001package org.apache.maven.doxia.module.twiki.parser; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import java.util.ArrayList; 023import java.util.List; 024import java.util.StringTokenizer; 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027 028/** 029 * Parse almost plain text in search of WikiWords, links, ... 030 * 031 * @author Juan F. Codagnone 032 * @version $Id$ 033 */ 034public class TextParser 035{ 036 /** 037 * pattern to detect WikiWords 038 */ 039 private static final Pattern WIKIWORD_PATTERN = 040 Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" ); 041 042 /** 043 * pattern to detect SpecificLinks links [[reference][text]] 044 */ 045 private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" ); 046 047 /** 048 * pattern to detect ForcedLinks links [[reference asd]] 049 */ 050 private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" ); 051 052 /** 053 * anchor name 054 */ 055 private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" ); 056 057 /** 058 * url word 059 */ 060 private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" ); 061 062 /** 063 * image pattern specification 064 */ 065 private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" ); 066 067 /** 068 * image tag pattern specification (used for images at relative URLs) 069 */ 070 private static final Pattern IMAGE_TAG_PATTERN = 071 Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE ); 072 073 /** HTML tag pattern */ 074 private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL ); 075 076 /** 077 * resolves wikiWordLinks 078 */ 079 private final WikiWordLinkResolver wikiWordLinkResolver; 080 081 /** resolves noautolink tag */ 082 private boolean noautolink; 083 084 /** 085 * Creates the TextParser. 086 * 087 * @param resolver resolver for wikiWord links 088 */ 089 public TextParser( final WikiWordLinkResolver resolver ) 090 { 091 this.wikiWordLinkResolver = resolver; 092 } 093 094 /** 095 * <p>parse.</p> 096 * 097 * @param line line to parse 098 * @return a list of block that represents the input 099 */ 100 public final List<Block> parse( final String line ) 101 { 102 final List<Block> ret = new ArrayList<Block>(); 103 104 final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line ); 105 final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line ); 106 final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line ); 107 final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line ); 108 final Matcher urlMatcher = URL_PATTERN.matcher( line ); 109 final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line ); 110 111 final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line ); 112 Matcher xhtmlMatcher = null; 113 if ( tagMatcher.find() ) 114 { 115 String tag = tagMatcher.group( 2 ); 116 117 Pattern pattern = 118 Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL ); 119 xhtmlMatcher = pattern.matcher( line ); 120 } 121 122 if ( xhtmlMatcher != null && xhtmlMatcher.find() ) 123 { 124 parseXHTML( line, ret, xhtmlMatcher ); 125 } 126 else if ( linkMatcher.find() ) 127 { 128 parseLink( line, ret, linkMatcher ); 129 } 130 else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink ) 131 { 132 parseWiki( line, ret, wikiMatcher ); 133 } 134 else if ( forcedLinkMatcher.find() ) 135 { 136 parseForcedLink( line, ret, forcedLinkMatcher ); 137 } 138 else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) ) 139 { 140 parseAnchor( line, ret, anchorMatcher ); 141 } 142 else if ( urlMatcher.find() && isAWord( urlMatcher, line ) ) 143 { 144 parseUrl( line, ret, urlMatcher ); 145 } 146 else if ( imageTagMatcher.find() ) 147 { 148 parseImage( line, ret, imageTagMatcher ); 149 } 150 else 151 { 152 if ( line.length() != 0 ) 153 { 154 ret.add( new TextBlock( line ) ); 155 } 156 } 157 158 return ret; 159 } 160 161 /** 162 * Parses the image tag 163 * @param line the line to parse 164 * @param ret where the results live 165 * @param imageTagMatcher image tag matcher 166 */ 167 private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher ) 168 { 169 ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) ); 170 final String src = imageTagMatcher.group( 2 ); 171 ret.add( new ImageBlock( src ) ); 172 ret.addAll( parse( line.substring( imageTagMatcher.end(), line.length() ) ) ); 173 } 174 175 /** 176 * Parses the url 177 * @param line the line to parse 178 * @param ret where the results live 179 * @param urlMatcher url matcher 180 */ 181 private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher ) 182 { 183 ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) ); 184 final String url = urlMatcher.group( 0 ); 185 final Matcher imageMatcher = IMAGE_PATTERN.matcher( url ); 186 if ( imageMatcher.matches() ) 187 { 188 ret.add( new ImageBlock( url ) ); 189 } 190 else 191 { 192 ret.add( new LinkBlock( url, new TextBlock( url ) ) ); 193 } 194 ret.addAll( parse( line.substring( urlMatcher.end(), line.length() ) ) ); 195 } 196 197 /** 198 * Parses the anchor 199 * @param line the line to parse 200 * @param ret where the results live 201 * @param anchorMatcher anchor matcher 202 */ 203 private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher ) 204 { 205 ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) ); 206 ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) ); 207 ret.addAll( parse( line.substring( anchorMatcher.end(), line.length() ) ) ); 208 } 209 210 /** 211 * Parses the link 212 * @param line line to parse 213 * @param ret where the results live 214 * @param forcedLinkMatcher forced link matcher 215 */ 216 private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher ) 217 { 218 if ( forcedLinkMatcher.group( 1 ) != null ) 219 { 220 ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) ); 221 } 222 else 223 { 224 final String showText = forcedLinkMatcher.group( 3 ); 225 // mailto link: 226 if ( showText.trim().startsWith( "mailto:" ) ) 227 { 228 String s = showText.trim(); 229 int i = s.indexOf( ' ' ); 230 if ( i == -1 ) 231 { 232 ret.add( new TextBlock( s ) ); 233 } 234 else 235 { 236 ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) ); 237 } 238 } 239 else 240 { 241 ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) ); 242 ret.add( createLink( showText, showText ) ); 243 ret.addAll( parse( line.substring( forcedLinkMatcher.end(), line.length() ) ) ); 244 } 245 } 246 } 247 248 /** 249 * Decides between a WikiWordBlock or a a LinkBlock 250 * @param link the link text 251 * @param showText the show text. 252 * @return either a WikiWordBlock or a LinkBlock 253 */ 254 private Block createLink( final String link, final String showText ) 255 { 256 final Block content; 257 if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() ) 258 { 259 content = new ImageBlock( showText ); 260 } 261 else 262 { 263 content = new TextBlock( showText ); 264 } 265 266 if ( URL_PATTERN.matcher( link ).matches() ) 267 { 268 return new LinkBlock( link, content ); 269 } 270 271 final StringTokenizer tokenizer = new StringTokenizer( link ); 272 final StringBuilder sb = new StringBuilder(); 273 274 while ( tokenizer.hasMoreElements() ) 275 { 276 final String s = tokenizer.nextToken(); 277 sb.append( s.substring( 0, 1 ).toUpperCase() ); 278 sb.append( s.substring( 1 ) ); 279 } 280 return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver ); 281 } 282 283 /** 284 * Parses a wiki word 285 * @param line the line to parse 286 * @param ret where the results live 287 * @param wikiMatcher wiki matcher 288 */ 289 private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher ) 290 { 291 final String wikiWord = wikiMatcher.group(); 292 ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) ); 293 if ( wikiWord.startsWith( "!" ) ) 294 { // link prevention 295 ret.add( new TextBlock( wikiWord.substring( 1 ) ) ); 296 } 297 else 298 { 299 ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) ); 300 } 301 ret.addAll( parse( line.substring( wikiMatcher.end(), line.length() ) ) ); 302 } 303 304 /** 305 * Parses a link 306 * @param line the line to parse 307 * @param ret where the results live 308 * @param linkMatcher link matcher 309 */ 310 private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher ) 311 { 312 ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) ); 313 if ( line.charAt( linkMatcher.start() ) == '!' ) 314 { 315 ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) ); 316 } 317 else 318 { 319 ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) ); 320 } 321 ret.addAll( parse( line.substring( linkMatcher.end(), line.length() ) ) ); 322 } 323 324 /** 325 * Parses xhtml. 326 * 327 * @param line the line to parse 328 * @param ret where the results live 329 * @param xhtmlMatcher xhtml matcher 330 */ 331 private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher ) 332 { 333 ret.addAll( parse( line.substring( 0, xhtmlMatcher.start() ) ) ); 334 if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 ) 335 { 336 noautolink = true; 337 } 338 else 339 { 340 ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) ); 341 } 342 343 ret.addAll( parse( xhtmlMatcher.group( 2 ) ) ); 344 345 if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 ) 346 { 347 noautolink = false; 348 } 349 else 350 { 351 ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) ); 352 } 353 354 ret.addAll( parse( xhtmlMatcher.group( 4 ) ) ); 355 } 356 357 /** 358 * @param m matcher to test 359 * @param line line to test 360 * @return <code>true</code> if the match on m represent a word (must be 361 * a space before the word or must be the beginning of the line) 362 */ 363 private boolean isAWord( final Matcher m, final String line ) 364 { 365 return startLikeWord( m, line ) && endLikeWord( m, line ); 366 } 367 368 /** 369 * @param m matcher to test 370 * @param line line to test 371 * @return true if it is the beginning of a word 372 */ 373 private boolean startLikeWord( final Matcher m, final String line ) 374 { 375 final int start = m.start(); 376 377 boolean ret = false; 378 if ( start == 0 ) 379 { 380 ret = true; 381 } 382 else if ( start > 0 ) 383 { 384 if ( isSpace( line.charAt( start - 1 ) ) ) 385 { 386 ret = true; 387 } 388 } 389 390 return ret; 391 } 392 393 /** 394 * @param m matcher to test 395 * @param line line to test 396 * @return true if it is the end of a word 397 */ 398 private boolean endLikeWord( final Matcher m, final String line ) 399 { 400 final int end = m.end(); 401 402 boolean ret = true; 403 if ( end < line.length() ) 404 { 405 ret = isSpace( line.charAt( end ) ); 406 } 407 408 return ret; 409 } 410 411 /** 412 * @param c char to test 413 * @return <code>true</code> if c is a space char 414 */ 415 private boolean isSpace( final char c ) 416 { 417 return c == ' ' || c == '\t'; 418 } 419}