001package org.apache.maven.doxia.module.twiki.parser; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import java.util.ArrayList; 023import java.util.List; 024import java.util.StringTokenizer; 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027 028/** 029 * Parse almost plain text in search of WikiWords, links, ... 030 * 031 * @author Juan F. Codagnone 032 */ 033public class TextParser 034{ 035 /** 036 * pattern to detect WikiWords 037 */ 038 private static final Pattern WIKIWORD_PATTERN = 039 Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" ); 040 041 /** 042 * pattern to detect SpecificLinks links [[reference][text]] 043 */ 044 private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" ); 045 046 /** 047 * pattern to detect ForcedLinks links [[reference asd]] 048 */ 049 private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" ); 050 051 /** 052 * anchor name 053 */ 054 private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" ); 055 056 /** 057 * url word 058 */ 059 private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" ); 060 061 /** 062 * image pattern specification 063 */ 064 private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" ); 065 066 /** 067 * image tag pattern specification (used for images at relative URLs) 068 */ 069 private static final Pattern IMAGE_TAG_PATTERN = 070 Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE ); 071 072 /** HTML tag pattern */ 073 private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL ); 074 075 /** 076 * resolves wikiWordLinks 077 */ 078 private final WikiWordLinkResolver wikiWordLinkResolver; 079 080 /** resolves noautolink tag */ 081 private boolean noautolink; 082 083 /** 084 * Creates the TextParser. 085 * 086 * @param resolver resolver for wikiWord links 087 */ 088 public TextParser( final WikiWordLinkResolver resolver ) 089 { 090 this.wikiWordLinkResolver = resolver; 091 } 092 093 /** 094 * <p>parse.</p> 095 * 096 * @param line line to parse 097 * @return a list of block that represents the input 098 */ 099 public final List<Block> parse( final String line ) 100 { 101 final List<Block> ret = new ArrayList<>(); 102 103 final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line ); 104 final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line ); 105 final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line ); 106 final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line ); 107 final Matcher urlMatcher = URL_PATTERN.matcher( line ); 108 final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line ); 109 110 final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line ); 111 Matcher xhtmlMatcher = null; 112 if ( tagMatcher.find() ) 113 { 114 String tag = tagMatcher.group( 2 ); 115 116 Pattern pattern = 117 Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL ); 118 xhtmlMatcher = pattern.matcher( line ); 119 } 120 121 if ( xhtmlMatcher != null && xhtmlMatcher.find() ) 122 { 123 parseXHTML( line, ret, xhtmlMatcher ); 124 } 125 else if ( linkMatcher.find() ) 126 { 127 parseLink( line, ret, linkMatcher ); 128 } 129 else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink ) 130 { 131 parseWiki( line, ret, wikiMatcher ); 132 } 133 else if ( forcedLinkMatcher.find() ) 134 { 135 parseForcedLink( line, ret, forcedLinkMatcher ); 136 } 137 else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) ) 138 { 139 parseAnchor( line, ret, anchorMatcher ); 140 } 141 else if ( urlMatcher.find() && isAWord( urlMatcher, line ) ) 142 { 143 parseUrl( line, ret, urlMatcher ); 144 } 145 else if ( imageTagMatcher.find() ) 146 { 147 parseImage( line, ret, imageTagMatcher ); 148 } 149 else 150 { 151 if ( line.length() != 0 ) 152 { 153 ret.add( new TextBlock( line ) ); 154 } 155 } 156 157 return ret; 158 } 159 160 /** 161 * Parses the image tag 162 * @param line the line to parse 163 * @param ret where the results live 164 * @param imageTagMatcher image tag matcher 165 */ 166 private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher ) 167 { 168 ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) ); 169 final String src = imageTagMatcher.group( 2 ); 170 ret.add( new ImageBlock( src ) ); 171 ret.addAll( parse( line.substring( imageTagMatcher.end() ) ) ); 172 } 173 174 /** 175 * Parses the url 176 * @param line the line to parse 177 * @param ret where the results live 178 * @param urlMatcher url matcher 179 */ 180 private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher ) 181 { 182 ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) ); 183 final String url = urlMatcher.group( 0 ); 184 final Matcher imageMatcher = IMAGE_PATTERN.matcher( url ); 185 if ( imageMatcher.matches() ) 186 { 187 ret.add( new ImageBlock( url ) ); 188 } 189 else 190 { 191 ret.add( new LinkBlock( url, new TextBlock( url ) ) ); 192 } 193 ret.addAll( parse( line.substring( urlMatcher.end() ) ) ); 194 } 195 196 /** 197 * Parses the anchor 198 * @param line the line to parse 199 * @param ret where the results live 200 * @param anchorMatcher anchor matcher 201 */ 202 private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher ) 203 { 204 ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) ); 205 ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) ); 206 ret.addAll( parse( line.substring( anchorMatcher.end() ) ) ); 207 } 208 209 /** 210 * Parses the link 211 * @param line line to parse 212 * @param ret where the results live 213 * @param forcedLinkMatcher forced link matcher 214 */ 215 private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher ) 216 { 217 if ( forcedLinkMatcher.group( 1 ) != null ) 218 { 219 ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) ); 220 } 221 else 222 { 223 final String showText = forcedLinkMatcher.group( 3 ); 224 // mailto link: 225 if ( showText.trim().startsWith( "mailto:" ) ) 226 { 227 String s = showText.trim(); 228 int i = s.indexOf( ' ' ); 229 if ( i == -1 ) 230 { 231 ret.add( new TextBlock( s ) ); 232 } 233 else 234 { 235 ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) ); 236 } 237 } 238 else 239 { 240 ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) ); 241 ret.add( createLink( showText, showText ) ); 242 ret.addAll( parse( line.substring( forcedLinkMatcher.end() ) ) ); 243 } 244 } 245 } 246 247 /** 248 * Decides between a WikiWordBlock or a a LinkBlock 249 * @param link the link text 250 * @param showText the show text. 251 * @return either a WikiWordBlock or a LinkBlock 252 */ 253 private Block createLink( final String link, final String showText ) 254 { 255 final Block content; 256 if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() ) 257 { 258 content = new ImageBlock( showText ); 259 } 260 else 261 { 262 content = new TextBlock( showText ); 263 } 264 265 if ( URL_PATTERN.matcher( link ).matches() ) 266 { 267 return new LinkBlock( link, content ); 268 } 269 270 final StringTokenizer tokenizer = new StringTokenizer( link ); 271 final StringBuilder sb = new StringBuilder(); 272 273 while ( tokenizer.hasMoreElements() ) 274 { 275 final String s = tokenizer.nextToken(); 276 sb.append( s.substring( 0, 1 ).toUpperCase() ); 277 sb.append( s.substring( 1 ) ); 278 } 279 return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver ); 280 } 281 282 /** 283 * Parses a wiki word 284 * @param line the line to parse 285 * @param ret where the results live 286 * @param wikiMatcher wiki matcher 287 */ 288 private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher ) 289 { 290 final String wikiWord = wikiMatcher.group(); 291 ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) ); 292 if ( wikiWord.startsWith( "!" ) ) 293 { // link prevention 294 ret.add( new TextBlock( wikiWord.substring( 1 ) ) ); 295 } 296 else 297 { 298 ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) ); 299 } 300 ret.addAll( parse( line.substring( wikiMatcher.end() ) ) ); 301 } 302 303 /** 304 * Parses a link 305 * @param line the line to parse 306 * @param ret where the results live 307 * @param linkMatcher link matcher 308 */ 309 private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher ) 310 { 311 ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) ); 312 if ( line.charAt( linkMatcher.start() ) == '!' ) 313 { 314 ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) ); 315 } 316 else 317 { 318 ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) ); 319 } 320 ret.addAll( parse( line.substring( linkMatcher.end() ) ) ); 321 } 322 323 /** 324 * Parses xhtml. 325 * 326 * @param line the line to parse 327 * @param ret where the results live 328 * @param xhtmlMatcher xhtml matcher 329 */ 330 private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher ) 331 { 332 ret.addAll( parse( line.substring( 0, xhtmlMatcher.start() ) ) ); 333 if ( xhtmlMatcher.group( 1 ).contains( "noautolink" ) ) 334 { 335 noautolink = true; 336 } 337 else 338 { 339 ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) ); 340 } 341 342 ret.addAll( parse( xhtmlMatcher.group( 2 ) ) ); 343 344 if ( xhtmlMatcher.group( 1 ).contains( "noautolink" ) ) 345 { 346 noautolink = false; 347 } 348 else 349 { 350 ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) ); 351 } 352 353 ret.addAll( parse( xhtmlMatcher.group( 4 ) ) ); 354 } 355 356 /** 357 * @param m matcher to test 358 * @param line line to test 359 * @return <code>true</code> if the match on m represent a word (must be 360 * a space before the word or must be the beginning of the line) 361 */ 362 private boolean isAWord( final Matcher m, final String line ) 363 { 364 return startLikeWord( m, line ) && endLikeWord( m, line ); 365 } 366 367 /** 368 * @param m matcher to test 369 * @param line line to test 370 * @return true if it is the beginning of a word 371 */ 372 private boolean startLikeWord( final Matcher m, final String line ) 373 { 374 final int start = m.start(); 375 376 boolean ret = false; 377 if ( start == 0 ) 378 { 379 ret = true; 380 } 381 else if ( start > 0 ) 382 { 383 if ( isSpace( line.charAt( start - 1 ) ) ) 384 { 385 ret = true; 386 } 387 } 388 389 return ret; 390 } 391 392 /** 393 * @param m matcher to test 394 * @param line line to test 395 * @return true if it is the end of a word 396 */ 397 private boolean endLikeWord( final Matcher m, final String line ) 398 { 399 final int end = m.end(); 400 401 boolean ret = true; 402 if ( end < line.length() ) 403 { 404 ret = isSpace( line.charAt( end ) ); 405 } 406 407 return ret; 408 } 409 410 /** 411 * @param c char to test 412 * @return <code>true</code> if c is a space char 413 */ 414 private boolean isSpace( final char c ) 415 { 416 return c == ' ' || c == '\t'; 417 } 418}