1 | |
package org.apache.maven.doxia.module.twiki.parser; |
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
import java.util.ArrayList; |
23 | |
import java.util.List; |
24 | |
import java.util.StringTokenizer; |
25 | |
import java.util.regex.Matcher; |
26 | |
import java.util.regex.Pattern; |
27 | |
|
28 | |
|
29 | |
|
30 | |
|
31 | |
|
32 | |
|
33 | |
|
34 | |
public class TextParser |
35 | |
{ |
36 | |
|
37 | |
|
38 | |
|
39 | 1 | private static final Pattern WIKIWORD_PATTERN = |
40 | |
Pattern.compile( "(!?([A-Z]\\w*[.])?([A-Z][a-z]+){2,}(#\\w*)?)" ); |
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | 1 | private static final Pattern SPECIFICLINK_PATTERN = Pattern.compile( "!?\\[\\[([^\\]]+)\\]\\[([^\\]]+)\\]\\]" ); |
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | 1 | private static final Pattern FORCEDLINK_PATTERN = Pattern.compile( "(!)?(\\[\\[(.+)\\]\\])" ); |
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | 1 | private static final Pattern ANCHOR_PATTERN = Pattern.compile( "#(([A-Z][A-Za-z]*){2,})" ); |
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | 1 | private static final Pattern URL_PATTERN = Pattern.compile( "(\\w+):[/][/][^\\s]*" ); |
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | 1 | private static final Pattern IMAGE_PATTERN = Pattern.compile( "(.*)\\.(png|jpg|gif|bmp)" ); |
66 | |
|
67 | |
|
68 | |
|
69 | |
|
70 | 1 | private static final Pattern IMAGE_TAG_PATTERN = |
71 | |
Pattern.compile( "<img\\b.*?\\bsrc=([\"'])(.*?)\\1.*>", Pattern.CASE_INSENSITIVE ); |
72 | |
|
73 | |
|
74 | 1 | private static final Pattern HTML_TAG_PATTERN = Pattern.compile( "<(/?)([\\w]*)(.*?)(/?)>", Pattern.DOTALL ); |
75 | |
|
76 | |
|
77 | |
|
78 | |
|
79 | |
private final WikiWordLinkResolver wikiWordLinkResolver; |
80 | |
|
81 | |
|
82 | |
private boolean noautolink; |
83 | |
|
84 | |
|
85 | |
|
86 | |
|
87 | |
|
88 | |
|
89 | |
public TextParser( final WikiWordLinkResolver resolver ) |
90 | 95 | { |
91 | 95 | this.wikiWordLinkResolver = resolver; |
92 | 95 | } |
93 | |
|
94 | |
|
95 | |
|
96 | |
|
97 | |
|
98 | |
|
99 | |
|
100 | |
public final List<Block> parse( final String line ) |
101 | |
{ |
102 | 203 | final List<Block> ret = new ArrayList<Block>(); |
103 | |
|
104 | 203 | final Matcher linkMatcher = SPECIFICLINK_PATTERN.matcher( line ); |
105 | 203 | final Matcher wikiMatcher = WIKIWORD_PATTERN.matcher( line ); |
106 | 203 | final Matcher forcedLinkMatcher = FORCEDLINK_PATTERN.matcher( line ); |
107 | 203 | final Matcher anchorMatcher = ANCHOR_PATTERN.matcher( line ); |
108 | 203 | final Matcher urlMatcher = URL_PATTERN.matcher( line ); |
109 | 203 | final Matcher imageTagMatcher = IMAGE_TAG_PATTERN.matcher( line ); |
110 | |
|
111 | 203 | final Matcher tagMatcher = HTML_TAG_PATTERN.matcher( line ); |
112 | 203 | Matcher xhtmlMatcher = null; |
113 | 203 | if ( tagMatcher.find() ) |
114 | |
{ |
115 | 1 | String tag = tagMatcher.group( 2 ); |
116 | |
|
117 | 1 | Pattern pattern = |
118 | |
Pattern.compile( "(\\<" + tag + ".*\\>)(.*)?(\\<\\/" + tag + "\\>)(.*)?", Pattern.DOTALL ); |
119 | 1 | xhtmlMatcher = pattern.matcher( line ); |
120 | |
} |
121 | |
|
122 | 203 | if ( xhtmlMatcher != null && xhtmlMatcher.find() ) |
123 | |
{ |
124 | 0 | parseXHTML( line, ret, xhtmlMatcher ); |
125 | |
} |
126 | 203 | else if ( linkMatcher.find() ) |
127 | |
{ |
128 | 10 | parseLink( line, ret, linkMatcher ); |
129 | |
} |
130 | 193 | else if ( wikiMatcher.find() && startLikeWord( wikiMatcher, line ) && !noautolink ) |
131 | |
{ |
132 | 7 | parseWiki( line, ret, wikiMatcher ); |
133 | |
} |
134 | 186 | else if ( forcedLinkMatcher.find() ) |
135 | |
{ |
136 | 5 | parseForcedLink( line, ret, forcedLinkMatcher ); |
137 | |
} |
138 | 181 | else if ( anchorMatcher.find() && isAWord( anchorMatcher, line ) ) |
139 | |
{ |
140 | 1 | parseAnchor( line, ret, anchorMatcher ); |
141 | |
} |
142 | 180 | else if ( urlMatcher.find() && isAWord( urlMatcher, line ) ) |
143 | |
{ |
144 | 2 | parseUrl( line, ret, urlMatcher ); |
145 | |
} |
146 | 178 | else if ( imageTagMatcher.find() ) |
147 | |
{ |
148 | 1 | parseImage( line, ret, imageTagMatcher ); |
149 | |
} |
150 | |
else |
151 | |
{ |
152 | 177 | if ( line.length() != 0 ) |
153 | |
{ |
154 | 142 | ret.add( new TextBlock( line ) ); |
155 | |
} |
156 | |
} |
157 | |
|
158 | 203 | return ret; |
159 | |
} |
160 | |
|
161 | |
|
162 | |
|
163 | |
|
164 | |
|
165 | |
|
166 | |
|
167 | |
private void parseImage( final String line, final List<Block> ret, final Matcher imageTagMatcher ) |
168 | |
{ |
169 | 1 | ret.addAll( parse( line.substring( 0, imageTagMatcher.start() ) ) ); |
170 | 1 | final String src = imageTagMatcher.group( 2 ); |
171 | 1 | ret.add( new ImageBlock( src ) ); |
172 | 1 | ret.addAll( parse( line.substring( imageTagMatcher.end(), line.length() ) ) ); |
173 | 1 | } |
174 | |
|
175 | |
|
176 | |
|
177 | |
|
178 | |
|
179 | |
|
180 | |
|
181 | |
private void parseUrl( final String line, final List<Block> ret, final Matcher urlMatcher ) |
182 | |
{ |
183 | 2 | ret.addAll( parse( line.substring( 0, urlMatcher.start() ) ) ); |
184 | 2 | final String url = urlMatcher.group( 0 ); |
185 | 2 | final Matcher imageMatcher = IMAGE_PATTERN.matcher( url ); |
186 | 2 | if ( imageMatcher.matches() ) |
187 | |
{ |
188 | 1 | ret.add( new ImageBlock( url ) ); |
189 | |
} |
190 | |
else |
191 | |
{ |
192 | 1 | ret.add( new LinkBlock( url, new TextBlock( url ) ) ); |
193 | |
} |
194 | 2 | ret.addAll( parse( line.substring( urlMatcher.end(), line.length() ) ) ); |
195 | 2 | } |
196 | |
|
197 | |
|
198 | |
|
199 | |
|
200 | |
|
201 | |
|
202 | |
|
203 | |
private void parseAnchor( final String line, final List<Block> ret, final Matcher anchorMatcher ) |
204 | |
{ |
205 | 1 | ret.addAll( parse( line.substring( 0, anchorMatcher.start() ) ) ); |
206 | 1 | ret.add( new AnchorBlock( anchorMatcher.group( 1 ) ) ); |
207 | 1 | ret.addAll( parse( line.substring( anchorMatcher.end(), line.length() ) ) ); |
208 | 1 | } |
209 | |
|
210 | |
|
211 | |
|
212 | |
|
213 | |
|
214 | |
|
215 | |
|
216 | |
private void parseForcedLink( final String line, final List<Block> ret, final Matcher forcedLinkMatcher ) |
217 | |
{ |
218 | 5 | if ( forcedLinkMatcher.group( 1 ) != null ) |
219 | |
{ |
220 | 1 | ret.add( new TextBlock( forcedLinkMatcher.group( 2 ) ) ); |
221 | |
} |
222 | |
else |
223 | |
{ |
224 | 4 | final String showText = forcedLinkMatcher.group( 3 ); |
225 | |
|
226 | 4 | if ( showText.trim().startsWith( "mailto:" ) ) |
227 | |
{ |
228 | 1 | String s = showText.trim(); |
229 | 1 | int i = s.indexOf( ' ' ); |
230 | 1 | if ( i == -1 ) |
231 | |
{ |
232 | 0 | ret.add( new TextBlock( s ) ); |
233 | |
} |
234 | |
else |
235 | |
{ |
236 | 1 | ret.add( new LinkBlock( s.substring( 0, i ), new TextBlock( s.substring( i ).trim() ) ) ); |
237 | |
} |
238 | 1 | } |
239 | |
else |
240 | |
{ |
241 | 3 | ret.addAll( parse( line.substring( 0, forcedLinkMatcher.start() ) ) ); |
242 | 3 | ret.add( createLink( showText, showText ) ); |
243 | 3 | ret.addAll( parse( line.substring( forcedLinkMatcher.end(), line.length() ) ) ); |
244 | |
} |
245 | |
} |
246 | 5 | } |
247 | |
|
248 | |
|
249 | |
|
250 | |
|
251 | |
|
252 | |
|
253 | |
|
254 | |
private Block createLink( final String link, final String showText ) |
255 | |
{ |
256 | |
final Block content; |
257 | 12 | if ( URL_PATTERN.matcher( showText ).matches() && IMAGE_PATTERN.matcher( showText ).matches() ) |
258 | |
{ |
259 | 1 | content = new ImageBlock( showText ); |
260 | |
} |
261 | |
else |
262 | |
{ |
263 | 11 | content = new TextBlock( showText ); |
264 | |
} |
265 | |
|
266 | 12 | if ( URL_PATTERN.matcher( link ).matches() ) |
267 | |
{ |
268 | 6 | return new LinkBlock( link, content ); |
269 | |
} |
270 | |
|
271 | 6 | final StringTokenizer tokenizer = new StringTokenizer( link ); |
272 | 6 | final StringBuffer sb = new StringBuffer(); |
273 | |
|
274 | 15 | while ( tokenizer.hasMoreElements() ) |
275 | |
{ |
276 | 9 | final String s = tokenizer.nextToken(); |
277 | 9 | sb.append( s.substring( 0, 1 ).toUpperCase() ); |
278 | 9 | sb.append( s.substring( 1 ) ); |
279 | 9 | } |
280 | 6 | return new WikiWordBlock( sb.toString(), content, wikiWordLinkResolver ); |
281 | |
} |
282 | |
|
283 | |
|
284 | |
|
285 | |
|
286 | |
|
287 | |
|
288 | |
|
289 | |
private void parseWiki( final String line, final List<Block> ret, final Matcher wikiMatcher ) |
290 | |
{ |
291 | 7 | final String wikiWord = wikiMatcher.group(); |
292 | 7 | ret.addAll( parse( line.substring( 0, wikiMatcher.start() ) ) ); |
293 | 7 | if ( wikiWord.startsWith( "!" ) ) |
294 | |
{ |
295 | 1 | ret.add( new TextBlock( wikiWord.substring( 1 ) ) ); |
296 | |
} |
297 | |
else |
298 | |
{ |
299 | 6 | ret.add( new WikiWordBlock( wikiWord, wikiWordLinkResolver ) ); |
300 | |
} |
301 | 7 | ret.addAll( parse( line.substring( wikiMatcher.end(), line.length() ) ) ); |
302 | 7 | } |
303 | |
|
304 | |
|
305 | |
|
306 | |
|
307 | |
|
308 | |
|
309 | |
|
310 | |
private void parseLink( final String line, final List<Block> ret, final Matcher linkMatcher ) |
311 | |
{ |
312 | 10 | ret.addAll( parse( line.substring( 0, linkMatcher.start() ) ) ); |
313 | 10 | if ( line.charAt( linkMatcher.start() ) == '!' ) |
314 | |
{ |
315 | 1 | ret.add( new TextBlock( line.substring( linkMatcher.start() + 1, linkMatcher.end() ) ) ); |
316 | |
} |
317 | |
else |
318 | |
{ |
319 | 9 | ret.add( createLink( linkMatcher.group( 1 ), linkMatcher.group( 2 ) ) ); |
320 | |
} |
321 | 10 | ret.addAll( parse( line.substring( linkMatcher.end(), line.length() ) ) ); |
322 | 10 | } |
323 | |
|
324 | |
|
325 | |
|
326 | |
|
327 | |
|
328 | |
|
329 | |
|
330 | |
|
331 | |
private void parseXHTML( final String line, final List<Block> ret, final Matcher xhtmlMatcher ) |
332 | |
{ |
333 | 0 | if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 ) |
334 | |
{ |
335 | 0 | noautolink = true; |
336 | |
} |
337 | |
else |
338 | |
{ |
339 | 0 | ret.add( new XHTMLBlock( xhtmlMatcher.group( 1 ) ) ); |
340 | |
} |
341 | |
|
342 | 0 | ret.addAll( parse( xhtmlMatcher.group( 2 ) ) ); |
343 | |
|
344 | 0 | if ( xhtmlMatcher.group( 1 ).indexOf( "noautolink" ) != -1 ) |
345 | |
{ |
346 | 0 | noautolink = false; |
347 | |
} |
348 | |
else |
349 | |
{ |
350 | 0 | ret.add( new XHTMLBlock( xhtmlMatcher.group( 3 ) ) ); |
351 | |
} |
352 | |
|
353 | 0 | ret.addAll( parse( xhtmlMatcher.group( 4 ) ) ); |
354 | 0 | } |
355 | |
|
356 | |
|
357 | |
|
358 | |
|
359 | |
|
360 | |
|
361 | |
|
362 | |
private boolean isAWord( final Matcher m, final String line ) |
363 | |
{ |
364 | 4 | return startLikeWord( m, line ) && endLikeWord( m, line ); |
365 | |
} |
366 | |
|
367 | |
|
368 | |
|
369 | |
|
370 | |
|
371 | |
|
372 | |
private boolean startLikeWord( final Matcher m, final String line ) |
373 | |
{ |
374 | 15 | final int start = m.start(); |
375 | |
|
376 | 15 | boolean ret = false; |
377 | 15 | if ( start == 0 ) |
378 | |
{ |
379 | 6 | ret = true; |
380 | |
} |
381 | 9 | else if ( start > 0 ) |
382 | |
{ |
383 | 9 | if ( isSpace( line.charAt( start - 1 ) ) ) |
384 | |
{ |
385 | 5 | ret = true; |
386 | |
} |
387 | |
} |
388 | |
|
389 | 15 | return ret; |
390 | |
} |
391 | |
|
392 | |
|
393 | |
|
394 | |
|
395 | |
|
396 | |
|
397 | |
private boolean endLikeWord( final Matcher m, final String line ) |
398 | |
{ |
399 | 4 | final int end = m.end(); |
400 | |
|
401 | 4 | boolean ret = true; |
402 | 4 | if ( end < line.length() ) |
403 | |
{ |
404 | 4 | ret = isSpace( line.charAt( end ) ); |
405 | |
} |
406 | |
|
407 | 4 | return ret; |
408 | |
} |
409 | |
|
410 | |
|
411 | |
|
412 | |
|
413 | |
|
414 | |
private boolean isSpace( final char c ) |
415 | |
{ |
416 | 13 | return c == ' ' || c == '\t'; |
417 | |
} |
418 | |
} |