1 package org.apache.maven.doxia.parser;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.Reader;
23 import java.util.HashMap;
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.TreeSet;
27
28 import javax.swing.text.html.HTML.Attribute;
29
30 import org.apache.maven.doxia.macro.MacroExecutionException;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.apache.maven.doxia.sink.Sink;
33 import org.apache.maven.doxia.sink.SinkEventAttributeSet;
34 import org.apache.maven.doxia.sink.SinkEventAttributes;
35 import org.apache.maven.doxia.util.DoxiaUtils;
36
37 import org.codehaus.plexus.util.StringUtils;
38 import org.codehaus.plexus.util.xml.pull.XmlPullParser;
39 import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
40
41
42
43
44
45
46
47
48
49 public class XhtmlBaseParser
50 extends AbstractXmlParser
51 implements HtmlMarkup
52 {
53
54
55
56
57 private boolean scriptBlock;
58
59
60 private boolean isLink;
61
62
63 private boolean isAnchor;
64
65
66 private int orderedListDepth = 0;
67
68
69 private int sectionLevel;
70
71
72 private boolean inVerbatim;
73
74
75 private boolean inFigure;
76
77
78 boolean hasDefinitionListItem = false;
79
80
81 private final SinkEventAttributeSet decoration = new SinkEventAttributeSet();
82
83
84
85 private Map<String, Set<String>> warnMessages;
86
87
88 @Override
89 public void parse( Reader source, Sink sink )
90 throws ParseException
91 {
92 init();
93
94 try
95 {
96 super.parse( source, sink );
97 }
98 finally
99 {
100 logWarnings();
101
102 setSecondParsing( false );
103 init();
104 }
105 }
106
107
108
109
110
111
112
113 @Override
114 protected void initXmlParser( XmlPullParser parser )
115 throws XmlPullParserException
116 {
117 super.initXmlParser( parser );
118
119
120
121
122
123
124
125
126 parser.defineEntityReplacementText( "nbsp", "\u00a0" );
127 parser.defineEntityReplacementText( "iexcl", "\u00a1" );
128 parser.defineEntityReplacementText( "cent", "\u00a2" );
129 parser.defineEntityReplacementText( "pound", "\u00a3" );
130 parser.defineEntityReplacementText( "curren", "\u00a4" );
131 parser.defineEntityReplacementText( "yen", "\u00a5" );
132 parser.defineEntityReplacementText( "brvbar", "\u00a6" );
133 parser.defineEntityReplacementText( "sect", "\u00a7" );
134 parser.defineEntityReplacementText( "uml", "\u00a8" );
135 parser.defineEntityReplacementText( "copy", "\u00a9" );
136 parser.defineEntityReplacementText( "ordf", "\u00aa" );
137 parser.defineEntityReplacementText( "laquo", "\u00ab" );
138 parser.defineEntityReplacementText( "not", "\u00ac" );
139 parser.defineEntityReplacementText( "shy", "\u00ad" );
140 parser.defineEntityReplacementText( "reg", "\u00ae" );
141 parser.defineEntityReplacementText( "macr", "\u00af" );
142 parser.defineEntityReplacementText( "deg", "\u00b0" );
143 parser.defineEntityReplacementText( "plusmn", "\u00b1" );
144 parser.defineEntityReplacementText( "sup2", "\u00b2" );
145 parser.defineEntityReplacementText( "sup3", "\u00b3" );
146 parser.defineEntityReplacementText( "acute", "\u00b4" );
147 parser.defineEntityReplacementText( "micro", "\u00b5" );
148 parser.defineEntityReplacementText( "para", "\u00b6" );
149 parser.defineEntityReplacementText( "middot", "\u00b7" );
150 parser.defineEntityReplacementText( "cedil", "\u00b8" );
151 parser.defineEntityReplacementText( "sup1", "\u00b9" );
152 parser.defineEntityReplacementText( "ordm", "\u00ba" );
153 parser.defineEntityReplacementText( "raquo", "\u00bb" );
154 parser.defineEntityReplacementText( "frac14", "\u00bc" );
155 parser.defineEntityReplacementText( "frac12", "\u00bd" );
156 parser.defineEntityReplacementText( "frac34", "\u00be" );
157 parser.defineEntityReplacementText( "iquest", "\u00bf" );
158 parser.defineEntityReplacementText( "Agrave", "\u00c0" );
159 parser.defineEntityReplacementText( "Aacute", "\u00c1" );
160 parser.defineEntityReplacementText( "Acirc", "\u00c2" );
161 parser.defineEntityReplacementText( "Atilde", "\u00c3" );
162 parser.defineEntityReplacementText( "Auml", "\u00c4" );
163 parser.defineEntityReplacementText( "Aring", "\u00c5" );
164 parser.defineEntityReplacementText( "AElig", "\u00c6" );
165 parser.defineEntityReplacementText( "Ccedil", "\u00c7" );
166 parser.defineEntityReplacementText( "Egrave", "\u00c8" );
167 parser.defineEntityReplacementText( "Eacute", "\u00c9" );
168 parser.defineEntityReplacementText( "Ecirc", "\u00ca" );
169 parser.defineEntityReplacementText( "Euml", "\u00cb" );
170 parser.defineEntityReplacementText( "Igrave", "\u00cc" );
171 parser.defineEntityReplacementText( "Iacute", "\u00cd" );
172 parser.defineEntityReplacementText( "Icirc", "\u00ce" );
173 parser.defineEntityReplacementText( "Iuml", "\u00cf" );
174 parser.defineEntityReplacementText( "ETH", "\u00d0" );
175 parser.defineEntityReplacementText( "Ntilde", "\u00d1" );
176 parser.defineEntityReplacementText( "Ograve", "\u00d2" );
177 parser.defineEntityReplacementText( "Oacute", "\u00d3" );
178 parser.defineEntityReplacementText( "Ocirc", "\u00d4" );
179 parser.defineEntityReplacementText( "Otilde", "\u00d5" );
180 parser.defineEntityReplacementText( "Ouml", "\u00d6" );
181 parser.defineEntityReplacementText( "times", "\u00d7" );
182 parser.defineEntityReplacementText( "Oslash", "\u00d8" );
183 parser.defineEntityReplacementText( "Ugrave", "\u00d9" );
184 parser.defineEntityReplacementText( "Uacute", "\u00da" );
185 parser.defineEntityReplacementText( "Ucirc", "\u00db" );
186 parser.defineEntityReplacementText( "Uuml", "\u00dc" );
187 parser.defineEntityReplacementText( "Yacute", "\u00dd" );
188 parser.defineEntityReplacementText( "THORN", "\u00de" );
189 parser.defineEntityReplacementText( "szlig", "\u00df" );
190 parser.defineEntityReplacementText( "agrave", "\u00e0" );
191 parser.defineEntityReplacementText( "aacute", "\u00e1" );
192 parser.defineEntityReplacementText( "acirc", "\u00e2" );
193 parser.defineEntityReplacementText( "atilde", "\u00e3" );
194 parser.defineEntityReplacementText( "auml", "\u00e4" );
195 parser.defineEntityReplacementText( "aring", "\u00e5" );
196 parser.defineEntityReplacementText( "aelig", "\u00e6" );
197 parser.defineEntityReplacementText( "ccedil", "\u00e7" );
198 parser.defineEntityReplacementText( "egrave", "\u00e8" );
199 parser.defineEntityReplacementText( "eacute", "\u00e9" );
200 parser.defineEntityReplacementText( "ecirc", "\u00ea" );
201 parser.defineEntityReplacementText( "euml", "\u00eb" );
202 parser.defineEntityReplacementText( "igrave", "\u00ec" );
203 parser.defineEntityReplacementText( "iacute", "\u00ed" );
204 parser.defineEntityReplacementText( "icirc", "\u00ee" );
205 parser.defineEntityReplacementText( "iuml", "\u00ef" );
206 parser.defineEntityReplacementText( "eth", "\u00f0" );
207 parser.defineEntityReplacementText( "ntilde", "\u00f1" );
208 parser.defineEntityReplacementText( "ograve", "\u00f2" );
209 parser.defineEntityReplacementText( "oacute", "\u00f3" );
210 parser.defineEntityReplacementText( "ocirc", "\u00f4" );
211 parser.defineEntityReplacementText( "otilde", "\u00f5" );
212 parser.defineEntityReplacementText( "ouml", "\u00f6" );
213 parser.defineEntityReplacementText( "divide", "\u00f7" );
214 parser.defineEntityReplacementText( "oslash", "\u00f8" );
215 parser.defineEntityReplacementText( "ugrave", "\u00f9" );
216 parser.defineEntityReplacementText( "uacute", "\u00fa" );
217 parser.defineEntityReplacementText( "ucirc", "\u00fb" );
218 parser.defineEntityReplacementText( "uuml", "\u00fc" );
219 parser.defineEntityReplacementText( "yacute", "\u00fd" );
220 parser.defineEntityReplacementText( "thorn", "\u00fe" );
221 parser.defineEntityReplacementText( "yuml", "\u00ff" );
222
223
224
225
226
227 parser.defineEntityReplacementText( "OElig", "\u0152" );
228 parser.defineEntityReplacementText( "oelig", "\u0153" );
229 parser.defineEntityReplacementText( "Scaron", "\u0160" );
230 parser.defineEntityReplacementText( "scaron", "\u0161" );
231 parser.defineEntityReplacementText( "Yuml", "\u0178" );
232 parser.defineEntityReplacementText( "circ", "\u02c6" );
233 parser.defineEntityReplacementText( "tilde", "\u02dc" );
234 parser.defineEntityReplacementText( "ensp", "\u2002" );
235 parser.defineEntityReplacementText( "emsp", "\u2003" );
236 parser.defineEntityReplacementText( "thinsp", "\u2009" );
237 parser.defineEntityReplacementText( "zwnj", "\u200c" );
238 parser.defineEntityReplacementText( "zwj", "\u200d" );
239 parser.defineEntityReplacementText( "lrm", "\u200e" );
240 parser.defineEntityReplacementText( "rlm", "\u200f" );
241 parser.defineEntityReplacementText( "ndash", "\u2013" );
242 parser.defineEntityReplacementText( "mdash", "\u2014" );
243 parser.defineEntityReplacementText( "lsquo", "\u2018" );
244 parser.defineEntityReplacementText( "rsquo", "\u2019" );
245 parser.defineEntityReplacementText( "sbquo", "\u201a" );
246 parser.defineEntityReplacementText( "ldquo", "\u201c" );
247 parser.defineEntityReplacementText( "rdquo", "\u201d" );
248 parser.defineEntityReplacementText( "bdquo", "\u201e" );
249 parser.defineEntityReplacementText( "dagger", "\u2020" );
250 parser.defineEntityReplacementText( "Dagger", "\u2021" );
251 parser.defineEntityReplacementText( "permil", "\u2030" );
252 parser.defineEntityReplacementText( "lsaquo", "\u2039" );
253 parser.defineEntityReplacementText( "rsaquo", "\u203a" );
254 parser.defineEntityReplacementText( "euro", "\u20ac" );
255
256
257
258
259
260 parser.defineEntityReplacementText( "fnof", "\u0192" );
261 parser.defineEntityReplacementText( "Alpha", "\u0391" );
262 parser.defineEntityReplacementText( "Beta", "\u0392" );
263 parser.defineEntityReplacementText( "Gamma", "\u0393" );
264 parser.defineEntityReplacementText( "Delta", "\u0394" );
265 parser.defineEntityReplacementText( "Epsilon", "\u0395" );
266 parser.defineEntityReplacementText( "Zeta", "\u0396" );
267 parser.defineEntityReplacementText( "Eta", "\u0397" );
268 parser.defineEntityReplacementText( "Theta", "\u0398" );
269 parser.defineEntityReplacementText( "Iota", "\u0399" );
270 parser.defineEntityReplacementText( "Kappa", "\u039a" );
271 parser.defineEntityReplacementText( "Lambda", "\u039b" );
272 parser.defineEntityReplacementText( "Mu", "\u039c" );
273 parser.defineEntityReplacementText( "Nu", "\u039d" );
274 parser.defineEntityReplacementText( "Xi", "\u039e" );
275 parser.defineEntityReplacementText( "Omicron", "\u039f" );
276 parser.defineEntityReplacementText( "Pi", "\u03a0" );
277 parser.defineEntityReplacementText( "Rho", "\u03a1" );
278 parser.defineEntityReplacementText( "Sigma", "\u03a3" );
279 parser.defineEntityReplacementText( "Tau", "\u03a4" );
280 parser.defineEntityReplacementText( "Upsilon", "\u03a5" );
281 parser.defineEntityReplacementText( "Phi", "\u03a6" );
282 parser.defineEntityReplacementText( "Chi", "\u03a7" );
283 parser.defineEntityReplacementText( "Psi", "\u03a8" );
284 parser.defineEntityReplacementText( "Omega", "\u03a9" );
285 parser.defineEntityReplacementText( "alpha", "\u03b1" );
286 parser.defineEntityReplacementText( "beta", "\u03b2" );
287 parser.defineEntityReplacementText( "gamma", "\u03b3" );
288 parser.defineEntityReplacementText( "delta", "\u03b4" );
289 parser.defineEntityReplacementText( "epsilon", "\u03b5" );
290 parser.defineEntityReplacementText( "zeta", "\u03b6" );
291 parser.defineEntityReplacementText( "eta", "\u03b7" );
292 parser.defineEntityReplacementText( "theta", "\u03b8" );
293 parser.defineEntityReplacementText( "iota", "\u03b9" );
294 parser.defineEntityReplacementText( "kappa", "\u03ba" );
295 parser.defineEntityReplacementText( "lambda", "\u03bb" );
296 parser.defineEntityReplacementText( "mu", "\u03bc" );
297 parser.defineEntityReplacementText( "nu", "\u03bd" );
298 parser.defineEntityReplacementText( "xi", "\u03be" );
299 parser.defineEntityReplacementText( "omicron", "\u03bf" );
300 parser.defineEntityReplacementText( "pi", "\u03c0" );
301 parser.defineEntityReplacementText( "rho", "\u03c1" );
302 parser.defineEntityReplacementText( "sigmaf", "\u03c2" );
303 parser.defineEntityReplacementText( "sigma", "\u03c3" );
304 parser.defineEntityReplacementText( "tau", "\u03c4" );
305 parser.defineEntityReplacementText( "upsilon", "\u03c5" );
306 parser.defineEntityReplacementText( "phi", "\u03c6" );
307 parser.defineEntityReplacementText( "chi", "\u03c7" );
308 parser.defineEntityReplacementText( "psi", "\u03c8" );
309 parser.defineEntityReplacementText( "omega", "\u03c9" );
310 parser.defineEntityReplacementText( "thetasym", "\u03d1" );
311 parser.defineEntityReplacementText( "upsih", "\u03d2" );
312 parser.defineEntityReplacementText( "piv", "\u03d6" );
313 parser.defineEntityReplacementText( "bull", "\u2022" );
314 parser.defineEntityReplacementText( "hellip", "\u2026" );
315 parser.defineEntityReplacementText( "prime", "\u2032" );
316 parser.defineEntityReplacementText( "Prime", "\u2033" );
317 parser.defineEntityReplacementText( "oline", "\u203e" );
318 parser.defineEntityReplacementText( "frasl", "\u2044" );
319 parser.defineEntityReplacementText( "weierp", "\u2118" );
320 parser.defineEntityReplacementText( "image", "\u2111" );
321 parser.defineEntityReplacementText( "real", "\u211c" );
322 parser.defineEntityReplacementText( "trade", "\u2122" );
323 parser.defineEntityReplacementText( "alefsym", "\u2135" );
324 parser.defineEntityReplacementText( "larr", "\u2190" );
325 parser.defineEntityReplacementText( "uarr", "\u2191" );
326 parser.defineEntityReplacementText( "rarr", "\u2192" );
327 parser.defineEntityReplacementText( "darr", "\u2193" );
328 parser.defineEntityReplacementText( "harr", "\u2194" );
329 parser.defineEntityReplacementText( "crarr", "\u21b5" );
330 parser.defineEntityReplacementText( "lArr", "\u21d0" );
331 parser.defineEntityReplacementText( "uArr", "\u21d1" );
332 parser.defineEntityReplacementText( "rArr", "\u21d2" );
333 parser.defineEntityReplacementText( "dArr", "\u21d3" );
334 parser.defineEntityReplacementText( "hArr", "\u21d4" );
335 parser.defineEntityReplacementText( "forall", "\u2200" );
336 parser.defineEntityReplacementText( "part", "\u2202" );
337 parser.defineEntityReplacementText( "exist", "\u2203" );
338 parser.defineEntityReplacementText( "empty", "\u2205" );
339 parser.defineEntityReplacementText( "nabla", "\u2207" );
340 parser.defineEntityReplacementText( "isin", "\u2208" );
341 parser.defineEntityReplacementText( "notin", "\u2209" );
342 parser.defineEntityReplacementText( "ni", "\u220b" );
343 parser.defineEntityReplacementText( "prod", "\u220f" );
344 parser.defineEntityReplacementText( "sum", "\u2211" );
345 parser.defineEntityReplacementText( "minus", "\u2212" );
346 parser.defineEntityReplacementText( "lowast", "\u2217" );
347 parser.defineEntityReplacementText( "radic", "\u221a" );
348 parser.defineEntityReplacementText( "prop", "\u221d" );
349 parser.defineEntityReplacementText( "infin", "\u221e" );
350 parser.defineEntityReplacementText( "ang", "\u2220" );
351 parser.defineEntityReplacementText( "and", "\u2227" );
352 parser.defineEntityReplacementText( "or", "\u2228" );
353 parser.defineEntityReplacementText( "cap", "\u2229" );
354 parser.defineEntityReplacementText( "cup", "\u222a" );
355 parser.defineEntityReplacementText( "int", "\u222b" );
356 parser.defineEntityReplacementText( "there4", "\u2234" );
357 parser.defineEntityReplacementText( "sim", "\u223c" );
358 parser.defineEntityReplacementText( "cong", "\u2245" );
359 parser.defineEntityReplacementText( "asymp", "\u2248" );
360 parser.defineEntityReplacementText( "ne", "\u2260" );
361 parser.defineEntityReplacementText( "equiv", "\u2261" );
362 parser.defineEntityReplacementText( "le", "\u2264" );
363 parser.defineEntityReplacementText( "ge", "\u2265" );
364 parser.defineEntityReplacementText( "sub", "\u2282" );
365 parser.defineEntityReplacementText( "sup", "\u2283" );
366 parser.defineEntityReplacementText( "nsub", "\u2284" );
367 parser.defineEntityReplacementText( "sube", "\u2286" );
368 parser.defineEntityReplacementText( "supe", "\u2287" );
369 parser.defineEntityReplacementText( "oplus", "\u2295" );
370 parser.defineEntityReplacementText( "otimes", "\u2297" );
371 parser.defineEntityReplacementText( "perp", "\u22a5" );
372 parser.defineEntityReplacementText( "sdot", "\u22c5" );
373 parser.defineEntityReplacementText( "lceil", "\u2308" );
374 parser.defineEntityReplacementText( "rceil", "\u2309" );
375 parser.defineEntityReplacementText( "lfloor", "\u230a" );
376 parser.defineEntityReplacementText( "rfloor", "\u230b" );
377 parser.defineEntityReplacementText( "lang", "\u2329" );
378 parser.defineEntityReplacementText( "rang", "\u232a" );
379 parser.defineEntityReplacementText( "loz", "\u25ca" );
380 parser.defineEntityReplacementText( "spades", "\u2660" );
381 parser.defineEntityReplacementText( "clubs", "\u2663" );
382 parser.defineEntityReplacementText( "hearts", "\u2665" );
383 parser.defineEntityReplacementText( "diams", "\u2666" );
384 }
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407 protected boolean baseStartTag( XmlPullParser parser, Sink sink )
408 {
409 boolean visited = true;
410
411 SinkEventAttributeSet attribs = getAttributesFromParser( parser );
412
413 if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
414 {
415 handleSectionStart( sink, Sink.SECTION_LEVEL_1, attribs );
416 }
417 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
418 {
419 handleSectionStart( sink, Sink.SECTION_LEVEL_2, attribs );
420 }
421 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
422 {
423 handleSectionStart( sink, Sink.SECTION_LEVEL_3, attribs );
424 }
425 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
426 {
427 handleSectionStart( sink, Sink.SECTION_LEVEL_4, attribs );
428 }
429 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
430 {
431 handleSectionStart( sink, Sink.SECTION_LEVEL_5, attribs );
432 }
433 else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
434 {
435 decoration.addAttribute( SinkEventAttributes.DECORATION, "underline" );
436 }
437 else if ( parser.getName().equals( HtmlMarkup.S.toString() )
438 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
439 || parser.getName().equals( "del" ) )
440 {
441 decoration.addAttribute( SinkEventAttributes.DECORATION, "line-through" );
442 }
443 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
444 {
445 decoration.addAttribute( SinkEventAttributes.VALIGN, "sub" );
446 }
447 else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
448 {
449 decoration.addAttribute( SinkEventAttributes.VALIGN, "sup" );
450 }
451 else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
452 {
453 handlePStart( sink, attribs );
454 }
455 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
456 {
457 visited = handleDivStart( parser, attribs, sink );
458 }
459 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
460 {
461 handlePreStart( attribs, sink );
462 }
463 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
464 {
465 sink.list( attribs );
466 }
467 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
468 {
469 handleOLStart( parser, sink, attribs );
470 }
471 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
472 {
473 handleLIStart( sink, attribs );
474 }
475 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
476 {
477 sink.definitionList( attribs );
478 }
479 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
480 {
481 if ( hasDefinitionListItem )
482 {
483
484 sink.definitionListItem_();
485 }
486 sink.definitionListItem( attribs );
487 hasDefinitionListItem = true;
488 sink.definedTerm( attribs );
489 }
490 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
491 {
492 if ( !hasDefinitionListItem )
493 {
494 sink.definitionListItem( attribs );
495 }
496 sink.definition( attribs );
497 }
498 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
499 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
500 {
501 sink.bold();
502 }
503 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
504 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
505 {
506 handleFigureCaptionStart( sink, attribs );
507 }
508 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
509 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
510 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
511 {
512 sink.monospaced();
513 }
514 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
515 {
516 handleAStart( parser, sink, attribs );
517 }
518 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
519 {
520 handleTableStart( sink, attribs, parser );
521 }
522 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
523 {
524 sink.tableRow( attribs );
525 }
526 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
527 {
528 sink.tableHeaderCell( attribs );
529 }
530 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
531 {
532 sink.tableCell( attribs );
533 }
534 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
535 {
536 sink.tableCaption( attribs );
537 }
538 else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
539 {
540 sink.lineBreak( attribs );
541 }
542 else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
543 {
544 sink.horizontalRule( attribs );
545 }
546 else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
547 {
548 handleImgStart( parser, sink, attribs );
549 }
550 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
551 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
552 {
553 handleUnknown( parser, sink, TAG_TYPE_START );
554 scriptBlock = true;
555 }
556 else
557 {
558 visited = false;
559 }
560
561 return visited;
562 }
563
564
565
566
567
568
569
570
571
572
573
574
575
576 protected boolean baseEndTag( XmlPullParser parser, Sink sink )
577 {
578 boolean visited = true;
579
580 if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
581 {
582 if ( !inFigure )
583 {
584 sink.paragraph_();
585 }
586 }
587 else if ( parser.getName().equals( HtmlMarkup.U.toString() )
588 || parser.getName().equals( HtmlMarkup.S.toString() )
589 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
590 || parser.getName().equals( "del" ) )
591 {
592 decoration.removeAttribute( SinkEventAttributes.DECORATION );
593 }
594 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() )
595 || parser.getName().equals( HtmlMarkup.SUP.toString() ) )
596 {
597 decoration.removeAttribute( SinkEventAttributes.VALIGN );
598 }
599 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
600 {
601 if ( inFigure )
602 {
603 sink.figure_();
604 this.inFigure = false;
605 }
606 else
607 {
608 visited = false;
609 }
610 }
611 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
612 {
613 verbatim_();
614
615 sink.verbatim_();
616 }
617 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
618 {
619 sink.list_();
620 }
621 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
622 {
623 sink.numberedList_();
624 orderedListDepth--;
625 }
626 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
627 {
628 handleListItemEnd( sink );
629 }
630 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
631 {
632 if ( hasDefinitionListItem )
633 {
634 sink.definitionListItem_();
635 hasDefinitionListItem = false;
636 }
637 sink.definitionList_();
638 }
639 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
640 {
641 sink.definedTerm_();
642 }
643 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
644 {
645 sink.definition_();
646 sink.definitionListItem_();
647 hasDefinitionListItem = false;
648 }
649 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
650 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
651 {
652 sink.bold_();
653 }
654 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
655 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
656 {
657 handleFigureCaptionEnd( sink );
658 }
659 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
660 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
661 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
662 {
663 sink.monospaced_();
664 }
665 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
666 {
667 handleAEnd( sink );
668 }
669
670
671
672
673
674 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
675 {
676 sink.tableRows_();
677
678 sink.table_();
679 }
680 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
681 {
682 sink.tableRow_();
683 }
684 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
685 {
686 sink.tableHeaderCell_();
687 }
688 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
689 {
690 sink.tableCell_();
691 }
692 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
693 {
694 sink.tableCaption_();
695 }
696 else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
697 {
698 sink.sectionTitle1_();
699 }
700 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
701 {
702 sink.sectionTitle2_();
703 }
704 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
705 {
706 sink.sectionTitle3_();
707 }
708 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
709 {
710 sink.sectionTitle4_();
711 }
712 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
713 {
714 sink.sectionTitle5_();
715 }
716 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
717 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
718 {
719 handleUnknown( parser, sink, TAG_TYPE_END );
720
721 scriptBlock = false;
722 }
723 else
724 {
725 visited = false;
726 }
727
728 return visited;
729 }
730
731
732
733
734
735
736
737 protected void handleStartTag( XmlPullParser parser, Sink sink )
738 throws XmlPullParserException, MacroExecutionException
739 {
740 if ( !baseStartTag( parser, sink ) )
741 {
742 if ( getLog().isWarnEnabled() )
743 {
744 String position = "[" + parser.getLineNumber() + ":"
745 + parser.getColumnNumber() + "]";
746 String tag = "<" + parser.getName() + ">";
747
748 getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
749 }
750 }
751 }
752
753
754
755
756
757
758
759 protected void handleEndTag( XmlPullParser parser, Sink sink )
760 throws XmlPullParserException, MacroExecutionException
761 {
762 if ( !baseEndTag( parser, sink ) )
763 {
764
765 }
766 }
767
768
769 @Override
770 protected void handleText( XmlPullParser parser, Sink sink )
771 throws XmlPullParserException
772 {
773 String text = getText( parser );
774
775
776
777
778
779
780
781 if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
782 {
783 sink.text( text, decoration );
784 }
785 }
786
787
788 @Override
789 protected void handleComment( XmlPullParser parser, Sink sink )
790 throws XmlPullParserException
791 {
792 String text = getText( parser ).trim();
793
794 if ( "PB".equals( text ) )
795 {
796 sink.pageBreak();
797 }
798 else
799 {
800 sink.comment( text );
801 }
802 }
803
804
805 @Override
806 protected void handleCdsect( XmlPullParser parser, Sink sink )
807 throws XmlPullParserException
808 {
809 String text = getText( parser );
810
811 if ( isScriptBlock() )
812 {
813 sink.unknown( CDATA, new Object[] { Integer.valueOf( CDATA_TYPE ), text}, null );
814 }
815 else
816 {
817 sink.text( text );
818 }
819 }
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849 protected void consecutiveSections( int newLevel, Sink sink )
850 {
851 closeOpenSections( newLevel, sink );
852 openMissingSections( newLevel, sink );
853
854 this.sectionLevel = newLevel;
855 }
856
857
858
859
860
861
862
863 private void closeOpenSections( int newLevel, Sink sink )
864 {
865 while ( this.sectionLevel >= newLevel )
866 {
867 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
868 {
869 sink.section5_();
870 }
871 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
872 {
873 sink.section4_();
874 }
875 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
876 {
877 sink.section3_();
878 }
879 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
880 {
881 sink.section2_();
882 }
883 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
884 {
885 sink.section1_();
886 }
887
888 this.sectionLevel--;
889 }
890 }
891
892
893
894
895
896
897
898 private void openMissingSections( int newLevel, Sink sink )
899 {
900 while ( this.sectionLevel < newLevel - 1 )
901 {
902 this.sectionLevel++;
903
904 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
905 {
906 sink.section5();
907 }
908 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
909 {
910 sink.section4();
911 }
912 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
913 {
914 sink.section3();
915 }
916 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
917 {
918 sink.section2();
919 }
920 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
921 {
922 sink.section1();
923 }
924 }
925 }
926
927
928
929
930
931
932 protected int getSectionLevel()
933 {
934 return this.sectionLevel;
935 }
936
937
938
939
940
941
942 protected void setSectionLevel( int newLevel )
943 {
944 this.sectionLevel = newLevel;
945 }
946
947
948
949
950 protected void verbatim_()
951 {
952 this.inVerbatim = false;
953 }
954
955
956
957
958 protected void verbatim()
959 {
960 this.inVerbatim = true;
961 }
962
963
964
965
966
967
968 protected boolean isVerbatim()
969 {
970 return this.inVerbatim;
971 }
972
973
974
975
976
977
978
979
980 protected boolean isScriptBlock()
981 {
982 return this.scriptBlock;
983 }
984
985
986
987
988
989
990
991
992 protected String validAnchor( String id )
993 {
994 if ( !DoxiaUtils.isValidId( id ) )
995 {
996 String linkAnchor = DoxiaUtils.encodeId( id, true );
997
998 String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
999 logMessage( "modifiedLink", msg );
1000
1001 return linkAnchor;
1002 }
1003
1004 return id;
1005 }
1006
1007
1008 @Override
1009 protected void init()
1010 {
1011 super.init();
1012
1013 this.scriptBlock = false;
1014 this.isLink = false;
1015 this.isAnchor = false;
1016 this.orderedListDepth = 0;
1017 this.sectionLevel = 0;
1018 this.inVerbatim = false;
1019 this.inFigure = false;
1020 while ( this.decoration.getAttributeNames().hasMoreElements() )
1021 {
1022 this.decoration.removeAttribute( this.decoration.getAttributeNames().nextElement() );
1023 }
1024 this.warnMessages = null;
1025 }
1026
1027 private void handleAEnd( Sink sink )
1028 {
1029 if ( isLink )
1030 {
1031 sink.link_();
1032 isLink = false;
1033 }
1034 else if ( isAnchor )
1035 {
1036 sink.anchor_();
1037 isAnchor = false;
1038 }
1039 }
1040
1041 private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1042 {
1043 String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
1044
1045 if ( href != null )
1046 {
1047 int hashIndex = href.indexOf( '#' );
1048 if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
1049 {
1050 String hash = href.substring( hashIndex + 1 );
1051
1052 if ( !DoxiaUtils.isValidId( hash ) )
1053 {
1054 href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
1055
1056 String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
1057 logMessage( "modifiedLink", msg );
1058 }
1059 }
1060 sink.link( href, attribs );
1061 isLink = true;
1062 }
1063 else
1064 {
1065 String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
1066
1067 if ( name != null )
1068 {
1069 sink.anchor( validAnchor( name ), attribs );
1070 isAnchor = true;
1071 }
1072 else
1073 {
1074 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
1075 if ( id != null )
1076 {
1077 sink.anchor( validAnchor( id ), attribs );
1078 isAnchor = true;
1079 }
1080 }
1081 }
1082 }
1083
1084 private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
1085 {
1086 boolean visited = true;
1087
1088 String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
1089
1090 if ( "figure".equals( divclass ) )
1091 {
1092 this.inFigure = true;
1093 SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
1094 atts.removeAttribute( SinkEventAttributes.CLASS );
1095 sink.figure( atts );
1096 }
1097 else
1098 {
1099 visited = false;
1100 }
1101
1102 return visited;
1103 }
1104
1105 private void handleFigureCaptionEnd( Sink sink )
1106 {
1107 if ( inFigure )
1108 {
1109 sink.figureCaption_();
1110 }
1111 else
1112 {
1113 sink.italic_();
1114 }
1115 }
1116
1117 private void handleFigureCaptionStart( Sink sink, SinkEventAttributeSet attribs )
1118 {
1119 if ( inFigure )
1120 {
1121 sink.figureCaption( attribs );
1122 }
1123 else
1124 {
1125 sink.italic();
1126 }
1127 }
1128
1129 private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1130 {
1131 String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
1132
1133 if ( src != null )
1134 {
1135 sink.figureGraphics( src, attribs );
1136 }
1137 }
1138
1139 private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
1140 {
1141 if ( orderedListDepth == 0 )
1142 {
1143 sink.listItem( attribs );
1144 }
1145 else
1146 {
1147 sink.numberedListItem( attribs );
1148 }
1149 }
1150
1151 private void handleListItemEnd( Sink sink )
1152 {
1153 if ( orderedListDepth == 0 )
1154 {
1155 sink.listItem_();
1156 }
1157 else
1158 {
1159 sink.numberedListItem_();
1160 }
1161 }
1162
1163 private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1164 {
1165 int numbering = Sink.NUMBERING_DECIMAL;
1166
1167 String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
1168
1169 if ( style != null )
1170 {
1171 if ( "list-style-type: upper-alpha".equals( style ) )
1172 {
1173 numbering = Sink.NUMBERING_UPPER_ALPHA;
1174 }
1175 else if ( "list-style-type: lower-alpha".equals( style ) )
1176 {
1177 numbering = Sink.NUMBERING_LOWER_ALPHA;
1178 }
1179 else if ( "list-style-type: upper-roman".equals( style ) )
1180 {
1181 numbering = Sink.NUMBERING_UPPER_ROMAN;
1182 }
1183 else if ( "list-style-type: lower-roman".equals( style ) )
1184 {
1185 numbering = Sink.NUMBERING_LOWER_ROMAN;
1186 }
1187 else if ( "list-style-type: decimal".equals( style ) )
1188 {
1189 numbering = Sink.NUMBERING_DECIMAL;
1190 }
1191 }
1192
1193 sink.numberedList( numbering, attribs );
1194 orderedListDepth++;
1195 }
1196
1197 private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
1198 {
1199 if ( !inFigure )
1200 {
1201 sink.paragraph( attribs );
1202 }
1203 }
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215 private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
1216 {
1217 verbatim();
1218 attribs.removeAttribute( SinkEventAttributes.DECORATION );
1219 sink.verbatim( attribs );
1220 }
1221
1222 private void handleSectionStart( Sink sink, int level, SinkEventAttributeSet attribs )
1223 {
1224 consecutiveSections( level, sink );
1225 sink.section( level, attribs );
1226 sink.sectionTitle( level, attribs );
1227 }
1228
1229 private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
1230 {
1231 sink.table( attribs );
1232 String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
1233 boolean grid = true;
1234
1235 if ( border == null || "0".equals( border ) )
1236 {
1237 grid = false;
1238 }
1239
1240 String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
1241 int[] justif = {Sink.JUSTIFY_LEFT};
1242
1243 if ( "center".equals( align ) )
1244 {
1245 justif[0] = Sink.JUSTIFY_CENTER;
1246 }
1247 else if ( "right".equals( align ) )
1248 {
1249 justif[0] = Sink.JUSTIFY_RIGHT;
1250 }
1251
1252 sink.tableRows( justif, grid );
1253 }
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263 private void logMessage( String key, String msg )
1264 {
1265 final String log = "[XHTML Parser] " + msg;
1266 if ( getLog().isDebugEnabled() )
1267 {
1268 getLog().debug( log );
1269
1270 return;
1271 }
1272
1273 if ( warnMessages == null )
1274 {
1275 warnMessages = new HashMap<String, Set<String>>();
1276 }
1277
1278 Set<String> set = warnMessages.get( key );
1279 if ( set == null )
1280 {
1281 set = new TreeSet<String>();
1282 }
1283 set.add( log );
1284 warnMessages.put( key, set );
1285 }
1286
1287
1288
1289
1290 private void logWarnings()
1291 {
1292 if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
1293 {
1294 for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
1295 {
1296 for ( String msg : entry.getValue() )
1297 {
1298 getLog().warn( msg );
1299 }
1300 }
1301
1302 this.warnMessages = null;
1303 }
1304 }
1305 }