1 package org.apache.maven.doxia.parser;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.Reader;
23 import java.util.HashMap;
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.TreeSet;
27
28 import javax.swing.text.html.HTML.Attribute;
29
30 import org.apache.maven.doxia.macro.MacroExecutionException;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.apache.maven.doxia.sink.Sink;
33 import org.apache.maven.doxia.sink.SinkEventAttributes;
34 import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
35 import org.apache.maven.doxia.util.DoxiaUtils;
36
37 import org.codehaus.plexus.util.StringUtils;
38 import org.codehaus.plexus.util.xml.pull.XmlPullParser;
39 import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
40
41
42
43
44
45
46
47
48
49 public class XhtmlBaseParser
50 extends AbstractXmlParser
51 implements HtmlMarkup
52 {
53
54
55
56
57 private boolean scriptBlock;
58
59
60 private boolean isLink;
61
62
63 private boolean isAnchor;
64
65
66 private int orderedListDepth = 0;
67
68
69 private int sectionLevel;
70
71
72 private boolean inVerbatim;
73
74
75 private boolean inFigure;
76
77
78 boolean hasDefinitionListItem = false;
79
80
81
82 private Map<String, Set<String>> warnMessages;
83
84
85 @Override
86 public void parse( Reader source, Sink sink )
87 throws ParseException
88 {
89 init();
90
91 try
92 {
93 super.parse( source, sink );
94 }
95 finally
96 {
97 logWarnings();
98
99 setSecondParsing( false );
100 init();
101 }
102 }
103
104
105
106
107
108
109
110 @Override
111 protected void initXmlParser( XmlPullParser parser )
112 throws XmlPullParserException
113 {
114 super.initXmlParser( parser );
115
116
117
118
119
120
121
122
123 parser.defineEntityReplacementText( "nbsp", "\u00a0" );
124 parser.defineEntityReplacementText( "iexcl", "\u00a1" );
125 parser.defineEntityReplacementText( "cent", "\u00a2" );
126 parser.defineEntityReplacementText( "pound", "\u00a3" );
127 parser.defineEntityReplacementText( "curren", "\u00a4" );
128 parser.defineEntityReplacementText( "yen", "\u00a5" );
129 parser.defineEntityReplacementText( "brvbar", "\u00a6" );
130 parser.defineEntityReplacementText( "sect", "\u00a7" );
131 parser.defineEntityReplacementText( "uml", "\u00a8" );
132 parser.defineEntityReplacementText( "copy", "\u00a9" );
133 parser.defineEntityReplacementText( "ordf", "\u00aa" );
134 parser.defineEntityReplacementText( "laquo", "\u00ab" );
135 parser.defineEntityReplacementText( "not", "\u00ac" );
136 parser.defineEntityReplacementText( "shy", "\u00ad" );
137 parser.defineEntityReplacementText( "reg", "\u00ae" );
138 parser.defineEntityReplacementText( "macr", "\u00af" );
139 parser.defineEntityReplacementText( "deg", "\u00b0" );
140 parser.defineEntityReplacementText( "plusmn", "\u00b1" );
141 parser.defineEntityReplacementText( "sup2", "\u00b2" );
142 parser.defineEntityReplacementText( "sup3", "\u00b3" );
143 parser.defineEntityReplacementText( "acute", "\u00b4" );
144 parser.defineEntityReplacementText( "micro", "\u00b5" );
145 parser.defineEntityReplacementText( "para", "\u00b6" );
146 parser.defineEntityReplacementText( "middot", "\u00b7" );
147 parser.defineEntityReplacementText( "cedil", "\u00b8" );
148 parser.defineEntityReplacementText( "sup1", "\u00b9" );
149 parser.defineEntityReplacementText( "ordm", "\u00ba" );
150 parser.defineEntityReplacementText( "raquo", "\u00bb" );
151 parser.defineEntityReplacementText( "frac14", "\u00bc" );
152 parser.defineEntityReplacementText( "frac12", "\u00bd" );
153 parser.defineEntityReplacementText( "frac34", "\u00be" );
154 parser.defineEntityReplacementText( "iquest", "\u00bf" );
155 parser.defineEntityReplacementText( "Agrave", "\u00c0" );
156 parser.defineEntityReplacementText( "Aacute", "\u00c1" );
157 parser.defineEntityReplacementText( "Acirc", "\u00c2" );
158 parser.defineEntityReplacementText( "Atilde", "\u00c3" );
159 parser.defineEntityReplacementText( "Auml", "\u00c4" );
160 parser.defineEntityReplacementText( "Aring", "\u00c5" );
161 parser.defineEntityReplacementText( "AElig", "\u00c6" );
162 parser.defineEntityReplacementText( "Ccedil", "\u00c7" );
163 parser.defineEntityReplacementText( "Egrave", "\u00c8" );
164 parser.defineEntityReplacementText( "Eacute", "\u00c9" );
165 parser.defineEntityReplacementText( "Ecirc", "\u00ca" );
166 parser.defineEntityReplacementText( "Euml", "\u00cb" );
167 parser.defineEntityReplacementText( "Igrave", "\u00cc" );
168 parser.defineEntityReplacementText( "Iacute", "\u00cd" );
169 parser.defineEntityReplacementText( "Icirc", "\u00ce" );
170 parser.defineEntityReplacementText( "Iuml", "\u00cf" );
171 parser.defineEntityReplacementText( "ETH", "\u00d0" );
172 parser.defineEntityReplacementText( "Ntilde", "\u00d1" );
173 parser.defineEntityReplacementText( "Ograve", "\u00d2" );
174 parser.defineEntityReplacementText( "Oacute", "\u00d3" );
175 parser.defineEntityReplacementText( "Ocirc", "\u00d4" );
176 parser.defineEntityReplacementText( "Otilde", "\u00d5" );
177 parser.defineEntityReplacementText( "Ouml", "\u00d6" );
178 parser.defineEntityReplacementText( "times", "\u00d7" );
179 parser.defineEntityReplacementText( "Oslash", "\u00d8" );
180 parser.defineEntityReplacementText( "Ugrave", "\u00d9" );
181 parser.defineEntityReplacementText( "Uacute", "\u00da" );
182 parser.defineEntityReplacementText( "Ucirc", "\u00db" );
183 parser.defineEntityReplacementText( "Uuml", "\u00dc" );
184 parser.defineEntityReplacementText( "Yacute", "\u00dd" );
185 parser.defineEntityReplacementText( "THORN", "\u00de" );
186 parser.defineEntityReplacementText( "szlig", "\u00df" );
187 parser.defineEntityReplacementText( "agrave", "\u00e0" );
188 parser.defineEntityReplacementText( "aacute", "\u00e1" );
189 parser.defineEntityReplacementText( "acirc", "\u00e2" );
190 parser.defineEntityReplacementText( "atilde", "\u00e3" );
191 parser.defineEntityReplacementText( "auml", "\u00e4" );
192 parser.defineEntityReplacementText( "aring", "\u00e5" );
193 parser.defineEntityReplacementText( "aelig", "\u00e6" );
194 parser.defineEntityReplacementText( "ccedil", "\u00e7" );
195 parser.defineEntityReplacementText( "egrave", "\u00e8" );
196 parser.defineEntityReplacementText( "eacute", "\u00e9" );
197 parser.defineEntityReplacementText( "ecirc", "\u00ea" );
198 parser.defineEntityReplacementText( "euml", "\u00eb" );
199 parser.defineEntityReplacementText( "igrave", "\u00ec" );
200 parser.defineEntityReplacementText( "iacute", "\u00ed" );
201 parser.defineEntityReplacementText( "icirc", "\u00ee" );
202 parser.defineEntityReplacementText( "iuml", "\u00ef" );
203 parser.defineEntityReplacementText( "eth", "\u00f0" );
204 parser.defineEntityReplacementText( "ntilde", "\u00f1" );
205 parser.defineEntityReplacementText( "ograve", "\u00f2" );
206 parser.defineEntityReplacementText( "oacute", "\u00f3" );
207 parser.defineEntityReplacementText( "ocirc", "\u00f4" );
208 parser.defineEntityReplacementText( "otilde", "\u00f5" );
209 parser.defineEntityReplacementText( "ouml", "\u00f6" );
210 parser.defineEntityReplacementText( "divide", "\u00f7" );
211 parser.defineEntityReplacementText( "oslash", "\u00f8" );
212 parser.defineEntityReplacementText( "ugrave", "\u00f9" );
213 parser.defineEntityReplacementText( "uacute", "\u00fa" );
214 parser.defineEntityReplacementText( "ucirc", "\u00fb" );
215 parser.defineEntityReplacementText( "uuml", "\u00fc" );
216 parser.defineEntityReplacementText( "yacute", "\u00fd" );
217 parser.defineEntityReplacementText( "thorn", "\u00fe" );
218 parser.defineEntityReplacementText( "yuml", "\u00ff" );
219
220
221
222
223
224 parser.defineEntityReplacementText( "OElig", "\u0152" );
225 parser.defineEntityReplacementText( "oelig", "\u0153" );
226 parser.defineEntityReplacementText( "Scaron", "\u0160" );
227 parser.defineEntityReplacementText( "scaron", "\u0161" );
228 parser.defineEntityReplacementText( "Yuml", "\u0178" );
229 parser.defineEntityReplacementText( "circ", "\u02c6" );
230 parser.defineEntityReplacementText( "tilde", "\u02dc" );
231 parser.defineEntityReplacementText( "ensp", "\u2002" );
232 parser.defineEntityReplacementText( "emsp", "\u2003" );
233 parser.defineEntityReplacementText( "thinsp", "\u2009" );
234 parser.defineEntityReplacementText( "zwnj", "\u200c" );
235 parser.defineEntityReplacementText( "zwj", "\u200d" );
236 parser.defineEntityReplacementText( "lrm", "\u200e" );
237 parser.defineEntityReplacementText( "rlm", "\u200f" );
238 parser.defineEntityReplacementText( "ndash", "\u2013" );
239 parser.defineEntityReplacementText( "mdash", "\u2014" );
240 parser.defineEntityReplacementText( "lsquo", "\u2018" );
241 parser.defineEntityReplacementText( "rsquo", "\u2019" );
242 parser.defineEntityReplacementText( "sbquo", "\u201a" );
243 parser.defineEntityReplacementText( "ldquo", "\u201c" );
244 parser.defineEntityReplacementText( "rdquo", "\u201d" );
245 parser.defineEntityReplacementText( "bdquo", "\u201e" );
246 parser.defineEntityReplacementText( "dagger", "\u2020" );
247 parser.defineEntityReplacementText( "Dagger", "\u2021" );
248 parser.defineEntityReplacementText( "permil", "\u2030" );
249 parser.defineEntityReplacementText( "lsaquo", "\u2039" );
250 parser.defineEntityReplacementText( "rsaquo", "\u203a" );
251 parser.defineEntityReplacementText( "euro", "\u20ac" );
252
253
254
255
256
257 parser.defineEntityReplacementText( "fnof", "\u0192" );
258 parser.defineEntityReplacementText( "Alpha", "\u0391" );
259 parser.defineEntityReplacementText( "Beta", "\u0392" );
260 parser.defineEntityReplacementText( "Gamma", "\u0393" );
261 parser.defineEntityReplacementText( "Delta", "\u0394" );
262 parser.defineEntityReplacementText( "Epsilon", "\u0395" );
263 parser.defineEntityReplacementText( "Zeta", "\u0396" );
264 parser.defineEntityReplacementText( "Eta", "\u0397" );
265 parser.defineEntityReplacementText( "Theta", "\u0398" );
266 parser.defineEntityReplacementText( "Iota", "\u0399" );
267 parser.defineEntityReplacementText( "Kappa", "\u039a" );
268 parser.defineEntityReplacementText( "Lambda", "\u039b" );
269 parser.defineEntityReplacementText( "Mu", "\u039c" );
270 parser.defineEntityReplacementText( "Nu", "\u039d" );
271 parser.defineEntityReplacementText( "Xi", "\u039e" );
272 parser.defineEntityReplacementText( "Omicron", "\u039f" );
273 parser.defineEntityReplacementText( "Pi", "\u03a0" );
274 parser.defineEntityReplacementText( "Rho", "\u03a1" );
275 parser.defineEntityReplacementText( "Sigma", "\u03a3" );
276 parser.defineEntityReplacementText( "Tau", "\u03a4" );
277 parser.defineEntityReplacementText( "Upsilon", "\u03a5" );
278 parser.defineEntityReplacementText( "Phi", "\u03a6" );
279 parser.defineEntityReplacementText( "Chi", "\u03a7" );
280 parser.defineEntityReplacementText( "Psi", "\u03a8" );
281 parser.defineEntityReplacementText( "Omega", "\u03a9" );
282 parser.defineEntityReplacementText( "alpha", "\u03b1" );
283 parser.defineEntityReplacementText( "beta", "\u03b2" );
284 parser.defineEntityReplacementText( "gamma", "\u03b3" );
285 parser.defineEntityReplacementText( "delta", "\u03b4" );
286 parser.defineEntityReplacementText( "epsilon", "\u03b5" );
287 parser.defineEntityReplacementText( "zeta", "\u03b6" );
288 parser.defineEntityReplacementText( "eta", "\u03b7" );
289 parser.defineEntityReplacementText( "theta", "\u03b8" );
290 parser.defineEntityReplacementText( "iota", "\u03b9" );
291 parser.defineEntityReplacementText( "kappa", "\u03ba" );
292 parser.defineEntityReplacementText( "lambda", "\u03bb" );
293 parser.defineEntityReplacementText( "mu", "\u03bc" );
294 parser.defineEntityReplacementText( "nu", "\u03bd" );
295 parser.defineEntityReplacementText( "xi", "\u03be" );
296 parser.defineEntityReplacementText( "omicron", "\u03bf" );
297 parser.defineEntityReplacementText( "pi", "\u03c0" );
298 parser.defineEntityReplacementText( "rho", "\u03c1" );
299 parser.defineEntityReplacementText( "sigmaf", "\u03c2" );
300 parser.defineEntityReplacementText( "sigma", "\u03c3" );
301 parser.defineEntityReplacementText( "tau", "\u03c4" );
302 parser.defineEntityReplacementText( "upsilon", "\u03c5" );
303 parser.defineEntityReplacementText( "phi", "\u03c6" );
304 parser.defineEntityReplacementText( "chi", "\u03c7" );
305 parser.defineEntityReplacementText( "psi", "\u03c8" );
306 parser.defineEntityReplacementText( "omega", "\u03c9" );
307 parser.defineEntityReplacementText( "thetasym", "\u03d1" );
308 parser.defineEntityReplacementText( "upsih", "\u03d2" );
309 parser.defineEntityReplacementText( "piv", "\u03d6" );
310 parser.defineEntityReplacementText( "bull", "\u2022" );
311 parser.defineEntityReplacementText( "hellip", "\u2026" );
312 parser.defineEntityReplacementText( "prime", "\u2032" );
313 parser.defineEntityReplacementText( "Prime", "\u2033" );
314 parser.defineEntityReplacementText( "oline", "\u203e" );
315 parser.defineEntityReplacementText( "frasl", "\u2044" );
316 parser.defineEntityReplacementText( "weierp", "\u2118" );
317 parser.defineEntityReplacementText( "image", "\u2111" );
318 parser.defineEntityReplacementText( "real", "\u211c" );
319 parser.defineEntityReplacementText( "trade", "\u2122" );
320 parser.defineEntityReplacementText( "alefsym", "\u2135" );
321 parser.defineEntityReplacementText( "larr", "\u2190" );
322 parser.defineEntityReplacementText( "uarr", "\u2191" );
323 parser.defineEntityReplacementText( "rarr", "\u2192" );
324 parser.defineEntityReplacementText( "darr", "\u2193" );
325 parser.defineEntityReplacementText( "harr", "\u2194" );
326 parser.defineEntityReplacementText( "crarr", "\u21b5" );
327 parser.defineEntityReplacementText( "lArr", "\u21d0" );
328 parser.defineEntityReplacementText( "uArr", "\u21d1" );
329 parser.defineEntityReplacementText( "rArr", "\u21d2" );
330 parser.defineEntityReplacementText( "dArr", "\u21d3" );
331 parser.defineEntityReplacementText( "hArr", "\u21d4" );
332 parser.defineEntityReplacementText( "forall", "\u2200" );
333 parser.defineEntityReplacementText( "part", "\u2202" );
334 parser.defineEntityReplacementText( "exist", "\u2203" );
335 parser.defineEntityReplacementText( "empty", "\u2205" );
336 parser.defineEntityReplacementText( "nabla", "\u2207" );
337 parser.defineEntityReplacementText( "isin", "\u2208" );
338 parser.defineEntityReplacementText( "notin", "\u2209" );
339 parser.defineEntityReplacementText( "ni", "\u220b" );
340 parser.defineEntityReplacementText( "prod", "\u220f" );
341 parser.defineEntityReplacementText( "sum", "\u2211" );
342 parser.defineEntityReplacementText( "minus", "\u2212" );
343 parser.defineEntityReplacementText( "lowast", "\u2217" );
344 parser.defineEntityReplacementText( "radic", "\u221a" );
345 parser.defineEntityReplacementText( "prop", "\u221d" );
346 parser.defineEntityReplacementText( "infin", "\u221e" );
347 parser.defineEntityReplacementText( "ang", "\u2220" );
348 parser.defineEntityReplacementText( "and", "\u2227" );
349 parser.defineEntityReplacementText( "or", "\u2228" );
350 parser.defineEntityReplacementText( "cap", "\u2229" );
351 parser.defineEntityReplacementText( "cup", "\u222a" );
352 parser.defineEntityReplacementText( "int", "\u222b" );
353 parser.defineEntityReplacementText( "there4", "\u2234" );
354 parser.defineEntityReplacementText( "sim", "\u223c" );
355 parser.defineEntityReplacementText( "cong", "\u2245" );
356 parser.defineEntityReplacementText( "asymp", "\u2248" );
357 parser.defineEntityReplacementText( "ne", "\u2260" );
358 parser.defineEntityReplacementText( "equiv", "\u2261" );
359 parser.defineEntityReplacementText( "le", "\u2264" );
360 parser.defineEntityReplacementText( "ge", "\u2265" );
361 parser.defineEntityReplacementText( "sub", "\u2282" );
362 parser.defineEntityReplacementText( "sup", "\u2283" );
363 parser.defineEntityReplacementText( "nsub", "\u2284" );
364 parser.defineEntityReplacementText( "sube", "\u2286" );
365 parser.defineEntityReplacementText( "supe", "\u2287" );
366 parser.defineEntityReplacementText( "oplus", "\u2295" );
367 parser.defineEntityReplacementText( "otimes", "\u2297" );
368 parser.defineEntityReplacementText( "perp", "\u22a5" );
369 parser.defineEntityReplacementText( "sdot", "\u22c5" );
370 parser.defineEntityReplacementText( "lceil", "\u2308" );
371 parser.defineEntityReplacementText( "rceil", "\u2309" );
372 parser.defineEntityReplacementText( "lfloor", "\u230a" );
373 parser.defineEntityReplacementText( "rfloor", "\u230b" );
374 parser.defineEntityReplacementText( "lang", "\u2329" );
375 parser.defineEntityReplacementText( "rang", "\u232a" );
376 parser.defineEntityReplacementText( "loz", "\u25ca" );
377 parser.defineEntityReplacementText( "spades", "\u2660" );
378 parser.defineEntityReplacementText( "clubs", "\u2663" );
379 parser.defineEntityReplacementText( "hearts", "\u2665" );
380 parser.defineEntityReplacementText( "diams", "\u2666" );
381 }
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404 protected boolean baseStartTag( XmlPullParser parser, Sink sink )
405 {
406 boolean visited = true;
407
408 SinkEventAttributeSet attribs = getAttributesFromParser( parser );
409
410 if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
411 {
412 handleSectionStart( sink, Sink.SECTION_LEVEL_1, attribs );
413 }
414 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
415 {
416 handleSectionStart( sink, Sink.SECTION_LEVEL_2, attribs );
417 }
418 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
419 {
420 handleSectionStart( sink, Sink.SECTION_LEVEL_3, attribs );
421 }
422 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
423 {
424 handleSectionStart( sink, Sink.SECTION_LEVEL_4, attribs );
425 }
426 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
427 {
428 handleSectionStart( sink, Sink.SECTION_LEVEL_5, attribs );
429 }
430 else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
431 {
432 attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION );
433 sink.inline( attribs );
434 }
435 else if ( parser.getName().equals( HtmlMarkup.S.toString() )
436 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
437 || parser.getName().equals( "del" ) )
438 {
439 attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH );
440 sink.inline( attribs );
441 }
442 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
443 {
444 attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT );
445 sink.inline( attribs );
446 }
447 else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
448 {
449 attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT );
450 sink.inline( attribs );
451 }
452 else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
453 {
454 handlePStart( sink, attribs );
455 }
456 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
457 {
458 visited = handleDivStart( parser, attribs, sink );
459 }
460 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
461 {
462 handlePreStart( attribs, sink );
463 }
464 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
465 {
466 sink.list( attribs );
467 }
468 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
469 {
470 handleOLStart( parser, sink, attribs );
471 }
472 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
473 {
474 handleLIStart( sink, attribs );
475 }
476 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
477 {
478 sink.definitionList( attribs );
479 }
480 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
481 {
482 if ( hasDefinitionListItem )
483 {
484
485 sink.definitionListItem_();
486 }
487 sink.definitionListItem( attribs );
488 hasDefinitionListItem = true;
489 sink.definedTerm( attribs );
490 }
491 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
492 {
493 if ( !hasDefinitionListItem )
494 {
495 sink.definitionListItem( attribs );
496 }
497 sink.definition( attribs );
498 }
499 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
500 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
501 {
502 sink.inline( SinkEventAttributeSet.Semantics.BOLD );
503 }
504 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
505 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
506 {
507 handleFigureCaptionStart( sink, attribs );
508 }
509 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
510 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
511 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
512 {
513 sink.inline( SinkEventAttributeSet.Semantics.MONOSPACED );
514 }
515 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
516 {
517 handleAStart( parser, sink, attribs );
518 }
519 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
520 {
521 handleTableStart( sink, attribs, parser );
522 }
523 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
524 {
525 sink.tableRow( attribs );
526 }
527 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
528 {
529 sink.tableHeaderCell( attribs );
530 }
531 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
532 {
533 sink.tableCell( attribs );
534 }
535 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
536 {
537 sink.tableCaption( attribs );
538 }
539 else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
540 {
541 sink.lineBreak( attribs );
542 }
543 else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
544 {
545 sink.horizontalRule( attribs );
546 }
547 else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
548 {
549 handleImgStart( parser, sink, attribs );
550 }
551 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
552 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
553 {
554 handleUnknown( parser, sink, TAG_TYPE_START );
555 scriptBlock = true;
556 }
557 else
558 {
559 visited = false;
560 }
561
562 return visited;
563 }
564
565
566
567
568
569
570
571
572
573
574
575
576
577 protected boolean baseEndTag( XmlPullParser parser, Sink sink )
578 {
579 boolean visited = true;
580
581 if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
582 {
583 if ( !inFigure )
584 {
585 sink.paragraph_();
586 }
587 }
588 else if ( parser.getName().equals( HtmlMarkup.U.toString() )
589 || parser.getName().equals( HtmlMarkup.S.toString() )
590 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
591 || parser.getName().equals( "del" ) )
592 {
593 sink.inline_();
594 }
595 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() )
596 || parser.getName().equals( HtmlMarkup.SUP.toString() ) )
597 {
598 sink.inline_();
599 }
600 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
601 {
602 if ( inFigure )
603 {
604 sink.figure_();
605 this.inFigure = false;
606 }
607 else
608 {
609 visited = false;
610 }
611 }
612 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
613 {
614 verbatim_();
615
616 sink.verbatim_();
617 }
618 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
619 {
620 sink.list_();
621 }
622 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
623 {
624 sink.numberedList_();
625 orderedListDepth--;
626 }
627 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
628 {
629 handleListItemEnd( sink );
630 }
631 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
632 {
633 if ( hasDefinitionListItem )
634 {
635 sink.definitionListItem_();
636 hasDefinitionListItem = false;
637 }
638 sink.definitionList_();
639 }
640 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
641 {
642 sink.definedTerm_();
643 }
644 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
645 {
646 sink.definition_();
647 sink.definitionListItem_();
648 hasDefinitionListItem = false;
649 }
650 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
651 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
652 {
653 sink.inline_();
654 }
655 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
656 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
657 {
658 handleFigureCaptionEnd( sink );
659 }
660 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
661 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
662 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
663 {
664 sink.inline_();
665 }
666 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
667 {
668 handleAEnd( sink );
669 }
670
671
672
673
674
675 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
676 {
677 sink.tableRows_();
678
679 sink.table_();
680 }
681 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
682 {
683 sink.tableRow_();
684 }
685 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
686 {
687 sink.tableHeaderCell_();
688 }
689 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
690 {
691 sink.tableCell_();
692 }
693 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
694 {
695 sink.tableCaption_();
696 }
697 else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
698 {
699 sink.sectionTitle1_();
700 }
701 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
702 {
703 sink.sectionTitle2_();
704 }
705 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
706 {
707 sink.sectionTitle3_();
708 }
709 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
710 {
711 sink.sectionTitle4_();
712 }
713 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
714 {
715 sink.sectionTitle5_();
716 }
717 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
718 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
719 {
720 handleUnknown( parser, sink, TAG_TYPE_END );
721
722 scriptBlock = false;
723 }
724 else
725 {
726 visited = false;
727 }
728
729 return visited;
730 }
731
732
733
734
735
736
737
738 protected void handleStartTag( XmlPullParser parser, Sink sink )
739 throws XmlPullParserException, MacroExecutionException
740 {
741 if ( !baseStartTag( parser, sink ) )
742 {
743 if ( getLog().isWarnEnabled() )
744 {
745 String position = "[" + parser.getLineNumber() + ":"
746 + parser.getColumnNumber() + "]";
747 String tag = "<" + parser.getName() + ">";
748
749 getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
750 }
751 }
752 }
753
754
755
756
757
758
759
760 protected void handleEndTag( XmlPullParser parser, Sink sink )
761 throws XmlPullParserException, MacroExecutionException
762 {
763 if ( !baseEndTag( parser, sink ) )
764 {
765
766 }
767 }
768
769
770 @Override
771 protected void handleText( XmlPullParser parser, Sink sink )
772 throws XmlPullParserException
773 {
774 String text = getText( parser );
775
776
777
778
779
780
781
782 if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
783 {
784 sink.text( text );
785 }
786 }
787
788
789 @Override
790 protected void handleComment( XmlPullParser parser, Sink sink )
791 throws XmlPullParserException
792 {
793 String text = getText( parser );
794
795 if ( "PB".equals( text.trim() ) )
796 {
797 sink.pageBreak();
798 }
799 else
800 {
801 if ( isEmitComments() )
802 {
803 sink.comment( text );
804 }
805 }
806 }
807
808
809 @Override
810 protected void handleCdsect( XmlPullParser parser, Sink sink )
811 throws XmlPullParserException
812 {
813 String text = getText( parser );
814
815 if ( isScriptBlock() )
816 {
817 sink.unknown( CDATA, new Object[] { CDATA_TYPE, text }, null );
818 }
819 else
820 {
821 sink.text( text );
822 }
823 }
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857 protected void consecutiveSections( int newLevel, Sink sink )
858 {
859 closeOpenSections( newLevel, sink );
860 openMissingSections( newLevel, sink );
861
862 this.sectionLevel = newLevel;
863 }
864
865
866
867
868
869
870
871 private void closeOpenSections( int newLevel, Sink sink )
872 {
873 while ( this.sectionLevel >= newLevel )
874 {
875 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
876 {
877 sink.section5_();
878 }
879 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
880 {
881 sink.section4_();
882 }
883 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
884 {
885 sink.section3_();
886 }
887 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
888 {
889 sink.section2_();
890 }
891 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
892 {
893 sink.section1_();
894 }
895
896 this.sectionLevel--;
897 }
898 }
899
900
901
902
903
904
905
906 private void openMissingSections( int newLevel, Sink sink )
907 {
908 while ( this.sectionLevel < newLevel - 1 )
909 {
910 this.sectionLevel++;
911
912 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
913 {
914 sink.section5();
915 }
916 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
917 {
918 sink.section4();
919 }
920 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
921 {
922 sink.section3();
923 }
924 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
925 {
926 sink.section2();
927 }
928 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
929 {
930 sink.section1();
931 }
932 }
933 }
934
935
936
937
938
939
940 protected int getSectionLevel()
941 {
942 return this.sectionLevel;
943 }
944
945
946
947
948
949
950 protected void setSectionLevel( int newLevel )
951 {
952 this.sectionLevel = newLevel;
953 }
954
955
956
957
958 protected void verbatim_()
959 {
960 this.inVerbatim = false;
961 }
962
963
964
965
966 protected void verbatim()
967 {
968 this.inVerbatim = true;
969 }
970
971
972
973
974
975
976 protected boolean isVerbatim()
977 {
978 return this.inVerbatim;
979 }
980
981
982
983
984
985
986
987
988 protected boolean isScriptBlock()
989 {
990 return this.scriptBlock;
991 }
992
993
994
995
996
997
998
999
1000 protected String validAnchor( String id )
1001 {
1002 if ( !DoxiaUtils.isValidId( id ) )
1003 {
1004 String linkAnchor = DoxiaUtils.encodeId( id, true );
1005
1006 String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
1007 logMessage( "modifiedLink", msg );
1008
1009 return linkAnchor;
1010 }
1011
1012 return id;
1013 }
1014
1015
1016 @Override
1017 protected void init()
1018 {
1019 super.init();
1020
1021 this.scriptBlock = false;
1022 this.isLink = false;
1023 this.isAnchor = false;
1024 this.orderedListDepth = 0;
1025 this.sectionLevel = 0;
1026 this.inVerbatim = false;
1027 this.inFigure = false;
1028 this.warnMessages = null;
1029 }
1030
1031 private void handleAEnd( Sink sink )
1032 {
1033 if ( isLink )
1034 {
1035 sink.link_();
1036 isLink = false;
1037 }
1038 else if ( isAnchor )
1039 {
1040 sink.anchor_();
1041 isAnchor = false;
1042 }
1043 }
1044
1045 private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1046 {
1047 String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
1048
1049 if ( href != null )
1050 {
1051 int hashIndex = href.indexOf( '#' );
1052 if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
1053 {
1054 String hash = href.substring( hashIndex + 1 );
1055
1056 if ( !DoxiaUtils.isValidId( hash ) )
1057 {
1058 href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
1059
1060 String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
1061 logMessage( "modifiedLink", msg );
1062 }
1063 }
1064 sink.link( href, attribs );
1065 isLink = true;
1066 }
1067 else
1068 {
1069 String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
1070
1071 if ( name != null )
1072 {
1073 sink.anchor( validAnchor( name ), attribs );
1074 isAnchor = true;
1075 }
1076 else
1077 {
1078 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
1079 if ( id != null )
1080 {
1081 sink.anchor( validAnchor( id ), attribs );
1082 isAnchor = true;
1083 }
1084 }
1085 }
1086 }
1087
1088 private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
1089 {
1090 boolean visited = true;
1091
1092 String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
1093
1094 if ( "figure".equals( divclass ) )
1095 {
1096 this.inFigure = true;
1097 SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
1098 atts.removeAttribute( SinkEventAttributes.CLASS );
1099 sink.figure( atts );
1100 }
1101 else
1102 {
1103 visited = false;
1104 }
1105
1106 return visited;
1107 }
1108
1109 private void handleFigureCaptionEnd( Sink sink )
1110 {
1111 if ( inFigure )
1112 {
1113 sink.figureCaption_();
1114 }
1115 else
1116 {
1117 sink.inline_();
1118 }
1119 }
1120
1121 private void handleFigureCaptionStart( Sink sink, SinkEventAttributeSet attribs )
1122 {
1123 if ( inFigure )
1124 {
1125 sink.figureCaption( attribs );
1126 }
1127 else
1128 {
1129 sink.inline( SinkEventAttributeSet.Semantics.ITALIC );
1130 }
1131 }
1132
1133 private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1134 {
1135 String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
1136
1137 if ( src != null )
1138 {
1139 sink.figureGraphics( src, attribs );
1140 }
1141 }
1142
1143 private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
1144 {
1145 if ( orderedListDepth == 0 )
1146 {
1147 sink.listItem( attribs );
1148 }
1149 else
1150 {
1151 sink.numberedListItem( attribs );
1152 }
1153 }
1154
1155 private void handleListItemEnd( Sink sink )
1156 {
1157 if ( orderedListDepth == 0 )
1158 {
1159 sink.listItem_();
1160 }
1161 else
1162 {
1163 sink.numberedListItem_();
1164 }
1165 }
1166
1167 private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1168 {
1169 int numbering = Sink.NUMBERING_DECIMAL;
1170
1171 String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
1172
1173 if ( style != null )
1174 {
1175 switch ( style )
1176 {
1177 case "list-style-type: upper-alpha":
1178 numbering = Sink.NUMBERING_UPPER_ALPHA;
1179 break;
1180 case "list-style-type: lower-alpha":
1181 numbering = Sink.NUMBERING_LOWER_ALPHA;
1182 break;
1183 case "list-style-type: upper-roman":
1184 numbering = Sink.NUMBERING_UPPER_ROMAN;
1185 break;
1186 case "list-style-type: lower-roman":
1187 numbering = Sink.NUMBERING_LOWER_ROMAN;
1188 break;
1189 case "list-style-type: decimal":
1190 numbering = Sink.NUMBERING_DECIMAL;
1191 break;
1192 default:
1193
1194 }
1195 }
1196
1197 sink.numberedList( numbering, attribs );
1198 orderedListDepth++;
1199 }
1200
1201 private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
1202 {
1203 if ( !inFigure )
1204 {
1205 sink.paragraph( attribs );
1206 }
1207 }
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219 private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
1220 {
1221 verbatim();
1222 sink.verbatim( attribs );
1223 }
1224
1225 private void handleSectionStart( Sink sink, int level, SinkEventAttributeSet attribs )
1226 {
1227 consecutiveSections( level, sink );
1228 sink.section( level, attribs );
1229 sink.sectionTitle( level, attribs );
1230 }
1231
1232 private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
1233 {
1234 sink.table( attribs );
1235 String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
1236 boolean grid = true;
1237
1238 if ( border == null || "0".equals( border ) )
1239 {
1240 grid = false;
1241 }
1242
1243 String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
1244 int[] justif = {Sink.JUSTIFY_LEFT};
1245
1246 if ( "center".equals( align ) )
1247 {
1248 justif[0] = Sink.JUSTIFY_CENTER;
1249 }
1250 else if ( "right".equals( align ) )
1251 {
1252 justif[0] = Sink.JUSTIFY_RIGHT;
1253 }
1254
1255 sink.tableRows( justif, grid );
1256 }
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266 private void logMessage( String key, String msg )
1267 {
1268 final String log = "[XHTML Parser] " + msg;
1269 if ( getLog().isDebugEnabled() )
1270 {
1271 getLog().debug( log );
1272
1273 return;
1274 }
1275
1276 if ( warnMessages == null )
1277 {
1278 warnMessages = new HashMap<>();
1279 }
1280
1281 Set<String> set = warnMessages.get( key );
1282 if ( set == null )
1283 {
1284 set = new TreeSet<>();
1285 }
1286 set.add( log );
1287 warnMessages.put( key, set );
1288 }
1289
1290
1291
1292
1293 private void logWarnings()
1294 {
1295 if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
1296 {
1297 for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
1298 {
1299 for ( String msg : entry.getValue() )
1300 {
1301 getLog().warn( msg );
1302 }
1303 }
1304
1305 this.warnMessages = null;
1306 }
1307 }
1308 }