1 package org.apache.maven.doxia.parser;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.Reader;
23 import java.util.HashMap;
24 import java.util.Map;
25 import java.util.Set;
26 import java.util.TreeSet;
27
28 import javax.swing.text.html.HTML.Attribute;
29
30 import org.apache.maven.doxia.macro.MacroExecutionException;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.apache.maven.doxia.sink.Sink;
33 import org.apache.maven.doxia.sink.SinkEventAttributes;
34 import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
35 import org.apache.maven.doxia.util.DoxiaUtils;
36
37 import org.codehaus.plexus.util.StringUtils;
38 import org.codehaus.plexus.util.xml.pull.XmlPullParser;
39 import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
40
41
42
43
44
45
46
47
48
49 public class XhtmlBaseParser
50 extends AbstractXmlParser
51 implements HtmlMarkup
52 {
53
54
55
56
57 private boolean scriptBlock;
58
59
60 private boolean isLink;
61
62
63 private boolean isAnchor;
64
65
66 private int orderedListDepth = 0;
67
68
69 private int sectionLevel;
70
71
72 private boolean inVerbatim;
73
74
75 private boolean inFigure;
76
77
78 boolean hasDefinitionListItem = false;
79
80
81 private final SinkEventAttributeSet decoration = new SinkEventAttributeSet();
82
83
84
85 private Map<String, Set<String>> warnMessages;
86
87
88 @Override
89 public void parse( Reader source, Sink sink )
90 throws ParseException
91 {
92 init();
93
94 try
95 {
96 super.parse( source, sink );
97 }
98 finally
99 {
100 logWarnings();
101
102 setSecondParsing( false );
103 init();
104 }
105 }
106
107
108
109
110
111
112
113 @Override
114 protected void initXmlParser( XmlPullParser parser )
115 throws XmlPullParserException
116 {
117 super.initXmlParser( parser );
118
119
120
121
122
123
124
125
126 parser.defineEntityReplacementText( "nbsp", "\u00a0" );
127 parser.defineEntityReplacementText( "iexcl", "\u00a1" );
128 parser.defineEntityReplacementText( "cent", "\u00a2" );
129 parser.defineEntityReplacementText( "pound", "\u00a3" );
130 parser.defineEntityReplacementText( "curren", "\u00a4" );
131 parser.defineEntityReplacementText( "yen", "\u00a5" );
132 parser.defineEntityReplacementText( "brvbar", "\u00a6" );
133 parser.defineEntityReplacementText( "sect", "\u00a7" );
134 parser.defineEntityReplacementText( "uml", "\u00a8" );
135 parser.defineEntityReplacementText( "copy", "\u00a9" );
136 parser.defineEntityReplacementText( "ordf", "\u00aa" );
137 parser.defineEntityReplacementText( "laquo", "\u00ab" );
138 parser.defineEntityReplacementText( "not", "\u00ac" );
139 parser.defineEntityReplacementText( "shy", "\u00ad" );
140 parser.defineEntityReplacementText( "reg", "\u00ae" );
141 parser.defineEntityReplacementText( "macr", "\u00af" );
142 parser.defineEntityReplacementText( "deg", "\u00b0" );
143 parser.defineEntityReplacementText( "plusmn", "\u00b1" );
144 parser.defineEntityReplacementText( "sup2", "\u00b2" );
145 parser.defineEntityReplacementText( "sup3", "\u00b3" );
146 parser.defineEntityReplacementText( "acute", "\u00b4" );
147 parser.defineEntityReplacementText( "micro", "\u00b5" );
148 parser.defineEntityReplacementText( "para", "\u00b6" );
149 parser.defineEntityReplacementText( "middot", "\u00b7" );
150 parser.defineEntityReplacementText( "cedil", "\u00b8" );
151 parser.defineEntityReplacementText( "sup1", "\u00b9" );
152 parser.defineEntityReplacementText( "ordm", "\u00ba" );
153 parser.defineEntityReplacementText( "raquo", "\u00bb" );
154 parser.defineEntityReplacementText( "frac14", "\u00bc" );
155 parser.defineEntityReplacementText( "frac12", "\u00bd" );
156 parser.defineEntityReplacementText( "frac34", "\u00be" );
157 parser.defineEntityReplacementText( "iquest", "\u00bf" );
158 parser.defineEntityReplacementText( "Agrave", "\u00c0" );
159 parser.defineEntityReplacementText( "Aacute", "\u00c1" );
160 parser.defineEntityReplacementText( "Acirc", "\u00c2" );
161 parser.defineEntityReplacementText( "Atilde", "\u00c3" );
162 parser.defineEntityReplacementText( "Auml", "\u00c4" );
163 parser.defineEntityReplacementText( "Aring", "\u00c5" );
164 parser.defineEntityReplacementText( "AElig", "\u00c6" );
165 parser.defineEntityReplacementText( "Ccedil", "\u00c7" );
166 parser.defineEntityReplacementText( "Egrave", "\u00c8" );
167 parser.defineEntityReplacementText( "Eacute", "\u00c9" );
168 parser.defineEntityReplacementText( "Ecirc", "\u00ca" );
169 parser.defineEntityReplacementText( "Euml", "\u00cb" );
170 parser.defineEntityReplacementText( "Igrave", "\u00cc" );
171 parser.defineEntityReplacementText( "Iacute", "\u00cd" );
172 parser.defineEntityReplacementText( "Icirc", "\u00ce" );
173 parser.defineEntityReplacementText( "Iuml", "\u00cf" );
174 parser.defineEntityReplacementText( "ETH", "\u00d0" );
175 parser.defineEntityReplacementText( "Ntilde", "\u00d1" );
176 parser.defineEntityReplacementText( "Ograve", "\u00d2" );
177 parser.defineEntityReplacementText( "Oacute", "\u00d3" );
178 parser.defineEntityReplacementText( "Ocirc", "\u00d4" );
179 parser.defineEntityReplacementText( "Otilde", "\u00d5" );
180 parser.defineEntityReplacementText( "Ouml", "\u00d6" );
181 parser.defineEntityReplacementText( "times", "\u00d7" );
182 parser.defineEntityReplacementText( "Oslash", "\u00d8" );
183 parser.defineEntityReplacementText( "Ugrave", "\u00d9" );
184 parser.defineEntityReplacementText( "Uacute", "\u00da" );
185 parser.defineEntityReplacementText( "Ucirc", "\u00db" );
186 parser.defineEntityReplacementText( "Uuml", "\u00dc" );
187 parser.defineEntityReplacementText( "Yacute", "\u00dd" );
188 parser.defineEntityReplacementText( "THORN", "\u00de" );
189 parser.defineEntityReplacementText( "szlig", "\u00df" );
190 parser.defineEntityReplacementText( "agrave", "\u00e0" );
191 parser.defineEntityReplacementText( "aacute", "\u00e1" );
192 parser.defineEntityReplacementText( "acirc", "\u00e2" );
193 parser.defineEntityReplacementText( "atilde", "\u00e3" );
194 parser.defineEntityReplacementText( "auml", "\u00e4" );
195 parser.defineEntityReplacementText( "aring", "\u00e5" );
196 parser.defineEntityReplacementText( "aelig", "\u00e6" );
197 parser.defineEntityReplacementText( "ccedil", "\u00e7" );
198 parser.defineEntityReplacementText( "egrave", "\u00e8" );
199 parser.defineEntityReplacementText( "eacute", "\u00e9" );
200 parser.defineEntityReplacementText( "ecirc", "\u00ea" );
201 parser.defineEntityReplacementText( "euml", "\u00eb" );
202 parser.defineEntityReplacementText( "igrave", "\u00ec" );
203 parser.defineEntityReplacementText( "iacute", "\u00ed" );
204 parser.defineEntityReplacementText( "icirc", "\u00ee" );
205 parser.defineEntityReplacementText( "iuml", "\u00ef" );
206 parser.defineEntityReplacementText( "eth", "\u00f0" );
207 parser.defineEntityReplacementText( "ntilde", "\u00f1" );
208 parser.defineEntityReplacementText( "ograve", "\u00f2" );
209 parser.defineEntityReplacementText( "oacute", "\u00f3" );
210 parser.defineEntityReplacementText( "ocirc", "\u00f4" );
211 parser.defineEntityReplacementText( "otilde", "\u00f5" );
212 parser.defineEntityReplacementText( "ouml", "\u00f6" );
213 parser.defineEntityReplacementText( "divide", "\u00f7" );
214 parser.defineEntityReplacementText( "oslash", "\u00f8" );
215 parser.defineEntityReplacementText( "ugrave", "\u00f9" );
216 parser.defineEntityReplacementText( "uacute", "\u00fa" );
217 parser.defineEntityReplacementText( "ucirc", "\u00fb" );
218 parser.defineEntityReplacementText( "uuml", "\u00fc" );
219 parser.defineEntityReplacementText( "yacute", "\u00fd" );
220 parser.defineEntityReplacementText( "thorn", "\u00fe" );
221 parser.defineEntityReplacementText( "yuml", "\u00ff" );
222
223
224
225
226
227 parser.defineEntityReplacementText( "OElig", "\u0152" );
228 parser.defineEntityReplacementText( "oelig", "\u0153" );
229 parser.defineEntityReplacementText( "Scaron", "\u0160" );
230 parser.defineEntityReplacementText( "scaron", "\u0161" );
231 parser.defineEntityReplacementText( "Yuml", "\u0178" );
232 parser.defineEntityReplacementText( "circ", "\u02c6" );
233 parser.defineEntityReplacementText( "tilde", "\u02dc" );
234 parser.defineEntityReplacementText( "ensp", "\u2002" );
235 parser.defineEntityReplacementText( "emsp", "\u2003" );
236 parser.defineEntityReplacementText( "thinsp", "\u2009" );
237 parser.defineEntityReplacementText( "zwnj", "\u200c" );
238 parser.defineEntityReplacementText( "zwj", "\u200d" );
239 parser.defineEntityReplacementText( "lrm", "\u200e" );
240 parser.defineEntityReplacementText( "rlm", "\u200f" );
241 parser.defineEntityReplacementText( "ndash", "\u2013" );
242 parser.defineEntityReplacementText( "mdash", "\u2014" );
243 parser.defineEntityReplacementText( "lsquo", "\u2018" );
244 parser.defineEntityReplacementText( "rsquo", "\u2019" );
245 parser.defineEntityReplacementText( "sbquo", "\u201a" );
246 parser.defineEntityReplacementText( "ldquo", "\u201c" );
247 parser.defineEntityReplacementText( "rdquo", "\u201d" );
248 parser.defineEntityReplacementText( "bdquo", "\u201e" );
249 parser.defineEntityReplacementText( "dagger", "\u2020" );
250 parser.defineEntityReplacementText( "Dagger", "\u2021" );
251 parser.defineEntityReplacementText( "permil", "\u2030" );
252 parser.defineEntityReplacementText( "lsaquo", "\u2039" );
253 parser.defineEntityReplacementText( "rsaquo", "\u203a" );
254 parser.defineEntityReplacementText( "euro", "\u20ac" );
255
256
257
258
259
260 parser.defineEntityReplacementText( "fnof", "\u0192" );
261 parser.defineEntityReplacementText( "Alpha", "\u0391" );
262 parser.defineEntityReplacementText( "Beta", "\u0392" );
263 parser.defineEntityReplacementText( "Gamma", "\u0393" );
264 parser.defineEntityReplacementText( "Delta", "\u0394" );
265 parser.defineEntityReplacementText( "Epsilon", "\u0395" );
266 parser.defineEntityReplacementText( "Zeta", "\u0396" );
267 parser.defineEntityReplacementText( "Eta", "\u0397" );
268 parser.defineEntityReplacementText( "Theta", "\u0398" );
269 parser.defineEntityReplacementText( "Iota", "\u0399" );
270 parser.defineEntityReplacementText( "Kappa", "\u039a" );
271 parser.defineEntityReplacementText( "Lambda", "\u039b" );
272 parser.defineEntityReplacementText( "Mu", "\u039c" );
273 parser.defineEntityReplacementText( "Nu", "\u039d" );
274 parser.defineEntityReplacementText( "Xi", "\u039e" );
275 parser.defineEntityReplacementText( "Omicron", "\u039f" );
276 parser.defineEntityReplacementText( "Pi", "\u03a0" );
277 parser.defineEntityReplacementText( "Rho", "\u03a1" );
278 parser.defineEntityReplacementText( "Sigma", "\u03a3" );
279 parser.defineEntityReplacementText( "Tau", "\u03a4" );
280 parser.defineEntityReplacementText( "Upsilon", "\u03a5" );
281 parser.defineEntityReplacementText( "Phi", "\u03a6" );
282 parser.defineEntityReplacementText( "Chi", "\u03a7" );
283 parser.defineEntityReplacementText( "Psi", "\u03a8" );
284 parser.defineEntityReplacementText( "Omega", "\u03a9" );
285 parser.defineEntityReplacementText( "alpha", "\u03b1" );
286 parser.defineEntityReplacementText( "beta", "\u03b2" );
287 parser.defineEntityReplacementText( "gamma", "\u03b3" );
288 parser.defineEntityReplacementText( "delta", "\u03b4" );
289 parser.defineEntityReplacementText( "epsilon", "\u03b5" );
290 parser.defineEntityReplacementText( "zeta", "\u03b6" );
291 parser.defineEntityReplacementText( "eta", "\u03b7" );
292 parser.defineEntityReplacementText( "theta", "\u03b8" );
293 parser.defineEntityReplacementText( "iota", "\u03b9" );
294 parser.defineEntityReplacementText( "kappa", "\u03ba" );
295 parser.defineEntityReplacementText( "lambda", "\u03bb" );
296 parser.defineEntityReplacementText( "mu", "\u03bc" );
297 parser.defineEntityReplacementText( "nu", "\u03bd" );
298 parser.defineEntityReplacementText( "xi", "\u03be" );
299 parser.defineEntityReplacementText( "omicron", "\u03bf" );
300 parser.defineEntityReplacementText( "pi", "\u03c0" );
301 parser.defineEntityReplacementText( "rho", "\u03c1" );
302 parser.defineEntityReplacementText( "sigmaf", "\u03c2" );
303 parser.defineEntityReplacementText( "sigma", "\u03c3" );
304 parser.defineEntityReplacementText( "tau", "\u03c4" );
305 parser.defineEntityReplacementText( "upsilon", "\u03c5" );
306 parser.defineEntityReplacementText( "phi", "\u03c6" );
307 parser.defineEntityReplacementText( "chi", "\u03c7" );
308 parser.defineEntityReplacementText( "psi", "\u03c8" );
309 parser.defineEntityReplacementText( "omega", "\u03c9" );
310 parser.defineEntityReplacementText( "thetasym", "\u03d1" );
311 parser.defineEntityReplacementText( "upsih", "\u03d2" );
312 parser.defineEntityReplacementText( "piv", "\u03d6" );
313 parser.defineEntityReplacementText( "bull", "\u2022" );
314 parser.defineEntityReplacementText( "hellip", "\u2026" );
315 parser.defineEntityReplacementText( "prime", "\u2032" );
316 parser.defineEntityReplacementText( "Prime", "\u2033" );
317 parser.defineEntityReplacementText( "oline", "\u203e" );
318 parser.defineEntityReplacementText( "frasl", "\u2044" );
319 parser.defineEntityReplacementText( "weierp", "\u2118" );
320 parser.defineEntityReplacementText( "image", "\u2111" );
321 parser.defineEntityReplacementText( "real", "\u211c" );
322 parser.defineEntityReplacementText( "trade", "\u2122" );
323 parser.defineEntityReplacementText( "alefsym", "\u2135" );
324 parser.defineEntityReplacementText( "larr", "\u2190" );
325 parser.defineEntityReplacementText( "uarr", "\u2191" );
326 parser.defineEntityReplacementText( "rarr", "\u2192" );
327 parser.defineEntityReplacementText( "darr", "\u2193" );
328 parser.defineEntityReplacementText( "harr", "\u2194" );
329 parser.defineEntityReplacementText( "crarr", "\u21b5" );
330 parser.defineEntityReplacementText( "lArr", "\u21d0" );
331 parser.defineEntityReplacementText( "uArr", "\u21d1" );
332 parser.defineEntityReplacementText( "rArr", "\u21d2" );
333 parser.defineEntityReplacementText( "dArr", "\u21d3" );
334 parser.defineEntityReplacementText( "hArr", "\u21d4" );
335 parser.defineEntityReplacementText( "forall", "\u2200" );
336 parser.defineEntityReplacementText( "part", "\u2202" );
337 parser.defineEntityReplacementText( "exist", "\u2203" );
338 parser.defineEntityReplacementText( "empty", "\u2205" );
339 parser.defineEntityReplacementText( "nabla", "\u2207" );
340 parser.defineEntityReplacementText( "isin", "\u2208" );
341 parser.defineEntityReplacementText( "notin", "\u2209" );
342 parser.defineEntityReplacementText( "ni", "\u220b" );
343 parser.defineEntityReplacementText( "prod", "\u220f" );
344 parser.defineEntityReplacementText( "sum", "\u2211" );
345 parser.defineEntityReplacementText( "minus", "\u2212" );
346 parser.defineEntityReplacementText( "lowast", "\u2217" );
347 parser.defineEntityReplacementText( "radic", "\u221a" );
348 parser.defineEntityReplacementText( "prop", "\u221d" );
349 parser.defineEntityReplacementText( "infin", "\u221e" );
350 parser.defineEntityReplacementText( "ang", "\u2220" );
351 parser.defineEntityReplacementText( "and", "\u2227" );
352 parser.defineEntityReplacementText( "or", "\u2228" );
353 parser.defineEntityReplacementText( "cap", "\u2229" );
354 parser.defineEntityReplacementText( "cup", "\u222a" );
355 parser.defineEntityReplacementText( "int", "\u222b" );
356 parser.defineEntityReplacementText( "there4", "\u2234" );
357 parser.defineEntityReplacementText( "sim", "\u223c" );
358 parser.defineEntityReplacementText( "cong", "\u2245" );
359 parser.defineEntityReplacementText( "asymp", "\u2248" );
360 parser.defineEntityReplacementText( "ne", "\u2260" );
361 parser.defineEntityReplacementText( "equiv", "\u2261" );
362 parser.defineEntityReplacementText( "le", "\u2264" );
363 parser.defineEntityReplacementText( "ge", "\u2265" );
364 parser.defineEntityReplacementText( "sub", "\u2282" );
365 parser.defineEntityReplacementText( "sup", "\u2283" );
366 parser.defineEntityReplacementText( "nsub", "\u2284" );
367 parser.defineEntityReplacementText( "sube", "\u2286" );
368 parser.defineEntityReplacementText( "supe", "\u2287" );
369 parser.defineEntityReplacementText( "oplus", "\u2295" );
370 parser.defineEntityReplacementText( "otimes", "\u2297" );
371 parser.defineEntityReplacementText( "perp", "\u22a5" );
372 parser.defineEntityReplacementText( "sdot", "\u22c5" );
373 parser.defineEntityReplacementText( "lceil", "\u2308" );
374 parser.defineEntityReplacementText( "rceil", "\u2309" );
375 parser.defineEntityReplacementText( "lfloor", "\u230a" );
376 parser.defineEntityReplacementText( "rfloor", "\u230b" );
377 parser.defineEntityReplacementText( "lang", "\u2329" );
378 parser.defineEntityReplacementText( "rang", "\u232a" );
379 parser.defineEntityReplacementText( "loz", "\u25ca" );
380 parser.defineEntityReplacementText( "spades", "\u2660" );
381 parser.defineEntityReplacementText( "clubs", "\u2663" );
382 parser.defineEntityReplacementText( "hearts", "\u2665" );
383 parser.defineEntityReplacementText( "diams", "\u2666" );
384 }
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407 protected boolean baseStartTag( XmlPullParser parser, Sink sink )
408 {
409 boolean visited = true;
410
411 SinkEventAttributeSet attribs = getAttributesFromParser( parser );
412
413 if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
414 {
415 handleSectionStart( sink, Sink.SECTION_LEVEL_1, attribs );
416 }
417 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
418 {
419 handleSectionStart( sink, Sink.SECTION_LEVEL_2, attribs );
420 }
421 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
422 {
423 handleSectionStart( sink, Sink.SECTION_LEVEL_3, attribs );
424 }
425 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
426 {
427 handleSectionStart( sink, Sink.SECTION_LEVEL_4, attribs );
428 }
429 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
430 {
431 handleSectionStart( sink, Sink.SECTION_LEVEL_5, attribs );
432 }
433 else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
434 {
435 decoration.addAttribute( SinkEventAttributes.DECORATION, "underline" );
436 }
437 else if ( parser.getName().equals( HtmlMarkup.S.toString() )
438 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
439 || parser.getName().equals( "del" ) )
440 {
441 decoration.addAttribute( SinkEventAttributes.DECORATION, "line-through" );
442 }
443 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
444 {
445 decoration.addAttribute( SinkEventAttributes.VALIGN, "sub" );
446 }
447 else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
448 {
449 decoration.addAttribute( SinkEventAttributes.VALIGN, "sup" );
450 }
451 else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
452 {
453 handlePStart( sink, attribs );
454 }
455 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
456 {
457 visited = handleDivStart( parser, attribs, sink );
458 }
459 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
460 {
461 handlePreStart( attribs, sink );
462 }
463 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
464 {
465 sink.list( attribs );
466 }
467 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
468 {
469 handleOLStart( parser, sink, attribs );
470 }
471 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
472 {
473 handleLIStart( sink, attribs );
474 }
475 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
476 {
477 sink.definitionList( attribs );
478 }
479 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
480 {
481 if ( hasDefinitionListItem )
482 {
483
484 sink.definitionListItem_();
485 }
486 sink.definitionListItem( attribs );
487 hasDefinitionListItem = true;
488 sink.definedTerm( attribs );
489 }
490 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
491 {
492 if ( !hasDefinitionListItem )
493 {
494 sink.definitionListItem( attribs );
495 }
496 sink.definition( attribs );
497 }
498 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
499 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
500 {
501 sink.bold();
502 }
503 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
504 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
505 {
506 handleFigureCaptionStart( sink, attribs );
507 }
508 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
509 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
510 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
511 {
512 sink.monospaced();
513 }
514 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
515 {
516 handleAStart( parser, sink, attribs );
517 }
518 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
519 {
520 handleTableStart( sink, attribs, parser );
521 }
522 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
523 {
524 sink.tableRow( attribs );
525 }
526 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
527 {
528 sink.tableHeaderCell( attribs );
529 }
530 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
531 {
532 sink.tableCell( attribs );
533 }
534 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
535 {
536 sink.tableCaption( attribs );
537 }
538 else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
539 {
540 sink.lineBreak( attribs );
541 }
542 else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
543 {
544 sink.horizontalRule( attribs );
545 }
546 else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
547 {
548 handleImgStart( parser, sink, attribs );
549 }
550 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
551 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
552 {
553 handleUnknown( parser, sink, TAG_TYPE_START );
554 scriptBlock = true;
555 }
556 else
557 {
558 visited = false;
559 }
560
561 return visited;
562 }
563
564
565
566
567
568
569
570
571
572
573
574
575
576 protected boolean baseEndTag( XmlPullParser parser, Sink sink )
577 {
578 boolean visited = true;
579
580 if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
581 {
582 if ( !inFigure )
583 {
584 sink.paragraph_();
585 }
586 }
587 else if ( parser.getName().equals( HtmlMarkup.U.toString() )
588 || parser.getName().equals( HtmlMarkup.S.toString() )
589 || parser.getName().equals( HtmlMarkup.STRIKE.toString() )
590 || parser.getName().equals( "del" ) )
591 {
592 decoration.removeAttribute( SinkEventAttributes.DECORATION );
593 }
594 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() )
595 || parser.getName().equals( HtmlMarkup.SUP.toString() ) )
596 {
597 decoration.removeAttribute( SinkEventAttributes.VALIGN );
598 }
599 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
600 {
601 if ( inFigure )
602 {
603 sink.figure_();
604 this.inFigure = false;
605 }
606 else
607 {
608 visited = false;
609 }
610 }
611 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
612 {
613 verbatim_();
614
615 sink.verbatim_();
616 }
617 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
618 {
619 sink.list_();
620 }
621 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
622 {
623 sink.numberedList_();
624 orderedListDepth--;
625 }
626 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
627 {
628 handleListItemEnd( sink );
629 }
630 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
631 {
632 if ( hasDefinitionListItem )
633 {
634 sink.definitionListItem_();
635 hasDefinitionListItem = false;
636 }
637 sink.definitionList_();
638 }
639 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
640 {
641 sink.definedTerm_();
642 }
643 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
644 {
645 sink.definition_();
646 sink.definitionListItem_();
647 hasDefinitionListItem = false;
648 }
649 else if ( ( parser.getName().equals( HtmlMarkup.B.toString() ) )
650 || ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) )
651 {
652 sink.bold_();
653 }
654 else if ( ( parser.getName().equals( HtmlMarkup.I.toString() ) )
655 || ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) )
656 {
657 handleFigureCaptionEnd( sink );
658 }
659 else if ( ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
660 || ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
661 || ( parser.getName().equals( HtmlMarkup.TT.toString() ) ) )
662 {
663 sink.monospaced_();
664 }
665 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
666 {
667 handleAEnd( sink );
668 }
669
670
671
672
673
674 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
675 {
676 sink.tableRows_();
677
678 sink.table_();
679 }
680 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
681 {
682 sink.tableRow_();
683 }
684 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
685 {
686 sink.tableHeaderCell_();
687 }
688 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
689 {
690 sink.tableCell_();
691 }
692 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
693 {
694 sink.tableCaption_();
695 }
696 else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
697 {
698 sink.sectionTitle1_();
699 }
700 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
701 {
702 sink.sectionTitle2_();
703 }
704 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
705 {
706 sink.sectionTitle3_();
707 }
708 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
709 {
710 sink.sectionTitle4_();
711 }
712 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
713 {
714 sink.sectionTitle5_();
715 }
716 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
717 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
718 {
719 handleUnknown( parser, sink, TAG_TYPE_END );
720
721 scriptBlock = false;
722 }
723 else
724 {
725 visited = false;
726 }
727
728 return visited;
729 }
730
731
732
733
734
735
736
737 protected void handleStartTag( XmlPullParser parser, Sink sink )
738 throws XmlPullParserException, MacroExecutionException
739 {
740 if ( !baseStartTag( parser, sink ) )
741 {
742 if ( getLog().isWarnEnabled() )
743 {
744 String position = "[" + parser.getLineNumber() + ":"
745 + parser.getColumnNumber() + "]";
746 String tag = "<" + parser.getName() + ">";
747
748 getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
749 }
750 }
751 }
752
753
754
755
756
757
758
759 protected void handleEndTag( XmlPullParser parser, Sink sink )
760 throws XmlPullParserException, MacroExecutionException
761 {
762 if ( !baseEndTag( parser, sink ) )
763 {
764
765 }
766 }
767
768
769 @Override
770 protected void handleText( XmlPullParser parser, Sink sink )
771 throws XmlPullParserException
772 {
773 String text = getText( parser );
774
775
776
777
778
779
780
781 if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
782 {
783 sink.text( text, decoration );
784 }
785 }
786
787
788 @Override
789 protected void handleComment( XmlPullParser parser, Sink sink )
790 throws XmlPullParserException
791 {
792 String text = getText( parser );
793
794 if ( "PB".equals( text.trim() ) )
795 {
796 sink.pageBreak();
797 }
798 else
799 {
800 if ( isEmitComments() )
801 {
802 sink.comment( text );
803 }
804 }
805 }
806
807
808 @Override
809 protected void handleCdsect( XmlPullParser parser, Sink sink )
810 throws XmlPullParserException
811 {
812 String text = getText( parser );
813
814 if ( isScriptBlock() )
815 {
816 sink.unknown( CDATA, new Object[] { Integer.valueOf( CDATA_TYPE ), text}, null );
817 }
818 else
819 {
820 sink.text( text );
821 }
822 }
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852 protected void consecutiveSections( int newLevel, Sink sink )
853 {
854 closeOpenSections( newLevel, sink );
855 openMissingSections( newLevel, sink );
856
857 this.sectionLevel = newLevel;
858 }
859
860
861
862
863
864
865
866 private void closeOpenSections( int newLevel, Sink sink )
867 {
868 while ( this.sectionLevel >= newLevel )
869 {
870 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
871 {
872 sink.section5_();
873 }
874 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
875 {
876 sink.section4_();
877 }
878 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
879 {
880 sink.section3_();
881 }
882 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
883 {
884 sink.section2_();
885 }
886 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
887 {
888 sink.section1_();
889 }
890
891 this.sectionLevel--;
892 }
893 }
894
895
896
897
898
899
900
901 private void openMissingSections( int newLevel, Sink sink )
902 {
903 while ( this.sectionLevel < newLevel - 1 )
904 {
905 this.sectionLevel++;
906
907 if ( sectionLevel == Sink.SECTION_LEVEL_5 )
908 {
909 sink.section5();
910 }
911 else if ( sectionLevel == Sink.SECTION_LEVEL_4 )
912 {
913 sink.section4();
914 }
915 else if ( sectionLevel == Sink.SECTION_LEVEL_3 )
916 {
917 sink.section3();
918 }
919 else if ( sectionLevel == Sink.SECTION_LEVEL_2 )
920 {
921 sink.section2();
922 }
923 else if ( sectionLevel == Sink.SECTION_LEVEL_1 )
924 {
925 sink.section1();
926 }
927 }
928 }
929
930
931
932
933
934
935 protected int getSectionLevel()
936 {
937 return this.sectionLevel;
938 }
939
940
941
942
943
944
945 protected void setSectionLevel( int newLevel )
946 {
947 this.sectionLevel = newLevel;
948 }
949
950
951
952
953 protected void verbatim_()
954 {
955 this.inVerbatim = false;
956 }
957
958
959
960
961 protected void verbatim()
962 {
963 this.inVerbatim = true;
964 }
965
966
967
968
969
970
971 protected boolean isVerbatim()
972 {
973 return this.inVerbatim;
974 }
975
976
977
978
979
980
981
982
983 protected boolean isScriptBlock()
984 {
985 return this.scriptBlock;
986 }
987
988
989
990
991
992
993
994
995 protected String validAnchor( String id )
996 {
997 if ( !DoxiaUtils.isValidId( id ) )
998 {
999 String linkAnchor = DoxiaUtils.encodeId( id, true );
1000
1001 String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
1002 logMessage( "modifiedLink", msg );
1003
1004 return linkAnchor;
1005 }
1006
1007 return id;
1008 }
1009
1010
1011 @Override
1012 protected void init()
1013 {
1014 super.init();
1015
1016 this.scriptBlock = false;
1017 this.isLink = false;
1018 this.isAnchor = false;
1019 this.orderedListDepth = 0;
1020 this.sectionLevel = 0;
1021 this.inVerbatim = false;
1022 this.inFigure = false;
1023 while ( this.decoration.getAttributeNames().hasMoreElements() )
1024 {
1025 this.decoration.removeAttribute( this.decoration.getAttributeNames().nextElement() );
1026 }
1027 this.warnMessages = null;
1028 }
1029
1030 private void handleAEnd( Sink sink )
1031 {
1032 if ( isLink )
1033 {
1034 sink.link_();
1035 isLink = false;
1036 }
1037 else if ( isAnchor )
1038 {
1039 sink.anchor_();
1040 isAnchor = false;
1041 }
1042 }
1043
1044 private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1045 {
1046 String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
1047
1048 if ( href != null )
1049 {
1050 int hashIndex = href.indexOf( '#' );
1051 if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
1052 {
1053 String hash = href.substring( hashIndex + 1 );
1054
1055 if ( !DoxiaUtils.isValidId( hash ) )
1056 {
1057 href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
1058
1059 String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
1060 logMessage( "modifiedLink", msg );
1061 }
1062 }
1063 sink.link( href, attribs );
1064 isLink = true;
1065 }
1066 else
1067 {
1068 String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
1069
1070 if ( name != null )
1071 {
1072 sink.anchor( validAnchor( name ), attribs );
1073 isAnchor = true;
1074 }
1075 else
1076 {
1077 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
1078 if ( id != null )
1079 {
1080 sink.anchor( validAnchor( id ), attribs );
1081 isAnchor = true;
1082 }
1083 }
1084 }
1085 }
1086
1087 private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
1088 {
1089 boolean visited = true;
1090
1091 String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
1092
1093 if ( "figure".equals( divclass ) )
1094 {
1095 this.inFigure = true;
1096 SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
1097 atts.removeAttribute( SinkEventAttributes.CLASS );
1098 sink.figure( atts );
1099 }
1100 else
1101 {
1102 visited = false;
1103 }
1104
1105 return visited;
1106 }
1107
1108 private void handleFigureCaptionEnd( Sink sink )
1109 {
1110 if ( inFigure )
1111 {
1112 sink.figureCaption_();
1113 }
1114 else
1115 {
1116 sink.italic_();
1117 }
1118 }
1119
1120 private void handleFigureCaptionStart( Sink sink, SinkEventAttributeSet attribs )
1121 {
1122 if ( inFigure )
1123 {
1124 sink.figureCaption( attribs );
1125 }
1126 else
1127 {
1128 sink.italic();
1129 }
1130 }
1131
1132 private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1133 {
1134 String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
1135
1136 if ( src != null )
1137 {
1138 sink.figureGraphics( src, attribs );
1139 }
1140 }
1141
1142 private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
1143 {
1144 if ( orderedListDepth == 0 )
1145 {
1146 sink.listItem( attribs );
1147 }
1148 else
1149 {
1150 sink.numberedListItem( attribs );
1151 }
1152 }
1153
1154 private void handleListItemEnd( Sink sink )
1155 {
1156 if ( orderedListDepth == 0 )
1157 {
1158 sink.listItem_();
1159 }
1160 else
1161 {
1162 sink.numberedListItem_();
1163 }
1164 }
1165
1166 private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1167 {
1168 int numbering = Sink.NUMBERING_DECIMAL;
1169
1170 String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
1171
1172 if ( style != null )
1173 {
1174 if ( "list-style-type: upper-alpha".equals( style ) )
1175 {
1176 numbering = Sink.NUMBERING_UPPER_ALPHA;
1177 }
1178 else if ( "list-style-type: lower-alpha".equals( style ) )
1179 {
1180 numbering = Sink.NUMBERING_LOWER_ALPHA;
1181 }
1182 else if ( "list-style-type: upper-roman".equals( style ) )
1183 {
1184 numbering = Sink.NUMBERING_UPPER_ROMAN;
1185 }
1186 else if ( "list-style-type: lower-roman".equals( style ) )
1187 {
1188 numbering = Sink.NUMBERING_LOWER_ROMAN;
1189 }
1190 else if ( "list-style-type: decimal".equals( style ) )
1191 {
1192 numbering = Sink.NUMBERING_DECIMAL;
1193 }
1194 }
1195
1196 sink.numberedList( numbering, attribs );
1197 orderedListDepth++;
1198 }
1199
1200 private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
1201 {
1202 if ( !inFigure )
1203 {
1204 sink.paragraph( attribs );
1205 }
1206 }
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218 private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
1219 {
1220 verbatim();
1221 attribs.removeAttribute( SinkEventAttributes.DECORATION );
1222 sink.verbatim( attribs );
1223 }
1224
1225 private void handleSectionStart( Sink sink, int level, SinkEventAttributeSet attribs )
1226 {
1227 consecutiveSections( level, sink );
1228 sink.section( level, attribs );
1229 sink.sectionTitle( level, attribs );
1230 }
1231
1232 private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
1233 {
1234 sink.table( attribs );
1235 String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
1236 boolean grid = true;
1237
1238 if ( border == null || "0".equals( border ) )
1239 {
1240 grid = false;
1241 }
1242
1243 String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
1244 int[] justif = {Sink.JUSTIFY_LEFT};
1245
1246 if ( "center".equals( align ) )
1247 {
1248 justif[0] = Sink.JUSTIFY_CENTER;
1249 }
1250 else if ( "right".equals( align ) )
1251 {
1252 justif[0] = Sink.JUSTIFY_RIGHT;
1253 }
1254
1255 sink.tableRows( justif, grid );
1256 }
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266 private void logMessage( String key, String msg )
1267 {
1268 final String log = "[XHTML Parser] " + msg;
1269 if ( getLog().isDebugEnabled() )
1270 {
1271 getLog().debug( log );
1272
1273 return;
1274 }
1275
1276 if ( warnMessages == null )
1277 {
1278 warnMessages = new HashMap<String, Set<String>>();
1279 }
1280
1281 Set<String> set = warnMessages.get( key );
1282 if ( set == null )
1283 {
1284 set = new TreeSet<String>();
1285 }
1286 set.add( log );
1287 warnMessages.put( key, set );
1288 }
1289
1290
1291
1292
1293 private void logWarnings()
1294 {
1295 if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
1296 {
1297 for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
1298 {
1299 for ( String msg : entry.getValue() )
1300 {
1301 getLog().warn( msg );
1302 }
1303 }
1304
1305 this.warnMessages = null;
1306 }
1307 }
1308 }