View Javadoc
1   package org.apache.maven.doxia.parser;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.Reader;
23  import java.util.HashMap;
24  import java.util.Map;
25  import java.util.Set;
26  import java.util.Stack;
27  import java.util.TreeSet;
28  
29  import javax.swing.text.html.HTML.Attribute;
30  
31  import org.apache.maven.doxia.macro.MacroExecutionException;
32  import org.apache.maven.doxia.markup.HtmlMarkup;
33  import org.apache.maven.doxia.sink.Sink;
34  import org.apache.maven.doxia.sink.SinkEventAttributes;
35  import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
36  import org.apache.maven.doxia.util.DoxiaUtils;
37  import org.codehaus.plexus.util.StringUtils;
38  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
39  import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
40  
41  /**
42   * Common base parser for xhtml5 events.
43   */
44  public class Xhtml5BaseParser
45      extends AbstractXmlParser
46          implements HtmlMarkup
47  {
48      /**
49       * True if a <script></script> or <style></style> block is read. CDATA sections within are
50       * handled as rawText.
51       */
52      private boolean scriptBlock;
53  
54      /** Used to distinguish <a href=""> from <a name="">. */
55      private boolean isLink;
56  
57      /** Used to distinguish <a href=""> from <a name="">. */
58      private boolean isAnchor;
59  
60      /** Used for nested lists. */
61      private int orderedListDepth = 0;
62  
63      /** Counts section level. */
64      private int sectionLevel;
65  
66      /** Counts heading level. */
67      private int headingLevel;
68  
69      /** Verbatim flag, true whenever we are inside a <pre> tag. */
70      private boolean inVerbatim;
71  
72      /** Used to keep track of closing tags for content events */
73      private Stack<String> divStack = new Stack<>();
74  
75      /** Used to wrap the definedTerm with its definition, even when one is omitted */
76      boolean hasDefinitionListItem = false;
77  
78      /** Map of warn messages with a String as key to describe the error type and a Set as value.
79       * Using to reduce warn messages. */
80      private Map<String, Set<String>> warnMessages;
81  
82      /** {@inheritDoc} */
83      @Override
84      public void parse( Reader source, Sink sink, String reference )
85          throws ParseException
86      {
87          init();
88  
89          try
90          {
91              super.parse( source, sink, reference );
92          }
93          finally
94          {
95              logWarnings();
96  
97              setSecondParsing( false );
98              init();
99          }
100     }
101 
102     /**
103      * {@inheritDoc}
104      *
105      * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved
106      * without additional DTD.
107      */
108     @Override
109     protected void initXmlParser( XmlPullParser parser )
110         throws XmlPullParserException
111     {
112         super.initXmlParser( parser );
113     }
114 
115     /**
116      * <p>
117      *   Goes through a common list of possible html5 start tags. These include only tags that can go into
118      *   the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers.
119      * </p>
120      * <p>
121      *   The currently handled tags are:
122      * </p>
123      * <p>
124      *   <code>
125      *      &lt;article&gt;, &lt;nav&gt;, &lt;aside&gt;, &lt;section&gt;, &lt;h2&gt;, &lt;h3&gt;, &lt;h4&gt;,
126      *      &lt;h5&gt;, &lt;h6&gt;, &lt;header&gt;, &lt;main&gt;, &lt;footer&gt;, &lt;em&gt;, &lt;strong&gt;,
127      *      &lt;small&gt;, &lt;s&gt;, &lt;cite&gt;, &lt;q&gt;, &lt;dfn&gt;, &lt;abbr&gt;, &lt;i&gt;,
128      *      &lt;b&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;kbd&gt;, &lt;sub&gt;, &lt;sup&gt;, &lt;u&gt;,
129      *      &lt;mark&gt;, &lt;ruby&gt;, &lt;rb&gt;, &lt;rt&gt;, &lt;rtc&gt;, &lt;rp&gt;, &lt;bdi&gt;,
130      *      &lt;bdo&gt;, &lt;span&gt;, &lt;ins&gt;, &lt;del&gt;, &lt;p&gt;, &lt;pre&gt;, &lt;ul&gt;,
131      *      &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;a&gt;, &lt;table&gt;,
132      *      &lt;tr&gt;, &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;wbr/&gt;, &lt;hr/&gt;,
133      *      &lt;img/&gt;.
134      *   </code>
135      * </p>
136      *
137      * @param parser A parser.
138      * @param sink the sink to receive the events.
139      * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
140      */
141     protected boolean baseStartTag( XmlPullParser parser, Sink sink )
142     {
143         boolean visited = true;
144 
145         SinkEventAttributeSet attribs = getAttributesFromParser( parser );
146 
147         if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
148         {
149             sink.article( attribs );
150         }
151         else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
152         {
153             sink.navigation( attribs );
154         }
155         else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
156         {
157             sink.sidebar( attribs );
158         }
159         else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
160         {
161             handleSectionStart( sink, attribs );
162         }
163         else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
164         {
165             handleHeadingStart( sink, Sink.SECTION_LEVEL_1, attribs );
166         }
167         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
168         {
169             handleHeadingStart( sink, Sink.SECTION_LEVEL_2, attribs );
170         }
171         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
172         {
173             handleHeadingStart( sink, Sink.SECTION_LEVEL_3, attribs );
174         }
175         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
176         {
177             handleHeadingStart( sink, Sink.SECTION_LEVEL_4, attribs );
178         }
179         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
180         {
181             handleHeadingStart( sink, Sink.SECTION_LEVEL_5, attribs );
182         }
183         else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
184         {
185             sink.header( attribs );
186         }
187         else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
188         {
189             sink.content( attribs );
190         }
191         else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
192         {
193             sink.footer( attribs );
194         }
195         else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
196         {
197             attribs.addAttributes( SinkEventAttributeSet.Semantics.EMPHASIS );
198             sink.inline( attribs );
199         }
200         else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
201         {
202             attribs.addAttributes( SinkEventAttributeSet.Semantics.STRONG );
203             sink.inline( attribs );
204         }
205         else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
206         {
207             attribs.addAttributes( SinkEventAttributeSet.Semantics.SMALL );
208             sink.inline( attribs );
209         }
210         else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
211         {
212             attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH );
213             sink.inline( attribs );
214             /* deprecated line-through support */
215         }
216         else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
217         {
218             attribs.addAttributes( SinkEventAttributeSet.Semantics.CITATION );
219             sink.inline( attribs );
220         }
221         else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
222         {
223             attribs.addAttributes( SinkEventAttributeSet.Semantics.QUOTE );
224             sink.inline( attribs );
225         }
226         else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
227         {
228             attribs.addAttributes( SinkEventAttributeSet.Semantics.DEFINITION );
229             sink.inline( attribs );
230         }
231         else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
232         {
233             attribs.addAttributes( SinkEventAttributeSet.Semantics.ABBREVIATION );
234             sink.inline( attribs );
235         }
236         else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
237         {
238             attribs.addAttributes( SinkEventAttributeSet.Semantics.ITALIC );
239             sink.inline( attribs );
240         }
241         else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
242         {
243             attribs.addAttributes( SinkEventAttributeSet.Semantics.BOLD );
244             sink.inline( attribs );
245         }
246         else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
247         {
248             attribs.addAttributes( SinkEventAttributeSet.Semantics.CODE );
249             sink.inline( attribs );
250         }
251         else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
252         {
253             attribs.addAttributes( SinkEventAttributeSet.Semantics.VARIABLE );
254             sink.inline( attribs );
255         }
256         else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
257         {
258             attribs.addAttributes( SinkEventAttributeSet.Semantics.SAMPLE );
259             sink.inline( attribs );
260         }
261         else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
262         {
263             attribs.addAttributes( SinkEventAttributeSet.Semantics.KEYBOARD );
264             sink.inline( attribs );
265         }
266         else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
267         {
268             attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT );
269             sink.inline( attribs );
270         }
271         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
272         {
273             attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT );
274             sink.inline( attribs );
275         }
276         else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
277         {
278             attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION );
279             sink.inline( attribs );
280         }
281         else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
282         {
283             attribs.addAttributes( SinkEventAttributeSet.Semantics.HIGHLIGHT );
284             sink.inline( attribs );
285         }
286         else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
287         {
288             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY );
289             sink.inline( attribs );
290         }
291         else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
292         {
293             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_BASE );
294             sink.inline( attribs );
295         }
296         else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
297         {
298             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT );
299             sink.inline( attribs );
300         }
301         else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
302         {
303             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER );
304             sink.inline( attribs );
305         }
306         else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
307         {
308             attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_PARANTHESES );
309             sink.inline( attribs );
310         }
311         else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
312         {
313             attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION );
314             sink.inline( attribs );
315         }
316         else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
317         {
318             attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE );
319             sink.inline( attribs );
320         }
321         else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
322         {
323             attribs.addAttributes( SinkEventAttributeSet.Semantics.PHRASE );
324             sink.inline( attribs );
325         }
326         else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
327         {
328             attribs.addAttributes( SinkEventAttributeSet.Semantics.INSERT );
329             sink.inline( attribs );
330         }
331         else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
332         {
333             attribs.addAttributes( SinkEventAttributeSet.Semantics.DELETE );
334             sink.inline( attribs );
335         }
336         else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
337         {
338             handlePStart( sink, attribs );
339         }
340         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
341         {
342             handleDivStart( parser, attribs, sink );
343         }
344         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
345         {
346             handlePreStart( attribs, sink );
347         }
348         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
349         {
350             sink.list( attribs );
351         }
352         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
353         {
354             handleOLStart( parser, sink, attribs );
355         }
356         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
357         {
358             handleLIStart( sink, attribs );
359         }
360         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
361         {
362             sink.definitionList( attribs );
363         }
364         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
365         {
366             if ( hasDefinitionListItem )
367             {
368                 // close previous listItem
369                 sink.definitionListItem_();
370             }
371             sink.definitionListItem( attribs );
372             hasDefinitionListItem = true;
373             sink.definedTerm( attribs );
374         }
375         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
376         {
377             if ( !hasDefinitionListItem )
378             {
379                 sink.definitionListItem( attribs );
380             }
381             sink.definition( attribs );
382         }
383         else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
384         {
385             sink.figure( attribs );
386         }
387         else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
388         {
389             sink.figureCaption( attribs );
390         }
391         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
392         {
393             handleAStart( parser, sink, attribs );
394         }
395         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
396         {
397             handleTableStart( sink, attribs, parser );
398         }
399         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
400         {
401             sink.tableRow( attribs );
402         }
403         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
404         {
405             sink.tableHeaderCell( attribs );
406         }
407         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
408         {
409             sink.tableCell( attribs );
410         }
411         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
412         {
413             sink.tableCaption( attribs );
414         }
415         else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
416         {
417             sink.lineBreak( attribs );
418         }
419         else if ( parser.getName().equals( HtmlMarkup.WBR.toString() ) )
420         {
421             sink.lineBreakOpportunity( attribs );
422         }
423         else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
424         {
425             sink.horizontalRule( attribs );
426         }
427         else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
428         {
429             handleImgStart( parser, sink, attribs );
430         }
431         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
432             || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
433         {
434             handleUnknown( parser, sink, TAG_TYPE_START );
435             scriptBlock = true;
436         }
437         else
438         {
439             visited = false;
440         }
441 
442         return visited;
443     }
444 
445     /**
446      * <p>
447      *   Goes through a common list of possible html end tags.
448      *   These should be re-usable by different xhtml-based parsers.
449      *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
450      *   except for the empty elements ({@code <br/>, <hr/>, <img/>}).
451      * </p>
452      *
453      * @param parser A parser.
454      * @param sink the sink to receive the events.
455      * @return True if the event has been handled by this method, false otherwise.
456      */
457     protected boolean baseEndTag( XmlPullParser parser, Sink sink )
458     {
459         boolean visited = true;
460 
461         if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
462         {
463             sink.paragraph_();
464         }
465         else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
466         {
467             handleDivEnd( sink );
468         }
469         else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
470         {
471             verbatim_();
472 
473             sink.verbatim_();
474         }
475         else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
476         {
477             sink.list_();
478         }
479         else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
480         {
481             sink.numberedList_();
482             orderedListDepth--;
483         }
484         else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
485         {
486             handleListItemEnd( sink );
487         }
488         else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
489         {
490             if ( hasDefinitionListItem )
491             {
492                 sink.definitionListItem_();
493                 hasDefinitionListItem = false;
494             }
495             sink.definitionList_();
496         }
497         else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
498         {
499             sink.definedTerm_();
500         }
501         else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
502         {
503             sink.definition_();
504             sink.definitionListItem_();
505             hasDefinitionListItem = false;
506         }
507         else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
508         {
509             sink.figure_();
510         }
511         else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
512         {
513             sink.figureCaption_();
514         }
515         else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
516         {
517             handleAEnd( sink );
518         }
519 
520         else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
521         {
522             sink.inline_();
523         }
524         else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
525         {
526             sink.inline_();
527         }
528         else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
529         {
530             sink.inline_();
531         }
532         else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
533         {
534             sink.inline_();
535         }
536         else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
537         {
538             sink.inline_();
539         }
540         else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
541         {
542             sink.inline_();
543         }
544         else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
545         {
546             sink.inline_();
547         }
548         else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
549         {
550             sink.inline_();
551         }
552         else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
553         {
554             sink.inline_();
555         }
556         else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
557         {
558             sink.inline_();
559         }
560         else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
561         {
562             sink.inline_();
563         }
564         else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
565         {
566             sink.inline_();
567         }
568         else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
569         {
570             sink.inline_();
571         }
572         else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
573         {
574             sink.inline_();
575         }
576         else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
577         {
578             sink.inline_();
579         }
580         else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
581         {
582             sink.inline_();
583         }
584         else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
585         {
586             sink.inline_();
587         }
588         else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
589         {
590             sink.inline_();
591         }
592         else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
593         {
594             sink.inline_();
595         }
596         else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
597         {
598             sink.inline_();
599         }
600         else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
601         {
602             sink.inline_();
603         }
604         else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
605         {
606             sink.inline_();
607         }
608         else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
609         {
610             sink.inline_();
611         }
612         else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
613         {
614             sink.inline_();
615         }
616         else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
617         {
618             sink.inline_();
619         }
620         else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
621         {
622             sink.inline_();
623         }
624         else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
625         {
626             sink.inline_();
627         }
628         else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
629         {
630             sink.inline_();
631         }
632 
633         // ----------------------------------------------------------------------
634         // Tables
635         // ----------------------------------------------------------------------
636 
637         else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
638         {
639             sink.tableRows_();
640 
641             sink.table_();
642         }
643         else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
644         {
645             sink.tableRow_();
646         }
647         else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
648         {
649             sink.tableHeaderCell_();
650         }
651         else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
652         {
653             sink.tableCell_();
654         }
655         else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
656         {
657             sink.tableCaption_();
658         }
659         else if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
660         {
661             sink.article_();
662         }
663         else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
664         {
665             sink.navigation_();
666         }
667         else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
668         {
669             sink.sidebar_();
670         }
671         else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
672         {
673             handleSectionEnd( sink );
674         }
675         else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
676         {
677             sink.sectionTitle1_();
678         }
679         else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
680         {
681             sink.sectionTitle2_();
682         }
683         else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
684         {
685             sink.sectionTitle3_();
686         }
687         else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
688         {
689             sink.sectionTitle4_();
690         }
691         else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
692         {
693             sink.sectionTitle5_();
694         }
695         else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
696         {
697             sink.header_();
698         }
699         else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
700         {
701             sink.content_();
702         }
703         else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
704         {
705             sink.footer_();
706         }
707         else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
708             || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
709         {
710             handleUnknown( parser, sink, TAG_TYPE_END );
711 
712             scriptBlock = false;
713         }
714         else
715         {
716             visited = false;
717         }
718 
719         return visited;
720     }
721 
722     /**
723      * {@inheritDoc}
724      *
725      * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
726      * overridden by implementing parsers to include additional tags.
727      */
728     protected void handleStartTag( XmlPullParser parser, Sink sink )
729         throws XmlPullParserException, MacroExecutionException
730     {
731         if ( !baseStartTag( parser, sink ) )
732         {
733             if ( getLog().isWarnEnabled() )
734             {
735                 String position = "[" + parser.getLineNumber() + ":"
736                     + parser.getColumnNumber() + "]";
737                 String tag = "<" + parser.getName() + ">";
738 
739                 getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
740             }
741         }
742     }
743 
744     /**
745      * {@inheritDoc}
746      *
747      * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
748      * overridden by implementing parsers to include additional tags.
749      */
750     protected void handleEndTag( XmlPullParser parser, Sink sink )
751         throws XmlPullParserException, MacroExecutionException
752     {
753         if ( !baseEndTag( parser, sink ) )
754         {
755             // unrecognized tag is already logged in StartTag
756         }
757     }
758 
759     /** {@inheritDoc} */
760     @Override
761     protected void handleText( XmlPullParser parser, Sink sink )
762         throws XmlPullParserException
763     {
764         String text = getText( parser );
765 
766         /*
767          * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
768          * parser so any whitespace that makes it here is significant.
769          *
770          * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
771          */
772         if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
773         {
774             sink.text( text );
775         }
776     }
777 
778     /** {@inheritDoc} */
779     @Override
780     protected void handleComment( XmlPullParser parser, Sink sink )
781         throws XmlPullParserException
782     {
783         String text = getText( parser );
784 
785         if ( "PB".equals( text.trim() ) )
786         {
787             sink.pageBreak();
788         }
789         else
790         {
791             if ( isEmitComments() )
792             {
793                 sink.comment( text );
794             }
795         }
796     }
797 
798     /** {@inheritDoc} */
799     @Override
800     protected void handleCdsect( XmlPullParser parser, Sink sink )
801         throws XmlPullParserException
802     {
803         String text = getText( parser );
804 
805         if ( isScriptBlock() )
806         {
807             sink.unknown( CDATA, new Object[] { CDATA_TYPE, text }, null );
808         }
809         else
810         {
811             sink.text( text );
812         }
813     }
814 
815     /**
816      * Make sure sections are nested consecutively.
817      *
818      * <p>
819      * HTML5 heading tags H1 to H6 imply sections where they are not
820      * present, that means we have to open close any sections that
821      * are missing in between.
822      * </p>
823      *
824      * <p>
825      * For instance, if the following sequence is parsed:
826      * </p>
827      * <pre>
828      * &lt;h3&gt;&lt;/h3&gt;
829      * &lt;h6&gt;&lt;/h6&gt;
830      * </pre>
831      * <p>
832      * we have to insert two section starts before we open the <code>&lt;h6&gt;</code>.
833      * In the following sequence
834      * </p>
835      * <pre>
836      * &lt;h6&gt;&lt;/h6&gt;
837      * &lt;h3&gt;&lt;/h3&gt;
838      * </pre>
839      * <p>
840      * we have to close two sections before we open the <code>&lt;h3&gt;</code>.
841      * </p>
842      *
843      * <p>The current level is set to newLevel afterwards.</p>
844      *
845      * @param newLevel the new section level, all upper levels have to be closed.
846      * @param sink the sink to receive the events.
847      * @param attribs a {@link org.apache.maven.doxia.sink.impl.SinkEventAttributeSet} object.
848      */
849     protected void consecutiveSections( int newLevel, Sink sink, SinkEventAttributeSet attribs )
850     {
851         closeOpenSections( newLevel, sink );
852         openMissingSections( newLevel, sink );
853 
854         this.headingLevel = newLevel;
855     }
856 
857     /**
858      * Close open sections.
859      *
860      * @param newLevel the new section level, all upper levels have to be closed.
861      * @param sink the sink to receive the events.
862      */
863     private void closeOpenSections( int newLevel, Sink sink )
864     {
865         while ( this.headingLevel >= newLevel
866                 && this.sectionLevel < headingLevel )
867         {
868             if ( headingLevel == Sink.SECTION_LEVEL_5 )
869             {
870                 sink.section5_();
871             }
872             else if ( headingLevel == Sink.SECTION_LEVEL_4 )
873             {
874                 sink.section4_();
875             }
876             else if ( headingLevel == Sink.SECTION_LEVEL_3 )
877             {
878                 sink.section3_();
879             }
880             else if ( headingLevel == Sink.SECTION_LEVEL_2 )
881             {
882                 sink.section2_();
883             }
884             else if ( headingLevel == Sink.SECTION_LEVEL_1 )
885             {
886                 sink.section1_();
887             }
888 
889             this.headingLevel--;
890         }
891     }
892 
893     /**
894      * Open missing sections.
895      *
896      * @param newLevel the new section level, all lower levels have to be opened.
897      * @param sink the sink to receive the events.
898      */
899     private void openMissingSections( int newLevel, Sink sink )
900     {
901         while ( this.headingLevel < newLevel
902                 && this.sectionLevel < newLevel )
903         {
904             this.headingLevel++;
905 
906             if ( headingLevel == Sink.SECTION_LEVEL_5 )
907             {
908                 sink.section5();
909             }
910             else if ( headingLevel == Sink.SECTION_LEVEL_4 )
911             {
912                 sink.section4();
913             }
914             else if ( headingLevel == Sink.SECTION_LEVEL_3 )
915             {
916                 sink.section3();
917             }
918             else if ( headingLevel == Sink.SECTION_LEVEL_2 )
919             {
920                 sink.section2();
921             }
922             else if ( headingLevel == Sink.SECTION_LEVEL_1 )
923             {
924                 sink.section1();
925             }
926         }
927     }
928 
929     /**
930      * Return the current section level.
931      *
932      * @return the current section level.
933      */
934     protected int getSectionLevel()
935     {
936         return this.headingLevel;
937     }
938 
939     /**
940      * Set the current section level.
941      *
942      * @param newLevel the new section level.
943      */
944     protected void setSectionLevel( int newLevel )
945     {
946         this.headingLevel = newLevel;
947     }
948 
949     /**
950      * Stop verbatim mode.
951      */
952     protected void verbatim_()
953     {
954         this.inVerbatim = false;
955     }
956 
957     /**
958      * Start verbatim mode.
959      */
960     protected void verbatim()
961     {
962         this.inVerbatim = true;
963     }
964 
965     /**
966      * Checks if we are currently inside a &lt;pre&gt; tag.
967      *
968      * @return true if we are currently in verbatim mode.
969      */
970     protected boolean isVerbatim()
971     {
972         return this.inVerbatim;
973     }
974 
975     /**
976      * Checks if we are currently inside a &lt;script&gt; tag.
977      *
978      * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
979      * @since 1.1.1.
980      */
981     protected boolean isScriptBlock()
982     {
983         return this.scriptBlock;
984     }
985 
986     /**
987      * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
988      *
989      * @param id The id to validate.
990      * @return A transformed id or the original id if it was already valid.
991      * @see DoxiaUtils#encodeId(String)
992      */
993     protected String validAnchor( String id )
994     {
995         if ( !DoxiaUtils.isValidId( id ) )
996         {
997             String linkAnchor = DoxiaUtils.encodeId( id, true );
998 
999             String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
1000             logMessage( "modifiedLink", msg );
1001 
1002             return linkAnchor;
1003         }
1004 
1005         return id;
1006     }
1007 
1008     /** {@inheritDoc} */
1009     @Override
1010     protected void init()
1011     {
1012         super.init();
1013 
1014         this.scriptBlock = false;
1015         this.isLink = false;
1016         this.isAnchor = false;
1017         this.orderedListDepth = 0;
1018         this.headingLevel = 0;
1019         this.inVerbatim = false;
1020         this.warnMessages = null;
1021     }
1022 
1023     private void handleAEnd( Sink sink )
1024     {
1025         if ( isLink )
1026         {
1027             sink.link_();
1028             isLink = false;
1029         }
1030         else if ( isAnchor )
1031         {
1032             sink.anchor_();
1033             isAnchor = false;
1034         }
1035     }
1036 
1037     private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1038     {
1039         String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
1040 
1041         if ( href != null )
1042         {
1043             int hashIndex = href.indexOf( '#' );
1044             if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
1045             {
1046                 String hash = href.substring( hashIndex + 1 );
1047 
1048                 if ( !DoxiaUtils.isValidId( hash ) )
1049                 {
1050                     href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
1051 
1052                     String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
1053                     logMessage( "modifiedLink", msg );
1054                 }
1055             }
1056             sink.link( href, attribs );
1057             isLink = true;
1058         }
1059         else
1060         {
1061             String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
1062 
1063             if ( name != null )
1064             {
1065                 sink.anchor( validAnchor( name ), attribs );
1066                 isAnchor = true;
1067             }
1068             else
1069             {
1070                 String id = parser.getAttributeValue( null, Attribute.ID.toString() );
1071                 if ( id != null )
1072                 {
1073                     sink.anchor( validAnchor( id ), attribs );
1074                     isAnchor = true;
1075                 }
1076             }
1077         }
1078     }
1079 
1080     private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
1081     {
1082         String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
1083 
1084         this.divStack.push( divclass );
1085 
1086         if ( "content".equals( divclass ) )
1087         {
1088             SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
1089             atts.removeAttribute( SinkEventAttributes.CLASS );
1090             sink.content( atts );
1091         }
1092         if ( "source".equals( divclass ) )
1093         {
1094             return false;
1095         }
1096         else
1097         {
1098             sink.division( attribs );
1099         }
1100 
1101         return true;
1102     }
1103 
1104     private boolean handleDivEnd( Sink sink )
1105     {
1106         String divclass = divStack.pop();
1107 
1108         if ( "content".equals( divclass ) )
1109         {
1110             sink.content_();
1111         }
1112         if ( "source".equals( divclass ) )
1113         {
1114             return false;
1115         }
1116         else
1117         {
1118             sink.division_();
1119         }
1120 
1121         return true;
1122     }
1123 
1124     private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1125     {
1126         String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
1127 
1128         if ( src != null )
1129         {
1130             sink.figureGraphics( src, attribs );
1131         }
1132     }
1133 
1134     private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
1135     {
1136         if ( orderedListDepth == 0 )
1137         {
1138             sink.listItem( attribs );
1139         }
1140         else
1141         {
1142             sink.numberedListItem( attribs );
1143         }
1144     }
1145 
1146     private void handleListItemEnd( Sink sink )
1147     {
1148         if ( orderedListDepth == 0 )
1149         {
1150             sink.listItem_();
1151         }
1152         else
1153         {
1154             sink.numberedListItem_();
1155         }
1156     }
1157 
1158     private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1159     {
1160         int numbering = Sink.NUMBERING_DECIMAL;
1161         // this will have to be generalized if we handle styles
1162         String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
1163 
1164         if ( style != null )
1165         {
1166             switch ( style )
1167             {
1168                 case "list-style-type: upper-alpha":
1169                     numbering = Sink.NUMBERING_UPPER_ALPHA;
1170                     break;
1171                 case "list-style-type: lower-alpha":
1172                     numbering = Sink.NUMBERING_LOWER_ALPHA;
1173                     break;
1174                 case "list-style-type: upper-roman":
1175                     numbering = Sink.NUMBERING_UPPER_ROMAN;
1176                     break;
1177                 case "list-style-type: lower-roman":
1178                     numbering = Sink.NUMBERING_LOWER_ROMAN;
1179                     break;
1180                 case "list-style-type: decimal":
1181                     numbering = Sink.NUMBERING_DECIMAL;
1182                     break;
1183                 default:
1184                     // ignore all other
1185             }
1186         }
1187 
1188         sink.numberedList( numbering, attribs );
1189         orderedListDepth++;
1190     }
1191 
1192     private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
1193     {
1194         sink.paragraph( attribs );
1195     }
1196 
1197     /*
1198      * The PRE element tells visual user agents that the enclosed text is
1199      * "preformatted". When handling preformatted text, visual user agents:
1200      * - May leave white space intact.
1201      * - May render text with a fixed-pitch font.
1202      * - May disable automatic word wrap.
1203      * - Must not disable bidirectional processing.
1204      * Non-visual user agents are not required to respect extra white space
1205      * in the content of a PRE element.
1206      */
1207     private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
1208     {
1209         verbatim();
1210         sink.verbatim( attribs );
1211     }
1212 
1213     private void handleSectionStart( Sink sink, SinkEventAttributeSet attribs )
1214     {
1215         sink.section( ++sectionLevel, attribs );
1216     }
1217 
1218     private void handleHeadingStart( Sink sink, int level, SinkEventAttributeSet attribs )
1219     {
1220         consecutiveSections( level, sink, attribs );
1221         sink.sectionTitle( level, attribs );
1222     }
1223 
1224     private void handleSectionEnd( Sink sink )
1225     {
1226         closeOpenSections( sectionLevel, sink );
1227         this.headingLevel = 0;
1228 
1229         sink.section_( sectionLevel-- );
1230     }
1231 
1232     private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
1233     {
1234         sink.table( attribs );
1235         String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
1236         boolean grid = true;
1237 
1238         if ( border == null || "0".equals( border ) )
1239         {
1240             grid = false;
1241         }
1242 
1243         String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
1244         int[] justif = {Sink.JUSTIFY_LEFT};
1245 
1246         if ( "center".equals( align ) )
1247         {
1248             justif[0] = Sink.JUSTIFY_CENTER;
1249         }
1250         else if ( "right".equals( align ) )
1251         {
1252             justif[0] = Sink.JUSTIFY_RIGHT;
1253         }
1254 
1255         sink.tableRows( justif, grid );
1256     }
1257 
1258     /**
1259      * If debug mode is enabled, log the <code>msg</code> as is, otherwise add unique msg in <code>warnMessages</code>.
1260      *
1261      * @param key not null
1262      * @param msg not null
1263      * @see #parse(Reader, Sink)
1264      * @since 1.1.1
1265      */
1266     private void logMessage( String key, String msg )
1267     {
1268         final String log = "[XHTML Parser] " + msg;
1269         if ( getLog().isDebugEnabled() )
1270         {
1271             getLog().debug( log );
1272 
1273             return;
1274         }
1275 
1276         if ( warnMessages == null )
1277         {
1278             warnMessages = new HashMap<>();
1279         }
1280 
1281         Set<String> set = warnMessages.get( key );
1282         if ( set == null )
1283         {
1284             set = new TreeSet<>();
1285         }
1286         set.add( log );
1287         warnMessages.put( key, set );
1288     }
1289 
1290     /**
1291      * @since 1.1.1
1292      */
1293     private void logWarnings()
1294     {
1295         if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
1296         {
1297             for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
1298             {
1299                 for ( String msg : entry.getValue() )
1300                 {
1301                     getLog().warn( msg );
1302                 }
1303             }
1304 
1305             this.warnMessages = null;
1306         }
1307     }
1308 }