001package org.apache.maven.doxia.parser;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import java.io.Reader;
023import java.util.HashMap;
024import java.util.Map;
025import java.util.Set;
026import java.util.Stack;
027import java.util.TreeSet;
028
029import javax.swing.text.html.HTML.Attribute;
030
031import org.apache.maven.doxia.macro.MacroExecutionException;
032import org.apache.maven.doxia.markup.HtmlMarkup;
033import org.apache.maven.doxia.sink.Sink;
034import org.apache.maven.doxia.sink.SinkEventAttributes;
035import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet;
036import org.apache.maven.doxia.util.DoxiaUtils;
037import org.codehaus.plexus.util.StringUtils;
038import org.codehaus.plexus.util.xml.pull.XmlPullParser;
039import org.codehaus.plexus.util.xml.pull.XmlPullParserException;
040
041/**
042 * Common base parser for xhtml5 events.
043 */
044public class Xhtml5BaseParser
045    extends AbstractXmlParser
046        implements HtmlMarkup
047{
048    /**
049     * True if a <script></script> or <style></style> block is read. CDATA sections within are
050     * handled as rawText.
051     */
052    private boolean scriptBlock;
053
054    /** Used to distinguish <a href=""> from <a name="">. */
055    private boolean isLink;
056
057    /** Used to distinguish <a href=""> from <a name="">. */
058    private boolean isAnchor;
059
060    /** Used for nested lists. */
061    private int orderedListDepth = 0;
062
063    /** Counts section level. */
064    private int sectionLevel;
065
066    /** Counts heading level. */
067    private int headingLevel;
068
069    /** Verbatim flag, true whenever we are inside a <pre> tag. */
070    private boolean inVerbatim;
071
072    /** Used to keep track of closing tags for content events */
073    private Stack<String> divStack = new Stack<>();
074
075    /** Used to wrap the definedTerm with its definition, even when one is omitted */
076    boolean hasDefinitionListItem = false;
077
078    /** Map of warn messages with a String as key to describe the error type and a Set as value.
079     * Using to reduce warn messages. */
080    private Map<String, Set<String>> warnMessages;
081
082    /** {@inheritDoc} */
083    @Override
084    public void parse( Reader source, Sink sink, String reference )
085        throws ParseException
086    {
087        init();
088
089        try
090        {
091            super.parse( source, sink, reference );
092        }
093        finally
094        {
095            logWarnings();
096
097            setSecondParsing( false );
098            init();
099        }
100    }
101
102    /**
103     * {@inheritDoc}
104     *
105     * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved
106     * without additional DTD.
107     */
108    @Override
109    protected void initXmlParser( XmlPullParser parser )
110        throws XmlPullParserException
111    {
112        super.initXmlParser( parser );
113    }
114
115    /**
116     * <p>
117     *   Goes through a common list of possible html5 start tags. These include only tags that can go into
118     *   the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers.
119     * </p>
120     * <p>
121     *   The currently handled tags are:
122     * </p>
123     * <p>
124     *   <code>
125     *      &lt;article&gt;, &lt;nav&gt;, &lt;aside&gt;, &lt;section&gt;, &lt;h2&gt;, &lt;h3&gt;, &lt;h4&gt;,
126     *      &lt;h5&gt;, &lt;h6&gt;, &lt;header&gt;, &lt;main&gt;, &lt;footer&gt;, &lt;em&gt;, &lt;strong&gt;,
127     *      &lt;small&gt;, &lt;s&gt;, &lt;cite&gt;, &lt;q&gt;, &lt;dfn&gt;, &lt;abbr&gt;, &lt;i&gt;,
128     *      &lt;b&gt;, &lt;code&gt;, &lt;samp&gt;, &lt;kbd&gt;, &lt;sub&gt;, &lt;sup&gt;, &lt;u&gt;,
129     *      &lt;mark&gt;, &lt;ruby&gt;, &lt;rb&gt;, &lt;rt&gt;, &lt;rtc&gt;, &lt;rp&gt;, &lt;bdi&gt;,
130     *      &lt;bdo&gt;, &lt;span&gt;, &lt;ins&gt;, &lt;del&gt;, &lt;p&gt;, &lt;pre&gt;, &lt;ul&gt;,
131     *      &lt;ol&gt;, &lt;li&gt;, &lt;dl&gt;, &lt;dt&gt;, &lt;dd&gt;, &lt;a&gt;, &lt;table&gt;,
132     *      &lt;tr&gt;, &lt;th&gt;, &lt;td&gt;, &lt;caption&gt;, &lt;br/&gt;, &lt;wbr/&gt;, &lt;hr/&gt;,
133     *      &lt;img/&gt;.
134     *   </code>
135     * </p>
136     *
137     * @param parser A parser.
138     * @param sink the sink to receive the events.
139     * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise.
140     */
141    protected boolean baseStartTag( XmlPullParser parser, Sink sink )
142    {
143        boolean visited = true;
144
145        SinkEventAttributeSet attribs = getAttributesFromParser( parser );
146
147        if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
148        {
149            sink.article( attribs );
150        }
151        else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
152        {
153            sink.navigation( attribs );
154        }
155        else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
156        {
157            sink.sidebar( attribs );
158        }
159        else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
160        {
161            handleSectionStart( sink, attribs );
162        }
163        else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
164        {
165            handleHeadingStart( sink, Sink.SECTION_LEVEL_1, attribs );
166        }
167        else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
168        {
169            handleHeadingStart( sink, Sink.SECTION_LEVEL_2, attribs );
170        }
171        else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
172        {
173            handleHeadingStart( sink, Sink.SECTION_LEVEL_3, attribs );
174        }
175        else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
176        {
177            handleHeadingStart( sink, Sink.SECTION_LEVEL_4, attribs );
178        }
179        else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
180        {
181            handleHeadingStart( sink, Sink.SECTION_LEVEL_5, attribs );
182        }
183        else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
184        {
185            sink.header( attribs );
186        }
187        else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
188        {
189            sink.content( attribs );
190        }
191        else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
192        {
193            sink.footer( attribs );
194        }
195        else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
196        {
197            attribs.addAttributes( SinkEventAttributeSet.Semantics.EMPHASIS );
198            sink.inline( attribs );
199        }
200        else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
201        {
202            attribs.addAttributes( SinkEventAttributeSet.Semantics.STRONG );
203            sink.inline( attribs );
204        }
205        else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
206        {
207            attribs.addAttributes( SinkEventAttributeSet.Semantics.SMALL );
208            sink.inline( attribs );
209        }
210        else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
211        {
212            attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH );
213            sink.inline( attribs );
214            /* deprecated line-through support */
215        }
216        else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
217        {
218            attribs.addAttributes( SinkEventAttributeSet.Semantics.CITATION );
219            sink.inline( attribs );
220        }
221        else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
222        {
223            attribs.addAttributes( SinkEventAttributeSet.Semantics.QUOTE );
224            sink.inline( attribs );
225        }
226        else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
227        {
228            attribs.addAttributes( SinkEventAttributeSet.Semantics.DEFINITION );
229            sink.inline( attribs );
230        }
231        else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
232        {
233            attribs.addAttributes( SinkEventAttributeSet.Semantics.ABBREVIATION );
234            sink.inline( attribs );
235        }
236        else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
237        {
238            attribs.addAttributes( SinkEventAttributeSet.Semantics.ITALIC );
239            sink.inline( attribs );
240        }
241        else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
242        {
243            attribs.addAttributes( SinkEventAttributeSet.Semantics.BOLD );
244            sink.inline( attribs );
245        }
246        else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
247        {
248            attribs.addAttributes( SinkEventAttributeSet.Semantics.CODE );
249            sink.inline( attribs );
250        }
251        else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
252        {
253            attribs.addAttributes( SinkEventAttributeSet.Semantics.VARIABLE );
254            sink.inline( attribs );
255        }
256        else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
257        {
258            attribs.addAttributes( SinkEventAttributeSet.Semantics.SAMPLE );
259            sink.inline( attribs );
260        }
261        else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
262        {
263            attribs.addAttributes( SinkEventAttributeSet.Semantics.KEYBOARD );
264            sink.inline( attribs );
265        }
266        else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
267        {
268            attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT );
269            sink.inline( attribs );
270        }
271        else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
272        {
273            attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT );
274            sink.inline( attribs );
275        }
276        else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
277        {
278            attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION );
279            sink.inline( attribs );
280        }
281        else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
282        {
283            attribs.addAttributes( SinkEventAttributeSet.Semantics.HIGHLIGHT );
284            sink.inline( attribs );
285        }
286        else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
287        {
288            attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY );
289            sink.inline( attribs );
290        }
291        else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
292        {
293            attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_BASE );
294            sink.inline( attribs );
295        }
296        else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
297        {
298            attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT );
299            sink.inline( attribs );
300        }
301        else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
302        {
303            attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER );
304            sink.inline( attribs );
305        }
306        else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
307        {
308            attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_PARANTHESES );
309            sink.inline( attribs );
310        }
311        else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
312        {
313            attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION );
314            sink.inline( attribs );
315        }
316        else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
317        {
318            attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE );
319            sink.inline( attribs );
320        }
321        else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
322        {
323            attribs.addAttributes( SinkEventAttributeSet.Semantics.PHRASE );
324            sink.inline( attribs );
325        }
326        else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
327        {
328            attribs.addAttributes( SinkEventAttributeSet.Semantics.INSERT );
329            sink.inline( attribs );
330        }
331        else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
332        {
333            attribs.addAttributes( SinkEventAttributeSet.Semantics.DELETE );
334            sink.inline( attribs );
335        }
336        else if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
337        {
338            handlePStart( sink, attribs );
339        }
340        else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
341        {
342            handleDivStart( parser, attribs, sink );
343        }
344        else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
345        {
346            handlePreStart( attribs, sink );
347        }
348        else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
349        {
350            sink.list( attribs );
351        }
352        else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
353        {
354            handleOLStart( parser, sink, attribs );
355        }
356        else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
357        {
358            handleLIStart( sink, attribs );
359        }
360        else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
361        {
362            sink.definitionList( attribs );
363        }
364        else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
365        {
366            if ( hasDefinitionListItem )
367            {
368                // close previous listItem
369                sink.definitionListItem_();
370            }
371            sink.definitionListItem( attribs );
372            hasDefinitionListItem = true;
373            sink.definedTerm( attribs );
374        }
375        else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
376        {
377            if ( !hasDefinitionListItem )
378            {
379                sink.definitionListItem( attribs );
380            }
381            sink.definition( attribs );
382        }
383        else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
384        {
385            sink.figure( attribs );
386        }
387        else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
388        {
389            sink.figureCaption( attribs );
390        }
391        else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
392        {
393            handleAStart( parser, sink, attribs );
394        }
395        else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
396        {
397            handleTableStart( sink, attribs, parser );
398        }
399        else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
400        {
401            sink.tableRow( attribs );
402        }
403        else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
404        {
405            sink.tableHeaderCell( attribs );
406        }
407        else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
408        {
409            sink.tableCell( attribs );
410        }
411        else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
412        {
413            sink.tableCaption( attribs );
414        }
415        else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) )
416        {
417            sink.lineBreak( attribs );
418        }
419        else if ( parser.getName().equals( HtmlMarkup.WBR.toString() ) )
420        {
421            sink.lineBreakOpportunity( attribs );
422        }
423        else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) )
424        {
425            sink.horizontalRule( attribs );
426        }
427        else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) )
428        {
429            handleImgStart( parser, sink, attribs );
430        }
431        else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
432            || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
433        {
434            handleUnknown( parser, sink, TAG_TYPE_START );
435            scriptBlock = true;
436        }
437        else
438        {
439            visited = false;
440        }
441
442        return visited;
443    }
444
445    /**
446     * <p>
447     *   Goes through a common list of possible html end tags.
448     *   These should be re-usable by different xhtml-based parsers.
449     *   The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)},
450     *   except for the empty elements ({@code <br/>, <hr/>, <img/>}).
451     * </p>
452     *
453     * @param parser A parser.
454     * @param sink the sink to receive the events.
455     * @return True if the event has been handled by this method, false otherwise.
456     */
457    protected boolean baseEndTag( XmlPullParser parser, Sink sink )
458    {
459        boolean visited = true;
460
461        if ( parser.getName().equals( HtmlMarkup.P.toString() ) )
462        {
463            sink.paragraph_();
464        }
465        else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
466        {
467            handleDivEnd( sink );
468        }
469        else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) )
470        {
471            verbatim_();
472
473            sink.verbatim_();
474        }
475        else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) )
476        {
477            sink.list_();
478        }
479        else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) )
480        {
481            sink.numberedList_();
482            orderedListDepth--;
483        }
484        else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) )
485        {
486            handleListItemEnd( sink );
487        }
488        else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) )
489        {
490            if ( hasDefinitionListItem )
491            {
492                sink.definitionListItem_();
493                hasDefinitionListItem = false;
494            }
495            sink.definitionList_();
496        }
497        else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) )
498        {
499            sink.definedTerm_();
500        }
501        else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) )
502        {
503            sink.definition_();
504            sink.definitionListItem_();
505            hasDefinitionListItem = false;
506        }
507        else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) )
508        {
509            sink.figure_();
510        }
511        else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) )
512        {
513            sink.figureCaption_();
514        }
515        else if ( parser.getName().equals( HtmlMarkup.A.toString() ) )
516        {
517            handleAEnd( sink );
518        }
519
520        else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) )
521        {
522            sink.inline_();
523        }
524        else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) )
525        {
526            sink.inline_();
527        }
528        else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) )
529        {
530            sink.inline_();
531        }
532        else if ( parser.getName().equals( HtmlMarkup.S.toString() ) )
533        {
534            sink.inline_();
535        }
536        else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) )
537        {
538            sink.inline_();
539        }
540        else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) )
541        {
542            sink.inline_();
543        }
544        else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) )
545        {
546            sink.inline_();
547        }
548        else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) )
549        {
550            sink.inline_();
551        }
552        else if ( parser.getName().equals( HtmlMarkup.I.toString() ) )
553        {
554            sink.inline_();
555        }
556        else if ( parser.getName().equals( HtmlMarkup.B.toString() ) )
557        {
558            sink.inline_();
559        }
560        else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) )
561        {
562            sink.inline_();
563        }
564        else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) )
565        {
566            sink.inline_();
567        }
568        else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) )
569        {
570            sink.inline_();
571        }
572        else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) )
573        {
574            sink.inline_();
575        }
576        else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) )
577        {
578            sink.inline_();
579        }
580        else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) )
581        {
582            sink.inline_();
583        }
584        else if ( parser.getName().equals( HtmlMarkup.U.toString() ) )
585        {
586            sink.inline_();
587        }
588        else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) )
589        {
590            sink.inline_();
591        }
592        else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) )
593        {
594            sink.inline_();
595        }
596        else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) )
597        {
598            sink.inline_();
599        }
600        else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) )
601        {
602            sink.inline_();
603        }
604        else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) )
605        {
606            sink.inline_();
607        }
608        else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) )
609        {
610            sink.inline_();
611        }
612        else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) )
613        {
614            sink.inline_();
615        }
616        else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) )
617        {
618            sink.inline_();
619        }
620        else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) )
621        {
622            sink.inline_();
623        }
624        else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) )
625        {
626            sink.inline_();
627        }
628        else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) )
629        {
630            sink.inline_();
631        }
632
633        // ----------------------------------------------------------------------
634        // Tables
635        // ----------------------------------------------------------------------
636
637        else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) )
638        {
639            sink.tableRows_();
640
641            sink.table_();
642        }
643        else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) )
644        {
645            sink.tableRow_();
646        }
647        else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) )
648        {
649            sink.tableHeaderCell_();
650        }
651        else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) )
652        {
653            sink.tableCell_();
654        }
655        else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) )
656        {
657            sink.tableCaption_();
658        }
659        else if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) )
660        {
661            sink.article_();
662        }
663        else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) )
664        {
665            sink.navigation_();
666        }
667        else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) )
668        {
669            sink.sidebar_();
670        }
671        else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) )
672        {
673            handleSectionEnd( sink );
674        }
675        else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) )
676        {
677            sink.sectionTitle1_();
678        }
679        else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) )
680        {
681            sink.sectionTitle2_();
682        }
683        else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) )
684        {
685            sink.sectionTitle3_();
686        }
687        else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) )
688        {
689            sink.sectionTitle4_();
690        }
691        else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) )
692        {
693            sink.sectionTitle5_();
694        }
695        else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) )
696        {
697            sink.header_();
698        }
699        else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) )
700        {
701            sink.content_();
702        }
703        else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) )
704        {
705            sink.footer_();
706        }
707        else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() )
708            || parser.getName().equals( HtmlMarkup.STYLE.toString() ) )
709        {
710            handleUnknown( parser, sink, TAG_TYPE_END );
711
712            scriptBlock = false;
713        }
714        else
715        {
716            visited = false;
717        }
718
719        return visited;
720    }
721
722    /**
723     * {@inheritDoc}
724     *
725     * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be
726     * overridden by implementing parsers to include additional tags.
727     */
728    protected void handleStartTag( XmlPullParser parser, Sink sink )
729        throws XmlPullParserException, MacroExecutionException
730    {
731        if ( !baseStartTag( parser, sink ) )
732        {
733            if ( getLog().isWarnEnabled() )
734            {
735                String position = "[" + parser.getLineNumber() + ":"
736                    + parser.getColumnNumber() + "]";
737                String tag = "<" + parser.getName() + ">";
738
739                getLog().warn( "Unrecognized xml tag: " + tag + " at " + position );
740            }
741        }
742    }
743
744    /**
745     * {@inheritDoc}
746     *
747     * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be
748     * overridden by implementing parsers to include additional tags.
749     */
750    protected void handleEndTag( XmlPullParser parser, Sink sink )
751        throws XmlPullParserException, MacroExecutionException
752    {
753        if ( !baseEndTag( parser, sink ) )
754        {
755            // unrecognized tag is already logged in StartTag
756        }
757    }
758
759    /** {@inheritDoc} */
760    @Override
761    protected void handleText( XmlPullParser parser, Sink sink )
762        throws XmlPullParserException
763    {
764        String text = getText( parser );
765
766        /*
767         * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the
768         * parser so any whitespace that makes it here is significant.
769         *
770         * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA.
771         */
772        if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() )
773        {
774            sink.text( text );
775        }
776    }
777
778    /** {@inheritDoc} */
779    @Override
780    protected void handleComment( XmlPullParser parser, Sink sink )
781        throws XmlPullParserException
782    {
783        String text = getText( parser );
784
785        if ( "PB".equals( text.trim() ) )
786        {
787            sink.pageBreak();
788        }
789        else
790        {
791            if ( isEmitComments() )
792            {
793                sink.comment( text );
794            }
795        }
796    }
797
798    /** {@inheritDoc} */
799    @Override
800    protected void handleCdsect( XmlPullParser parser, Sink sink )
801        throws XmlPullParserException
802    {
803        String text = getText( parser );
804
805        if ( isScriptBlock() )
806        {
807            sink.unknown( CDATA, new Object[] { CDATA_TYPE, text }, null );
808        }
809        else
810        {
811            sink.text( text );
812        }
813    }
814
815    /**
816     * Make sure sections are nested consecutively.
817     *
818     * <p>
819     * HTML5 heading tags H1 to H6 imply sections where they are not
820     * present, that means we have to open close any sections that
821     * are missing in between.
822     * </p>
823     *
824     * <p>
825     * For instance, if the following sequence is parsed:
826     * </p>
827     * <pre>
828     * &lt;h3&gt;&lt;/h3&gt;
829     * &lt;h6&gt;&lt;/h6&gt;
830     * </pre>
831     * <p>
832     * we have to insert two section starts before we open the <code>&lt;h6&gt;</code>.
833     * In the following sequence
834     * </p>
835     * <pre>
836     * &lt;h6&gt;&lt;/h6&gt;
837     * &lt;h3&gt;&lt;/h3&gt;
838     * </pre>
839     * <p>
840     * we have to close two sections before we open the <code>&lt;h3&gt;</code>.
841     * </p>
842     *
843     * <p>The current level is set to newLevel afterwards.</p>
844     *
845     * @param newLevel the new section level, all upper levels have to be closed.
846     * @param sink the sink to receive the events.
847     * @param attribs a {@link org.apache.maven.doxia.sink.impl.SinkEventAttributeSet} object.
848     */
849    protected void consecutiveSections( int newLevel, Sink sink, SinkEventAttributeSet attribs )
850    {
851        closeOpenSections( newLevel, sink );
852        openMissingSections( newLevel, sink );
853
854        this.headingLevel = newLevel;
855    }
856
857    /**
858     * Close open sections.
859     *
860     * @param newLevel the new section level, all upper levels have to be closed.
861     * @param sink the sink to receive the events.
862     */
863    private void closeOpenSections( int newLevel, Sink sink )
864    {
865        while ( this.headingLevel >= newLevel
866                && this.sectionLevel < headingLevel )
867        {
868            if ( headingLevel == Sink.SECTION_LEVEL_5 )
869            {
870                sink.section5_();
871            }
872            else if ( headingLevel == Sink.SECTION_LEVEL_4 )
873            {
874                sink.section4_();
875            }
876            else if ( headingLevel == Sink.SECTION_LEVEL_3 )
877            {
878                sink.section3_();
879            }
880            else if ( headingLevel == Sink.SECTION_LEVEL_2 )
881            {
882                sink.section2_();
883            }
884            else if ( headingLevel == Sink.SECTION_LEVEL_1 )
885            {
886                sink.section1_();
887            }
888
889            this.headingLevel--;
890        }
891    }
892
893    /**
894     * Open missing sections.
895     *
896     * @param newLevel the new section level, all lower levels have to be opened.
897     * @param sink the sink to receive the events.
898     */
899    private void openMissingSections( int newLevel, Sink sink )
900    {
901        while ( this.headingLevel < newLevel
902                && this.sectionLevel < newLevel )
903        {
904            this.headingLevel++;
905
906            if ( headingLevel == Sink.SECTION_LEVEL_5 )
907            {
908                sink.section5();
909            }
910            else if ( headingLevel == Sink.SECTION_LEVEL_4 )
911            {
912                sink.section4();
913            }
914            else if ( headingLevel == Sink.SECTION_LEVEL_3 )
915            {
916                sink.section3();
917            }
918            else if ( headingLevel == Sink.SECTION_LEVEL_2 )
919            {
920                sink.section2();
921            }
922            else if ( headingLevel == Sink.SECTION_LEVEL_1 )
923            {
924                sink.section1();
925            }
926        }
927    }
928
929    /**
930     * Return the current section level.
931     *
932     * @return the current section level.
933     */
934    protected int getSectionLevel()
935    {
936        return this.headingLevel;
937    }
938
939    /**
940     * Set the current section level.
941     *
942     * @param newLevel the new section level.
943     */
944    protected void setSectionLevel( int newLevel )
945    {
946        this.headingLevel = newLevel;
947    }
948
949    /**
950     * Stop verbatim mode.
951     */
952    protected void verbatim_()
953    {
954        this.inVerbatim = false;
955    }
956
957    /**
958     * Start verbatim mode.
959     */
960    protected void verbatim()
961    {
962        this.inVerbatim = true;
963    }
964
965    /**
966     * Checks if we are currently inside a &lt;pre&gt; tag.
967     *
968     * @return true if we are currently in verbatim mode.
969     */
970    protected boolean isVerbatim()
971    {
972        return this.inVerbatim;
973    }
974
975    /**
976     * Checks if we are currently inside a &lt;script&gt; tag.
977     *
978     * @return true if we are currently inside <code>&lt;script&gt;</code> tags.
979     * @since 1.1.1.
980     */
981    protected boolean isScriptBlock()
982    {
983        return this.scriptBlock;
984    }
985
986    /**
987     * Checks if the given id is a valid Doxia id and if not, returns a transformed one.
988     *
989     * @param id The id to validate.
990     * @return A transformed id or the original id if it was already valid.
991     * @see DoxiaUtils#encodeId(String)
992     */
993    protected String validAnchor( String id )
994    {
995        if ( !DoxiaUtils.isValidId( id ) )
996        {
997            String linkAnchor = DoxiaUtils.encodeId( id, true );
998
999            String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'";
1000            logMessage( "modifiedLink", msg );
1001
1002            return linkAnchor;
1003        }
1004
1005        return id;
1006    }
1007
1008    /** {@inheritDoc} */
1009    @Override
1010    protected void init()
1011    {
1012        super.init();
1013
1014        this.scriptBlock = false;
1015        this.isLink = false;
1016        this.isAnchor = false;
1017        this.orderedListDepth = 0;
1018        this.headingLevel = 0;
1019        this.inVerbatim = false;
1020        this.warnMessages = null;
1021    }
1022
1023    private void handleAEnd( Sink sink )
1024    {
1025        if ( isLink )
1026        {
1027            sink.link_();
1028            isLink = false;
1029        }
1030        else if ( isAnchor )
1031        {
1032            sink.anchor_();
1033            isAnchor = false;
1034        }
1035    }
1036
1037    private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1038    {
1039        String href = parser.getAttributeValue( null, Attribute.HREF.toString() );
1040
1041        if ( href != null )
1042        {
1043            int hashIndex = href.indexOf( '#' );
1044            if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) )
1045            {
1046                String hash = href.substring( hashIndex + 1 );
1047
1048                if ( !DoxiaUtils.isValidId( hash ) )
1049                {
1050                    href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true );
1051
1052                    String msg = "Modified invalid link: '" + hash + "' to '" + href + "'";
1053                    logMessage( "modifiedLink", msg );
1054                }
1055            }
1056            sink.link( href, attribs );
1057            isLink = true;
1058        }
1059        else
1060        {
1061            String name = parser.getAttributeValue( null, Attribute.NAME.toString() );
1062
1063            if ( name != null )
1064            {
1065                sink.anchor( validAnchor( name ), attribs );
1066                isAnchor = true;
1067            }
1068            else
1069            {
1070                String id = parser.getAttributeValue( null, Attribute.ID.toString() );
1071                if ( id != null )
1072                {
1073                    sink.anchor( validAnchor( id ), attribs );
1074                    isAnchor = true;
1075                }
1076            }
1077        }
1078    }
1079
1080    private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink )
1081    {
1082        String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() );
1083
1084        this.divStack.push( divclass );
1085
1086        if ( "content".equals( divclass ) )
1087        {
1088            SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs );
1089            atts.removeAttribute( SinkEventAttributes.CLASS );
1090            sink.content( atts );
1091        }
1092        if ( "source".equals( divclass ) )
1093        {
1094            return false;
1095        }
1096        else
1097        {
1098            sink.division( attribs );
1099        }
1100
1101        return true;
1102    }
1103
1104    private boolean handleDivEnd( Sink sink )
1105    {
1106        String divclass = divStack.pop();
1107
1108        if ( "content".equals( divclass ) )
1109        {
1110            sink.content_();
1111        }
1112        if ( "source".equals( divclass ) )
1113        {
1114            return false;
1115        }
1116        else
1117        {
1118            sink.division_();
1119        }
1120
1121        return true;
1122    }
1123
1124    private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1125    {
1126        String src = parser.getAttributeValue( null, Attribute.SRC.toString() );
1127
1128        if ( src != null )
1129        {
1130            sink.figureGraphics( src, attribs );
1131        }
1132    }
1133
1134    private void handleLIStart( Sink sink, SinkEventAttributeSet attribs )
1135    {
1136        if ( orderedListDepth == 0 )
1137        {
1138            sink.listItem( attribs );
1139        }
1140        else
1141        {
1142            sink.numberedListItem( attribs );
1143        }
1144    }
1145
1146    private void handleListItemEnd( Sink sink )
1147    {
1148        if ( orderedListDepth == 0 )
1149        {
1150            sink.listItem_();
1151        }
1152        else
1153        {
1154            sink.numberedListItem_();
1155        }
1156    }
1157
1158    private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs )
1159    {
1160        int numbering = Sink.NUMBERING_DECIMAL;
1161        // this will have to be generalized if we handle styles
1162        String style = parser.getAttributeValue( null, Attribute.STYLE.toString() );
1163
1164        if ( style != null )
1165        {
1166            switch ( style )
1167            {
1168                case "list-style-type: upper-alpha":
1169                    numbering = Sink.NUMBERING_UPPER_ALPHA;
1170                    break;
1171                case "list-style-type: lower-alpha":
1172                    numbering = Sink.NUMBERING_LOWER_ALPHA;
1173                    break;
1174                case "list-style-type: upper-roman":
1175                    numbering = Sink.NUMBERING_UPPER_ROMAN;
1176                    break;
1177                case "list-style-type: lower-roman":
1178                    numbering = Sink.NUMBERING_LOWER_ROMAN;
1179                    break;
1180                case "list-style-type: decimal":
1181                    numbering = Sink.NUMBERING_DECIMAL;
1182                    break;
1183                default:
1184                    // ignore all other
1185            }
1186        }
1187
1188        sink.numberedList( numbering, attribs );
1189        orderedListDepth++;
1190    }
1191
1192    private void handlePStart( Sink sink, SinkEventAttributeSet attribs )
1193    {
1194        sink.paragraph( attribs );
1195    }
1196
1197    /*
1198     * The PRE element tells visual user agents that the enclosed text is
1199     * "preformatted". When handling preformatted text, visual user agents:
1200     * - May leave white space intact.
1201     * - May render text with a fixed-pitch font.
1202     * - May disable automatic word wrap.
1203     * - Must not disable bidirectional processing.
1204     * Non-visual user agents are not required to respect extra white space
1205     * in the content of a PRE element.
1206     */
1207    private void handlePreStart( SinkEventAttributeSet attribs, Sink sink )
1208    {
1209        verbatim();
1210        sink.verbatim( attribs );
1211    }
1212
1213    private void handleSectionStart( Sink sink, SinkEventAttributeSet attribs )
1214    {
1215        sink.section( ++sectionLevel, attribs );
1216    }
1217
1218    private void handleHeadingStart( Sink sink, int level, SinkEventAttributeSet attribs )
1219    {
1220        consecutiveSections( level, sink, attribs );
1221        sink.sectionTitle( level, attribs );
1222    }
1223
1224    private void handleSectionEnd( Sink sink )
1225    {
1226        closeOpenSections( sectionLevel, sink );
1227        this.headingLevel = 0;
1228
1229        sink.section_( sectionLevel-- );
1230    }
1231
1232    private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser )
1233    {
1234        sink.table( attribs );
1235        String border = parser.getAttributeValue( null, Attribute.BORDER.toString() );
1236        boolean grid = true;
1237
1238        if ( border == null || "0".equals( border ) )
1239        {
1240            grid = false;
1241        }
1242
1243        String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() );
1244        int[] justif = {Sink.JUSTIFY_LEFT};
1245
1246        if ( "center".equals( align ) )
1247        {
1248            justif[0] = Sink.JUSTIFY_CENTER;
1249        }
1250        else if ( "right".equals( align ) )
1251        {
1252            justif[0] = Sink.JUSTIFY_RIGHT;
1253        }
1254
1255        sink.tableRows( justif, grid );
1256    }
1257
1258    /**
1259     * If debug mode is enabled, log the <code>msg</code> as is, otherwise add unique msg in <code>warnMessages</code>.
1260     *
1261     * @param key not null
1262     * @param msg not null
1263     * @see #parse(Reader, Sink)
1264     * @since 1.1.1
1265     */
1266    private void logMessage( String key, String msg )
1267    {
1268        final String log = "[XHTML Parser] " + msg;
1269        if ( getLog().isDebugEnabled() )
1270        {
1271            getLog().debug( log );
1272
1273            return;
1274        }
1275
1276        if ( warnMessages == null )
1277        {
1278            warnMessages = new HashMap<>();
1279        }
1280
1281        Set<String> set = warnMessages.get( key );
1282        if ( set == null )
1283        {
1284            set = new TreeSet<>();
1285        }
1286        set.add( log );
1287        warnMessages.put( key, set );
1288    }
1289
1290    /**
1291     * @since 1.1.1
1292     */
1293    private void logWarnings()
1294    {
1295        if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() )
1296        {
1297            for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() )
1298            {
1299                for ( String msg : entry.getValue() )
1300                {
1301                    getLog().warn( msg );
1302                }
1303            }
1304
1305            this.warnMessages = null;
1306        }
1307    }
1308}