001package org.apache.maven.doxia.parser; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import java.io.Reader; 023import java.util.HashMap; 024import java.util.Map; 025import java.util.Set; 026import java.util.Stack; 027import java.util.TreeSet; 028 029import javax.swing.text.html.HTML.Attribute; 030 031import org.apache.maven.doxia.macro.MacroExecutionException; 032import org.apache.maven.doxia.markup.HtmlMarkup; 033import org.apache.maven.doxia.sink.Sink; 034import org.apache.maven.doxia.sink.SinkEventAttributes; 035import org.apache.maven.doxia.sink.impl.SinkEventAttributeSet; 036import org.apache.maven.doxia.util.DoxiaUtils; 037import org.codehaus.plexus.util.StringUtils; 038import org.codehaus.plexus.util.xml.pull.XmlPullParser; 039import org.codehaus.plexus.util.xml.pull.XmlPullParserException; 040 041/** 042 * Common base parser for xhtml5 events. 043 */ 044public class Xhtml5BaseParser 045 extends AbstractXmlParser 046 implements HtmlMarkup 047{ 048 /** 049 * True if a <script></script> or <style></style> block is read. CDATA sections within are 050 * handled as rawText. 051 */ 052 private boolean scriptBlock; 053 054 /** Used to distinguish <a href=""> from <a name="">. */ 055 private boolean isLink; 056 057 /** Used to distinguish <a href=""> from <a name="">. */ 058 private boolean isAnchor; 059 060 /** Used for nested lists. */ 061 private int orderedListDepth = 0; 062 063 /** Counts section level. */ 064 private int sectionLevel; 065 066 /** Counts heading level. */ 067 private int headingLevel; 068 069 /** Verbatim flag, true whenever we are inside a <pre> tag. */ 070 private boolean inVerbatim; 071 072 /** Used to keep track of closing tags for content events */ 073 private Stack<String> divStack = new Stack<>(); 074 075 /** Used to wrap the definedTerm with its definition, even when one is omitted */ 076 boolean hasDefinitionListItem = false; 077 078 /** Map of warn messages with a String as key to describe the error type and a Set as value. 079 * Using to reduce warn messages. */ 080 private Map<String, Set<String>> warnMessages; 081 082 /** {@inheritDoc} */ 083 @Override 084 public void parse( Reader source, Sink sink, String reference ) 085 throws ParseException 086 { 087 init(); 088 089 try 090 { 091 super.parse( source, sink, reference ); 092 } 093 finally 094 { 095 logWarnings(); 096 097 setSecondParsing( false ); 098 init(); 099 } 100 } 101 102 /** 103 * {@inheritDoc} 104 * 105 * Adds all XHTML (HTML 5.2) entities to the parser so that they can be recognized and resolved 106 * without additional DTD. 107 */ 108 @Override 109 protected void initXmlParser( XmlPullParser parser ) 110 throws XmlPullParserException 111 { 112 super.initXmlParser( parser ); 113 } 114 115 /** 116 * <p> 117 * Goes through a common list of possible html5 start tags. These include only tags that can go into 118 * the body of an xhtml5 document and so should be re-usable by different xhtml-based parsers. 119 * </p> 120 * <p> 121 * The currently handled tags are: 122 * </p> 123 * <p> 124 * <code> 125 * <article>, <nav>, <aside>, <section>, <h2>, <h3>, <h4>, 126 * <h5>, <h6>, <header>, <main>, <footer>, <em>, <strong>, 127 * <small>, <s>, <cite>, <q>, <dfn>, <abbr>, <i>, 128 * <b>, <code>, <samp>, <kbd>, <sub>, <sup>, <u>, 129 * <mark>, <ruby>, <rb>, <rt>, <rtc>, <rp>, <bdi>, 130 * <bdo>, <span>, <ins>, <del>, <p>, <pre>, <ul>, 131 * <ol>, <li>, <dl>, <dt>, <dd>, <a>, <table>, 132 * <tr>, <th>, <td>, <caption>, <br/>, <wbr/>, <hr/>, 133 * <img/>. 134 * </code> 135 * </p> 136 * 137 * @param parser A parser. 138 * @param sink the sink to receive the events. 139 * @return True if the event has been handled by this method, i.e. the tag was recognized, false otherwise. 140 */ 141 protected boolean baseStartTag( XmlPullParser parser, Sink sink ) 142 { 143 boolean visited = true; 144 145 SinkEventAttributeSet attribs = getAttributesFromParser( parser ); 146 147 if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) ) 148 { 149 sink.article( attribs ); 150 } 151 else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) ) 152 { 153 sink.navigation( attribs ); 154 } 155 else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) ) 156 { 157 sink.sidebar( attribs ); 158 } 159 else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) ) 160 { 161 handleSectionStart( sink, attribs ); 162 } 163 else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) ) 164 { 165 handleHeadingStart( sink, Sink.SECTION_LEVEL_1, attribs ); 166 } 167 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) ) 168 { 169 handleHeadingStart( sink, Sink.SECTION_LEVEL_2, attribs ); 170 } 171 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) ) 172 { 173 handleHeadingStart( sink, Sink.SECTION_LEVEL_3, attribs ); 174 } 175 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) ) 176 { 177 handleHeadingStart( sink, Sink.SECTION_LEVEL_4, attribs ); 178 } 179 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) ) 180 { 181 handleHeadingStart( sink, Sink.SECTION_LEVEL_5, attribs ); 182 } 183 else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) ) 184 { 185 sink.header( attribs ); 186 } 187 else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) ) 188 { 189 sink.content( attribs ); 190 } 191 else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) ) 192 { 193 sink.footer( attribs ); 194 } 195 else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) 196 { 197 attribs.addAttributes( SinkEventAttributeSet.Semantics.EMPHASIS ); 198 sink.inline( attribs ); 199 } 200 else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) 201 { 202 attribs.addAttributes( SinkEventAttributeSet.Semantics.STRONG ); 203 sink.inline( attribs ); 204 } 205 else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) ) 206 { 207 attribs.addAttributes( SinkEventAttributeSet.Semantics.SMALL ); 208 sink.inline( attribs ); 209 } 210 else if ( parser.getName().equals( HtmlMarkup.S.toString() ) ) 211 { 212 attribs.addAttributes( SinkEventAttributeSet.Semantics.LINE_THROUGH ); 213 sink.inline( attribs ); 214 /* deprecated line-through support */ 215 } 216 else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) ) 217 { 218 attribs.addAttributes( SinkEventAttributeSet.Semantics.CITATION ); 219 sink.inline( attribs ); 220 } 221 else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) ) 222 { 223 attribs.addAttributes( SinkEventAttributeSet.Semantics.QUOTE ); 224 sink.inline( attribs ); 225 } 226 else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) ) 227 { 228 attribs.addAttributes( SinkEventAttributeSet.Semantics.DEFINITION ); 229 sink.inline( attribs ); 230 } 231 else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) ) 232 { 233 attribs.addAttributes( SinkEventAttributeSet.Semantics.ABBREVIATION ); 234 sink.inline( attribs ); 235 } 236 else if ( parser.getName().equals( HtmlMarkup.I.toString() ) ) 237 { 238 attribs.addAttributes( SinkEventAttributeSet.Semantics.ITALIC ); 239 sink.inline( attribs ); 240 } 241 else if ( parser.getName().equals( HtmlMarkup.B.toString() ) ) 242 { 243 attribs.addAttributes( SinkEventAttributeSet.Semantics.BOLD ); 244 sink.inline( attribs ); 245 } 246 else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) ) 247 { 248 attribs.addAttributes( SinkEventAttributeSet.Semantics.CODE ); 249 sink.inline( attribs ); 250 } 251 else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) ) 252 { 253 attribs.addAttributes( SinkEventAttributeSet.Semantics.VARIABLE ); 254 sink.inline( attribs ); 255 } 256 else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) ) 257 { 258 attribs.addAttributes( SinkEventAttributeSet.Semantics.SAMPLE ); 259 sink.inline( attribs ); 260 } 261 else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) ) 262 { 263 attribs.addAttributes( SinkEventAttributeSet.Semantics.KEYBOARD ); 264 sink.inline( attribs ); 265 } 266 else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) ) 267 { 268 attribs.addAttributes( SinkEventAttributeSet.Semantics.SUPERSCRIPT ); 269 sink.inline( attribs ); 270 } 271 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) ) 272 { 273 attribs.addAttributes( SinkEventAttributeSet.Semantics.SUBSCRIPT ); 274 sink.inline( attribs ); 275 } 276 else if ( parser.getName().equals( HtmlMarkup.U.toString() ) ) 277 { 278 attribs.addAttributes( SinkEventAttributeSet.Semantics.ANNOTATION ); 279 sink.inline( attribs ); 280 } 281 else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) ) 282 { 283 attribs.addAttributes( SinkEventAttributeSet.Semantics.HIGHLIGHT ); 284 sink.inline( attribs ); 285 } 286 else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) ) 287 { 288 attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY ); 289 sink.inline( attribs ); 290 } 291 else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) ) 292 { 293 attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_BASE ); 294 sink.inline( attribs ); 295 } 296 else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) ) 297 { 298 attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT ); 299 sink.inline( attribs ); 300 } 301 else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) ) 302 { 303 attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_TEXT_CONTAINER ); 304 sink.inline( attribs ); 305 } 306 else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) ) 307 { 308 attribs.addAttributes( SinkEventAttributeSet.Semantics.RUBY_PARANTHESES ); 309 sink.inline( attribs ); 310 } 311 else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) ) 312 { 313 attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_ISOLATION ); 314 sink.inline( attribs ); 315 } 316 else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) ) 317 { 318 attribs.addAttributes( SinkEventAttributeSet.Semantics.BIDIRECTIONAL_OVERRIDE ); 319 sink.inline( attribs ); 320 } 321 else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) ) 322 { 323 attribs.addAttributes( SinkEventAttributeSet.Semantics.PHRASE ); 324 sink.inline( attribs ); 325 } 326 else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) ) 327 { 328 attribs.addAttributes( SinkEventAttributeSet.Semantics.INSERT ); 329 sink.inline( attribs ); 330 } 331 else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) ) 332 { 333 attribs.addAttributes( SinkEventAttributeSet.Semantics.DELETE ); 334 sink.inline( attribs ); 335 } 336 else if ( parser.getName().equals( HtmlMarkup.P.toString() ) ) 337 { 338 handlePStart( sink, attribs ); 339 } 340 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 341 { 342 handleDivStart( parser, attribs, sink ); 343 } 344 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) ) 345 { 346 handlePreStart( attribs, sink ); 347 } 348 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) ) 349 { 350 sink.list( attribs ); 351 } 352 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) ) 353 { 354 handleOLStart( parser, sink, attribs ); 355 } 356 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) ) 357 { 358 handleLIStart( sink, attribs ); 359 } 360 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) ) 361 { 362 sink.definitionList( attribs ); 363 } 364 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) ) 365 { 366 if ( hasDefinitionListItem ) 367 { 368 // close previous listItem 369 sink.definitionListItem_(); 370 } 371 sink.definitionListItem( attribs ); 372 hasDefinitionListItem = true; 373 sink.definedTerm( attribs ); 374 } 375 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) ) 376 { 377 if ( !hasDefinitionListItem ) 378 { 379 sink.definitionListItem( attribs ); 380 } 381 sink.definition( attribs ); 382 } 383 else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) ) 384 { 385 sink.figure( attribs ); 386 } 387 else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) ) 388 { 389 sink.figureCaption( attribs ); 390 } 391 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) ) 392 { 393 handleAStart( parser, sink, attribs ); 394 } 395 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) ) 396 { 397 handleTableStart( sink, attribs, parser ); 398 } 399 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) ) 400 { 401 sink.tableRow( attribs ); 402 } 403 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) ) 404 { 405 sink.tableHeaderCell( attribs ); 406 } 407 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) ) 408 { 409 sink.tableCell( attribs ); 410 } 411 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) ) 412 { 413 sink.tableCaption( attribs ); 414 } 415 else if ( parser.getName().equals( HtmlMarkup.BR.toString() ) ) 416 { 417 sink.lineBreak( attribs ); 418 } 419 else if ( parser.getName().equals( HtmlMarkup.WBR.toString() ) ) 420 { 421 sink.lineBreakOpportunity( attribs ); 422 } 423 else if ( parser.getName().equals( HtmlMarkup.HR.toString() ) ) 424 { 425 sink.horizontalRule( attribs ); 426 } 427 else if ( parser.getName().equals( HtmlMarkup.IMG.toString() ) ) 428 { 429 handleImgStart( parser, sink, attribs ); 430 } 431 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) 432 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) ) 433 { 434 handleUnknown( parser, sink, TAG_TYPE_START ); 435 scriptBlock = true; 436 } 437 else 438 { 439 visited = false; 440 } 441 442 return visited; 443 } 444 445 /** 446 * <p> 447 * Goes through a common list of possible html end tags. 448 * These should be re-usable by different xhtml-based parsers. 449 * The tags handled here are the same as for {@link #baseStartTag(XmlPullParser,Sink)}, 450 * except for the empty elements ({@code <br/>, <hr/>, <img/>}). 451 * </p> 452 * 453 * @param parser A parser. 454 * @param sink the sink to receive the events. 455 * @return True if the event has been handled by this method, false otherwise. 456 */ 457 protected boolean baseEndTag( XmlPullParser parser, Sink sink ) 458 { 459 boolean visited = true; 460 461 if ( parser.getName().equals( HtmlMarkup.P.toString() ) ) 462 { 463 sink.paragraph_(); 464 } 465 else if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 466 { 467 handleDivEnd( sink ); 468 } 469 else if ( parser.getName().equals( HtmlMarkup.PRE.toString() ) ) 470 { 471 verbatim_(); 472 473 sink.verbatim_(); 474 } 475 else if ( parser.getName().equals( HtmlMarkup.UL.toString() ) ) 476 { 477 sink.list_(); 478 } 479 else if ( parser.getName().equals( HtmlMarkup.OL.toString() ) ) 480 { 481 sink.numberedList_(); 482 orderedListDepth--; 483 } 484 else if ( parser.getName().equals( HtmlMarkup.LI.toString() ) ) 485 { 486 handleListItemEnd( sink ); 487 } 488 else if ( parser.getName().equals( HtmlMarkup.DL.toString() ) ) 489 { 490 if ( hasDefinitionListItem ) 491 { 492 sink.definitionListItem_(); 493 hasDefinitionListItem = false; 494 } 495 sink.definitionList_(); 496 } 497 else if ( parser.getName().equals( HtmlMarkup.DT.toString() ) ) 498 { 499 sink.definedTerm_(); 500 } 501 else if ( parser.getName().equals( HtmlMarkup.DD.toString() ) ) 502 { 503 sink.definition_(); 504 sink.definitionListItem_(); 505 hasDefinitionListItem = false; 506 } 507 else if ( ( parser.getName().equals( HtmlMarkup.FIGURE.toString() ) ) ) 508 { 509 sink.figure_(); 510 } 511 else if ( ( parser.getName().equals( HtmlMarkup.FIGCAPTION.toString() ) ) ) 512 { 513 sink.figureCaption_(); 514 } 515 else if ( parser.getName().equals( HtmlMarkup.A.toString() ) ) 516 { 517 handleAEnd( sink ); 518 } 519 520 else if ( parser.getName().equals( HtmlMarkup.EM.toString() ) ) 521 { 522 sink.inline_(); 523 } 524 else if ( parser.getName().equals( HtmlMarkup.STRONG.toString() ) ) 525 { 526 sink.inline_(); 527 } 528 else if ( parser.getName().equals( HtmlMarkup.SMALL.toString() ) ) 529 { 530 sink.inline_(); 531 } 532 else if ( parser.getName().equals( HtmlMarkup.S.toString() ) ) 533 { 534 sink.inline_(); 535 } 536 else if ( parser.getName().equals( HtmlMarkup.CITE.toString() ) ) 537 { 538 sink.inline_(); 539 } 540 else if ( parser.getName().equals( HtmlMarkup.Q.toString() ) ) 541 { 542 sink.inline_(); 543 } 544 else if ( parser.getName().equals( HtmlMarkup.DFN.toString() ) ) 545 { 546 sink.inline_(); 547 } 548 else if ( parser.getName().equals( HtmlMarkup.ABBR.toString() ) ) 549 { 550 sink.inline_(); 551 } 552 else if ( parser.getName().equals( HtmlMarkup.I.toString() ) ) 553 { 554 sink.inline_(); 555 } 556 else if ( parser.getName().equals( HtmlMarkup.B.toString() ) ) 557 { 558 sink.inline_(); 559 } 560 else if ( parser.getName().equals( HtmlMarkup.CODE.toString() ) ) 561 { 562 sink.inline_(); 563 } 564 else if ( parser.getName().equals( HtmlMarkup.VAR.toString() ) ) 565 { 566 sink.inline_(); 567 } 568 else if ( parser.getName().equals( HtmlMarkup.SAMP.toString() ) ) 569 { 570 sink.inline_(); 571 } 572 else if ( parser.getName().equals( HtmlMarkup.KBD.toString() ) ) 573 { 574 sink.inline_(); 575 } 576 else if ( parser.getName().equals( HtmlMarkup.SUP.toString() ) ) 577 { 578 sink.inline_(); 579 } 580 else if ( parser.getName().equals( HtmlMarkup.SUB.toString() ) ) 581 { 582 sink.inline_(); 583 } 584 else if ( parser.getName().equals( HtmlMarkup.U.toString() ) ) 585 { 586 sink.inline_(); 587 } 588 else if ( parser.getName().equals( HtmlMarkup.MARK.toString() ) ) 589 { 590 sink.inline_(); 591 } 592 else if ( parser.getName().equals( HtmlMarkup.RUBY.toString() ) ) 593 { 594 sink.inline_(); 595 } 596 else if ( parser.getName().equals( HtmlMarkup.RB.toString() ) ) 597 { 598 sink.inline_(); 599 } 600 else if ( parser.getName().equals( HtmlMarkup.RT.toString() ) ) 601 { 602 sink.inline_(); 603 } 604 else if ( parser.getName().equals( HtmlMarkup.RTC.toString() ) ) 605 { 606 sink.inline_(); 607 } 608 else if ( parser.getName().equals( HtmlMarkup.RP.toString() ) ) 609 { 610 sink.inline_(); 611 } 612 else if ( parser.getName().equals( HtmlMarkup.BDI.toString() ) ) 613 { 614 sink.inline_(); 615 } 616 else if ( parser.getName().equals( HtmlMarkup.BDO.toString() ) ) 617 { 618 sink.inline_(); 619 } 620 else if ( parser.getName().equals( HtmlMarkup.SPAN.toString() ) ) 621 { 622 sink.inline_(); 623 } 624 else if ( parser.getName().equals( HtmlMarkup.INS.toString() ) ) 625 { 626 sink.inline_(); 627 } 628 else if ( parser.getName().equals( HtmlMarkup.DEL.toString() ) ) 629 { 630 sink.inline_(); 631 } 632 633 // ---------------------------------------------------------------------- 634 // Tables 635 // ---------------------------------------------------------------------- 636 637 else if ( parser.getName().equals( HtmlMarkup.TABLE.toString() ) ) 638 { 639 sink.tableRows_(); 640 641 sink.table_(); 642 } 643 else if ( parser.getName().equals( HtmlMarkup.TR.toString() ) ) 644 { 645 sink.tableRow_(); 646 } 647 else if ( parser.getName().equals( HtmlMarkup.TH.toString() ) ) 648 { 649 sink.tableHeaderCell_(); 650 } 651 else if ( parser.getName().equals( HtmlMarkup.TD.toString() ) ) 652 { 653 sink.tableCell_(); 654 } 655 else if ( parser.getName().equals( HtmlMarkup.CAPTION.toString() ) ) 656 { 657 sink.tableCaption_(); 658 } 659 else if ( parser.getName().equals( HtmlMarkup.ARTICLE.toString() ) ) 660 { 661 sink.article_(); 662 } 663 else if ( parser.getName().equals( HtmlMarkup.NAV.toString() ) ) 664 { 665 sink.navigation_(); 666 } 667 else if ( parser.getName().equals( HtmlMarkup.ASIDE.toString() ) ) 668 { 669 sink.sidebar_(); 670 } 671 else if ( parser.getName().equals( HtmlMarkup.SECTION.toString() ) ) 672 { 673 handleSectionEnd( sink ); 674 } 675 else if ( parser.getName().equals( HtmlMarkup.H2.toString() ) ) 676 { 677 sink.sectionTitle1_(); 678 } 679 else if ( parser.getName().equals( HtmlMarkup.H3.toString() ) ) 680 { 681 sink.sectionTitle2_(); 682 } 683 else if ( parser.getName().equals( HtmlMarkup.H4.toString() ) ) 684 { 685 sink.sectionTitle3_(); 686 } 687 else if ( parser.getName().equals( HtmlMarkup.H5.toString() ) ) 688 { 689 sink.sectionTitle4_(); 690 } 691 else if ( parser.getName().equals( HtmlMarkup.H6.toString() ) ) 692 { 693 sink.sectionTitle5_(); 694 } 695 else if ( parser.getName().equals( HtmlMarkup.HEADER.toString() ) ) 696 { 697 sink.header_(); 698 } 699 else if ( parser.getName().equals( HtmlMarkup.MAIN.toString() ) ) 700 { 701 sink.content_(); 702 } 703 else if ( parser.getName().equals( HtmlMarkup.FOOTER.toString() ) ) 704 { 705 sink.footer_(); 706 } 707 else if ( parser.getName().equals( HtmlMarkup.SCRIPT.toString() ) 708 || parser.getName().equals( HtmlMarkup.STYLE.toString() ) ) 709 { 710 handleUnknown( parser, sink, TAG_TYPE_END ); 711 712 scriptBlock = false; 713 } 714 else 715 { 716 visited = false; 717 } 718 719 return visited; 720 } 721 722 /** 723 * {@inheritDoc} 724 * 725 * Just calls {@link #baseStartTag(XmlPullParser,Sink)}, this should be 726 * overridden by implementing parsers to include additional tags. 727 */ 728 protected void handleStartTag( XmlPullParser parser, Sink sink ) 729 throws XmlPullParserException, MacroExecutionException 730 { 731 if ( !baseStartTag( parser, sink ) ) 732 { 733 if ( getLog().isWarnEnabled() ) 734 { 735 String position = "[" + parser.getLineNumber() + ":" 736 + parser.getColumnNumber() + "]"; 737 String tag = "<" + parser.getName() + ">"; 738 739 getLog().warn( "Unrecognized xml tag: " + tag + " at " + position ); 740 } 741 } 742 } 743 744 /** 745 * {@inheritDoc} 746 * 747 * Just calls {@link #baseEndTag(XmlPullParser,Sink)}, this should be 748 * overridden by implementing parsers to include additional tags. 749 */ 750 protected void handleEndTag( XmlPullParser parser, Sink sink ) 751 throws XmlPullParserException, MacroExecutionException 752 { 753 if ( !baseEndTag( parser, sink ) ) 754 { 755 // unrecognized tag is already logged in StartTag 756 } 757 } 758 759 /** {@inheritDoc} */ 760 @Override 761 protected void handleText( XmlPullParser parser, Sink sink ) 762 throws XmlPullParserException 763 { 764 String text = getText( parser ); 765 766 /* 767 * NOTE: Don't do any whitespace trimming here. Whitespace normalization has already been performed by the 768 * parser so any whitespace that makes it here is significant. 769 * 770 * NOTE: text within script tags is ignored, scripting code should be embedded in CDATA. 771 */ 772 if ( StringUtils.isNotEmpty( text ) && !isScriptBlock() ) 773 { 774 sink.text( text ); 775 } 776 } 777 778 /** {@inheritDoc} */ 779 @Override 780 protected void handleComment( XmlPullParser parser, Sink sink ) 781 throws XmlPullParserException 782 { 783 String text = getText( parser ); 784 785 if ( "PB".equals( text.trim() ) ) 786 { 787 sink.pageBreak(); 788 } 789 else 790 { 791 if ( isEmitComments() ) 792 { 793 sink.comment( text ); 794 } 795 } 796 } 797 798 /** {@inheritDoc} */ 799 @Override 800 protected void handleCdsect( XmlPullParser parser, Sink sink ) 801 throws XmlPullParserException 802 { 803 String text = getText( parser ); 804 805 if ( isScriptBlock() ) 806 { 807 sink.unknown( CDATA, new Object[] { CDATA_TYPE, text }, null ); 808 } 809 else 810 { 811 sink.text( text ); 812 } 813 } 814 815 /** 816 * Make sure sections are nested consecutively. 817 * 818 * <p> 819 * HTML5 heading tags H1 to H6 imply sections where they are not 820 * present, that means we have to open close any sections that 821 * are missing in between. 822 * </p> 823 * 824 * <p> 825 * For instance, if the following sequence is parsed: 826 * </p> 827 * <pre> 828 * <h3></h3> 829 * <h6></h6> 830 * </pre> 831 * <p> 832 * we have to insert two section starts before we open the <code><h6></code>. 833 * In the following sequence 834 * </p> 835 * <pre> 836 * <h6></h6> 837 * <h3></h3> 838 * </pre> 839 * <p> 840 * we have to close two sections before we open the <code><h3></code>. 841 * </p> 842 * 843 * <p>The current level is set to newLevel afterwards.</p> 844 * 845 * @param newLevel the new section level, all upper levels have to be closed. 846 * @param sink the sink to receive the events. 847 * @param attribs a {@link org.apache.maven.doxia.sink.impl.SinkEventAttributeSet} object. 848 */ 849 protected void consecutiveSections( int newLevel, Sink sink, SinkEventAttributeSet attribs ) 850 { 851 closeOpenSections( newLevel, sink ); 852 openMissingSections( newLevel, sink ); 853 854 this.headingLevel = newLevel; 855 } 856 857 /** 858 * Close open sections. 859 * 860 * @param newLevel the new section level, all upper levels have to be closed. 861 * @param sink the sink to receive the events. 862 */ 863 private void closeOpenSections( int newLevel, Sink sink ) 864 { 865 while ( this.headingLevel >= newLevel 866 && this.sectionLevel < headingLevel ) 867 { 868 if ( headingLevel == Sink.SECTION_LEVEL_5 ) 869 { 870 sink.section5_(); 871 } 872 else if ( headingLevel == Sink.SECTION_LEVEL_4 ) 873 { 874 sink.section4_(); 875 } 876 else if ( headingLevel == Sink.SECTION_LEVEL_3 ) 877 { 878 sink.section3_(); 879 } 880 else if ( headingLevel == Sink.SECTION_LEVEL_2 ) 881 { 882 sink.section2_(); 883 } 884 else if ( headingLevel == Sink.SECTION_LEVEL_1 ) 885 { 886 sink.section1_(); 887 } 888 889 this.headingLevel--; 890 } 891 } 892 893 /** 894 * Open missing sections. 895 * 896 * @param newLevel the new section level, all lower levels have to be opened. 897 * @param sink the sink to receive the events. 898 */ 899 private void openMissingSections( int newLevel, Sink sink ) 900 { 901 while ( this.headingLevel < newLevel 902 && this.sectionLevel < newLevel ) 903 { 904 this.headingLevel++; 905 906 if ( headingLevel == Sink.SECTION_LEVEL_5 ) 907 { 908 sink.section5(); 909 } 910 else if ( headingLevel == Sink.SECTION_LEVEL_4 ) 911 { 912 sink.section4(); 913 } 914 else if ( headingLevel == Sink.SECTION_LEVEL_3 ) 915 { 916 sink.section3(); 917 } 918 else if ( headingLevel == Sink.SECTION_LEVEL_2 ) 919 { 920 sink.section2(); 921 } 922 else if ( headingLevel == Sink.SECTION_LEVEL_1 ) 923 { 924 sink.section1(); 925 } 926 } 927 } 928 929 /** 930 * Return the current section level. 931 * 932 * @return the current section level. 933 */ 934 protected int getSectionLevel() 935 { 936 return this.headingLevel; 937 } 938 939 /** 940 * Set the current section level. 941 * 942 * @param newLevel the new section level. 943 */ 944 protected void setSectionLevel( int newLevel ) 945 { 946 this.headingLevel = newLevel; 947 } 948 949 /** 950 * Stop verbatim mode. 951 */ 952 protected void verbatim_() 953 { 954 this.inVerbatim = false; 955 } 956 957 /** 958 * Start verbatim mode. 959 */ 960 protected void verbatim() 961 { 962 this.inVerbatim = true; 963 } 964 965 /** 966 * Checks if we are currently inside a <pre> tag. 967 * 968 * @return true if we are currently in verbatim mode. 969 */ 970 protected boolean isVerbatim() 971 { 972 return this.inVerbatim; 973 } 974 975 /** 976 * Checks if we are currently inside a <script> tag. 977 * 978 * @return true if we are currently inside <code><script></code> tags. 979 * @since 1.1.1. 980 */ 981 protected boolean isScriptBlock() 982 { 983 return this.scriptBlock; 984 } 985 986 /** 987 * Checks if the given id is a valid Doxia id and if not, returns a transformed one. 988 * 989 * @param id The id to validate. 990 * @return A transformed id or the original id if it was already valid. 991 * @see DoxiaUtils#encodeId(String) 992 */ 993 protected String validAnchor( String id ) 994 { 995 if ( !DoxiaUtils.isValidId( id ) ) 996 { 997 String linkAnchor = DoxiaUtils.encodeId( id, true ); 998 999 String msg = "Modified invalid link: '" + id + "' to '" + linkAnchor + "'"; 1000 logMessage( "modifiedLink", msg ); 1001 1002 return linkAnchor; 1003 } 1004 1005 return id; 1006 } 1007 1008 /** {@inheritDoc} */ 1009 @Override 1010 protected void init() 1011 { 1012 super.init(); 1013 1014 this.scriptBlock = false; 1015 this.isLink = false; 1016 this.isAnchor = false; 1017 this.orderedListDepth = 0; 1018 this.headingLevel = 0; 1019 this.inVerbatim = false; 1020 this.warnMessages = null; 1021 } 1022 1023 private void handleAEnd( Sink sink ) 1024 { 1025 if ( isLink ) 1026 { 1027 sink.link_(); 1028 isLink = false; 1029 } 1030 else if ( isAnchor ) 1031 { 1032 sink.anchor_(); 1033 isAnchor = false; 1034 } 1035 } 1036 1037 private void handleAStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs ) 1038 { 1039 String href = parser.getAttributeValue( null, Attribute.HREF.toString() ); 1040 1041 if ( href != null ) 1042 { 1043 int hashIndex = href.indexOf( '#' ); 1044 if ( hashIndex != -1 && !DoxiaUtils.isExternalLink( href ) ) 1045 { 1046 String hash = href.substring( hashIndex + 1 ); 1047 1048 if ( !DoxiaUtils.isValidId( hash ) ) 1049 { 1050 href = href.substring( 0, hashIndex ) + "#" + DoxiaUtils.encodeId( hash, true ); 1051 1052 String msg = "Modified invalid link: '" + hash + "' to '" + href + "'"; 1053 logMessage( "modifiedLink", msg ); 1054 } 1055 } 1056 sink.link( href, attribs ); 1057 isLink = true; 1058 } 1059 else 1060 { 1061 String name = parser.getAttributeValue( null, Attribute.NAME.toString() ); 1062 1063 if ( name != null ) 1064 { 1065 sink.anchor( validAnchor( name ), attribs ); 1066 isAnchor = true; 1067 } 1068 else 1069 { 1070 String id = parser.getAttributeValue( null, Attribute.ID.toString() ); 1071 if ( id != null ) 1072 { 1073 sink.anchor( validAnchor( id ), attribs ); 1074 isAnchor = true; 1075 } 1076 } 1077 } 1078 } 1079 1080 private boolean handleDivStart( XmlPullParser parser, SinkEventAttributeSet attribs, Sink sink ) 1081 { 1082 String divclass = parser.getAttributeValue( null, Attribute.CLASS.toString() ); 1083 1084 this.divStack.push( divclass ); 1085 1086 if ( "content".equals( divclass ) ) 1087 { 1088 SinkEventAttributeSet atts = new SinkEventAttributeSet( attribs ); 1089 atts.removeAttribute( SinkEventAttributes.CLASS ); 1090 sink.content( atts ); 1091 } 1092 if ( "source".equals( divclass ) ) 1093 { 1094 return false; 1095 } 1096 else 1097 { 1098 sink.division( attribs ); 1099 } 1100 1101 return true; 1102 } 1103 1104 private boolean handleDivEnd( Sink sink ) 1105 { 1106 String divclass = divStack.pop(); 1107 1108 if ( "content".equals( divclass ) ) 1109 { 1110 sink.content_(); 1111 } 1112 if ( "source".equals( divclass ) ) 1113 { 1114 return false; 1115 } 1116 else 1117 { 1118 sink.division_(); 1119 } 1120 1121 return true; 1122 } 1123 1124 private void handleImgStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs ) 1125 { 1126 String src = parser.getAttributeValue( null, Attribute.SRC.toString() ); 1127 1128 if ( src != null ) 1129 { 1130 sink.figureGraphics( src, attribs ); 1131 } 1132 } 1133 1134 private void handleLIStart( Sink sink, SinkEventAttributeSet attribs ) 1135 { 1136 if ( orderedListDepth == 0 ) 1137 { 1138 sink.listItem( attribs ); 1139 } 1140 else 1141 { 1142 sink.numberedListItem( attribs ); 1143 } 1144 } 1145 1146 private void handleListItemEnd( Sink sink ) 1147 { 1148 if ( orderedListDepth == 0 ) 1149 { 1150 sink.listItem_(); 1151 } 1152 else 1153 { 1154 sink.numberedListItem_(); 1155 } 1156 } 1157 1158 private void handleOLStart( XmlPullParser parser, Sink sink, SinkEventAttributeSet attribs ) 1159 { 1160 int numbering = Sink.NUMBERING_DECIMAL; 1161 // this will have to be generalized if we handle styles 1162 String style = parser.getAttributeValue( null, Attribute.STYLE.toString() ); 1163 1164 if ( style != null ) 1165 { 1166 switch ( style ) 1167 { 1168 case "list-style-type: upper-alpha": 1169 numbering = Sink.NUMBERING_UPPER_ALPHA; 1170 break; 1171 case "list-style-type: lower-alpha": 1172 numbering = Sink.NUMBERING_LOWER_ALPHA; 1173 break; 1174 case "list-style-type: upper-roman": 1175 numbering = Sink.NUMBERING_UPPER_ROMAN; 1176 break; 1177 case "list-style-type: lower-roman": 1178 numbering = Sink.NUMBERING_LOWER_ROMAN; 1179 break; 1180 case "list-style-type: decimal": 1181 numbering = Sink.NUMBERING_DECIMAL; 1182 break; 1183 default: 1184 // ignore all other 1185 } 1186 } 1187 1188 sink.numberedList( numbering, attribs ); 1189 orderedListDepth++; 1190 } 1191 1192 private void handlePStart( Sink sink, SinkEventAttributeSet attribs ) 1193 { 1194 sink.paragraph( attribs ); 1195 } 1196 1197 /* 1198 * The PRE element tells visual user agents that the enclosed text is 1199 * "preformatted". When handling preformatted text, visual user agents: 1200 * - May leave white space intact. 1201 * - May render text with a fixed-pitch font. 1202 * - May disable automatic word wrap. 1203 * - Must not disable bidirectional processing. 1204 * Non-visual user agents are not required to respect extra white space 1205 * in the content of a PRE element. 1206 */ 1207 private void handlePreStart( SinkEventAttributeSet attribs, Sink sink ) 1208 { 1209 verbatim(); 1210 sink.verbatim( attribs ); 1211 } 1212 1213 private void handleSectionStart( Sink sink, SinkEventAttributeSet attribs ) 1214 { 1215 sink.section( ++sectionLevel, attribs ); 1216 } 1217 1218 private void handleHeadingStart( Sink sink, int level, SinkEventAttributeSet attribs ) 1219 { 1220 consecutiveSections( level, sink, attribs ); 1221 sink.sectionTitle( level, attribs ); 1222 } 1223 1224 private void handleSectionEnd( Sink sink ) 1225 { 1226 closeOpenSections( sectionLevel, sink ); 1227 this.headingLevel = 0; 1228 1229 sink.section_( sectionLevel-- ); 1230 } 1231 1232 private void handleTableStart( Sink sink, SinkEventAttributeSet attribs, XmlPullParser parser ) 1233 { 1234 sink.table( attribs ); 1235 String border = parser.getAttributeValue( null, Attribute.BORDER.toString() ); 1236 boolean grid = true; 1237 1238 if ( border == null || "0".equals( border ) ) 1239 { 1240 grid = false; 1241 } 1242 1243 String align = parser.getAttributeValue( null, Attribute.ALIGN.toString() ); 1244 int[] justif = {Sink.JUSTIFY_LEFT}; 1245 1246 if ( "center".equals( align ) ) 1247 { 1248 justif[0] = Sink.JUSTIFY_CENTER; 1249 } 1250 else if ( "right".equals( align ) ) 1251 { 1252 justif[0] = Sink.JUSTIFY_RIGHT; 1253 } 1254 1255 sink.tableRows( justif, grid ); 1256 } 1257 1258 /** 1259 * If debug mode is enabled, log the <code>msg</code> as is, otherwise add unique msg in <code>warnMessages</code>. 1260 * 1261 * @param key not null 1262 * @param msg not null 1263 * @see #parse(Reader, Sink) 1264 * @since 1.1.1 1265 */ 1266 private void logMessage( String key, String msg ) 1267 { 1268 final String log = "[XHTML Parser] " + msg; 1269 if ( getLog().isDebugEnabled() ) 1270 { 1271 getLog().debug( log ); 1272 1273 return; 1274 } 1275 1276 if ( warnMessages == null ) 1277 { 1278 warnMessages = new HashMap<>(); 1279 } 1280 1281 Set<String> set = warnMessages.get( key ); 1282 if ( set == null ) 1283 { 1284 set = new TreeSet<>(); 1285 } 1286 set.add( log ); 1287 warnMessages.put( key, set ); 1288 } 1289 1290 /** 1291 * @since 1.1.1 1292 */ 1293 private void logWarnings() 1294 { 1295 if ( getLog().isWarnEnabled() && this.warnMessages != null && !isSecondParsing() ) 1296 { 1297 for ( Map.Entry<String, Set<String>> entry : this.warnMessages.entrySet() ) 1298 { 1299 for ( String msg : entry.getValue() ) 1300 { 1301 getLog().warn( msg ); 1302 } 1303 } 1304 1305 this.warnMessages = null; 1306 } 1307 } 1308}