View Javadoc
1   package org.apache.maven.doxia.module.markdown;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import com.vladsch.flexmark.ast.Heading;
23  import com.vladsch.flexmark.ast.HtmlCommentBlock;
24  import com.vladsch.flexmark.util.ast.Node;
25  import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
26  import com.vladsch.flexmark.html.HtmlRenderer;
27  import com.vladsch.flexmark.util.options.MutableDataSet;
28  import com.vladsch.flexmark.ext.escaped.character.EscapedCharacterExtension;
29  import com.vladsch.flexmark.ext.abbreviation.AbbreviationExtension;
30  import com.vladsch.flexmark.ext.autolink.AutolinkExtension;
31  import com.vladsch.flexmark.ext.definition.DefinitionExtension;
32  import com.vladsch.flexmark.ext.typographic.TypographicExtension;
33  import com.vladsch.flexmark.ext.tables.TablesExtension;
34  import com.vladsch.flexmark.ext.wikilink.WikiLinkExtension;
35  import com.vladsch.flexmark.ext.gfm.strikethrough.StrikethroughExtension;
36  
37  import org.apache.commons.io.input.CharSequenceReader;
38  import org.apache.maven.doxia.markup.HtmlMarkup;
39  import org.apache.maven.doxia.module.xhtml.XhtmlParser;
40  import org.apache.maven.doxia.parser.AbstractParser;
41  import org.apache.maven.doxia.parser.ParseException;
42  import org.apache.maven.doxia.parser.Parser;
43  import org.apache.maven.doxia.sink.Sink;
44  import org.apache.maven.doxia.util.HtmlTools;
45  import org.codehaus.plexus.component.annotations.Component;
46  import org.codehaus.plexus.component.annotations.Requirement;
47  import org.codehaus.plexus.util.IOUtil;
48  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
49  
50  import java.io.IOException;
51  import java.io.Reader;
52  import java.util.Arrays;
53  import java.util.regex.Matcher;
54  import java.util.regex.Pattern;
55  
56  /**
57   * <p>
58   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
59   * </p>
60   * <p>
61   * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
62   * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
63   * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
64   * </p>
65   *
66   * @author Vladimir Schneider
67   * @author Julien Nicoulaud
68   * @since 1.3
69   */
70  @Component( role = Parser.class, hint = MarkdownParser.ROLE_HINT )
71  public class MarkdownParser
72      extends AbstractParser
73  {
74  
75      /**
76       * The role hint for the {@link MarkdownParser} Plexus component.
77       */
78      public static final String ROLE_HINT = "markdown";
79  
80      /**
81       * Regex that identifies a multimarkdown-style metadata section at the start of the document
82       *
83       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
84       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
85       * ignored.
86       */
87      private static final Pattern METADATA_SECTION_PATTERN = Pattern.compile(
88              "\\A^\\s*"
89              + "(?:title|author|date|address|affiliation|copyright|email|keywords|language|phone|subtitle)"
90              + "[ \\t]*:[ \\t]*[^\\r\\n]*[ \\t]*$[\\r\\n]+"
91              + "(?:^[ \\t]*[^:\\r\\n]+[ \\t]*:[ \\t]*[^\\r\\n]*[ \\t]*$[\\r\\n]+)*",
92              Pattern.MULTILINE | Pattern.CASE_INSENSITIVE );
93  
94      /**
95       * Regex that captures the key and value of a multimarkdown-style metadata entry.
96       */
97      private static final Pattern METADATA_ENTRY_PATTERN = Pattern.compile(
98              "^[ \\t]*([^:\\r\\n]+?)[ \\t]*:[ \\t]*([^\\r\\n]*)[ \\t]*$",
99              Pattern.MULTILINE );
100 
101     /**
102      * <p>getType.</p>
103      *
104      * @return a int.
105      */
106     @Override
107     public int getType()
108     {
109         return TXT_TYPE;
110     }
111 
112     /**
113      * The parser of the HTML produced by Flexmark, that we will
114      * use to convert this HTML to Sink events
115      */
116     @Requirement
117     private MarkdownHtmlParser parser;
118 
119     /**
120      * Flexmark's Markdown parser (one static instance fits all)
121      */
122     private static final com.vladsch.flexmark.parser.Parser FLEXMARK_PARSER;
123 
124     /**
125      * Flexmark's HTML renderer (its output will be re-parsed and converted to Sink events)
126      */
127     private static final HtmlRenderer FLEXMARK_HTML_RENDERER;
128 
129     // Initialize the Flexmark parser and renderer, once and for all
130     static
131     {
132         MutableDataSet flexmarkOptions = new MutableDataSet();
133 
134         // Enable the extensions that we used to have in Pegdown
135         flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, Arrays.asList(
136                 EscapedCharacterExtension.create(),
137                 AbbreviationExtension.create(),
138                 AutolinkExtension.create(),
139                 DefinitionExtension.create(),
140                 TypographicExtension.create(),
141                 TablesExtension.create(),
142                 WikiLinkExtension.create(),
143                 StrikethroughExtension.create()
144         ) );
145 
146         // Disable wrong apostrophe replacement
147         flexmarkOptions.set( TypographicExtension.SINGLE_QUOTE_UNMATCHED, "&apos;" );
148 
149         // Additional options on the HTML rendering
150         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false );
151         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false );
152         flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 );
153 
154         // Build the Markdown parser
155         FLEXMARK_PARSER = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions ).build();
156 
157         // Build the HTML renderer
158         FLEXMARK_HTML_RENDERER = HtmlRenderer.builder( flexmarkOptions )
159                 .linkResolverFactory( new FlexmarkDoxiaLinkResolver.Factory() )
160                 .build();
161 
162     }
163 
164     /** {@inheritDoc} */
165     @Override
166     public void parse( Reader source, Sink sink, String reference )
167         throws ParseException
168     {
169         try
170         {
171             // Markdown to HTML (using flexmark-java library)
172             CharSequence html = toHtml( source );
173 
174             // then HTML to Sink API
175             parser.parse( new CharSequenceReader( html ), sink );
176         }
177         catch ( IOException e )
178         {
179             throw new ParseException( "Failed reading Markdown source document", e );
180         }
181     }
182 
183     /**
184      * uses flexmark-java library to parse content and generate HTML output.
185      *
186      * @param source the Markdown source
187      * @return HTML content generated by flexmark-java
188      * @throws IOException passed through
189      */
190     CharSequence toHtml( Reader source )
191         throws IOException
192     {
193         // Read the source
194         String text = IOUtil.toString( source );
195 
196         // Now, build the HTML document
197         StringBuilder html = new StringBuilder( 1000 );
198         html.append( "<html>" );
199         html.append( "<head>" );
200 
201         // First, we interpret the "metadata" section of the document and add the corresponding HTML headers
202         Matcher metadataMatcher = METADATA_SECTION_PATTERN.matcher( text );
203         boolean haveTitle = false;
204         if ( metadataMatcher.find() )
205         {
206             Matcher entryMatcher = METADATA_ENTRY_PATTERN.matcher( metadataMatcher.group( 0 ) );
207             while ( entryMatcher.find() )
208             {
209                 String key = entryMatcher.group( 1 );
210                 String value = entryMatcher.group( 2 );
211                 if ( "title".equalsIgnoreCase( key ) )
212                 {
213                     haveTitle = true;
214                     html.append( "<title>" );
215                     html.append( HtmlTools.escapeHTML( value, false ) );
216                     html.append( "</title>" );
217                 }
218                 else
219                 {
220                     html.append( "<meta name='" );
221                     html.append( HtmlTools.escapeHTML( key ) );
222                     html.append( "' content='" );
223                     html.append( HtmlTools.escapeHTML( value ) );
224                     html.append( "' />" );
225                 }
226             }
227 
228             // Trim the metadata from the source
229             text = text.substring( metadataMatcher.end( 0 ) );
230 
231         }
232 
233         // Now is the time to parse the Markdown document
234         // (after we've trimmed out the metadatas, and before we check for its headings)
235         Node documentRoot = FLEXMARK_PARSER.parse( text );
236 
237         // Special trick: if there is no title specified as a metadata in the header, we will use the first
238         // heading as the document title
239         if ( !haveTitle && documentRoot.hasChildren() )
240         {
241             // Skip the comment nodes
242             Node firstNode = documentRoot.getFirstChild();
243             while ( firstNode != null && firstNode instanceof HtmlCommentBlock )
244             {
245                 firstNode = firstNode.getNext();
246             }
247 
248             // If this first non-comment node is a heading, we use it as the document title
249             if ( firstNode != null && firstNode instanceof Heading )
250             {
251                 html.append( "<title>" );
252                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
253                 String headingText = collectingVisitor.collectAndGetText( firstNode );
254                 html.append( HtmlTools.escapeHTML( headingText, false ) );
255                 html.append( "</title>" );
256             }
257         }
258         html.append( "</head>" );
259         html.append( "<body>" );
260 
261         // Convert our Markdown document to HTML and append it to our HTML
262         FLEXMARK_HTML_RENDERER.render( documentRoot, html );
263 
264         html.append( "</body>" );
265         html.append( "</html>" );
266 
267         return html;
268     }
269 
270     /**
271      * Internal parser for HTML generated by the Markdown library.
272      *
273      * 2 special things:
274      * <ul>
275      * <li> DIV elements are translated as Unknown Sink events
276      * <li> PRE elements are all considered as boxed
277      * </ul>
278      * PRE elements need to be "boxed" because the XhtmlSink will surround the
279      * corresponding verbatim() Sink event with a DIV element with class="source",
280      * which is how most Maven Skin (incl. Fluido) recognize a block of code, which
281      * needs to be highlighted accordingly.
282      */
283     @Component( role = MarkdownHtmlParser.class )
284     public static class MarkdownHtmlParser
285         extends XhtmlParser
286     {
287         public MarkdownHtmlParser()
288         {
289             super();
290         }
291 
292         @Override
293         protected void init()
294         {
295             super.init();
296             super.boxed = true;
297         }
298 
299         @Override
300         protected boolean baseEndTag( XmlPullParser parser, Sink sink )
301         {
302             boolean visited = super.baseEndTag( parser, sink );
303             if ( !visited )
304             {
305                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
306                 {
307                     handleUnknown( parser, sink, TAG_TYPE_END );
308                     visited = true;
309                 }
310             }
311             return visited;
312         }
313 
314         @Override
315         protected boolean baseStartTag( XmlPullParser parser, Sink sink )
316         {
317             boolean visited = super.baseStartTag( parser, sink );
318             if ( !visited )
319             {
320                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
321                 {
322                     handleUnknown( parser, sink, TAG_TYPE_START );
323                     super.boxed = true;
324                     visited = true;
325                 }
326             }
327             return visited;
328         }
329     }
330 }