View Javadoc
1   package org.apache.maven.doxia.module.markdown;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import com.vladsch.flexmark.Extension;
23  import com.vladsch.flexmark.ast.Heading;
24  import com.vladsch.flexmark.ast.HtmlCommentBlock;
25  import com.vladsch.flexmark.ast.Node;
26  import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
27  import com.vladsch.flexmark.html.HtmlRenderer;
28  import com.vladsch.flexmark.profiles.pegdown.Extensions;
29  import com.vladsch.flexmark.profiles.pegdown.PegdownOptionsAdapter;
30  import com.vladsch.flexmark.util.options.MutableDataHolder;
31  import org.apache.commons.lang3.StringEscapeUtils;
32  import org.apache.commons.lang3.StringUtils;
33  import org.apache.maven.doxia.markup.HtmlMarkup;
34  import org.apache.maven.doxia.module.xhtml.XhtmlParser;
35  import org.apache.maven.doxia.parser.AbstractParser;
36  import org.apache.maven.doxia.parser.ParseException;
37  import org.apache.maven.doxia.parser.Parser;
38  import org.apache.maven.doxia.sink.Sink;
39  import org.codehaus.plexus.component.annotations.Component;
40  import org.codehaus.plexus.component.annotations.Requirement;
41  import org.codehaus.plexus.util.IOUtil;
42  import org.codehaus.plexus.util.xml.pull.XmlPullParser;
43  
44  import java.io.IOException;
45  import java.io.Reader;
46  import java.io.StringReader;
47  import java.util.ArrayList;
48  import java.util.regex.Matcher;
49  import java.util.regex.Pattern;
50  
51  /**
52   * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
53   * <p/>
54   * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
55   * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
56   * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
57   *
58   * @author Vladimir Schneider <vladimir@vladsch.com>
59   * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
60   * @since 1.3
61   */
62  @Component( role = Parser.class, hint = "markdown" )
63  public class MarkdownParser
64      extends AbstractParser
65  {
66  
67      /**
68       * The role hint for the {@link MarkdownParser} Plexus component.
69       */
70      public static final String ROLE_HINT = "markdown";
71  
72      /**
73       * Regex that identifies a multimarkdown-style metadata section at the start of the document
74       */
75      private static final String MULTI_MARKDOWN_METADATA_SECTION =
76          "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
77  
78      /**
79       * Regex that captures the key and value of a multimarkdown-style metadata entry.
80       */
81      private static final String MULTI_MARKDOWN_METADATA_ENTRY =
82          "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
83  
84      /**
85       * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
86       * first key in the metadata section must be one of these standard keys or else the entire metadata section is
87       * ignored.
88       */
89      private static final String[] STANDARD_METADATA_KEYS =
90          { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
91              "subtitle" };
92  
93      public int getType()
94      {
95          return TXT_TYPE;
96      }
97  
98      @Requirement
99      private MarkdownHtmlParser parser;
100 
101     public void parse( Reader source, Sink sink )
102         throws ParseException
103     {
104         try
105         {
106             // Markdown to HTML (using flexmark-java library)
107             String html = toHtml( source );
108             // then HTML to Sink API
109             parser.parse( new StringReader( html ), sink );
110         }
111         catch ( IOException e )
112         {
113             throw new ParseException( "Failed reading Markdown source document", e );
114         }
115     }
116 
117     /**
118      * uses flexmark-java library to parse content and generate HTML output.
119      *
120      * @param source the Markdown source
121      * @return HTML content generated by flexmark-java
122      * @throws IOException passed through
123      */
124     private String toHtml( Reader source )
125         throws IOException
126     {
127         String text = IOUtil.toString( source );
128         MutableDataHolder flexmarkOptions = PegdownOptionsAdapter.flexmarkOptions(
129                 Extensions.ALL & ~( Extensions.HARDWRAPS | Extensions.ANCHORLINKS ) ).toMutable();
130         ArrayList<Extension> extensions = new ArrayList<Extension>();
131         for ( Extension extension : flexmarkOptions.get( com.vladsch.flexmark.parser.Parser.EXTENSIONS ) )
132         {
133             extensions.add( extension );
134         }
135 
136         extensions.add( FlexmarkDoxiaExtension.create() );
137         flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, extensions );
138         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false );
139         flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false );
140         flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 );
141 
142         com.vladsch.flexmark.parser.Parser parser = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions )
143                 .build();
144         HtmlRenderer renderer = HtmlRenderer.builder( flexmarkOptions ).build();
145 
146         StringBuilder html = new StringBuilder( 1000 );
147         html.append( "<html>" );
148         html.append( "<head>" );
149         Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
150         Matcher metadataMatcher = metadataPattern.matcher( text );
151         boolean haveTitle = false;
152         if ( metadataMatcher.find() )
153         {
154             metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
155             Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
156             boolean first = true;
157             while ( lineMatcher.find() )
158             {
159                 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
160                 if ( first )
161                 {
162                     boolean found = false;
163                     for ( String k : STANDARD_METADATA_KEYS )
164                     {
165                         if ( k.equalsIgnoreCase( key ) )
166                         {
167                             found = true;
168                             break;
169                         }
170                     }
171                     if ( !found )
172                     {
173                         break;
174                     }
175                     first = false;
176                 }
177                 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
178                 if ( "title".equalsIgnoreCase( key ) )
179                 {
180                     haveTitle = true;
181                     html.append( "<title>" );
182                     html.append( StringEscapeUtils.escapeXml( value ) );
183                     html.append( "</title>" );
184                 }
185                 else if ( "author".equalsIgnoreCase( key ) )
186                 {
187                     html.append( "<meta name=\'author\' content=\'" );
188                     html.append( StringEscapeUtils.escapeXml( value ) );
189                     html.append( "\' />" );
190                 }
191                 else if ( "date".equalsIgnoreCase( key ) )
192                 {
193                     html.append( "<meta name=\'date\' content=\'" );
194                     html.append( StringEscapeUtils.escapeXml( value ) );
195                     html.append( "\' />" );
196                 }
197                 else
198                 {
199                     html.append( "<meta name=\'" );
200                     html.append( StringEscapeUtils.escapeXml( key ) );
201                     html.append( "\' content=\'" );
202                     html.append( StringEscapeUtils.escapeXml( value ) );
203                     html.append( "\' />" );
204                 }
205             }
206             if ( !first )
207             {
208                 text = text.substring( metadataMatcher.end() );
209             }
210         }
211 
212         Node rootNode = parser.parse( text );
213         String markdownHtml = renderer.render( rootNode );
214 
215         if ( !haveTitle && rootNode.hasChildren() )
216         {
217             // use the first (non-comment) node only if it is a heading
218             Node firstNode = rootNode.getFirstChild();
219             while ( firstNode != null && !( firstNode instanceof Heading ) )
220             {
221                 if ( !( firstNode instanceof HtmlCommentBlock ) )
222                 {
223                     break;
224                 }
225                 firstNode = firstNode.getNext();
226             }
227 
228             if ( firstNode instanceof Heading )
229             {
230                 html.append( "<title>" );
231                 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
232                 String headingText = collectingVisitor.collectAndGetText( firstNode );
233                 html.append( StringEscapeUtils.escapeXml( headingText ) );
234                 html.append( "</title>" );
235             }
236         }
237         html.append( "</head>" );
238         html.append( "<body>" );
239         html.append( markdownHtml );
240         html.append( "</body>" );
241         html.append( "</html>" );
242 
243         return html.toString();
244     }
245 
246     /**
247      * Internal parser for HTML generated by the Markdown library.
248      */
249     @Component( role = MarkdownHtmlParser.class )
250     public static class MarkdownHtmlParser
251         extends XhtmlParser
252     {
253         public MarkdownHtmlParser()
254         {
255             super();
256         }
257 
258         @Override
259         protected boolean baseEndTag( XmlPullParser parser, Sink sink )
260         {
261             boolean visited = super.baseEndTag( parser, sink );
262             if ( !visited )
263             {
264                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
265                 {
266                     handleUnknown( parser, sink, TAG_TYPE_END );
267                     visited = true;
268                 }
269             }
270             return visited;
271         }
272 
273         @Override
274         protected boolean baseStartTag( XmlPullParser parser, Sink sink )
275         {
276             boolean visited = super.baseStartTag( parser, sink );
277             if ( !visited )
278             {
279                 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
280                 {
281                     handleUnknown( parser, sink, TAG_TYPE_START );
282                     visited = true;
283                 }
284             }
285             return visited;
286         }
287     }
288 }