001package org.apache.maven.doxia.module.markdown;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import com.vladsch.flexmark.Extension;
023import com.vladsch.flexmark.ast.Heading;
024import com.vladsch.flexmark.ast.HtmlCommentBlock;
025import com.vladsch.flexmark.ast.Node;
026import com.vladsch.flexmark.ast.util.TextCollectingVisitor;
027import com.vladsch.flexmark.html.HtmlRenderer;
028import com.vladsch.flexmark.profiles.pegdown.Extensions;
029import com.vladsch.flexmark.profiles.pegdown.PegdownOptionsAdapter;
030import com.vladsch.flexmark.util.options.MutableDataHolder;
031import org.apache.commons.lang3.StringEscapeUtils;
032import org.apache.commons.lang3.StringUtils;
033import org.apache.maven.doxia.markup.HtmlMarkup;
034import org.apache.maven.doxia.module.xhtml.XhtmlParser;
035import org.apache.maven.doxia.parser.AbstractParser;
036import org.apache.maven.doxia.parser.ParseException;
037import org.apache.maven.doxia.parser.Parser;
038import org.apache.maven.doxia.sink.Sink;
039import org.codehaus.plexus.component.annotations.Component;
040import org.codehaus.plexus.component.annotations.Requirement;
041import org.codehaus.plexus.util.IOUtil;
042import org.codehaus.plexus.util.xml.pull.XmlPullParser;
043
044import java.io.IOException;
045import java.io.Reader;
046import java.io.StringReader;
047import java.util.ArrayList;
048import java.util.regex.Matcher;
049import java.util.regex.Pattern;
050
051/**
052 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
053 * <p/>
054 * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>,
055 * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
056 * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used)
057 *
058 * @author Vladimir Schneider <vladimir@vladsch.com>
059 * @author Julien Nicoulaud <julien.nicoulaud@gmail.com>
060 * @since 1.3
061 */
062@Component( role = Parser.class, hint = "markdown" )
063public class MarkdownParser
064    extends AbstractParser
065{
066
067    /**
068     * The role hint for the {@link MarkdownParser} Plexus component.
069     */
070    public static final String ROLE_HINT = "markdown";
071
072    /**
073     * Regex that identifies a multimarkdown-style metadata section at the start of the document
074     */
075    private static final String MULTI_MARKDOWN_METADATA_SECTION =
076        "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
077
078    /**
079     * Regex that captures the key and value of a multimarkdown-style metadata entry.
080     */
081    private static final String MULTI_MARKDOWN_METADATA_ENTRY =
082        "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
083
084    /**
085     * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
086     * first key in the metadata section must be one of these standard keys or else the entire metadata section is
087     * ignored.
088     */
089    private static final String[] STANDARD_METADATA_KEYS =
090        { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
091            "subtitle" };
092
093    public int getType()
094    {
095        return TXT_TYPE;
096    }
097
098    @Requirement
099    private MarkdownHtmlParser parser;
100
101    public void parse( Reader source, Sink sink )
102        throws ParseException
103    {
104        try
105        {
106            // Markdown to HTML (using flexmark-java library)
107            String html = toHtml( source );
108            // then HTML to Sink API
109            parser.parse( new StringReader( html ), sink );
110        }
111        catch ( IOException e )
112        {
113            throw new ParseException( "Failed reading Markdown source document", e );
114        }
115    }
116
117    /**
118     * uses flexmark-java library to parse content and generate HTML output.
119     *
120     * @param source the Markdown source
121     * @return HTML content generated by flexmark-java
122     * @throws IOException passed through
123     */
124    private String toHtml( Reader source )
125        throws IOException
126    {
127        String text = IOUtil.toString( source );
128        MutableDataHolder flexmarkOptions = PegdownOptionsAdapter.flexmarkOptions(
129                Extensions.ALL & ~( Extensions.HARDWRAPS | Extensions.ANCHORLINKS ) ).toMutable();
130        ArrayList<Extension> extensions = new ArrayList<Extension>();
131        for ( Extension extension : flexmarkOptions.get( com.vladsch.flexmark.parser.Parser.EXTENSIONS ) )
132        {
133            extensions.add( extension );
134        }
135
136        extensions.add( FlexmarkDoxiaExtension.create() );
137        flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, extensions );
138        flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false );
139        flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false );
140        flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 );
141
142        com.vladsch.flexmark.parser.Parser parser = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions )
143                .build();
144        HtmlRenderer renderer = HtmlRenderer.builder( flexmarkOptions ).build();
145
146        StringBuilder html = new StringBuilder( 1000 );
147        html.append( "<html>" );
148        html.append( "<head>" );
149        Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
150        Matcher metadataMatcher = metadataPattern.matcher( text );
151        boolean haveTitle = false;
152        if ( metadataMatcher.find() )
153        {
154            metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
155            Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
156            boolean first = true;
157            while ( lineMatcher.find() )
158            {
159                String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
160                if ( first )
161                {
162                    boolean found = false;
163                    for ( String k : STANDARD_METADATA_KEYS )
164                    {
165                        if ( k.equalsIgnoreCase( key ) )
166                        {
167                            found = true;
168                            break;
169                        }
170                    }
171                    if ( !found )
172                    {
173                        break;
174                    }
175                    first = false;
176                }
177                String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
178                if ( "title".equalsIgnoreCase( key ) )
179                {
180                    haveTitle = true;
181                    html.append( "<title>" );
182                    html.append( StringEscapeUtils.escapeXml( value ) );
183                    html.append( "</title>" );
184                }
185                else if ( "author".equalsIgnoreCase( key ) )
186                {
187                    html.append( "<meta name=\'author\' content=\'" );
188                    html.append( StringEscapeUtils.escapeXml( value ) );
189                    html.append( "\' />" );
190                }
191                else if ( "date".equalsIgnoreCase( key ) )
192                {
193                    html.append( "<meta name=\'date\' content=\'" );
194                    html.append( StringEscapeUtils.escapeXml( value ) );
195                    html.append( "\' />" );
196                }
197                else
198                {
199                    html.append( "<meta name=\'" );
200                    html.append( StringEscapeUtils.escapeXml( key ) );
201                    html.append( "\' content=\'" );
202                    html.append( StringEscapeUtils.escapeXml( value ) );
203                    html.append( "\' />" );
204                }
205            }
206            if ( !first )
207            {
208                text = text.substring( metadataMatcher.end() );
209            }
210        }
211
212        Node rootNode = parser.parse( text );
213        String markdownHtml = renderer.render( rootNode );
214
215        if ( !haveTitle && rootNode.hasChildren() )
216        {
217            // use the first (non-comment) node only if it is a heading
218            Node firstNode = rootNode.getFirstChild();
219            while ( firstNode != null && !( firstNode instanceof Heading ) )
220            {
221                if ( !( firstNode instanceof HtmlCommentBlock ) )
222                {
223                    break;
224                }
225                firstNode = firstNode.getNext();
226            }
227
228            if ( firstNode instanceof Heading )
229            {
230                html.append( "<title>" );
231                TextCollectingVisitor collectingVisitor = new TextCollectingVisitor();
232                String headingText = collectingVisitor.collectAndGetText( firstNode );
233                html.append( StringEscapeUtils.escapeXml( headingText ) );
234                html.append( "</title>" );
235            }
236        }
237        html.append( "</head>" );
238        html.append( "<body>" );
239        html.append( markdownHtml );
240        html.append( "</body>" );
241        html.append( "</html>" );
242
243        return html.toString();
244    }
245
246    /**
247     * Internal parser for HTML generated by the Markdown library.
248     */
249    @Component( role = MarkdownHtmlParser.class )
250    public static class MarkdownHtmlParser
251        extends XhtmlParser
252    {
253        public MarkdownHtmlParser()
254        {
255            super();
256        }
257
258        @Override
259        protected boolean baseEndTag( XmlPullParser parser, Sink sink )
260        {
261            boolean visited = super.baseEndTag( parser, sink );
262            if ( !visited )
263            {
264                if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
265                {
266                    handleUnknown( parser, sink, TAG_TYPE_END );
267                    visited = true;
268                }
269            }
270            return visited;
271        }
272
273        @Override
274        protected boolean baseStartTag( XmlPullParser parser, Sink sink )
275        {
276            boolean visited = super.baseStartTag( parser, sink );
277            if ( !visited )
278            {
279                if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) )
280                {
281                    handleUnknown( parser, sink, TAG_TYPE_START );
282                    visited = true;
283                }
284            }
285            return visited;
286        }
287    }
288}