001package org.apache.maven.doxia.module.markdown; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import com.vladsch.flexmark.Extension; 023import com.vladsch.flexmark.ast.Heading; 024import com.vladsch.flexmark.ast.HtmlCommentBlock; 025import com.vladsch.flexmark.ast.Node; 026import com.vladsch.flexmark.ast.util.TextCollectingVisitor; 027import com.vladsch.flexmark.html.HtmlRenderer; 028import com.vladsch.flexmark.profiles.pegdown.Extensions; 029import com.vladsch.flexmark.profiles.pegdown.PegdownOptionsAdapter; 030import com.vladsch.flexmark.util.options.MutableDataHolder; 031import org.apache.commons.lang3.StringEscapeUtils; 032import org.apache.commons.lang3.StringUtils; 033import org.apache.maven.doxia.markup.HtmlMarkup; 034import org.apache.maven.doxia.module.xhtml.XhtmlParser; 035import org.apache.maven.doxia.parser.AbstractParser; 036import org.apache.maven.doxia.parser.ParseException; 037import org.apache.maven.doxia.parser.Parser; 038import org.apache.maven.doxia.sink.Sink; 039import org.codehaus.plexus.component.annotations.Component; 040import org.codehaus.plexus.component.annotations.Requirement; 041import org.codehaus.plexus.util.IOUtil; 042import org.codehaus.plexus.util.xml.pull.XmlPullParser; 043 044import java.io.IOException; 045import java.io.Reader; 046import java.io.StringReader; 047import java.util.ArrayList; 048import java.util.regex.Matcher; 049import java.util.regex.Pattern; 050 051/** 052 * Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents. 053 * <p/> 054 * Defers effective parsing to the <a href="https://github.com/vsch/flexmark-java">flexmark-java library</a>, 055 * which generates HTML content then delegates parsing of this content to a slightly modified Doxia Xhtml parser. 056 * (before 1.8, the <a href="http://pegdown.org">PegDown library</a> was used) 057 * 058 * @author Vladimir Schneider <vladimir@vladsch.com> 059 * @author Julien Nicoulaud <julien.nicoulaud@gmail.com> 060 * @since 1.3 061 */ 062@Component( role = Parser.class, hint = "markdown" ) 063public class MarkdownParser 064 extends AbstractParser 065{ 066 067 /** 068 * The role hint for the {@link MarkdownParser} Plexus component. 069 */ 070 public static final String ROLE_HINT = "markdown"; 071 072 /** 073 * Regex that identifies a multimarkdown-style metadata section at the start of the document 074 */ 075 private static final String MULTI_MARKDOWN_METADATA_SECTION = 076 "^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)"; 077 078 /** 079 * Regex that captures the key and value of a multimarkdown-style metadata entry. 080 */ 081 private static final String MULTI_MARKDOWN_METADATA_ENTRY = 082 "([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n"; 083 084 /** 085 * In order to ensure that we have minimal risk of false positives when slurping metadata sections, the 086 * first key in the metadata section must be one of these standard keys or else the entire metadata section is 087 * ignored. 088 */ 089 private static final String[] STANDARD_METADATA_KEYS = 090 { "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone", 091 "subtitle" }; 092 093 public int getType() 094 { 095 return TXT_TYPE; 096 } 097 098 @Requirement 099 private MarkdownHtmlParser parser; 100 101 public void parse( Reader source, Sink sink ) 102 throws ParseException 103 { 104 try 105 { 106 // Markdown to HTML (using flexmark-java library) 107 String html = toHtml( source ); 108 // then HTML to Sink API 109 parser.parse( new StringReader( html ), sink ); 110 } 111 catch ( IOException e ) 112 { 113 throw new ParseException( "Failed reading Markdown source document", e ); 114 } 115 } 116 117 /** 118 * uses flexmark-java library to parse content and generate HTML output. 119 * 120 * @param source the Markdown source 121 * @return HTML content generated by flexmark-java 122 * @throws IOException passed through 123 */ 124 private String toHtml( Reader source ) 125 throws IOException 126 { 127 String text = IOUtil.toString( source ); 128 MutableDataHolder flexmarkOptions = PegdownOptionsAdapter.flexmarkOptions( 129 Extensions.ALL & ~( Extensions.HARDWRAPS | Extensions.ANCHORLINKS ) ).toMutable(); 130 ArrayList<Extension> extensions = new ArrayList<Extension>(); 131 for ( Extension extension : flexmarkOptions.get( com.vladsch.flexmark.parser.Parser.EXTENSIONS ) ) 132 { 133 extensions.add( extension ); 134 } 135 136 extensions.add( FlexmarkDoxiaExtension.create() ); 137 flexmarkOptions.set( com.vladsch.flexmark.parser.Parser.EXTENSIONS, extensions ); 138 flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_OPEN_TAG_EOL, false ); 139 flexmarkOptions.set( HtmlRenderer.HTML_BLOCK_CLOSE_TAG_EOL, false ); 140 flexmarkOptions.set( HtmlRenderer.MAX_TRAILING_BLANK_LINES, -1 ); 141 142 com.vladsch.flexmark.parser.Parser parser = com.vladsch.flexmark.parser.Parser.builder( flexmarkOptions ) 143 .build(); 144 HtmlRenderer renderer = HtmlRenderer.builder( flexmarkOptions ).build(); 145 146 StringBuilder html = new StringBuilder( 1000 ); 147 html.append( "<html>" ); 148 html.append( "<head>" ); 149 Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE ); 150 Matcher metadataMatcher = metadataPattern.matcher( text ); 151 boolean haveTitle = false; 152 if ( metadataMatcher.find() ) 153 { 154 metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE ); 155 Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) ); 156 boolean first = true; 157 while ( lineMatcher.find() ) 158 { 159 String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) ); 160 if ( first ) 161 { 162 boolean found = false; 163 for ( String k : STANDARD_METADATA_KEYS ) 164 { 165 if ( k.equalsIgnoreCase( key ) ) 166 { 167 found = true; 168 break; 169 } 170 } 171 if ( !found ) 172 { 173 break; 174 } 175 first = false; 176 } 177 String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) ); 178 if ( "title".equalsIgnoreCase( key ) ) 179 { 180 haveTitle = true; 181 html.append( "<title>" ); 182 html.append( StringEscapeUtils.escapeXml( value ) ); 183 html.append( "</title>" ); 184 } 185 else if ( "author".equalsIgnoreCase( key ) ) 186 { 187 html.append( "<meta name=\'author\' content=\'" ); 188 html.append( StringEscapeUtils.escapeXml( value ) ); 189 html.append( "\' />" ); 190 } 191 else if ( "date".equalsIgnoreCase( key ) ) 192 { 193 html.append( "<meta name=\'date\' content=\'" ); 194 html.append( StringEscapeUtils.escapeXml( value ) ); 195 html.append( "\' />" ); 196 } 197 else 198 { 199 html.append( "<meta name=\'" ); 200 html.append( StringEscapeUtils.escapeXml( key ) ); 201 html.append( "\' content=\'" ); 202 html.append( StringEscapeUtils.escapeXml( value ) ); 203 html.append( "\' />" ); 204 } 205 } 206 if ( !first ) 207 { 208 text = text.substring( metadataMatcher.end() ); 209 } 210 } 211 212 Node rootNode = parser.parse( text ); 213 String markdownHtml = renderer.render( rootNode ); 214 215 if ( !haveTitle && rootNode.hasChildren() ) 216 { 217 // use the first (non-comment) node only if it is a heading 218 Node firstNode = rootNode.getFirstChild(); 219 while ( firstNode != null && !( firstNode instanceof Heading ) ) 220 { 221 if ( !( firstNode instanceof HtmlCommentBlock ) ) 222 { 223 break; 224 } 225 firstNode = firstNode.getNext(); 226 } 227 228 if ( firstNode instanceof Heading ) 229 { 230 html.append( "<title>" ); 231 TextCollectingVisitor collectingVisitor = new TextCollectingVisitor(); 232 String headingText = collectingVisitor.collectAndGetText( firstNode ); 233 html.append( StringEscapeUtils.escapeXml( headingText ) ); 234 html.append( "</title>" ); 235 } 236 } 237 html.append( "</head>" ); 238 html.append( "<body>" ); 239 html.append( markdownHtml ); 240 html.append( "</body>" ); 241 html.append( "</html>" ); 242 243 return html.toString(); 244 } 245 246 /** 247 * Internal parser for HTML generated by the Markdown library. 248 */ 249 @Component( role = MarkdownHtmlParser.class ) 250 public static class MarkdownHtmlParser 251 extends XhtmlParser 252 { 253 public MarkdownHtmlParser() 254 { 255 super(); 256 } 257 258 @Override 259 protected boolean baseEndTag( XmlPullParser parser, Sink sink ) 260 { 261 boolean visited = super.baseEndTag( parser, sink ); 262 if ( !visited ) 263 { 264 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 265 { 266 handleUnknown( parser, sink, TAG_TYPE_END ); 267 visited = true; 268 } 269 } 270 return visited; 271 } 272 273 @Override 274 protected boolean baseStartTag( XmlPullParser parser, Sink sink ) 275 { 276 boolean visited = super.baseStartTag( parser, sink ); 277 if ( !visited ) 278 { 279 if ( parser.getName().equals( HtmlMarkup.DIV.toString() ) ) 280 { 281 handleUnknown( parser, sink, TAG_TYPE_START ); 282 visited = true; 283 } 284 } 285 return visited; 286 } 287 } 288}