View Javadoc
1   package org.apache.maven.tools.plugin.generator;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import org.codehaus.plexus.util.StringUtils;
23  import org.jsoup.Jsoup;
24  import org.jsoup.internal.StringUtil;
25  import org.jsoup.nodes.Document;
26  import org.jsoup.nodes.Element;
27  import org.jsoup.nodes.Node;
28  import org.jsoup.nodes.TextNode;
29  import org.jsoup.select.NodeTraversor;
30  import org.jsoup.select.NodeVisitor;
31  
32  /**
33   * Replaces (X)HTML content by plain text equivalent.
34   * Based on work from 
35   * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java">
36   * JSoup Example: HtmlToPlainText</a>.
37   */
38  public class HtmlToPlainTextConverter implements Converter
39  {
40      @Override
41      public String convert( String text )
42      {
43          if ( StringUtils.isBlank( text ) )
44          {
45              return text;
46          }
47          Document document = Jsoup.parse( text );
48          return getPlainText( document );
49      }
50  
51      /**
52       * Format an Element to plain-text
53       * 
54       * @param element the root element to format
55       * @return formatted text
56       */
57      private String getPlainText( Element element )
58      {
59          FormattingVisitor formatter = new FormattingVisitor();
60          NodeTraversor.traverse( formatter, element ); // walk the DOM, and call .head() and .tail() for each node
61  
62          return formatter.toString();
63      }
64  
65      // the formatting rules, implemented in a breadth-first DOM traverse
66      private static class FormattingVisitor
67          implements NodeVisitor
68      {
69          private StringBuilder accum = new StringBuilder(); // holds the accumulated text
70  
71          // hit when the node is first seen
72          public void head( Node node, int depth )
73          {
74              String name = node.nodeName();
75              if ( node instanceof TextNode )
76              {
77                  accum.append( ( (TextNode) node ).text() ); // TextNodes carry all user-readable text in the DOM.
78              }
79              else if ( name.equals( "li" ) )
80              {
81                  accum.append( "\n * " );
82              }
83              else if ( name.equals( "dt" ) )
84              {
85                  accum.append( "  " );
86              }
87              else if ( StringUtil.in( name, "p", "h1", "h2", "h3", "h4", "h5", "tr" ) )
88              {
89                  accum.append( "\n" );
90              }
91          }
92  
93          // hit when all of the node's children (if any) have been visited
94          public void tail( Node node, int depth )
95          {
96              String name = node.nodeName();
97              if ( StringUtil.in( name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5" ) )
98              {
99                  accum.append( "\n" );
100             }
101             else if ( name.equals( "a" ) )
102             {
103                 // link is empty if it cannot be made absolute
104                 String link = node.absUrl( "href" );
105                 if ( !link.isEmpty() )
106                 {
107                     accum.append( String.format( " <%s>", link ) );
108                 }
109             }
110         }
111 
112         @Override
113         public String toString()
114         {
115             // collate multiple consecutive spaces
116             return accum.toString().replaceAll( " +", " " ).replace( "\n ", "\n" );
117         }
118     }
119 }