1 package org.apache.maven.tools.plugin.generator;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import org.codehaus.plexus.util.StringUtils;
23 import org.jsoup.Jsoup;
24 import org.jsoup.internal.StringUtil;
25 import org.jsoup.nodes.Document;
26 import org.jsoup.nodes.Element;
27 import org.jsoup.nodes.Node;
28 import org.jsoup.nodes.TextNode;
29 import org.jsoup.select.NodeTraversor;
30 import org.jsoup.select.NodeVisitor;
31
32
33
34
35
36
37
38 public class HtmlToPlainTextConverter implements Converter
39 {
40 @Override
41 public String convert( String text )
42 {
43 if ( StringUtils.isBlank( text ) )
44 {
45 return text;
46 }
47 Document document = Jsoup.parse( text );
48 return getPlainText( document );
49 }
50
51
52
53
54
55
56
57 private String getPlainText( Element element )
58 {
59 FormattingVisitor formatter = new FormattingVisitor();
60 NodeTraversor.traverse( formatter, element );
61
62 return formatter.toString();
63 }
64
65
66 private static class FormattingVisitor
67 implements NodeVisitor
68 {
69 private StringBuilder accum = new StringBuilder();
70
71
72 public void head( Node node, int depth )
73 {
74 String name = node.nodeName();
75 if ( node instanceof TextNode )
76 {
77 accum.append( ( (TextNode) node ).text() );
78 }
79 else if ( name.equals( "li" ) )
80 {
81 accum.append( "\n * " );
82 }
83 else if ( name.equals( "dt" ) )
84 {
85 accum.append( " " );
86 }
87 else if ( StringUtil.in( name, "p", "h1", "h2", "h3", "h4", "h5", "tr" ) )
88 {
89 accum.append( "\n" );
90 }
91 }
92
93
94 public void tail( Node node, int depth )
95 {
96 String name = node.nodeName();
97 if ( StringUtil.in( name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5" ) )
98 {
99 accum.append( "\n" );
100 }
101 else if ( name.equals( "a" ) )
102 {
103
104 String link = node.absUrl( "href" );
105 if ( !link.isEmpty() )
106 {
107 accum.append( String.format( " <%s>", link ) );
108 }
109 }
110 }
111
112 @Override
113 public String toString()
114 {
115
116 return accum.toString().replaceAll( " +", " " ).replace( "\n ", "\n" );
117 }
118 }
119 }