001package org.apache.maven.tools.plugin.generator;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import org.codehaus.plexus.util.StringUtils;
023import org.jsoup.Jsoup;
024import org.jsoup.internal.StringUtil;
025import org.jsoup.nodes.Document;
026import org.jsoup.nodes.Element;
027import org.jsoup.nodes.Node;
028import org.jsoup.nodes.TextNode;
029import org.jsoup.select.NodeTraversor;
030import org.jsoup.select.NodeVisitor;
031
032/**
033 * Replaces (X)HTML content by plain text equivalent.
034 * Based on work from 
035 * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java">
036 * JSoup Example: HtmlToPlainText</a>.
037 */
038public class HtmlToPlainTextConverter implements Converter
039{
040    @Override
041    public String convert( String text )
042    {
043        if ( StringUtils.isBlank( text ) )
044        {
045            return text;
046        }
047        Document document = Jsoup.parse( text );
048        return getPlainText( document );
049    }
050
051    /**
052     * Format an Element to plain-text
053     * 
054     * @param element the root element to format
055     * @return formatted text
056     */
057    private String getPlainText( Element element )
058    {
059        FormattingVisitor formatter = new FormattingVisitor();
060        NodeTraversor.traverse( formatter, element ); // walk the DOM, and call .head() and .tail() for each node
061
062        return formatter.toString();
063    }
064
065    // the formatting rules, implemented in a breadth-first DOM traverse
066    private static class FormattingVisitor
067        implements NodeVisitor
068    {
069        private StringBuilder accum = new StringBuilder(); // holds the accumulated text
070
071        // hit when the node is first seen
072        public void head( Node node, int depth )
073        {
074            String name = node.nodeName();
075            if ( node instanceof TextNode )
076            {
077                accum.append( ( (TextNode) node ).text() ); // TextNodes carry all user-readable text in the DOM.
078            }
079            else if ( name.equals( "li" ) )
080            {
081                accum.append( "\n * " );
082            }
083            else if ( name.equals( "dt" ) )
084            {
085                accum.append( "  " );
086            }
087            else if ( StringUtil.in( name, "p", "h1", "h2", "h3", "h4", "h5", "tr" ) )
088            {
089                accum.append( "\n" );
090            }
091        }
092
093        // hit when all of the node's children (if any) have been visited
094        public void tail( Node node, int depth )
095        {
096            String name = node.nodeName();
097            if ( StringUtil.in( name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5" ) )
098            {
099                accum.append( "\n" );
100            }
101            else if ( name.equals( "a" ) )
102            {
103                // link is empty if it cannot be made absolute
104                String link = node.absUrl( "href" );
105                if ( !link.isEmpty() )
106                {
107                    accum.append( String.format( " <%s>", link ) );
108                }
109            }
110        }
111
112        @Override
113        public String toString()
114        {
115            // collate multiple consecutive spaces
116            return accum.toString().replaceAll( " +", " " ).replace( "\n ", "\n" );
117        }
118    }
119}