001package org.apache.maven.tools.plugin.generator; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import org.codehaus.plexus.util.StringUtils; 023import org.jsoup.Jsoup; 024import org.jsoup.internal.StringUtil; 025import org.jsoup.nodes.Document; 026import org.jsoup.nodes.Element; 027import org.jsoup.nodes.Node; 028import org.jsoup.nodes.TextNode; 029import org.jsoup.select.NodeTraversor; 030import org.jsoup.select.NodeVisitor; 031 032/** 033 * Replaces (X)HTML content by plain text equivalent. 034 * Based on work from 035 * <a href="https://github.com/jhy/jsoup/blob/master/src/main/java/org/jsoup/examples/HtmlToPlainText.java"> 036 * JSoup Example: HtmlToPlainText</a>. 037 */ 038public class HtmlToPlainTextConverter implements Converter 039{ 040 @Override 041 public String convert( String text ) 042 { 043 if ( StringUtils.isBlank( text ) ) 044 { 045 return text; 046 } 047 Document document = Jsoup.parse( text ); 048 return getPlainText( document ); 049 } 050 051 /** 052 * Format an Element to plain-text 053 * 054 * @param element the root element to format 055 * @return formatted text 056 */ 057 private String getPlainText( Element element ) 058 { 059 FormattingVisitor formatter = new FormattingVisitor(); 060 NodeTraversor.traverse( formatter, element ); // walk the DOM, and call .head() and .tail() for each node 061 062 return formatter.toString(); 063 } 064 065 // the formatting rules, implemented in a breadth-first DOM traverse 066 private static class FormattingVisitor 067 implements NodeVisitor 068 { 069 private StringBuilder accum = new StringBuilder(); // holds the accumulated text 070 071 // hit when the node is first seen 072 public void head( Node node, int depth ) 073 { 074 String name = node.nodeName(); 075 if ( node instanceof TextNode ) 076 { 077 accum.append( ( (TextNode) node ).text() ); // TextNodes carry all user-readable text in the DOM. 078 } 079 else if ( name.equals( "li" ) ) 080 { 081 accum.append( "\n * " ); 082 } 083 else if ( name.equals( "dt" ) ) 084 { 085 accum.append( " " ); 086 } 087 else if ( StringUtil.in( name, "p", "h1", "h2", "h3", "h4", "h5", "tr" ) ) 088 { 089 accum.append( "\n" ); 090 } 091 } 092 093 // hit when all of the node's children (if any) have been visited 094 public void tail( Node node, int depth ) 095 { 096 String name = node.nodeName(); 097 if ( StringUtil.in( name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5" ) ) 098 { 099 accum.append( "\n" ); 100 } 101 else if ( name.equals( "a" ) ) 102 { 103 // link is empty if it cannot be made absolute 104 String link = node.absUrl( "href" ); 105 if ( !link.isEmpty() ) 106 { 107 accum.append( String.format( " <%s>", link ) ); 108 } 109 } 110 } 111 112 @Override 113 public String toString() 114 { 115 // collate multiple consecutive spaces 116 return accum.toString().replaceAll( " +", " " ).replace( "\n ", "\n" ); 117 } 118 } 119}