1 package org.apache.maven.wagon.shared.http;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import org.apache.maven.wagon.TransferFailedException;
23 import org.apache.xerces.xni.Augmentations;
24 import org.apache.xerces.xni.QName;
25 import org.apache.xerces.xni.XMLAttributes;
26 import org.apache.xerces.xni.parser.XMLInputSource;
27 import org.apache.xerces.xni.parser.XMLParserConfiguration;
28 import org.codehaus.plexus.util.StringUtils;
29 import org.cyberneko.html.HTMLConfiguration;
30 import org.cyberneko.html.filters.DefaultFilter;
31
32 import java.io.IOException;
33 import java.io.InputStream;
34 import java.io.UnsupportedEncodingException;
35 import java.net.URI;
36 import java.net.URISyntaxException;
37 import java.net.URLDecoder;
38 import java.util.ArrayList;
39 import java.util.HashSet;
40 import java.util.List;
41 import java.util.Set;
42 import java.util.regex.Pattern;
43
44
45
46
47 public class HtmlFileListParser
48 {
49
50
51
52
53
54
55 public static List<String> parseFileList( String baseurl, InputStream stream )
56 throws TransferFailedException
57 {
58 try
59 {
60
61 URI baseURI = new URI( baseurl );
62
63 Parser handler = new Parser( baseURI );
64
65 XMLParserConfiguration parser = new HTMLConfiguration();
66 parser.setDocumentHandler( handler );
67 parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
68 parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
69 parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
70 parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
71
72 return new ArrayList<String>( handler.getLinks() );
73
74 }
75 catch ( URISyntaxException e )
76 {
77 throw new TransferFailedException( "Unable to parse as URI: " + baseurl, e );
78 }
79 catch ( IOException e )
80 {
81 throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
82 }
83 }
84
85 private static class Parser
86 extends DefaultFilter
87 {
88
89 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
90
91
92 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
93
94
95 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
96
97
98 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
99
100 private static final Pattern[] SKIPS =
101 new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
102
103 private Set<String> links = new HashSet<String>();
104
105 private URI baseURI;
106
107 public Parser( URI baseURI )
108 {
109 this.baseURI = baseURI.normalize();
110 }
111
112 public Set<String> getLinks()
113 {
114 return links;
115 }
116
117 public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
118 {
119 if ( "A".equals( element.rawname ) )
120 {
121 String href = attrs.getValue( "HREF" );
122 if ( href != null )
123 {
124 String link = cleanLink( baseURI, href );
125 if ( isAcceptableLink( link ) )
126 {
127 links.add( link );
128 }
129 }
130 }
131 }
132
133 private static String cleanLink( URI baseURI, String link )
134 {
135 if ( StringUtils.isEmpty( link ) )
136 {
137 return "";
138 }
139
140 String ret = link;
141
142 try
143 {
144 URI linkuri = new URI( ret );
145 URI relativeURI = baseURI.relativize( linkuri ).normalize();
146 ret = relativeURI.toASCIIString();
147 if ( ret.startsWith( baseURI.getPath() ) )
148 {
149 ret = ret.substring( baseURI.getPath().length() );
150 }
151
152 ret = URLDecoder.decode( ret, "UTF-8" );
153 }
154 catch ( URISyntaxException e )
155 {
156 }
157 catch ( UnsupportedEncodingException e )
158 {
159 }
160
161 return ret;
162 }
163
164 private static boolean isAcceptableLink( String link )
165 {
166 if ( StringUtils.isEmpty( link ) )
167 {
168 return false;
169 }
170
171 for ( int i = 0; i < SKIPS.length; i++ )
172 {
173 if ( SKIPS[i].matcher( link ).find() )
174 {
175 return false;
176 }
177 }
178
179 return true;
180 }
181 }
182 }