1 | |
package org.apache.maven.wagon.shared.http; |
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
import org.apache.maven.wagon.TransferFailedException; |
23 | |
import org.apache.xerces.xni.Augmentations; |
24 | |
import org.apache.xerces.xni.QName; |
25 | |
import org.apache.xerces.xni.XMLAttributes; |
26 | |
import org.apache.xerces.xni.parser.XMLInputSource; |
27 | |
import org.apache.xerces.xni.parser.XMLParserConfiguration; |
28 | |
import org.codehaus.plexus.util.StringUtils; |
29 | |
import org.cyberneko.html.HTMLConfiguration; |
30 | |
import org.cyberneko.html.filters.DefaultFilter; |
31 | |
|
32 | |
import java.io.IOException; |
33 | |
import java.io.InputStream; |
34 | |
import java.io.UnsupportedEncodingException; |
35 | |
import java.net.URI; |
36 | |
import java.net.URISyntaxException; |
37 | |
import java.net.URLDecoder; |
38 | |
import java.util.ArrayList; |
39 | |
import java.util.HashSet; |
40 | |
import java.util.List; |
41 | |
import java.util.Set; |
42 | |
import java.util.regex.Pattern; |
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | 0 | public class HtmlFileListParser |
48 | |
{ |
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
public static List<String> parseFileList( String baseurl, InputStream stream ) |
56 | |
throws TransferFailedException |
57 | |
{ |
58 | |
try |
59 | |
{ |
60 | |
|
61 | 10 | URI baseURI = new URI( baseurl ); |
62 | |
|
63 | 10 | Parser handler = new Parser( baseURI ); |
64 | |
|
65 | 10 | XMLParserConfiguration parser = new HTMLConfiguration(); |
66 | 10 | parser.setDocumentHandler( handler ); |
67 | 10 | parser.setFeature( "http://cyberneko.org/html/features/augmentations", true ); |
68 | 10 | parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" ); |
69 | 10 | parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" ); |
70 | 10 | parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) ); |
71 | |
|
72 | 10 | return new ArrayList<String>( handler.getLinks() ); |
73 | |
|
74 | |
} |
75 | 0 | catch ( URISyntaxException e ) |
76 | |
{ |
77 | 0 | throw new TransferFailedException( "Unable to parse as URI: " + baseurl, e ); |
78 | |
} |
79 | 0 | catch ( IOException e ) |
80 | |
{ |
81 | 0 | throw new TransferFailedException( "I/O error: " + e.getMessage(), e ); |
82 | |
} |
83 | |
} |
84 | |
|
85 | 0 | private static class Parser |
86 | |
extends DefaultFilter |
87 | |
{ |
88 | |
|
89 | 1 | private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); |
90 | |
|
91 | |
|
92 | 1 | private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); |
93 | |
|
94 | |
|
95 | 1 | private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); |
96 | |
|
97 | |
|
98 | 1 | private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); |
99 | |
|
100 | 1 | private static final Pattern[] SKIPS = |
101 | |
new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; |
102 | |
|
103 | 10 | private Set<String> links = new HashSet<String>(); |
104 | |
|
105 | |
private URI baseURI; |
106 | |
|
107 | |
public Parser( URI baseURI ) |
108 | 10 | { |
109 | 10 | this.baseURI = baseURI.normalize(); |
110 | 10 | } |
111 | |
|
112 | |
public Set<String> getLinks() |
113 | |
{ |
114 | 10 | return links; |
115 | |
} |
116 | |
|
117 | |
public void startElement( QName element, XMLAttributes attrs, Augmentations augs ) |
118 | |
{ |
119 | 1303 | if ( "A".equals( element.rawname ) ) |
120 | |
{ |
121 | 478 | String href = attrs.getValue( "HREF" ); |
122 | 478 | if ( href != null ) |
123 | |
{ |
124 | 478 | String link = cleanLink( baseURI, href ); |
125 | 478 | if ( isAcceptableLink( link ) ) |
126 | |
{ |
127 | 443 | links.add( link ); |
128 | |
} |
129 | |
} |
130 | |
} |
131 | 1303 | } |
132 | |
|
133 | |
private static String cleanLink( URI baseURI, String link ) |
134 | |
{ |
135 | 478 | if ( StringUtils.isEmpty( link ) ) |
136 | |
{ |
137 | 0 | return ""; |
138 | |
} |
139 | |
|
140 | 478 | String ret = link; |
141 | |
|
142 | |
try |
143 | |
{ |
144 | 478 | URI linkuri = new URI( ret ); |
145 | 478 | URI relativeURI = baseURI.relativize( linkuri ).normalize(); |
146 | 478 | ret = relativeURI.toASCIIString(); |
147 | 478 | if ( ret.startsWith( baseURI.getPath() ) ) |
148 | |
{ |
149 | 110 | ret = ret.substring( baseURI.getPath().length() ); |
150 | |
} |
151 | |
|
152 | 478 | ret = URLDecoder.decode( ret, "UTF-8" ); |
153 | |
} |
154 | 0 | catch ( URISyntaxException e ) |
155 | |
{ |
156 | |
} |
157 | 0 | catch ( UnsupportedEncodingException e ) |
158 | |
{ |
159 | 478 | } |
160 | |
|
161 | 478 | return ret; |
162 | |
} |
163 | |
|
164 | |
private static boolean isAcceptableLink( String link ) |
165 | |
{ |
166 | 478 | if ( StringUtils.isEmpty( link ) ) |
167 | |
{ |
168 | 0 | return false; |
169 | |
} |
170 | |
|
171 | 2265 | for ( int i = 0; i < SKIPS.length; i++ ) |
172 | |
{ |
173 | 1822 | if ( SKIPS[i].matcher( link ).find() ) |
174 | |
{ |
175 | 35 | return false; |
176 | |
} |
177 | |
} |
178 | |
|
179 | 443 | return true; |
180 | |
} |
181 | |
} |
182 | |
} |