1 | |
package org.apache.maven.wagon.shared.http; |
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
import java.io.IOException; |
23 | |
import java.io.InputStream; |
24 | |
import java.io.UnsupportedEncodingException; |
25 | |
import java.net.URI; |
26 | |
import java.net.URISyntaxException; |
27 | |
import java.net.URLDecoder; |
28 | |
import java.util.ArrayList; |
29 | |
import java.util.HashSet; |
30 | |
import java.util.List; |
31 | |
import java.util.Set; |
32 | |
import java.util.regex.Pattern; |
33 | |
|
34 | |
import org.apache.maven.wagon.TransferFailedException; |
35 | |
import org.apache.xerces.xni.Augmentations; |
36 | |
import org.apache.xerces.xni.QName; |
37 | |
import org.apache.xerces.xni.XMLAttributes; |
38 | |
import org.apache.xerces.xni.parser.XMLInputSource; |
39 | |
import org.apache.xerces.xni.parser.XMLParserConfiguration; |
40 | |
import org.codehaus.plexus.util.StringUtils; |
41 | |
import org.cyberneko.html.HTMLConfiguration; |
42 | |
import org.cyberneko.html.filters.DefaultFilter; |
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | 0 | public class HtmlFileListParser |
48 | |
{ |
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
public static ListparseFileList( String baseurl, InputStream stream ) |
57 | |
throws TransferFailedException |
58 | |
{ |
59 | |
try |
60 | |
{ |
61 | |
|
62 | 10 | URI baseURI = new URI( baseurl ); |
63 | |
|
64 | 10 | Parser handler = new Parser( baseURI ); |
65 | |
|
66 | 10 | XMLParserConfiguration parser = new HTMLConfiguration(); |
67 | 10 | parser.setDocumentHandler( handler ); |
68 | 10 | parser.setFeature( "http://cyberneko.org/html/features/augmentations", true ); |
69 | 10 | parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" ); |
70 | 10 | parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" ); |
71 | 10 | parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) ); |
72 | |
|
73 | 10 | return new ArrayList( handler.getLinks() ); |
74 | |
|
75 | |
} |
76 | 0 | catch ( URISyntaxException e ) |
77 | |
{ |
78 | 0 | throw new TransferFailedException( "Unable to parse as URI: " + baseurl ); |
79 | |
} |
80 | 0 | catch ( IOException e ) |
81 | |
{ |
82 | 0 | throw new TransferFailedException( "I/O error: " + e.getMessage(), e ); |
83 | |
} |
84 | |
} |
85 | |
|
86 | 0 | private static class Parser |
87 | |
extends DefaultFilter |
88 | |
{ |
89 | |
|
90 | 1 | private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); |
91 | |
|
92 | |
|
93 | 1 | private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); |
94 | |
|
95 | |
|
96 | 1 | private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); |
97 | |
|
98 | |
|
99 | 1 | private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); |
100 | |
|
101 | 1 | private static final Pattern[] SKIPS = |
102 | |
new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; |
103 | |
|
104 | 10 | private Set links = new HashSet(); |
105 | |
|
106 | |
private URI baseURI; |
107 | |
|
108 | |
public Parser( URI baseURI ) |
109 | 10 | { |
110 | 10 | this.baseURI = baseURI.normalize(); |
111 | 10 | } |
112 | |
|
113 | |
public Set getLinks() |
114 | |
{ |
115 | 10 | return links; |
116 | |
} |
117 | |
|
118 | |
public void startElement( QName element, XMLAttributes attrs, Augmentations augs ) |
119 | |
{ |
120 | 1303 | if ( "A".equals( element.rawname ) ) |
121 | |
{ |
122 | 478 | String href = attrs.getValue( "HREF" ); |
123 | 478 | if ( href != null ) |
124 | |
{ |
125 | 478 | String link = cleanLink( baseURI, href ); |
126 | 478 | if ( isAcceptableLink( link ) ) |
127 | |
{ |
128 | 443 | links.add( link ); |
129 | |
} |
130 | |
} |
131 | |
} |
132 | 1303 | } |
133 | |
|
134 | |
private static String cleanLink( URI baseURI, String link ) |
135 | |
{ |
136 | 478 | if ( StringUtils.isEmpty( link ) ) |
137 | |
{ |
138 | 0 | return ""; |
139 | |
} |
140 | |
|
141 | 478 | String ret = link; |
142 | |
|
143 | |
try |
144 | |
{ |
145 | 478 | URI linkuri = new URI( ret ); |
146 | 478 | URI relativeURI = baseURI.relativize( linkuri ).normalize(); |
147 | 478 | ret = relativeURI.toASCIIString(); |
148 | 478 | if ( ret.startsWith( baseURI.getPath() ) ) |
149 | |
{ |
150 | 110 | ret = ret.substring( baseURI.getPath().length() ); |
151 | |
} |
152 | |
|
153 | 478 | ret = URLDecoder.decode( ret, "UTF-8" ); |
154 | |
} |
155 | 0 | catch ( URISyntaxException e ) |
156 | |
{ |
157 | |
} |
158 | 0 | catch ( UnsupportedEncodingException e ) |
159 | |
{ |
160 | 478 | } |
161 | |
|
162 | 478 | return ret; |
163 | |
} |
164 | |
|
165 | |
private static boolean isAcceptableLink( String link ) |
166 | |
{ |
167 | 478 | if ( StringUtils.isEmpty( link ) ) |
168 | |
{ |
169 | 0 | return false; |
170 | |
} |
171 | |
|
172 | 2265 | for ( int i = 0; i < SKIPS.length; i++ ) |
173 | |
{ |
174 | 1822 | if ( SKIPS[i].matcher( link ).find() ) |
175 | |
{ |
176 | 35 | return false; |
177 | |
} |
178 | |
} |
179 | |
|
180 | 443 | return true; |
181 | |
} |
182 | |
} |
183 | |
} |