1 | |
package org.apache.maven.wagon.shared.http4; |
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
|
19 | |
|
20 | |
|
21 | |
|
22 | |
import org.apache.commons.io.IOUtils; |
23 | |
import org.apache.maven.wagon.TransferFailedException; |
24 | |
import org.codehaus.plexus.util.StringUtils; |
25 | |
import org.jsoup.Jsoup; |
26 | |
import org.jsoup.nodes.Document; |
27 | |
import org.jsoup.nodes.Element; |
28 | |
import org.jsoup.select.Elements; |
29 | |
|
30 | |
import java.io.IOException; |
31 | |
import java.io.InputStream; |
32 | |
import java.io.UnsupportedEncodingException; |
33 | |
import java.net.URI; |
34 | |
import java.net.URISyntaxException; |
35 | |
import java.net.URLDecoder; |
36 | |
import java.util.ArrayList; |
37 | |
import java.util.HashSet; |
38 | |
import java.util.List; |
39 | |
import java.util.Set; |
40 | |
import java.util.regex.Pattern; |
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | 0 | public class HtmlFileListParser |
46 | |
{ |
47 | |
|
48 | 1 | private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" ); |
49 | |
|
50 | |
|
51 | 1 | private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" ); |
52 | |
|
53 | |
|
54 | 1 | private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" ); |
55 | |
|
56 | |
|
57 | 1 | private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" ); |
58 | |
|
59 | 1 | private static final Pattern[] SKIPS = |
60 | |
new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS }; |
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
|
68 | |
|
69 | |
public static List<String> parseFileList( String baseurl, InputStream stream ) |
70 | |
throws TransferFailedException |
71 | |
{ |
72 | |
try |
73 | |
{ |
74 | 10 | URI baseURI = new URI( baseurl ); |
75 | |
|
76 | |
|
77 | 10 | String content = IOUtils.toString( stream, "utf-8" ); |
78 | 10 | Document doc = Jsoup.parse( content, baseurl ); |
79 | 10 | Elements links = doc.getElementsByTag( "a" ); |
80 | 10 | Set<String> results = new HashSet<String>(); |
81 | 524 | for ( int lx = 0; lx < links.size(); lx++ ) |
82 | |
{ |
83 | 514 | Element link = links.get( lx ); |
84 | |
|
85 | |
|
86 | |
|
87 | 514 | String target = link.attr( "href" ); |
88 | 514 | if ( target != null ) |
89 | |
{ |
90 | 514 | String clean = cleanLink( baseURI, target ); |
91 | 514 | if ( isAcceptableLink( clean ) ) |
92 | |
{ |
93 | 462 | results.add( clean ); |
94 | |
} |
95 | |
} |
96 | |
|
97 | |
} |
98 | |
|
99 | 10 | return new ArrayList<String>( results ); |
100 | |
} |
101 | 0 | catch ( URISyntaxException e ) |
102 | |
{ |
103 | 0 | throw new TransferFailedException( "Unable to parse as base URI: " + baseurl, e ); |
104 | |
} |
105 | 0 | catch ( IOException e ) |
106 | |
{ |
107 | 0 | throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e ); |
108 | |
} |
109 | |
} |
110 | |
|
111 | |
private static String cleanLink( URI baseURI, String link ) |
112 | |
{ |
113 | 514 | if ( StringUtils.isEmpty( link ) ) |
114 | |
{ |
115 | 17 | return ""; |
116 | |
} |
117 | |
|
118 | 497 | String ret = link; |
119 | |
|
120 | |
try |
121 | |
{ |
122 | 497 | URI linkuri = new URI( ret ); |
123 | 497 | if ( link.startsWith( "/" ) ) |
124 | |
{ |
125 | 118 | linkuri = baseURI.resolve( linkuri ); |
126 | |
} |
127 | 497 | URI relativeURI = baseURI.relativize( linkuri ).normalize(); |
128 | 497 | ret = relativeURI.toASCIIString(); |
129 | 497 | if ( ret.startsWith( baseURI.getPath() ) ) |
130 | |
{ |
131 | 0 | ret = ret.substring( baseURI.getPath().length() ); |
132 | |
} |
133 | |
|
134 | 497 | ret = URLDecoder.decode( ret, "UTF-8" ); |
135 | |
} |
136 | 0 | catch ( URISyntaxException e ) |
137 | |
{ |
138 | |
} |
139 | 0 | catch ( UnsupportedEncodingException e ) |
140 | |
{ |
141 | 497 | } |
142 | |
|
143 | 497 | return ret; |
144 | |
} |
145 | |
|
146 | |
private static boolean isAcceptableLink( String link ) |
147 | |
{ |
148 | 514 | if ( StringUtils.isEmpty( link ) ) |
149 | |
{ |
150 | 17 | return false; |
151 | |
} |
152 | |
|
153 | 2360 | for ( int i = 0; i < SKIPS.length; i++ ) |
154 | |
{ |
155 | 1898 | if ( SKIPS[i].matcher( link ).find() ) |
156 | |
{ |
157 | 35 | return false; |
158 | |
} |
159 | |
} |
160 | |
|
161 | 462 | return true; |
162 | |
} |
163 | |
|
164 | |
} |