1 package org.apache.maven.wagon.shared.http;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.UnsupportedEncodingException;
25 import java.net.URI;
26 import java.net.URISyntaxException;
27 import java.net.URLDecoder;
28 import java.util.ArrayList;
29 import java.util.HashSet;
30 import java.util.List;
31 import java.util.Set;
32 import java.util.regex.Pattern;
33
34 import org.apache.commons.io.IOUtils;
35 import org.apache.maven.wagon.TransferFailedException;
36 import org.codehaus.plexus.util.StringUtils;
37 import org.jsoup.Jsoup;
38 import org.jsoup.nodes.Document;
39 import org.jsoup.nodes.Element;
40 import org.jsoup.select.Elements;
41
42
43
44
45 public class HtmlFileListParser
46 {
47
48 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
49
50
51 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
52
53
54 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
55
56
57 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
58
59 private static final Pattern[] SKIPS = new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT,
60 MAILTO_URLS };
61
62
63
64
65
66
67
68
69 public static List
70 throws TransferFailedException
71 {
72 try
73 {
74 URI baseURI = new URI( baseurl );
75
76
77 String content = IOUtils.toString( stream, "utf-8" );
78 Document doc = Jsoup.parse( content, baseurl );
79 Elements links = doc.getElementsByTag( "a" );
80 Set results = new HashSet();
81 for ( int lx = 0; lx < links.size(); lx++ )
82 {
83 Element link = links.get( lx );
84
85
86
87 String target = link.attr( "href" );
88 if ( target != null)
89 {
90 String clean = cleanLink( baseURI, target );
91 if ( isAcceptableLink( clean ))
92 {
93 results.add( clean );
94 }
95 }
96
97 }
98
99 ArrayList resultsAsList = new ArrayList();
100 resultsAsList.addAll( results );
101 return resultsAsList;
102 }
103 catch ( URISyntaxException e )
104 {
105 throw new TransferFailedException( "Unable to parse as base URI: " + baseurl );
106 }
107 catch ( IOException e )
108 {
109 throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
110 }
111 }
112
113 private static String cleanLink( URI baseURI, String link )
114 {
115 if ( StringUtils.isEmpty( link ) )
116 {
117 return "";
118 }
119
120 String ret = link;
121
122 try
123 {
124 URI linkuri = new URI( ret );
125 if ( link.startsWith( "/" ))
126 {
127 linkuri = baseURI.resolve( linkuri );
128 }
129 URI relativeURI = baseURI.relativize( linkuri ).normalize();
130 ret = relativeURI.toASCIIString();
131 if ( ret.startsWith( baseURI.getPath() ) )
132 {
133 ret = ret.substring( baseURI.getPath().length() );
134 }
135
136 ret = URLDecoder.decode( ret, "UTF-8" );
137 }
138 catch ( URISyntaxException e )
139 {
140 }
141 catch ( UnsupportedEncodingException e )
142 {
143 }
144
145 return ret;
146 }
147
148 private static boolean isAcceptableLink( String link )
149 {
150 if ( StringUtils.isEmpty( link ) )
151 {
152 return false;
153 }
154
155 for ( int i = 0; i < SKIPS.length; i++ )
156 {
157 if ( SKIPS[i].matcher( link ).find() )
158 {
159 return false;
160 }
161 }
162
163 return true;
164 }
165
166 }