1 package org.apache.maven.wagon.shared.http;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.UnsupportedEncodingException;
25 import java.net.URI;
26 import java.net.URISyntaxException;
27 import java.net.URLDecoder;
28 import java.util.ArrayList;
29 import java.util.HashSet;
30 import java.util.List;
31 import java.util.Set;
32 import java.util.regex.Pattern;
33
34 import org.apache.maven.wagon.TransferFailedException;
35 import org.apache.xerces.xni.Augmentations;
36 import org.apache.xerces.xni.QName;
37 import org.apache.xerces.xni.XMLAttributes;
38 import org.apache.xerces.xni.parser.XMLInputSource;
39 import org.apache.xerces.xni.parser.XMLParserConfiguration;
40 import org.codehaus.plexus.util.StringUtils;
41 import org.cyberneko.html.HTMLConfiguration;
42 import org.cyberneko.html.filters.DefaultFilter;
43
44
45
46
47 public class HtmlFileListParser
48 {
49
50
51
52
53
54
55
56 public static List
57 throws TransferFailedException
58 {
59 try
60 {
61
62 URI baseURI = new URI( baseurl );
63
64 Parser handler = new Parser( baseURI );
65
66 XMLParserConfiguration parser = new HTMLConfiguration();
67 parser.setDocumentHandler( handler );
68 parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
69 parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
70 parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
71 parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
72
73 return new ArrayList( handler.getLinks() );
74
75 }
76 catch ( URISyntaxException e )
77 {
78 throw new TransferFailedException( "Unable to parse as URI: " + baseurl );
79 }
80 catch ( IOException e )
81 {
82 throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
83 }
84 }
85
86 private static class Parser
87 extends DefaultFilter
88 {
89
90 private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
91
92
93 private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
94
95
96 private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
97
98
99 private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
100
101 private static final Pattern[] SKIPS =
102 new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
103
104 private Set links = new HashSet();
105
106 private URI baseURI;
107
108 public Parser( URI baseURI )
109 {
110 this.baseURI = baseURI.normalize();
111 }
112
113 public Set getLinks()
114 {
115 return links;
116 }
117
118 public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
119 {
120 if ( "A".equals( element.rawname ) )
121 {
122 String href = attrs.getValue( "HREF" );
123 if ( href != null )
124 {
125 String link = cleanLink( baseURI, href );
126 if ( isAcceptableLink( link ) )
127 {
128 links.add( link );
129 }
130 }
131 }
132 }
133
134 private static String cleanLink( URI baseURI, String link )
135 {
136 if ( StringUtils.isEmpty( link ) )
137 {
138 return "";
139 }
140
141 String ret = link;
142
143 try
144 {
145 URI linkuri = new URI( ret );
146 URI relativeURI = baseURI.relativize( linkuri ).normalize();
147 ret = relativeURI.toASCIIString();
148 if ( ret.startsWith( baseURI.getPath() ) )
149 {
150 ret = ret.substring( baseURI.getPath().length() );
151 }
152
153 ret = URLDecoder.decode( ret, "UTF-8" );
154 }
155 catch ( URISyntaxException e )
156 {
157 }
158 catch ( UnsupportedEncodingException e )
159 {
160 }
161
162 return ret;
163 }
164
165 private static boolean isAcceptableLink( String link )
166 {
167 if ( StringUtils.isEmpty( link ) )
168 {
169 return false;
170 }
171
172 for ( int i = 0; i < SKIPS.length; i++ )
173 {
174 if ( SKIPS[i].matcher( link ).find() )
175 {
176 return false;
177 }
178 }
179
180 return true;
181 }
182 }
183 }