View Javadoc

1   package org.apache.maven.wagon.shared.http;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.UnsupportedEncodingException;
25  import java.net.URI;
26  import java.net.URISyntaxException;
27  import java.net.URLDecoder;
28  import java.util.ArrayList;
29  import java.util.HashSet;
30  import java.util.List;
31  import java.util.Set;
32  import java.util.regex.Pattern;
33  
34  import org.apache.maven.wagon.TransferFailedException;
35  import org.apache.xerces.xni.Augmentations;
36  import org.apache.xerces.xni.QName;
37  import org.apache.xerces.xni.XMLAttributes;
38  import org.apache.xerces.xni.parser.XMLInputSource;
39  import org.apache.xerces.xni.parser.XMLParserConfiguration;
40  import org.codehaus.plexus.util.StringUtils;
41  import org.cyberneko.html.HTMLConfiguration;
42  import org.cyberneko.html.filters.DefaultFilter;
43  
44  /**
45   * Html File List Parser.
46   */
47  public class HtmlFileListParser
48  {
49      /**
50       * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
51       * 
52       * @param is the input stream.
53       * @return the file list.
54       * @throws TransferFailedException if there was a problem fetching the raw html.
55       */
56      public static List/* <String> */parseFileList( String baseurl, InputStream stream )
57          throws TransferFailedException
58      {
59          try
60          {
61              // Use URI object to get benefits of proper absolute and relative path resolution for free
62              URI baseURI = new URI( baseurl );
63  
64              Parser handler = new Parser( baseURI );
65  
66              XMLParserConfiguration parser = new HTMLConfiguration();
67              parser.setDocumentHandler( handler );
68              parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
69              parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
70              parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
71              parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
72  
73              return new ArrayList( handler.getLinks() );
74  
75          }
76          catch ( URISyntaxException e )
77          {
78              throw new TransferFailedException( "Unable to parse as URI: " + baseurl );
79          }
80          catch ( IOException e )
81          {
82              throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
83          }
84      }
85  
86      private static class Parser
87          extends DefaultFilter
88      {
89          // Apache Fancy Index Sort Headers
90          private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
91  
92          // URLs with excessive paths.
93          private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
94  
95          // URLs that to a parent directory.
96          private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
97  
98          // mailto urls
99          private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
100 
101         private static final Pattern[] SKIPS =
102             new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
103         
104         private Set links = new HashSet();
105 
106         private URI baseURI;
107 
108         public Parser( URI baseURI )
109         {
110             this.baseURI = baseURI.normalize();
111         }
112 
113         public Set getLinks()
114         {
115             return links;
116         }
117 
118         public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
119         {
120             if ( "A".equals( element.rawname ) )
121             {
122                 String href = attrs.getValue( "HREF" );
123                 if ( href != null )
124                 {
125                     String link = cleanLink( baseURI, href );
126                     if ( isAcceptableLink( link ) )
127                     {
128                         links.add( link );
129                     }
130                 }
131             }
132         }
133 
134         private static String cleanLink( URI baseURI, String link )
135         {
136             if ( StringUtils.isEmpty( link ) )
137             {
138                 return "";
139             }
140 
141             String ret = link;
142 
143             try
144             {
145                 URI linkuri = new URI( ret );
146                 URI relativeURI = baseURI.relativize( linkuri ).normalize();
147                 ret = relativeURI.toASCIIString();
148                 if ( ret.startsWith( baseURI.getPath() ) )
149                 {
150                     ret = ret.substring( baseURI.getPath().length() );
151                 }
152 
153                 ret = URLDecoder.decode( ret, "UTF-8" );
154             }
155             catch ( URISyntaxException e )
156             {
157             }
158             catch ( UnsupportedEncodingException e )
159             {
160             }
161 
162             return ret;
163         }
164 
165         private static boolean isAcceptableLink( String link )
166         {
167             if ( StringUtils.isEmpty( link ) )
168             {
169                 return false;
170             }
171 
172             for ( int i = 0; i < SKIPS.length; i++ )
173             {
174                 if ( SKIPS[i].matcher( link ).find() )
175                 {
176                     return false;
177                 }
178             }
179 
180             return true;
181         }
182     }
183 }