View Javadoc

1   package org.apache.maven.wagon.shared.http;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.UnsupportedEncodingException;
25  import java.net.URI;
26  import java.net.URISyntaxException;
27  import java.net.URLDecoder;
28  import java.util.ArrayList;
29  import java.util.HashSet;
30  import java.util.List;
31  import java.util.Set;
32  import java.util.regex.Pattern;
33  
34  import org.apache.commons.io.IOUtils;
35  import org.apache.maven.wagon.TransferFailedException;
36  import org.codehaus.plexus.util.StringUtils;
37  import org.jsoup.Jsoup;
38  import org.jsoup.nodes.Document;
39  import org.jsoup.nodes.Element;
40  import org.jsoup.select.Elements;
41  
42  /**
43   * Html File List Parser.
44   */
45  public class HtmlFileListParser
46  {
47      // Apache Fancy Index Sort Headers
48      private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
49  
50      // URLs with excessive paths.
51      private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
52  
53      // URLs that to a parent directory.
54      private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
55  
56      // mailto urls
57      private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
58  
59      private static final Pattern[] SKIPS = new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT,
60          MAILTO_URLS };
61  
62      /**
63       * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
64       * 
65       * @param is the input stream.
66       * @return the file list.
67       * @throws TransferFailedException if there was a problem fetching the raw html.
68       */
69      public static List/* <String> */parseFileList( String baseurl, InputStream stream )
70          throws TransferFailedException
71      {
72          try
73          {
74              URI baseURI = new URI( baseurl );
75              // to make debugging easier, start with a string. This is assuming UTF-8, which might not be a safe
76              // assumption.
77              String content = IOUtils.toString( stream, "utf-8" );
78              Document doc = Jsoup.parse( content, baseurl );
79              Elements links = doc.getElementsByTag( "a" );
80              Set results = new HashSet();
81              for ( int lx = 0; lx < links.size(); lx++ )
82              {
83                  Element link = links.get( lx );
84                  /*
85                   * The abs:href loses directories, so we deal with absolute paths ourselves below in cleanLink
86                   */
87                  String target = link.attr( "href" );
88                  if ( target != null)
89                  {
90                      String clean = cleanLink( baseURI, target );
91                      if ( isAcceptableLink( clean )) 
92                      {
93                          results.add( clean );
94                      }
95                  }
96  
97              }
98  
99              ArrayList resultsAsList = new ArrayList();
100             resultsAsList.addAll( results );
101             return resultsAsList;
102         }
103         catch ( URISyntaxException e )
104         {
105             throw new TransferFailedException( "Unable to parse as base URI: " + baseurl );
106         }
107         catch ( IOException e )
108         {
109             throw new TransferFailedException( "I/O error reading HTML listing of artifacts: " + e.getMessage(), e );
110         }
111     }
112 
113     private static String cleanLink( URI baseURI, String link )
114     {
115         if ( StringUtils.isEmpty( link ) )
116         {
117             return "";
118         }
119 
120         String ret = link;
121 
122         try
123         {
124             URI linkuri = new URI( ret );
125             if ( link.startsWith( "/" )) 
126             {
127                 linkuri =  baseURI.resolve( linkuri );
128             }
129             URI relativeURI = baseURI.relativize( linkuri ).normalize();
130             ret = relativeURI.toASCIIString();
131             if ( ret.startsWith( baseURI.getPath() ) )
132             {
133                 ret = ret.substring( baseURI.getPath().length() );
134             }
135 
136             ret = URLDecoder.decode( ret, "UTF-8" );
137         }
138         catch ( URISyntaxException e )
139         {
140         }
141         catch ( UnsupportedEncodingException e )
142         {
143         }
144 
145         return ret;
146     }
147 
148     private static boolean isAcceptableLink( String link )
149     {
150         if ( StringUtils.isEmpty( link ) )
151         {
152             return false;
153         }
154 
155         for ( int i = 0; i < SKIPS.length; i++ )
156         {
157             if ( SKIPS[i].matcher( link ).find() )
158             {
159                 return false;
160             }
161         }
162 
163         return true;
164     }
165 
166 }