Coverage Report - org.apache.maven.wagon.shared.http.HtmlFileListParser
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlFileListParser
60%
9/15
N/A
4
HtmlFileListParser$Parser
89%
32/36
81%
13/16
4
 
 1  
 package org.apache.maven.wagon.shared.http;
 2  
 
 3  
 /*
 4  
  * Licensed to the Apache Software Foundation (ASF) under one
 5  
  * or more contributor license agreements.  See the NOTICE file
 6  
  * distributed with this work for additional information
 7  
  * regarding copyright ownership.  The ASF licenses this file
 8  
  * to you under the Apache License, Version 2.0 (the
 9  
  * "License"); you may not use this file except in compliance
 10  
  * with the License.  You may obtain a copy of the License at
 11  
  *
 12  
  *   http://www.apache.org/licenses/LICENSE-2.0
 13  
  *
 14  
  * Unless required by applicable law or agreed to in writing,
 15  
  * software distributed under the License is distributed on an
 16  
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 17  
  * KIND, either express or implied.  See the License for the
 18  
  * specific language governing permissions and limitations
 19  
  * under the License.
 20  
  */
 21  
 
 22  
 import java.io.IOException;
 23  
 import java.io.InputStream;
 24  
 import java.io.UnsupportedEncodingException;
 25  
 import java.net.URI;
 26  
 import java.net.URISyntaxException;
 27  
 import java.net.URLDecoder;
 28  
 import java.util.ArrayList;
 29  
 import java.util.HashSet;
 30  
 import java.util.List;
 31  
 import java.util.Set;
 32  
 import java.util.regex.Pattern;
 33  
 
 34  
 import org.apache.maven.wagon.TransferFailedException;
 35  
 import org.apache.xerces.xni.Augmentations;
 36  
 import org.apache.xerces.xni.QName;
 37  
 import org.apache.xerces.xni.XMLAttributes;
 38  
 import org.apache.xerces.xni.parser.XMLInputSource;
 39  
 import org.apache.xerces.xni.parser.XMLParserConfiguration;
 40  
 import org.codehaus.plexus.util.StringUtils;
 41  
 import org.cyberneko.html.HTMLConfiguration;
 42  
 import org.cyberneko.html.filters.DefaultFilter;
 43  
 
 44  
 /**
 45  
  * Html File List Parser.
 46  
  */
 47  0
 public class HtmlFileListParser
 48  
 {
 49  
     /**
 50  
      * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
 51  
      * 
 52  
      * @param is the input stream.
 53  
      * @return the file list.
 54  
      * @throws TransferFailedException if there was a problem fetching the raw html.
 55  
      */
 56  
     public static List/* <String> */parseFileList( String baseurl, InputStream stream )
 57  
         throws TransferFailedException
 58  
     {
 59  
         try
 60  
         {
 61  
             // Use URI object to get benefits of proper absolute and relative path resolution for free
 62  10
             URI baseURI = new URI( baseurl );
 63  
 
 64  10
             Parser handler = new Parser( baseURI );
 65  
 
 66  10
             XMLParserConfiguration parser = new HTMLConfiguration();
 67  10
             parser.setDocumentHandler( handler );
 68  10
             parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
 69  10
             parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
 70  10
             parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
 71  10
             parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
 72  
 
 73  10
             return new ArrayList( handler.getLinks() );
 74  
 
 75  
         }
 76  0
         catch ( URISyntaxException e )
 77  
         {
 78  0
             throw new TransferFailedException( "Unable to parse as URI: " + baseurl );
 79  
         }
 80  0
         catch ( IOException e )
 81  
         {
 82  0
             throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
 83  
         }
 84  
     }
 85  
 
 86  0
     private static class Parser
 87  
         extends DefaultFilter
 88  
     {
 89  
         // Apache Fancy Index Sort Headers
 90  1
         private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
 91  
 
 92  
         // URLs with excessive paths.
 93  1
         private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
 94  
 
 95  
         // URLs that to a parent directory.
 96  1
         private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
 97  
 
 98  
         // mailto urls
 99  1
         private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
 100  
 
 101  1
         private static final Pattern[] SKIPS =
 102  
             new Pattern[] { APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
 103  
         
 104  10
         private Set links = new HashSet();
 105  
 
 106  
         private URI baseURI;
 107  
 
 108  
         public Parser( URI baseURI )
 109  10
         {
 110  10
             this.baseURI = baseURI.normalize();
 111  10
         }
 112  
 
 113  
         public Set getLinks()
 114  
         {
 115  10
             return links;
 116  
         }
 117  
 
 118  
         public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
 119  
         {
 120  1303
             if ( "A".equals( element.rawname ) )
 121  
             {
 122  478
                 String href = attrs.getValue( "HREF" );
 123  478
                 if ( href != null )
 124  
                 {
 125  478
                     String link = cleanLink( baseURI, href );
 126  478
                     if ( isAcceptableLink( link ) )
 127  
                     {
 128  443
                         links.add( link );
 129  
                     }
 130  
                 }
 131  
             }
 132  1303
         }
 133  
 
 134  
         private static String cleanLink( URI baseURI, String link )
 135  
         {
 136  478
             if ( StringUtils.isEmpty( link ) )
 137  
             {
 138  0
                 return "";
 139  
             }
 140  
 
 141  478
             String ret = link;
 142  
 
 143  
             try
 144  
             {
 145  478
                 URI linkuri = new URI( ret );
 146  478
                 URI relativeURI = baseURI.relativize( linkuri ).normalize();
 147  478
                 ret = relativeURI.toASCIIString();
 148  478
                 if ( ret.startsWith( baseURI.getPath() ) )
 149  
                 {
 150  110
                     ret = ret.substring( baseURI.getPath().length() );
 151  
                 }
 152  
 
 153  478
                 ret = URLDecoder.decode( ret, "UTF-8" );
 154  
             }
 155  0
             catch ( URISyntaxException e )
 156  
             {
 157  
             }
 158  0
             catch ( UnsupportedEncodingException e )
 159  
             {
 160  478
             }
 161  
 
 162  478
             return ret;
 163  
         }
 164  
 
 165  
         private static boolean isAcceptableLink( String link )
 166  
         {
 167  478
             if ( StringUtils.isEmpty( link ) )
 168  
             {
 169  0
                 return false;
 170  
             }
 171  
 
 172  2265
             for ( int i = 0; i < SKIPS.length; i++ )
 173  
             {
 174  1822
                 if ( SKIPS[i].matcher( link ).find() )
 175  
                 {
 176  35
                     return false;
 177  
                 }
 178  
             }
 179  
 
 180  443
             return true;
 181  
         }
 182  
     }
 183  
 }