Coverage Report - org.apache.maven.wagon.shared.http.HtmlFileListParser
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlFileListParser
60 %
9/15
N/A
4
HtmlFileListParser$Parser
88 %
32/36
81 %
13/16
4
 
 1  
 package org.apache.maven.wagon.shared.http;
 2  
 
 3  
 /*
 4  
  * Licensed to the Apache Software Foundation (ASF) under one
 5  
  * or more contributor license agreements.  See the NOTICE file
 6  
  * distributed with this work for additional information
 7  
  * regarding copyright ownership.  The ASF licenses this file
 8  
  * to you under the Apache License, Version 2.0 (the
 9  
  * "License"); you may not use this file except in compliance
 10  
  * with the License.  You may obtain a copy of the License at
 11  
  *
 12  
  *   http://www.apache.org/licenses/LICENSE-2.0
 13  
  *
 14  
  * Unless required by applicable law or agreed to in writing,
 15  
  * software distributed under the License is distributed on an
 16  
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 17  
  * KIND, either express or implied.  See the License for the
 18  
  * specific language governing permissions and limitations
 19  
  * under the License.
 20  
  */
 21  
 
 22  
 import org.apache.maven.wagon.TransferFailedException;
 23  
 import org.apache.xerces.xni.Augmentations;
 24  
 import org.apache.xerces.xni.QName;
 25  
 import org.apache.xerces.xni.XMLAttributes;
 26  
 import org.apache.xerces.xni.parser.XMLInputSource;
 27  
 import org.apache.xerces.xni.parser.XMLParserConfiguration;
 28  
 import org.codehaus.plexus.util.StringUtils;
 29  
 import org.cyberneko.html.HTMLConfiguration;
 30  
 import org.cyberneko.html.filters.DefaultFilter;
 31  
 
 32  
 import java.io.IOException;
 33  
 import java.io.InputStream;
 34  
 import java.io.UnsupportedEncodingException;
 35  
 import java.net.URI;
 36  
 import java.net.URISyntaxException;
 37  
 import java.net.URLDecoder;
 38  
 import java.util.ArrayList;
 39  
 import java.util.HashSet;
 40  
 import java.util.List;
 41  
 import java.util.Set;
 42  
 import java.util.regex.Pattern;
 43  
 
 44  
 /**
 45  
  * Html File List Parser.
 46  
  */
 47  0
 public class HtmlFileListParser
 48  
 {
 49  
     /**
 50  
      * Fetches a raw HTML from a provided InputStream, parses it, and returns the file list.
 51  
      *
 52  
      * @return the file list.
 53  
      * @throws TransferFailedException if there was a problem fetching the raw html.
 54  
      */
 55  
     public static List<String> parseFileList( String baseurl, InputStream stream )
 56  
         throws TransferFailedException
 57  
     {
 58  
         try
 59  
         {
 60  
             // Use URI object to get benefits of proper absolute and relative path resolution for free
 61  10
             URI baseURI = new URI( baseurl );
 62  
 
 63  10
             Parser handler = new Parser( baseURI );
 64  
 
 65  10
             XMLParserConfiguration parser = new HTMLConfiguration();
 66  10
             parser.setDocumentHandler( handler );
 67  10
             parser.setFeature( "http://cyberneko.org/html/features/augmentations", true );
 68  10
             parser.setProperty( "http://cyberneko.org/html/properties/names/elems", "upper" );
 69  10
             parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "upper" );
 70  10
             parser.parse( new XMLInputSource( null, baseurl, baseURI.toString(), stream, "UTF-8" ) );
 71  
 
 72  10
             return new ArrayList<String>( handler.getLinks() );
 73  
 
 74  
         }
 75  0
         catch ( URISyntaxException e )
 76  
         {
 77  0
             throw new TransferFailedException( "Unable to parse as URI: " + baseurl, e );
 78  
         }
 79  0
         catch ( IOException e )
 80  
         {
 81  0
             throw new TransferFailedException( "I/O error: " + e.getMessage(), e );
 82  
         }
 83  
     }
 84  
 
 85  0
     private static class Parser
 86  
         extends DefaultFilter
 87  
     {
 88  
         // Apache Fancy Index Sort Headers
 89  1
         private static final Pattern APACHE_INDEX_SKIP = Pattern.compile( "\\?[CDMNS]=.*" );
 90  
 
 91  
         // URLs with excessive paths.
 92  1
         private static final Pattern URLS_WITH_PATHS = Pattern.compile( "/[^/]*/" );
 93  
 
 94  
         // URLs that to a parent directory.
 95  1
         private static final Pattern URLS_TO_PARENT = Pattern.compile( "\\.\\./" );
 96  
 
 97  
         // mailto urls
 98  1
         private static final Pattern MAILTO_URLS = Pattern.compile( "mailto:.*" );
 99  
 
 100  1
         private static final Pattern[] SKIPS =
 101  
             new Pattern[]{ APACHE_INDEX_SKIP, URLS_WITH_PATHS, URLS_TO_PARENT, MAILTO_URLS };
 102  
 
 103  10
         private Set<String> links = new HashSet<String>();
 104  
 
 105  
         private URI baseURI;
 106  
 
 107  
         public Parser( URI baseURI )
 108  10
         {
 109  10
             this.baseURI = baseURI.normalize();
 110  10
         }
 111  
 
 112  
         public Set<String> getLinks()
 113  
         {
 114  10
             return links;
 115  
         }
 116  
 
 117  
         public void startElement( QName element, XMLAttributes attrs, Augmentations augs )
 118  
         {
 119  1303
             if ( "A".equals( element.rawname ) )
 120  
             {
 121  478
                 String href = attrs.getValue( "HREF" );
 122  478
                 if ( href != null )
 123  
                 {
 124  478
                     String link = cleanLink( baseURI, href );
 125  478
                     if ( isAcceptableLink( link ) )
 126  
                     {
 127  443
                         links.add( link );
 128  
                     }
 129  
                 }
 130  
             }
 131  1303
         }
 132  
 
 133  
         private static String cleanLink( URI baseURI, String link )
 134  
         {
 135  478
             if ( StringUtils.isEmpty( link ) )
 136  
             {
 137  0
                 return "";
 138  
             }
 139  
 
 140  478
             String ret = link;
 141  
 
 142  
             try
 143  
             {
 144  478
                 URI linkuri = new URI( ret );
 145  478
                 URI relativeURI = baseURI.relativize( linkuri ).normalize();
 146  478
                 ret = relativeURI.toASCIIString();
 147  478
                 if ( ret.startsWith( baseURI.getPath() ) )
 148  
                 {
 149  110
                     ret = ret.substring( baseURI.getPath().length() );
 150  
                 }
 151  
 
 152  478
                 ret = URLDecoder.decode( ret, "UTF-8" );
 153  
             }
 154  0
             catch ( URISyntaxException e )
 155  
             {
 156  
             }
 157  0
             catch ( UnsupportedEncodingException e )
 158  
             {
 159  478
             }
 160  
 
 161  478
             return ret;
 162  
         }
 163  
 
 164  
         private static boolean isAcceptableLink( String link )
 165  
         {
 166  478
             if ( StringUtils.isEmpty( link ) )
 167  
             {
 168  0
                 return false;
 169  
             }
 170  
 
 171  2265
             for ( int i = 0; i < SKIPS.length; i++ )
 172  
             {
 173  1822
                 if ( SKIPS[i].matcher( link ).find() )
 174  
                 {
 175  35
                     return false;
 176  
                 }
 177  
             }
 178  
 
 179  443
             return true;
 180  
         }
 181  
     }
 182  
 }