Coverage Report - org.apache.commons.feedparser.locate.LinkLocator
 
Classes in this File Line Coverage Branch Coverage Complexity
LinkLocator
0%
0/11
0%
0/4
6.2
LinkLocator$1
0%
0/51
0%
0/44
6.2
 
 1  
 /*
 2  
  * Copyright 1999,2004 The Apache Software Foundation.
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  * 
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.feedparser.locate;
 18  
 
 19  
 import java.net.MalformedURLException;
 20  
 import java.net.URL;
 21  
 import java.util.HashSet;
 22  
 import java.util.List;
 23  
 
 24  
 import org.apache.commons.feedparser.FeedList;
 25  
 
 26  
 /**
 27  
  * Find links by parsing the raw HTML.  We only return links that are on the
 28  
  * same site and link to /index.rdf LINKS and so forth.
 29  
  *
 30  
  * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
 31  
  */
 32  0
 public class LinkLocator {
 33  
 
 34  
     /**
 35  
      * 
 36  
      *
 37  
      * 
 38  
      */
 39  
     public static final List locate( String resource,
 40  
                                      String content,
 41  
                                      final FeedList list )
 42  
         throws Exception {
 43  
 
 44  
         /**
 45  
          * When we have been given feeds at a higher level (via <link rel> tags
 46  
          * we should prefer these.
 47  
          */
 48  0
         final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null;
 49  0
         final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null;
 50  
 
 51  0
         AnchorParserListener listener = new AnchorParserListener() {
 52  
 
 53  0
                 String resource = null;
 54  
                 
 55  0
                 String site = null;
 56  
 
 57  0
                 HashSet seen = new HashSet();
 58  
 
 59  0
                 boolean hasFoundRSSFeed = false;
 60  0
                 boolean hasFoundAtomFeed = false;
 61  
                 
 62  
                 public void setContext( Object context ) {
 63  
 
 64  0
                     resource = (String)context;
 65  
                     
 66  
                     //pass in the resource of the blog
 67  0
                     site = getSite( resource );
 68  
                     
 69  0
                 }
 70  
 
 71  
                 public Object getResult() {
 72  0
                     return list;
 73  
                 }
 74  
 
 75  
                 public boolean onAnchor( String href, String rel, String title ) {
 76  0
                     String current = ResourceExpander.expand( resource, href );
 77  0
                     if ( current == null )
 78  0
                         return true; //obviously not
 79  
 
 80  
                     //FIXME: if it's at the same directory level we should prioritize it.
 81  
                     //for example:
 82  
                     //
 83  
                     // http://peerfear.org/blog/
 84  
                     //
 85  
                     // http://peerfear.org/blog/index.rdf
 86  
                     //
 87  
                     // instead of 
 88  
                     //
 89  
                     // http://peerfear.org/index.rdf
 90  
 
 91  
                     //see if the link is on a different site
 92  0
                     if ( ! site.equals( getSite( current ) ) ) {
 93  0
                         return true;
 94  
                     }
 95  
 
 96  
                     //Radio style feed.  Screw that.
 97  
                     //FIXME: What happens if the Feed Parser is used to
 98  
                     //aggregate feeds on the localhost? This will break that.
 99  
                     //Brad Neuberg, bkn3@columbia.edu
 100  0
                     if ( current.startsWith( "http://127" ) ) 
 101  0
                         return true;
 102  
 
 103  0
                     if ( seen.contains( current ) ) {
 104  0
                         return true;
 105  
                     } 
 106  
 
 107  0
                     seen.add( current );
 108  
 
 109  
                     //FIXME: we should assert tha that these feeds are from the SAME
 110  
                     //domain not a link to another feed.
 111  
 
 112  0
                     boolean isRSSLink = current.endsWith( ".rss" );
 113  
 
 114  
                     //support ROLLER RSS links and explicit link discovery by
 115  
                     //non-extensions.
 116  0
                     if ( isRSSLink == false ) {
 117  
 
 118  0
                         isRSSLink =
 119  
                             title != null &&
 120  
                             title.equalsIgnoreCase( "rss" ) &&
 121  
                             href.indexOf( "rss" ) != -1;
 122  
 
 123  
                     } 
 124  
 
 125  0
                     if ( isRSSLink ) {
 126  
 
 127  
                         //this is an RSS feed.
 128  0
                         FeedReference ref = new FeedReference( current,
 129  
                                                                FeedReference.RSS_MEDIA_TYPE );
 130  
 
 131  
                         
 132  
                         //make sure we haven't already discovered this feed
 133  
                         //through a different process
 134  0
                         if (list.contains(ref))
 135  0
                             return true;
 136  
 
 137  
                         //Make sure to preserve existing AD feeds first.
 138  0
                         if ( ! hasExplicitRSSFeed )
 139  0
                             list.setAdRSSFeed( ref );
 140  
 
 141  0
                         list.add( ref );
 142  
 
 143  0
                         hasFoundRSSFeed = true;
 144  
                         
 145  
                     }
 146  
 
 147  0
                     if ( current.endsWith( ".atom" ) ) {
 148  
 
 149  0
                         FeedReference ref = new FeedReference( current,
 150  
                                                                FeedReference.RSS_MEDIA_TYPE );
 151  
 
 152  
                         //make sure we haven't already discovered this feed
 153  
                         //through a different process
 154  0
                         if (list.contains(ref))
 155  0
                             return true;
 156  
                         
 157  
                         //Make sure to preserve existing AD feeds first.
 158  0
                         if ( ! hasExplicitAtomFeed )
 159  0
                             list.setAdAtomFeed( ref );
 160  
 
 161  0
                         list.add( ref );
 162  
 
 163  0
                         hasFoundAtomFeed = true;
 164  
 
 165  
                     }
 166  
 
 167  0
                     if ( current.endsWith( ".xml" ) ||
 168  
                          current.endsWith( ".rdf" ) ) {
 169  
 
 170  
                         //NOTE that we do allow autodiscovery forfor index.xml
 171  
                         //and index.rdf files but we don't prefer them since
 172  
                         //these extensions are generic.  We would prefer to use
 173  
                         //index.rss or even Atom (though people tend to use Atom
 174  
                         //autodiscovery now).  This is important because if we
 175  
                         //spit back an index.xml file thats NOT RSS or worse an
 176  
                         //index.rdf file thats FOAF then we might break callers.
 177  
 
 178  0
                         FeedReference ref = new FeedReference( current,
 179  
                                                                FeedReference.RSS_MEDIA_TYPE );
 180  
                         
 181  
                         //make sure we haven't already discovered this feed
 182  
                         //through a different process
 183  0
                         if (list.contains(ref))
 184  0
                             return true;
 185  
 
 186  
                         //see if we should RESORT to using this.
 187  
 
 188  0
                         if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) {
 189  
 
 190  
                             //NOTE: when we have found an existing RDF file use
 191  
                             //that instead..  This is probably RSS 1.0 which is
 192  
                             //much better than RSS 0.91
 193  
 
 194  0
                             if ( list.getAdRSSFeed() == null ||
 195  
                                  list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) {
 196  
 
 197  0
                                 list.setAdRSSFeed( ref );
 198  
 
 199  
                             }
 200  
 
 201  
                         }
 202  
 
 203  
                         //feed for this blog.
 204  0
                         list.add( ref );
 205  0
                         return true;
 206  
                         
 207  
                     } 
 208  
 
 209  
                     //for coderman's blog at http://www.peertech.org
 210  
                     //FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu
 211  0
                     if ( current.endsWith( "/node/feed" ) )
 212  0
                         list.add( current );
 213  
 
 214  0
                     return true;
 215  
                     
 216  
                 }
 217  
 
 218  
             };
 219  
 
 220  0
         listener.setContext( resource );
 221  0
         AnchorParser.parseAnchors( content, listener );
 222  
         
 223  0
         return list;
 224  
         
 225  
     }
 226  
 
 227  
     public static String getSite( String resource ) {
 228  
 
 229  
         try {
 230  
 
 231  0
             String site = new URL( resource ).getHost();
 232  0
             return site.replaceAll( "http://www", "http://" );
 233  
             
 234  0
         } catch ( MalformedURLException e ) {
 235  0
             return null;
 236  
         }
 237  
         
 238  
     }
 239  
 
 240  
 }