Coverage Report - org.apache.commons.feedparser.locate.DiscoveryLocator
 
Classes in this File Line Coverage Branch Coverage Complexity
DiscoveryLocator
0%
0/48
0%
0/16
5
 
 1  
 /*
 2  
  * Copyright 1999,2004 The Apache Software Foundation.
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  * 
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.feedparser.locate;
 18  
 
 19  
 import java.util.HashMap;
 20  
 import java.util.HashSet;
 21  
 import java.util.List;
 22  
 import java.util.regex.Matcher;
 23  
 import java.util.regex.Pattern;
 24  
 
 25  
 import org.apache.commons.feedparser.FeedList;
 26  
 import org.apache.log4j.Logger;
 27  
 
 28  
 /**
 29  
  *
 30  
  * http://www.ietf.org/internet-drafts/draft-ietf-atompub-autodiscovery-00.txt
 31  
  * 
 32  
  * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
 33  
  */
 34  0
 public class DiscoveryLocator {
 35  
     
 36  0
     private static Logger log = Logger.getLogger( DiscoveryLocator.class );
 37  
 
 38  
     /**
 39  
      * Get a FULL link within the content. We then pull the attributes out of
 40  
      * this.
 41  
      */
 42  0
     static Pattern element_pattern =
 43  
         Pattern.compile( "<link[^>]+",
 44  
                          Pattern.CASE_INSENSITIVE );
 45  
 
 46  
     /**
 47  
      * Regex to match on attributes.
 48  
      * 
 49  
      * Implementation: Mon Mar 14 2005 01:59 PM (burton@rojo.com): this is a
 50  
      * pretty difficult regexp to grok.
 51  
      * 
 52  
      * There's are two regexps here.  One for attributes with quotes and one
 53  
      * without. Each regexp has two groups - 1 is the name and 2 is the value.
 54  
      * You can split the regexp on | to better understand each individual
 55  
      * regexp.
 56  
      */
 57  
 
 58  
     // > Attribute values MUST be one of the following: enclosed in double
 59  
     // > quotes, enclosed in single quotes, or not enclosed in quotes at all.
 60  
     //
 61  
     // 
 62  0
     static String ATTR_REGEXP = "([a-zA-Z]+)=[\"']([^\"']+)[\"']|([a-zA-Z]+)=([^\"'>\r\n\t ]+)";
 63  
     
 64  0
     static Pattern ATTR_PATTERN = Pattern.compile( ATTR_REGEXP,
 65  
                                                    Pattern.CASE_INSENSITIVE );
 66  
 
 67  0
     static HashSet mediatypes = new HashSet();
 68  
 
 69  
     static {
 70  
 
 71  0
         mediatypes.add( FeedReference.ATOM_MEDIA_TYPE );
 72  0
         mediatypes.add( FeedReference.RSS_MEDIA_TYPE );
 73  0
         mediatypes.add( FeedReference.XML_MEDIA_TYPE );
 74  
         
 75  0
     }
 76  
 
 77  
     /**
 78  
      * Locate a feed via RSS/Atom auto-discovery.  If both Atom and RSS are
 79  
      * listed we return both.  Actually we return all Atom/RSS or XML feeds
 80  
      * including FOAF.  It's up to the caller to use the correct feed.
 81  
      *
 82  
      * 
 83  
      */
 84  
     public static final List locate( String resource,
 85  
                                      String content,
 86  
                                      FeedList list )
 87  
         throws Exception {
 88  
 
 89  
         //this mechanism is easier but it isn't efficient.  I should just parse
 90  
         //elements forward until I discover </head>.  Also note that this isn't
 91  
         //doing all feed URLs just the first ones it finds.  
 92  
 
 93  0
         Matcher m = element_pattern.matcher( content );
 94  
 
 95  0
         while( m.find() ) {
 96  
             //the value of the link element XML... example:
 97  
             
 98  
             // <link rel="alternate" 
 99  
             //      href="http://www.codinginparadise.org/weblog/atom.xml"
 100  
             //      type="application/atom+xml" 
 101  
             //      title="ATOM" />
 102  
                  
 103  0
             String element = m.group( 0 );
 104  
 
 105  0
             HashMap attributes = getAttributes( element );
 106  
             
 107  0
             String type = (String)attributes.get( "type" );
 108  0
             if (type != null)
 109  0
                 type = type.toLowerCase();
 110  
 
 111  0
             if ( mediatypes.contains( type )  ) {
 112  
 
 113  
                 //expand the href
 114  0
                 String href = (String)attributes.get( "href" );
 115  0
                 log.debug("href="+href);
 116  
 
 117  
                 // http://xml.coverpages.org/draft-ietf-atompub-autodiscovery-00.txt
 118  
                 
 119  
                 // > The href attribute MUST be present in an Atom autodiscovery element,
 120  
                 // > and its value MUST be the URI [RFC2396] of an Atom feed.  The value
 121  
                 // > MAY be a relative URI, and if so, clients MUST resolve it to a full
 122  
                 // > URI (section 5 of [RFC2396]) using the document's base URI (section
 123  
                 // > 12.4 of HTML 4 [W3C.REC-html401-19991224]).
 124  
 
 125  0
                 href = ResourceExpander.expand( resource, href );
 126  
 
 127  0
                 FeedReference feedReference = new FeedReference( href, type );
 128  
                 
 129  0
                 feedReference.title = (String)attributes.get( "title" );
 130  
                 
 131  0
                 list.add( feedReference );
 132  
 
 133  0
                 if ( type.equals( FeedReference.ATOM_MEDIA_TYPE ) )
 134  0
                     list.setFirstAdAtomFeed( feedReference );
 135  
                     
 136  0
                 if ( type.equals( FeedReference.RSS_MEDIA_TYPE ) )
 137  0
                     list.setFirstAdRSSFeed( feedReference );
 138  
 
 139  
             }
 140  
             
 141  0
         }
 142  
         
 143  0
         return list;
 144  
         
 145  
     }
 146  
 
 147  
     /**
 148  
      * Parse attributes within elements into a hashmap.
 149  
      *
 150  
      * 
 151  
      */
 152  
     public static HashMap getAttributes( String content ) {
 153  
 
 154  0
         HashMap map = new HashMap();
 155  
 
 156  0
         Matcher m = ATTR_PATTERN.matcher( content );
 157  
 
 158  0
         int index = 0;
 159  
 
 160  0
         while ( m.find( index ) ) {
 161  
 
 162  0
             String name = m.group( 1 );
 163  0
             String value = null;
 164  
 
 165  
             //Since we use an OR regexp the first match will be 1/2 and the
 166  
             //second will be 3/4
 167  0
             if ( name != null ) {
 168  0
                 value = m.group( 2 );
 169  
             } else {
 170  0
                 name = m.group( 3 );
 171  0
                 value = m.group( 4 );
 172  
             }
 173  
 
 174  
             //String value = m.group( 2 ).toLowerCase().trim();
 175  0
             name = name.toLowerCase().trim();
 176  
             // Some services, such as AOL LiveJournal, are case sensitive
 177  
             // on their resource names; can't do a toLowerCase.
 178  
             // Brad Neuberg, bkn3@columbia.edu
 179  
             // String value = m.group( 2 ).toLowerCase().trim();
 180  0
             value = value.trim();
 181  
 
 182  0
             if ( "".equals( value ) ) 
 183  0
                 value = null; 
 184  
 
 185  0
             map.put( name, value );
 186  
             
 187  0
             index =  m.end();
 188  
             
 189  0
         } 
 190  
 
 191  0
         return map;
 192  
         
 193  
     }
 194  
 
 195  
 }