Coverage Report

Coverage Report - org.apache.commons.feedparser.locate.DiscoveryLocator

Classes in this File

0/48

0/16

 /*
  * Copyright 1999,2004 The Apache Software Foundation.
  * 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  * 
  *      http://www.apache.org/licenses/LICENSE-2.0
  * 
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.commons.feedparser.locate;
 
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.commons.feedparser.FeedList;
 import org.apache.log4j.Logger;
 
 /**
  *
  * http://www.ietf.org/internet-drafts/draft-ietf-atompub-autodiscovery-00.txt
  * 
  * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
  */
 public class DiscoveryLocator {
     
     private static Logger log = Logger.getLogger( DiscoveryLocator.class );
 
     /**
      * Get a FULL link within the content. We then pull the attributes out of
      * this.
      */
     static Pattern element_pattern =
         Pattern.compile( "<link[^>]+",
                          Pattern.CASE_INSENSITIVE );
 
     /**
      * Regex to match on attributes.
      * 
      * Implementation: Mon Mar 14 2005 01:59 PM (burton@rojo.com): this is a
      * pretty difficult regexp to grok.
      * 
      * There's are two regexps here.  One for attributes with quotes and one
      * without. Each regexp has two groups - 1 is the name and 2 is the value.
      * You can split the regexp on | to better understand each individual
      * regexp.
      */
 
     // > Attribute values MUST be one of the following: enclosed in double
     // > quotes, enclosed in single quotes, or not enclosed in quotes at all.
     //
     // 
     static String ATTR_REGEXP = "([a-zA-Z]+)=[\"']([^\"']+)[\"']|([a-zA-Z]+)=([^\"'>\r\n\t ]+)";
     
     static Pattern ATTR_PATTERN = Pattern.compile( ATTR_REGEXP,
                                                    Pattern.CASE_INSENSITIVE );
 
     static HashSet mediatypes = new HashSet();
 
     static {
 
         mediatypes.add( FeedReference.ATOM_MEDIA_TYPE );
         mediatypes.add( FeedReference.RSS_MEDIA_TYPE );
         mediatypes.add( FeedReference.XML_MEDIA_TYPE );
         
     }
 
     /**
      * Locate a feed via RSS/Atom auto-discovery.  If both Atom and RSS are
      * listed we return both.  Actually we return all Atom/RSS or XML feeds
      * including FOAF.  It's up to the caller to use the correct feed.
      *
      * 
      */
     public static final List locate( String resource,
                                      String content,
                                      FeedList list )
         throws Exception {
 
         //this mechanism is easier but it isn't efficient.  I should just parse
         //elements forward until I discover </head>.  Also note that this isn't
         //doing all feed URLs just the first ones it finds.  
 
         Matcher m = element_pattern.matcher( content );
 
         while( m.find() ) {
             //the value of the link element XML... example:
             
             // <link rel="alternate" 
             //      href="http://www.codinginparadise.org/weblog/atom.xml"
             //      type="application/atom+xml" 
             //      title="ATOM" />
                  
             String element = m.group( 0 );
 
             HashMap attributes = getAttributes( element );
             
             String type = (String)attributes.get( "type" );
             if (type != null)
                 type = type.toLowerCase();
 
             if ( mediatypes.contains( type )  ) {
 
                 //expand the href
                 String href = (String)attributes.get( "href" );
                 log.debug("href="+href);
 
                 // http://xml.coverpages.org/draft-ietf-atompub-autodiscovery-00.txt
                 
                 // > The href attribute MUST be present in an Atom autodiscovery element,
                 // > and its value MUST be the URI [RFC2396] of an Atom feed.  The value
                 // > MAY be a relative URI, and if so, clients MUST resolve it to a full
                 // > URI (section 5 of [RFC2396]) using the document's base URI (section
                 // > 12.4 of HTML 4 [W3C.REC-html401-19991224]).
 
                 href = ResourceExpander.expand( resource, href );
 
                 FeedReference feedReference = new FeedReference( href, type );
                 
                 feedReference.title = (String)attributes.get( "title" );
                 
                 list.add( feedReference );
 
                 if ( type.equals( FeedReference.ATOM_MEDIA_TYPE ) )
                     list.setFirstAdAtomFeed( feedReference );
                     
                 if ( type.equals( FeedReference.RSS_MEDIA_TYPE ) )
                     list.setFirstAdRSSFeed( feedReference );
 
             }
             
         }
         
         return list;
         
     }
 
     /**
      * Parse attributes within elements into a hashmap.
      *
      * 
      */
     public static HashMap getAttributes( String content ) {
 
         HashMap map = new HashMap();
 
         Matcher m = ATTR_PATTERN.matcher( content );
 
         int index = 0;
 
         while ( m.find( index ) ) {
 
             String name = m.group( 1 );
             String value = null;
 
             //Since we use an OR regexp the first match will be 1/2 and the
             //second will be 3/4
             if ( name != null ) {
                 value = m.group( 2 );
             } else {
                 name = m.group( 3 );
                 value = m.group( 4 );
             }
 
             //String value = m.group( 2 ).toLowerCase().trim();
             name = name.toLowerCase().trim();
             // Some services, such as AOL LiveJournal, are case sensitive
             // on their resource names; can't do a toLowerCase.
             // Brad Neuberg, bkn3@columbia.edu
             // String value = m.group( 2 ).toLowerCase().trim();
             value = value.trim();
 
             if ( "".equals( value ) ) 
                 value = null; 
 
             map.put( name, value );
             
             index =  m.end();
             
         } 
 
         return map;
         
     }
 
 }

1		/*
2		* Copyright 1999,2004 The Apache Software Foundation.
3		*
4		* Licensed under the Apache License, Version 2.0 (the "License");
5		* you may not use this file except in compliance with the License.
6		* You may obtain a copy of the License at
7		*
8		* http://www.apache.org/licenses/LICENSE-2.0
9		*
10		* Unless required by applicable law or agreed to in writing, software
11		* distributed under the License is distributed on an "AS IS" BASIS,
12		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13		* See the License for the specific language governing permissions and
14		* limitations under the License.
15		*/
16
17		package org.apache.commons.feedparser.locate;
18
19		import java.util.HashMap;
20		import java.util.HashSet;
21		import java.util.List;
22		import java.util.regex.Matcher;
23		import java.util.regex.Pattern;
24
25		import org.apache.commons.feedparser.FeedList;
26		import org.apache.log4j.Logger;
27
28		/**
29		*
30		* http://www.ietf.org/internet-drafts/draft-ietf-atompub-autodiscovery-00.txt
31		*
32		* @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
33		*/
34	0	public class DiscoveryLocator {
35
36	0	private static Logger log = Logger.getLogger( DiscoveryLocator.class );
37
38		/**
39		* Get a FULL link within the content. We then pull the attributes out of
40		* this.
41		*/
42	0	static Pattern element_pattern =
43		Pattern.compile( "<link[^>]+",
44		Pattern.CASE_INSENSITIVE );
45
46		/**
47		* Regex to match on attributes.
48		*
49		* Implementation: Mon Mar 14 2005 01:59 PM (burton@rojo.com): this is a
50		* pretty difficult regexp to grok.
51		*
52		* There's are two regexps here. One for attributes with quotes and one
53		* without. Each regexp has two groups - 1 is the name and 2 is the value.
54		* You can split the regexp on \| to better understand each individual
55		* regexp.
56		*/
57
58		// > Attribute values MUST be one of the following: enclosed in double
59		// > quotes, enclosed in single quotes, or not enclosed in quotes at all.
60		//
61		//
62	0	static String ATTR_REGEXP = "([a-zA-Z]+)=[\"']([^\"']+)[\"']\|([a-zA-Z]+)=([^\"'>\r\n\t ]+)";
63
64	0	static Pattern ATTR_PATTERN = Pattern.compile( ATTR_REGEXP,
65		Pattern.CASE_INSENSITIVE );
66
67	0	static HashSet mediatypes = new HashSet();
68
69		static {
70
71	0	mediatypes.add( FeedReference.ATOM_MEDIA_TYPE );
72	0	mediatypes.add( FeedReference.RSS_MEDIA_TYPE );
73	0	mediatypes.add( FeedReference.XML_MEDIA_TYPE );
74
75	0	}
76
77		/**
78		* Locate a feed via RSS/Atom auto-discovery. If both Atom and RSS are
79		* listed we return both. Actually we return all Atom/RSS or XML feeds
80		* including FOAF. It's up to the caller to use the correct feed.
81		*
82		*
83		*/
84		public static final List locate( String resource,
85		String content,
86		FeedList list )
87		throws Exception {
88
89		//this mechanism is easier but it isn't efficient. I should just parse
90		//elements forward until I discover </head>. Also note that this isn't
91		//doing all feed URLs just the first ones it finds.
92
93	0	Matcher m = element_pattern.matcher( content );
94
95	0	while( m.find() ) {
96		//the value of the link element XML... example:
97
98		// <link rel="alternate"
99		// href="http://www.codinginparadise.org/weblog/atom.xml"
100		// type="application/atom+xml"
101		// title="ATOM" />
102
103	0	String element = m.group( 0 );
104
105	0	HashMap attributes = getAttributes( element );
106
107	0	String type = (String)attributes.get( "type" );
108	0	if (type != null)
109	0	type = type.toLowerCase();
110
111	0	if ( mediatypes.contains( type ) ) {
112
113		//expand the href
114	0	String href = (String)attributes.get( "href" );
115	0	log.debug("href="+href);
116
117		// http://xml.coverpages.org/draft-ietf-atompub-autodiscovery-00.txt
118
119		// > The href attribute MUST be present in an Atom autodiscovery element,
120		// > and its value MUST be the URI [RFC2396] of an Atom feed. The value
121		// > MAY be a relative URI, and if so, clients MUST resolve it to a full
122		// > URI (section 5 of [RFC2396]) using the document's base URI (section
123		// > 12.4 of HTML 4 [W3C.REC-html401-19991224]).
124
125	0	href = ResourceExpander.expand( resource, href );
126
127	0	FeedReference feedReference = new FeedReference( href, type );
128
129	0	feedReference.title = (String)attributes.get( "title" );
130
131	0	list.add( feedReference );
132
133	0	if ( type.equals( FeedReference.ATOM_MEDIA_TYPE ) )
134	0	list.setFirstAdAtomFeed( feedReference );
135
136	0	if ( type.equals( FeedReference.RSS_MEDIA_TYPE ) )
137	0	list.setFirstAdRSSFeed( feedReference );
138
139		}
140
141	0	}
142
143	0	return list;
144
145		}
146
147		/**
148		* Parse attributes within elements into a hashmap.
149		*
150		*
151		*/
152		public static HashMap getAttributes( String content ) {
153
154	0	HashMap map = new HashMap();
155
156	0	Matcher m = ATTR_PATTERN.matcher( content );
157
158	0	int index = 0;
159
160	0	while ( m.find( index ) ) {
161
162	0	String name = m.group( 1 );
163	0	String value = null;
164
165		//Since we use an OR regexp the first match will be 1/2 and the
166		//second will be 3/4
167	0	if ( name != null ) {
168	0	value = m.group( 2 );
169		} else {
170	0	name = m.group( 3 );
171	0	value = m.group( 4 );
172		}
173
174		//String value = m.group( 2 ).toLowerCase().trim();
175	0	name = name.toLowerCase().trim();
176		// Some services, such as AOL LiveJournal, are case sensitive
177		// on their resource names; can't do a toLowerCase.
178		// Brad Neuberg, bkn3@columbia.edu
179		// String value = m.group( 2 ).toLowerCase().trim();
180	0	value = value.trim();
181
182	0	if ( "".equals( value ) )
183	0	value = null;
184
185	0	map.put( name, value );
186
187	0	index = m.end();
188
189	0	}
190
191	0	return map;
192
193		}
194
195		}