Coverage Report

Coverage Report - org.apache.commons.feedparser.locate.LinkLocator

Classes in this File

Line Coverage

Branch Coverage

Complexity

LinkLocator

0/11

0/4

6.2

LinkLocator$1

0/51

0/44

6.2

 /*
  * Copyright 1999,2004 The Apache Software Foundation.
  * 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  * 
  *      http://www.apache.org/licenses/LICENSE-2.0
  * 
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.commons.feedparser.locate;
 
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashSet;
 import java.util.List;
 
 import org.apache.commons.feedparser.FeedList;
 
 /**
  * Find links by parsing the raw HTML.  We only return links that are on the
  * same site and link to /index.rdf LINKS and so forth.
  *
  * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
  */
 public class LinkLocator {
 
     /**
      * 
      *
      * 
      */
     public static final List locate( String resource,
                                      String content,
                                      final FeedList list )
         throws Exception {
 
         /**
          * When we have been given feeds at a higher level (via <link rel> tags
          * we should prefer these.
          */
         final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null;
         final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null;
 
         AnchorParserListener listener = new AnchorParserListener() {
 
                 String resource = null;
                 
                 String site = null;
 
                 HashSet seen = new HashSet();
 
                 boolean hasFoundRSSFeed = false;
                 boolean hasFoundAtomFeed = false;
                 
                 public void setContext( Object context ) {
 
                     resource = (String)context;
                     
                     //pass in the resource of the blog
                     site = getSite( resource );
                     
                 }
 
                 public Object getResult() {
                     return list;
                 }
 
                 public boolean onAnchor( String href, String rel, String title ) {
                     String current = ResourceExpander.expand( resource, href );
                     if ( current == null )
                         return true; //obviously not
 
                     //FIXME: if it's at the same directory level we should prioritize it.
                     //for example:
                     //
                     // http://peerfear.org/blog/
                     //
                     // http://peerfear.org/blog/index.rdf
                     //
                     // instead of 
                     //
                     // http://peerfear.org/index.rdf
 
                     //see if the link is on a different site
                     if ( ! site.equals( getSite( current ) ) ) {
                         return true;
                     }
 
                     //Radio style feed.  Screw that.
                     //FIXME: What happens if the Feed Parser is used to
                     //aggregate feeds on the localhost? This will break that.
                     //Brad Neuberg, bkn3@columbia.edu
                     if ( current.startsWith( "http://127" ) ) 
                         return true;
 
                     if ( seen.contains( current ) ) {
                         return true;
                     } 
 
                     seen.add( current );
 
                     //FIXME: we should assert tha that these feeds are from the SAME
                     //domain not a link to another feed.
 
                     boolean isRSSLink = current.endsWith( ".rss" );
 
                     //support ROLLER RSS links and explicit link discovery by
                     //non-extensions.
                     if ( isRSSLink == false ) {
 
                         isRSSLink =
                             title != null &&
                             title.equalsIgnoreCase( "rss" ) &&
                             href.indexOf( "rss" ) != -1;
 
                     } 
 
                     if ( isRSSLink ) {
 
                         //this is an RSS feed.
                         FeedReference ref = new FeedReference( current,
                                                                FeedReference.RSS_MEDIA_TYPE );
 
                         
                         //make sure we haven't already discovered this feed
                         //through a different process
                         if (list.contains(ref))
                             return true;
 
                         //Make sure to preserve existing AD feeds first.
                         if ( ! hasExplicitRSSFeed )
                             list.setAdRSSFeed( ref );
 
                         list.add( ref );
 
                         hasFoundRSSFeed = true;
                         
                     }
 
                     if ( current.endsWith( ".atom" ) ) {
 
                         FeedReference ref = new FeedReference( current,
                                                                FeedReference.RSS_MEDIA_TYPE );
 
                         //make sure we haven't already discovered this feed
                         //through a different process
                         if (list.contains(ref))
                             return true;
                         
                         //Make sure to preserve existing AD feeds first.
                         if ( ! hasExplicitAtomFeed )
                             list.setAdAtomFeed( ref );
 
                         list.add( ref );
 
                         hasFoundAtomFeed = true;
 
                     }
 
                     if ( current.endsWith( ".xml" ) ||
                          current.endsWith( ".rdf" ) ) {
 
                         //NOTE that we do allow autodiscovery forfor index.xml
                         //and index.rdf files but we don't prefer them since
                         //these extensions are generic.  We would prefer to use
                         //index.rss or even Atom (though people tend to use Atom
                         //autodiscovery now).  This is important because if we
                         //spit back an index.xml file thats NOT RSS or worse an
                         //index.rdf file thats FOAF then we might break callers.
 
                         FeedReference ref = new FeedReference( current,
                                                                FeedReference.RSS_MEDIA_TYPE );
                         
                         //make sure we haven't already discovered this feed
                         //through a different process
                         if (list.contains(ref))
                             return true;
 
                         //see if we should RESORT to using this.
 
                         if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) {
 
                             //NOTE: when we have found an existing RDF file use
                             //that instead..  This is probably RSS 1.0 which is
                             //much better than RSS 0.91
 
                             if ( list.getAdRSSFeed() == null ||
                                  list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) {
 
                                 list.setAdRSSFeed( ref );
 
                             }
 
                         }
 
                         //feed for this blog.
                         list.add( ref );
                         return true;
                         
                     } 
 
                     //for coderman's blog at http://www.peertech.org
                     //FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu
                     if ( current.endsWith( "/node/feed" ) )
                         list.add( current );
 
                     return true;
                     
                 }
 
             };
 
         listener.setContext( resource );
         AnchorParser.parseAnchors( content, listener );
         
         return list;
         
     }
 
     public static String getSite( String resource ) {
 
         try {
 
             String site = new URL( resource ).getHost();
             return site.replaceAll( "http://www", "http://" );
             
         } catch ( MalformedURLException e ) {
             return null;
         }
         
     }
 
 }

1		/*
2		* Copyright 1999,2004 The Apache Software Foundation.
3		*
4		* Licensed under the Apache License, Version 2.0 (the "License");
5		* you may not use this file except in compliance with the License.
6		* You may obtain a copy of the License at
7		*
8		* http://www.apache.org/licenses/LICENSE-2.0
9		*
10		* Unless required by applicable law or agreed to in writing, software
11		* distributed under the License is distributed on an "AS IS" BASIS,
12		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13		* See the License for the specific language governing permissions and
14		* limitations under the License.
15		*/
16
17		package org.apache.commons.feedparser.locate;
18
19		import java.net.MalformedURLException;
20		import java.net.URL;
21		import java.util.HashSet;
22		import java.util.List;
23
24		import org.apache.commons.feedparser.FeedList;
25
26		/**
27		* Find links by parsing the raw HTML. We only return links that are on the
28		* same site and link to /index.rdf LINKS and so forth.
29		*
30		* @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
31		*/
32	0	public class LinkLocator {
33
34		/**
35		*
36		*
37		*
38		*/
39		public static final List locate( String resource,
40		String content,
41		final FeedList list )
42		throws Exception {
43
44		/**
45		* When we have been given feeds at a higher level (via <link rel> tags
46		* we should prefer these.
47		*/
48	0	final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null;
49	0	final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null;
50
51	0	AnchorParserListener listener = new AnchorParserListener() {
52
53	0	String resource = null;
54
55	0	String site = null;
56
57	0	HashSet seen = new HashSet();
58
59	0	boolean hasFoundRSSFeed = false;
60	0	boolean hasFoundAtomFeed = false;
61
62		public void setContext( Object context ) {
63
64	0	resource = (String)context;
65
66		//pass in the resource of the blog
67	0	site = getSite( resource );
68
69	0	}
70
71		public Object getResult() {
72	0	return list;
73		}
74
75		public boolean onAnchor( String href, String rel, String title ) {
76	0	String current = ResourceExpander.expand( resource, href );
77	0	if ( current == null )
78	0	return true; //obviously not
79
80		//FIXME: if it's at the same directory level we should prioritize it.
81		//for example:
82		//
83		// http://peerfear.org/blog/
84		//
85		// http://peerfear.org/blog/index.rdf
86		//
87		// instead of
88		//
89		// http://peerfear.org/index.rdf
90
91		//see if the link is on a different site
92	0	if ( ! site.equals( getSite( current ) ) ) {
93	0	return true;
94		}
95
96		//Radio style feed. Screw that.
97		//FIXME: What happens if the Feed Parser is used to
98		//aggregate feeds on the localhost? This will break that.
99		//Brad Neuberg, bkn3@columbia.edu
100	0	if ( current.startsWith( "http://127" ) )
101	0	return true;
102
103	0	if ( seen.contains( current ) ) {
104	0	return true;
105		}
106
107	0	seen.add( current );
108
109		//FIXME: we should assert tha that these feeds are from the SAME
110		//domain not a link to another feed.
111
112	0	boolean isRSSLink = current.endsWith( ".rss" );
113
114		//support ROLLER RSS links and explicit link discovery by
115		//non-extensions.
116	0	if ( isRSSLink == false ) {
117
118	0	isRSSLink =
119		title != null &&
120		title.equalsIgnoreCase( "rss" ) &&
121		href.indexOf( "rss" ) != -1;
122
123		}
124
125	0	if ( isRSSLink ) {
126
127		//this is an RSS feed.
128	0	FeedReference ref = new FeedReference( current,
129		FeedReference.RSS_MEDIA_TYPE );
130
131
132		//make sure we haven't already discovered this feed
133		//through a different process
134	0	if (list.contains(ref))
135	0	return true;
136
137		//Make sure to preserve existing AD feeds first.
138	0	if ( ! hasExplicitRSSFeed )
139	0	list.setAdRSSFeed( ref );
140
141	0	list.add( ref );
142
143	0	hasFoundRSSFeed = true;
144
145		}
146
147	0	if ( current.endsWith( ".atom" ) ) {
148
149	0	FeedReference ref = new FeedReference( current,
150		FeedReference.RSS_MEDIA_TYPE );
151
152		//make sure we haven't already discovered this feed
153		//through a different process
154	0	if (list.contains(ref))
155	0	return true;
156
157		//Make sure to preserve existing AD feeds first.
158	0	if ( ! hasExplicitAtomFeed )
159	0	list.setAdAtomFeed( ref );
160
161	0	list.add( ref );
162
163	0	hasFoundAtomFeed = true;
164
165		}
166
167	0	if ( current.endsWith( ".xml" ) \|\|
168		current.endsWith( ".rdf" ) ) {
169
170		//NOTE that we do allow autodiscovery forfor index.xml
171		//and index.rdf files but we don't prefer them since
172		//these extensions are generic. We would prefer to use
173		//index.rss or even Atom (though people tend to use Atom
174		//autodiscovery now). This is important because if we
175		//spit back an index.xml file thats NOT RSS or worse an
176		//index.rdf file thats FOAF then we might break callers.
177
178	0	FeedReference ref = new FeedReference( current,
179		FeedReference.RSS_MEDIA_TYPE );
180
181		//make sure we haven't already discovered this feed
182		//through a different process
183	0	if (list.contains(ref))
184	0	return true;
185
186		//see if we should RESORT to using this.
187
188	0	if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) {
189
190		//NOTE: when we have found an existing RDF file use
191		//that instead.. This is probably RSS 1.0 which is
192		//much better than RSS 0.91
193
194	0	if ( list.getAdRSSFeed() == null \|\|
195		list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) {
196
197	0	list.setAdRSSFeed( ref );
198
199		}
200
201		}
202
203		//feed for this blog.
204	0	list.add( ref );
205	0	return true;
206
207		}
208
209		//for coderman's blog at http://www.peertech.org
210		//FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu
211	0	if ( current.endsWith( "/node/feed" ) )
212	0	list.add( current );
213
214	0	return true;
215
216		}
217
218		};
219
220	0	listener.setContext( resource );
221	0	AnchorParser.parseAnchors( content, listener );
222
223	0	return list;
224
225		}
226
227		public static String getSite( String resource ) {
228
229		try {
230
231	0	String site = new URL( resource ).getHost();
232	0	return site.replaceAll( "http://www", "http://" );
233
234	0	} catch ( MalformedURLException e ) {
235	0	return null;
236		}
237
238		}
239
240		}