Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
LinkLocator |
|
| 6.2;6.2 | ||||
LinkLocator$1 |
|
| 6.2;6.2 |
1 | /* | |
2 | * Copyright 1999,2004 The Apache Software Foundation. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.commons.feedparser.locate; | |
18 | ||
19 | import java.net.MalformedURLException; | |
20 | import java.net.URL; | |
21 | import java.util.HashSet; | |
22 | import java.util.List; | |
23 | ||
24 | import org.apache.commons.feedparser.FeedList; | |
25 | ||
26 | /** | |
27 | * Find links by parsing the raw HTML. We only return links that are on the | |
28 | * same site and link to /index.rdf LINKS and so forth. | |
29 | * | |
30 | * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a> | |
31 | */ | |
32 | 0 | public class LinkLocator { |
33 | ||
34 | /** | |
35 | * | |
36 | * | |
37 | * | |
38 | */ | |
39 | public static final List locate( String resource, | |
40 | String content, | |
41 | final FeedList list ) | |
42 | throws Exception { | |
43 | ||
44 | /** | |
45 | * When we have been given feeds at a higher level (via <link rel> tags | |
46 | * we should prefer these. | |
47 | */ | |
48 | 0 | final boolean hasExplicitRSSFeed = list.getAdRSSFeed() != null; |
49 | 0 | final boolean hasExplicitAtomFeed = list.getAdRSSFeed() != null; |
50 | ||
51 | 0 | AnchorParserListener listener = new AnchorParserListener() { |
52 | ||
53 | 0 | String resource = null; |
54 | ||
55 | 0 | String site = null; |
56 | ||
57 | 0 | HashSet seen = new HashSet(); |
58 | ||
59 | 0 | boolean hasFoundRSSFeed = false; |
60 | 0 | boolean hasFoundAtomFeed = false; |
61 | ||
62 | public void setContext( Object context ) { | |
63 | ||
64 | 0 | resource = (String)context; |
65 | ||
66 | //pass in the resource of the blog | |
67 | 0 | site = getSite( resource ); |
68 | ||
69 | 0 | } |
70 | ||
71 | public Object getResult() { | |
72 | 0 | return list; |
73 | } | |
74 | ||
75 | public boolean onAnchor( String href, String rel, String title ) { | |
76 | 0 | String current = ResourceExpander.expand( resource, href ); |
77 | 0 | if ( current == null ) |
78 | 0 | return true; //obviously not |
79 | ||
80 | //FIXME: if it's at the same directory level we should prioritize it. | |
81 | //for example: | |
82 | // | |
83 | // http://peerfear.org/blog/ | |
84 | // | |
85 | // http://peerfear.org/blog/index.rdf | |
86 | // | |
87 | // instead of | |
88 | // | |
89 | // http://peerfear.org/index.rdf | |
90 | ||
91 | //see if the link is on a different site | |
92 | 0 | if ( ! site.equals( getSite( current ) ) ) { |
93 | 0 | return true; |
94 | } | |
95 | ||
96 | //Radio style feed. Screw that. | |
97 | //FIXME: What happens if the Feed Parser is used to | |
98 | //aggregate feeds on the localhost? This will break that. | |
99 | //Brad Neuberg, bkn3@columbia.edu | |
100 | 0 | if ( current.startsWith( "http://127" ) ) |
101 | 0 | return true; |
102 | ||
103 | 0 | if ( seen.contains( current ) ) { |
104 | 0 | return true; |
105 | } | |
106 | ||
107 | 0 | seen.add( current ); |
108 | ||
109 | //FIXME: we should assert tha that these feeds are from the SAME | |
110 | //domain not a link to another feed. | |
111 | ||
112 | 0 | boolean isRSSLink = current.endsWith( ".rss" ); |
113 | ||
114 | //support ROLLER RSS links and explicit link discovery by | |
115 | //non-extensions. | |
116 | 0 | if ( isRSSLink == false ) { |
117 | ||
118 | 0 | isRSSLink = |
119 | title != null && | |
120 | title.equalsIgnoreCase( "rss" ) && | |
121 | href.indexOf( "rss" ) != -1; | |
122 | ||
123 | } | |
124 | ||
125 | 0 | if ( isRSSLink ) { |
126 | ||
127 | //this is an RSS feed. | |
128 | 0 | FeedReference ref = new FeedReference( current, |
129 | FeedReference.RSS_MEDIA_TYPE ); | |
130 | ||
131 | ||
132 | //make sure we haven't already discovered this feed | |
133 | //through a different process | |
134 | 0 | if (list.contains(ref)) |
135 | 0 | return true; |
136 | ||
137 | //Make sure to preserve existing AD feeds first. | |
138 | 0 | if ( ! hasExplicitRSSFeed ) |
139 | 0 | list.setAdRSSFeed( ref ); |
140 | ||
141 | 0 | list.add( ref ); |
142 | ||
143 | 0 | hasFoundRSSFeed = true; |
144 | ||
145 | } | |
146 | ||
147 | 0 | if ( current.endsWith( ".atom" ) ) { |
148 | ||
149 | 0 | FeedReference ref = new FeedReference( current, |
150 | FeedReference.RSS_MEDIA_TYPE ); | |
151 | ||
152 | //make sure we haven't already discovered this feed | |
153 | //through a different process | |
154 | 0 | if (list.contains(ref)) |
155 | 0 | return true; |
156 | ||
157 | //Make sure to preserve existing AD feeds first. | |
158 | 0 | if ( ! hasExplicitAtomFeed ) |
159 | 0 | list.setAdAtomFeed( ref ); |
160 | ||
161 | 0 | list.add( ref ); |
162 | ||
163 | 0 | hasFoundAtomFeed = true; |
164 | ||
165 | } | |
166 | ||
167 | 0 | if ( current.endsWith( ".xml" ) || |
168 | current.endsWith( ".rdf" ) ) { | |
169 | ||
170 | //NOTE that we do allow autodiscovery forfor index.xml | |
171 | //and index.rdf files but we don't prefer them since | |
172 | //these extensions are generic. We would prefer to use | |
173 | //index.rss or even Atom (though people tend to use Atom | |
174 | //autodiscovery now). This is important because if we | |
175 | //spit back an index.xml file thats NOT RSS or worse an | |
176 | //index.rdf file thats FOAF then we might break callers. | |
177 | ||
178 | 0 | FeedReference ref = new FeedReference( current, |
179 | FeedReference.RSS_MEDIA_TYPE ); | |
180 | ||
181 | //make sure we haven't already discovered this feed | |
182 | //through a different process | |
183 | 0 | if (list.contains(ref)) |
184 | 0 | return true; |
185 | ||
186 | //see if we should RESORT to using this. | |
187 | ||
188 | 0 | if ( ! hasExplicitRSSFeed && ! hasFoundRSSFeed ) { |
189 | ||
190 | //NOTE: when we have found an existing RDF file use | |
191 | //that instead.. This is probably RSS 1.0 which is | |
192 | //much better than RSS 0.91 | |
193 | ||
194 | 0 | if ( list.getAdRSSFeed() == null || |
195 | list.getAdRSSFeed().resource.endsWith( ".rdf" ) == false ) { | |
196 | ||
197 | 0 | list.setAdRSSFeed( ref ); |
198 | ||
199 | } | |
200 | ||
201 | } | |
202 | ||
203 | //feed for this blog. | |
204 | 0 | list.add( ref ); |
205 | 0 | return true; |
206 | ||
207 | } | |
208 | ||
209 | //for coderman's blog at http://www.peertech.org | |
210 | //FIXME: This is a hack, Brad Neuberg, bkn3@columbia.edu | |
211 | 0 | if ( current.endsWith( "/node/feed" ) ) |
212 | 0 | list.add( current ); |
213 | ||
214 | 0 | return true; |
215 | ||
216 | } | |
217 | ||
218 | }; | |
219 | ||
220 | 0 | listener.setContext( resource ); |
221 | 0 | AnchorParser.parseAnchors( content, listener ); |
222 | ||
223 | 0 | return list; |
224 | ||
225 | } | |
226 | ||
227 | public static String getSite( String resource ) { | |
228 | ||
229 | try { | |
230 | ||
231 | 0 | String site = new URL( resource ).getHost(); |
232 | 0 | return site.replaceAll( "http://www", "http://" ); |
233 | ||
234 | 0 | } catch ( MalformedURLException e ) { |
235 | 0 | return null; |
236 | } | |
237 | ||
238 | } | |
239 | ||
240 | } |