Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
DiscoveryLocator |
|
| 5.0;5 |
1 | /* | |
2 | * Copyright 1999,2004 The Apache Software Foundation. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.commons.feedparser.locate; | |
18 | ||
19 | import java.util.HashMap; | |
20 | import java.util.HashSet; | |
21 | import java.util.List; | |
22 | import java.util.regex.Matcher; | |
23 | import java.util.regex.Pattern; | |
24 | ||
25 | import org.apache.commons.feedparser.FeedList; | |
26 | import org.apache.log4j.Logger; | |
27 | ||
28 | /** | |
29 | * | |
30 | * http://www.ietf.org/internet-drafts/draft-ietf-atompub-autodiscovery-00.txt | |
31 | * | |
32 | * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a> | |
33 | */ | |
34 | 0 | public class DiscoveryLocator { |
35 | ||
36 | 0 | private static Logger log = Logger.getLogger( DiscoveryLocator.class ); |
37 | ||
38 | /** | |
39 | * Get a FULL link within the content. We then pull the attributes out of | |
40 | * this. | |
41 | */ | |
42 | 0 | static Pattern element_pattern = |
43 | Pattern.compile( "<link[^>]+", | |
44 | Pattern.CASE_INSENSITIVE ); | |
45 | ||
46 | /** | |
47 | * Regex to match on attributes. | |
48 | * | |
49 | * Implementation: Mon Mar 14 2005 01:59 PM (burton@rojo.com): this is a | |
50 | * pretty difficult regexp to grok. | |
51 | * | |
52 | * There's are two regexps here. One for attributes with quotes and one | |
53 | * without. Each regexp has two groups - 1 is the name and 2 is the value. | |
54 | * You can split the regexp on | to better understand each individual | |
55 | * regexp. | |
56 | */ | |
57 | ||
58 | // > Attribute values MUST be one of the following: enclosed in double | |
59 | // > quotes, enclosed in single quotes, or not enclosed in quotes at all. | |
60 | // | |
61 | // | |
62 | 0 | static String ATTR_REGEXP = "([a-zA-Z]+)=[\"']([^\"']+)[\"']|([a-zA-Z]+)=([^\"'>\r\n\t ]+)"; |
63 | ||
64 | 0 | static Pattern ATTR_PATTERN = Pattern.compile( ATTR_REGEXP, |
65 | Pattern.CASE_INSENSITIVE ); | |
66 | ||
67 | 0 | static HashSet mediatypes = new HashSet(); |
68 | ||
69 | static { | |
70 | ||
71 | 0 | mediatypes.add( FeedReference.ATOM_MEDIA_TYPE ); |
72 | 0 | mediatypes.add( FeedReference.RSS_MEDIA_TYPE ); |
73 | 0 | mediatypes.add( FeedReference.XML_MEDIA_TYPE ); |
74 | ||
75 | 0 | } |
76 | ||
77 | /** | |
78 | * Locate a feed via RSS/Atom auto-discovery. If both Atom and RSS are | |
79 | * listed we return both. Actually we return all Atom/RSS or XML feeds | |
80 | * including FOAF. It's up to the caller to use the correct feed. | |
81 | * | |
82 | * | |
83 | */ | |
84 | public static final List locate( String resource, | |
85 | String content, | |
86 | FeedList list ) | |
87 | throws Exception { | |
88 | ||
89 | //this mechanism is easier but it isn't efficient. I should just parse | |
90 | //elements forward until I discover </head>. Also note that this isn't | |
91 | //doing all feed URLs just the first ones it finds. | |
92 | ||
93 | 0 | Matcher m = element_pattern.matcher( content ); |
94 | ||
95 | 0 | while( m.find() ) { |
96 | //the value of the link element XML... example: | |
97 | ||
98 | // <link rel="alternate" | |
99 | // href="http://www.codinginparadise.org/weblog/atom.xml" | |
100 | // type="application/atom+xml" | |
101 | // title="ATOM" /> | |
102 | ||
103 | 0 | String element = m.group( 0 ); |
104 | ||
105 | 0 | HashMap attributes = getAttributes( element ); |
106 | ||
107 | 0 | String type = (String)attributes.get( "type" ); |
108 | 0 | if (type != null) |
109 | 0 | type = type.toLowerCase(); |
110 | ||
111 | 0 | if ( mediatypes.contains( type ) ) { |
112 | ||
113 | //expand the href | |
114 | 0 | String href = (String)attributes.get( "href" ); |
115 | 0 | log.debug("href="+href); |
116 | ||
117 | // http://xml.coverpages.org/draft-ietf-atompub-autodiscovery-00.txt | |
118 | ||
119 | // > The href attribute MUST be present in an Atom autodiscovery element, | |
120 | // > and its value MUST be the URI [RFC2396] of an Atom feed. The value | |
121 | // > MAY be a relative URI, and if so, clients MUST resolve it to a full | |
122 | // > URI (section 5 of [RFC2396]) using the document's base URI (section | |
123 | // > 12.4 of HTML 4 [W3C.REC-html401-19991224]). | |
124 | ||
125 | 0 | href = ResourceExpander.expand( resource, href ); |
126 | ||
127 | 0 | FeedReference feedReference = new FeedReference( href, type ); |
128 | ||
129 | 0 | feedReference.title = (String)attributes.get( "title" ); |
130 | ||
131 | 0 | list.add( feedReference ); |
132 | ||
133 | 0 | if ( type.equals( FeedReference.ATOM_MEDIA_TYPE ) ) |
134 | 0 | list.setFirstAdAtomFeed( feedReference ); |
135 | ||
136 | 0 | if ( type.equals( FeedReference.RSS_MEDIA_TYPE ) ) |
137 | 0 | list.setFirstAdRSSFeed( feedReference ); |
138 | ||
139 | } | |
140 | ||
141 | 0 | } |
142 | ||
143 | 0 | return list; |
144 | ||
145 | } | |
146 | ||
147 | /** | |
148 | * Parse attributes within elements into a hashmap. | |
149 | * | |
150 | * | |
151 | */ | |
152 | public static HashMap getAttributes( String content ) { | |
153 | ||
154 | 0 | HashMap map = new HashMap(); |
155 | ||
156 | 0 | Matcher m = ATTR_PATTERN.matcher( content ); |
157 | ||
158 | 0 | int index = 0; |
159 | ||
160 | 0 | while ( m.find( index ) ) { |
161 | ||
162 | 0 | String name = m.group( 1 ); |
163 | 0 | String value = null; |
164 | ||
165 | //Since we use an OR regexp the first match will be 1/2 and the | |
166 | //second will be 3/4 | |
167 | 0 | if ( name != null ) { |
168 | 0 | value = m.group( 2 ); |
169 | } else { | |
170 | 0 | name = m.group( 3 ); |
171 | 0 | value = m.group( 4 ); |
172 | } | |
173 | ||
174 | //String value = m.group( 2 ).toLowerCase().trim(); | |
175 | 0 | name = name.toLowerCase().trim(); |
176 | // Some services, such as AOL LiveJournal, are case sensitive | |
177 | // on their resource names; can't do a toLowerCase. | |
178 | // Brad Neuberg, bkn3@columbia.edu | |
179 | // String value = m.group( 2 ).toLowerCase().trim(); | |
180 | 0 | value = value.trim(); |
181 | ||
182 | 0 | if ( "".equals( value ) ) |
183 | 0 | value = null; |
184 | ||
185 | 0 | map.put( name, value ); |
186 | ||
187 | 0 | index = m.end(); |
188 | ||
189 | 0 | } |
190 | ||
191 | 0 | return map; |
192 | ||
193 | } | |
194 | ||
195 | } |