Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
FeedParserImpl |
|
| 4.8;4.8 |
1 | /* | |
2 | * Copyright 1999,2004 The Apache Software Foundation. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.commons.feedparser; | |
18 | ||
19 | import java.io.ByteArrayInputStream; | |
20 | import java.io.ByteArrayOutputStream; | |
21 | import java.io.IOException; | |
22 | import java.io.InputStream; | |
23 | ||
24 | import org.apache.commons.feedparser.tools.XMLCleanser; | |
25 | import org.apache.commons.feedparser.tools.XMLEncodingParser; | |
26 | import org.apache.log4j.Logger; | |
27 | import org.jdom.input.SAXBuilder; | |
28 | ||
29 | /** | |
30 | * This FeedParser implementation is based on JDOM and Jaxen and is based around | |
31 | * XPath and JDOM iteration. While the implementation is straight forward it | |
32 | * has not been optimized for performance. A SAX based parser would certainly | |
33 | * be less memory intensive but with the downside of being harder to develop. | |
34 | * | |
35 | * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a> | |
36 | * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $ | |
37 | */ | |
38 | 0 | public class FeedParserImpl implements FeedParser { |
39 | ||
40 | 0 | private static Logger log = Logger.getLogger(FeedParserImpl.class); |
41 | ||
42 | /** | |
43 | * Parse this feed. | |
44 | * | |
45 | * @param resource The URL of the feed being parsed. This is optional and | |
46 | * may be null but is used when an exception is thrown to aid debugging. | |
47 | */ | |
48 | public void parse(FeedParserListener listener, | |
49 | InputStream is, | |
50 | String resource) throws FeedParserException { | |
51 | ||
52 | try { | |
53 | ||
54 | // Need to massage our XML support for UTF-8 to prevent the dreaded | |
55 | // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some | |
56 | // default feeds. This was tested a great deal under NewsMonster | |
57 | // and I'm happy with the results. Within FeedParser 2.0 we will be | |
58 | // using SAX2 so this won't be as big of a problem. In FeedParser | |
59 | // 2.0 (or as soon as we use SAX) this code should be totally | |
60 | // removed to use the original stream. | |
61 | ||
62 | 0 | is = getCorrectInputStream( is ); |
63 | ||
64 | //OK. Now we have the right InputStream so we should build our DOM | |
65 | //and exec. | |
66 | 0 | SAXBuilder builder = new SAXBuilder(); |
67 | ||
68 | //NOTE: in b10 of JDOM this won't accept an InputStream and requires | |
69 | //a org.w3c.dom.Document so we'll have to build one here. Will this | |
70 | //slow things down any? | |
71 | ||
72 | 0 | org.jdom.Document doc = builder.build( is ); |
73 | ||
74 | 0 | parse(listener, doc); |
75 | ||
76 | 0 | } catch (FeedParserException fpe) { |
77 | //if an explicit FeedParserException is thrown just rethrow it.. | |
78 | 0 | throw fpe; |
79 | 0 | } catch (Throwable t) { |
80 | ||
81 | //FIXME: when this is a JDOM or XML parser Exception we should | |
82 | //detect when we're working with an XHTML or HTML file and then | |
83 | //parse it with an XFN/XOXO event listener. | |
84 | ||
85 | 0 | throw new FeedParserException(t); |
86 | 0 | } |
87 | ||
88 | 0 | } |
89 | ||
90 | /** | |
91 | * Perform the Xerces UTF8 correction and FeedFilter. | |
92 | */ | |
93 | private InputStream getCorrectInputStream(InputStream is) | |
94 | throws Exception { | |
95 | ||
96 | 0 | byte[] bytes = toByteArray(is); |
97 | ||
98 | //FIXME: if we return the WRONG content type here we will break. | |
99 | //getBytes()... UTF-16 and UTF-32 especially. We should also perform | |
100 | //HTTP Content-Type parsing here to preserve the content type. This can | |
101 | //be fixed by integrating our networking API from NewsMonster. | |
102 | ||
103 | 0 | String encoding = XMLEncodingParser.parse(bytes); |
104 | ||
105 | 0 | if (encoding == null) |
106 | 0 | encoding = "UTF-8"; |
107 | ||
108 | 0 | if ( encoding.startsWith( "UTF" ) ) { |
109 | ||
110 | 0 | String result = XMLCleanser.cleanse( bytes, encoding ); |
111 | 0 | bytes = FeedFilter.parse( result, encoding ); |
112 | ||
113 | 0 | } else { |
114 | ||
115 | 0 | bytes = FeedFilter.parse(bytes, encoding); |
116 | ||
117 | } | |
118 | ||
119 | //remove prefix whitespace, intern HTML entities, etc. | |
120 | ||
121 | //build an input stream from the our bytes for parsing... | |
122 | 0 | is = new ByteArrayInputStream( bytes ); |
123 | ||
124 | 0 | return is; |
125 | ||
126 | } | |
127 | ||
128 | /** | |
129 | * @deprecated Use #parse( FeedParserException, InputStream, String ) | |
130 | */ | |
131 | public void parse(FeedParserListener listener, | |
132 | InputStream is) throws FeedParserException { | |
133 | ||
134 | 0 | parse(listener, is, null); |
135 | ||
136 | 0 | } |
137 | ||
138 | /** | |
139 | * Parse this feed. | |
140 | */ | |
141 | public void parse(FeedParserListener listener, | |
142 | org.jdom.Document doc) throws FeedParserException { | |
143 | ||
144 | try { | |
145 | ||
146 | 0 | String root = doc.getRootElement().getName(); |
147 | ||
148 | //Handle OPML | |
149 | 0 | if ("opml".equals(root)) { |
150 | 0 | OPMLFeedParser.parse(listener, doc); |
151 | 0 | return; |
152 | } | |
153 | ||
154 | //Handle changes.xml | |
155 | 0 | if ("weblogUpdates".equals(root)) { |
156 | 0 | ChangesFeedParser.parse(listener, doc); |
157 | 0 | return; |
158 | } | |
159 | ||
160 | //Handle ATOM | |
161 | 0 | if ( "feed".equals( root ) ) { |
162 | 0 | AtomFeedParser.parse(listener, doc); |
163 | 0 | return; |
164 | } | |
165 | ||
166 | //Handle FOAF | |
167 | 0 | if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) { |
168 | 0 | FOAFFeedParser.parse(listener, doc); |
169 | 0 | return; |
170 | } | |
171 | ||
172 | //FIXME: if this is XHTML we need to handle this with either an XFN | |
173 | //or an XOXO directory parser. There might be more metadata we need | |
174 | //to parse here. (also I wonder if this could be a chance to do | |
175 | //autodiscovery). | |
176 | ||
177 | //fall back on RDF and RSS parsing. | |
178 | ||
179 | //FIXME: if this is an UNKNOWN format We need to throw an | |
180 | //UnsupportedFeedxception (which extends FeedParserException) | |
181 | // | |
182 | // In this situation the ROOT elements should be: rss or RDF | |
183 | ||
184 | 0 | RSSFeedParser.parse(listener, doc); |
185 | ||
186 | 0 | } catch (FeedParserException fpe) { |
187 | //if an explicit FeedParserException is thrown just rethrow it.. | |
188 | 0 | throw fpe; |
189 | 0 | } catch (Throwable t) { |
190 | 0 | throw new FeedParserException(t); |
191 | 0 | } |
192 | ||
193 | 0 | } |
194 | ||
195 | /** | |
196 | * Convert an InputStream to a byte array. | |
197 | */ | |
198 | public byte[] toByteArray(InputStream is) throws IOException { | |
199 | ||
200 | //WARNING: | |
201 | 0 | ByteArrayOutputStream bos = new ByteArrayOutputStream(); |
202 | ||
203 | //now process the Reader... | |
204 | 0 | byte data[] = new byte[200]; |
205 | ||
206 | 0 | int readCount = 0; |
207 | ||
208 | 0 | while ((readCount = is.read(data)) > 0) { |
209 | ||
210 | 0 | bos.write(data, 0, readCount); |
211 | } | |
212 | ||
213 | 0 | is.close(); |
214 | 0 | bos.close(); |
215 | ||
216 | 0 | return bos.toByteArray(); |
217 | ||
218 | } | |
219 | ||
220 | } | |
221 |