Coverage Report

Coverage Report - org.apache.commons.feedparser.FeedParserImpl

Classes in this File

Line Coverage

Branch Coverage

Complexity

FeedParserImpl

0/53

0/14

4.8

 /*
  * Copyright 1999,2004 The Apache Software Foundation.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.commons.feedparser;
 
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
 import org.apache.commons.feedparser.tools.XMLCleanser;
 import org.apache.commons.feedparser.tools.XMLEncodingParser;
 import org.apache.log4j.Logger;
 import org.jdom.input.SAXBuilder;
 
 /**
  * This FeedParser implementation is based on JDOM and Jaxen and is based around
  * XPath and JDOM iteration.  While the implementation is straight forward it
  * has not been optimized for performance.  A SAX based parser would certainly
  * be less memory intensive but with the downside of being harder to develop.
  *
  * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
  * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $
  */
 public class FeedParserImpl implements FeedParser {
 
     private static Logger log = Logger.getLogger(FeedParserImpl.class);
 
     /**
      * Parse this feed.
      *
      * @param resource The URL of the feed being parsed.  This is optional and
      *                 may be null but is used when an exception is thrown to aid debugging.
      */
     public void parse(FeedParserListener listener,
                       InputStream is,
                       String resource) throws FeedParserException {
 
         try {
 
             // Need to massage our XML support for UTF-8 to prevent the dreaded
             // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
             // default feeds.  This was tested a great deal under NewsMonster
             // and I'm happy with the results.  Within FeedParser 2.0 we will be
             // using SAX2 so this won't be as big of a problem.  In FeedParser
             // 2.0 (or as soon as we use SAX) this code should be totally
             // removed to use the original stream.
 
             is = getCorrectInputStream( is );
 
             //OK.  Now we have the right InputStream so we should build our DOM
             //and exec.
             SAXBuilder builder = new SAXBuilder();
 
             //NOTE: in b10 of JDOM this won't accept an InputStream and requires
             //a org.w3c.dom.Document so we'll have to build one here.  Will this
             //slow things down any?
 
             org.jdom.Document doc = builder.build( is );
 
             parse(listener, doc);
 
         } catch (FeedParserException fpe) {
             //if an explicit FeedParserException is thrown just rethrow it..
             throw fpe;
         } catch (Throwable t) {
 
             //FIXME: when this is a JDOM or XML parser Exception we should
             //detect when we're working with an XHTML or HTML file and then
             //parse it with an XFN/XOXO event listener.
 
             throw new FeedParserException(t);
         }
 
     }
 
     /**
      * Perform the Xerces UTF8 correction and FeedFilter.
      */
     private InputStream getCorrectInputStream(InputStream is)
             throws Exception {
 
         byte[] bytes = toByteArray(is);
 
         //FIXME: if we return the WRONG content type here we will break.
         //getBytes()... UTF-16 and UTF-32 especially.  We should also perform
         //HTTP Content-Type parsing here to preserve the content type.  This can
         //be fixed by integrating our networking API from NewsMonster.
 
         String encoding = XMLEncodingParser.parse(bytes);
 
         if (encoding == null)
             encoding = "UTF-8";
 
         if ( encoding.startsWith( "UTF" ) ) {
 
             String result = XMLCleanser.cleanse( bytes, encoding );
             bytes = FeedFilter.parse( result, encoding );
 
         } else {
 
             bytes = FeedFilter.parse(bytes, encoding);
 
         }
 
         //remove prefix whitespace, intern HTML entities, etc.
 
         //build an input stream from the our bytes for parsing...
         is = new ByteArrayInputStream( bytes );
 
         return is;
 
     }
 
     /**
      * @deprecated Use #parse( FeedParserException, InputStream, String )
      */
     public void parse(FeedParserListener listener,
                       InputStream is) throws FeedParserException {
 
         parse(listener, is, null);
 
     }
 
     /**
      * Parse this feed.
      */
     public void parse(FeedParserListener listener,
                       org.jdom.Document doc) throws FeedParserException {
 
         try {
 
             String root = doc.getRootElement().getName();
 
             //Handle OPML
             if ("opml".equals(root)) {
                 OPMLFeedParser.parse(listener, doc);
                 return;
             }
 
             //Handle changes.xml
             if ("weblogUpdates".equals(root)) {
                 ChangesFeedParser.parse(listener, doc);
                 return;
             }
 
             //Handle ATOM
             if ( "feed".equals( root ) ) {
                 AtomFeedParser.parse(listener, doc);
                 return;
             }
 
             //Handle FOAF
             if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) {
                 FOAFFeedParser.parse(listener, doc);
                 return;
             }
 
             //FIXME: if this is XHTML we need to handle this with either an XFN
             //or an XOXO directory parser.  There might be more metadata we need
             //to parse here.  (also I wonder if this could be a chance to do
             //autodiscovery).
 
             //fall back on RDF and RSS parsing.
 
             //FIXME: if this is an UNKNOWN format We need to throw an
             //UnsupportedFeedxception (which extends FeedParserException)
             //
             // In this situation the ROOT elements should be: rss or RDF
 
             RSSFeedParser.parse(listener, doc);
 
         } catch (FeedParserException fpe) {
             //if an explicit FeedParserException is thrown just rethrow it..
             throw fpe;
         } catch (Throwable t) {
             throw new FeedParserException(t);
         }
 
     }
 
     /**
      * Convert an InputStream to a byte array.
      */
     public byte[] toByteArray(InputStream is) throws IOException {
 
         //WARNING:
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
 
         //now process the Reader...
         byte data[] = new byte[200];
 
         int readCount = 0;
 
         while ((readCount = is.read(data)) > 0) {
 
             bos.write(data, 0, readCount);
         }
 
         is.close();
         bos.close();
 
         return bos.toByteArray();
 
     }
 
 }
 

1		/*
2		* Copyright 1999,2004 The Apache Software Foundation.
3		*
4		* Licensed under the Apache License, Version 2.0 (the "License");
5		* you may not use this file except in compliance with the License.
6		* You may obtain a copy of the License at
7		*
8		* http://www.apache.org/licenses/LICENSE-2.0
9		*
10		* Unless required by applicable law or agreed to in writing, software
11		* distributed under the License is distributed on an "AS IS" BASIS,
12		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13		* See the License for the specific language governing permissions and
14		* limitations under the License.
15		*/
16
17		package org.apache.commons.feedparser;
18
19		import java.io.ByteArrayInputStream;
20		import java.io.ByteArrayOutputStream;
21		import java.io.IOException;
22		import java.io.InputStream;
23
24		import org.apache.commons.feedparser.tools.XMLCleanser;
25		import org.apache.commons.feedparser.tools.XMLEncodingParser;
26		import org.apache.log4j.Logger;
27		import org.jdom.input.SAXBuilder;
28
29		/**
30		* This FeedParser implementation is based on JDOM and Jaxen and is based around
31		* XPath and JDOM iteration. While the implementation is straight forward it
32		* has not been optimized for performance. A SAX based parser would certainly
33		* be less memory intensive but with the downside of being harder to develop.
34		*
35		* @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
36		* @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $
37		*/
38	0	public class FeedParserImpl implements FeedParser {
39
40	0	private static Logger log = Logger.getLogger(FeedParserImpl.class);
41
42		/**
43		* Parse this feed.
44		*
45		* @param resource The URL of the feed being parsed. This is optional and
46		* may be null but is used when an exception is thrown to aid debugging.
47		*/
48		public void parse(FeedParserListener listener,
49		InputStream is,
50		String resource) throws FeedParserException {
51
52		try {
53
54		// Need to massage our XML support for UTF-8 to prevent the dreaded
55		// "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
56		// default feeds. This was tested a great deal under NewsMonster
57		// and I'm happy with the results. Within FeedParser 2.0 we will be
58		// using SAX2 so this won't be as big of a problem. In FeedParser
59		// 2.0 (or as soon as we use SAX) this code should be totally
60		// removed to use the original stream.
61
62	0	is = getCorrectInputStream( is );
63
64		//OK. Now we have the right InputStream so we should build our DOM
65		//and exec.
66	0	SAXBuilder builder = new SAXBuilder();
67
68		//NOTE: in b10 of JDOM this won't accept an InputStream and requires
69		//a org.w3c.dom.Document so we'll have to build one here. Will this
70		//slow things down any?
71
72	0	org.jdom.Document doc = builder.build( is );
73
74	0	parse(listener, doc);
75
76	0	} catch (FeedParserException fpe) {
77		//if an explicit FeedParserException is thrown just rethrow it..
78	0	throw fpe;
79	0	} catch (Throwable t) {
80
81		//FIXME: when this is a JDOM or XML parser Exception we should
82		//detect when we're working with an XHTML or HTML file and then
83		//parse it with an XFN/XOXO event listener.
84
85	0	throw new FeedParserException(t);
86	0	}
87
88	0	}
89
90		/**
91		* Perform the Xerces UTF8 correction and FeedFilter.
92		*/
93		private InputStream getCorrectInputStream(InputStream is)
94		throws Exception {
95
96	0	byte[] bytes = toByteArray(is);
97
98		//FIXME: if we return the WRONG content type here we will break.
99		//getBytes()... UTF-16 and UTF-32 especially. We should also perform
100		//HTTP Content-Type parsing here to preserve the content type. This can
101		//be fixed by integrating our networking API from NewsMonster.
102
103	0	String encoding = XMLEncodingParser.parse(bytes);
104
105	0	if (encoding == null)
106	0	encoding = "UTF-8";
107
108	0	if ( encoding.startsWith( "UTF" ) ) {
109
110	0	String result = XMLCleanser.cleanse( bytes, encoding );
111	0	bytes = FeedFilter.parse( result, encoding );
112
113	0	} else {
114
115	0	bytes = FeedFilter.parse(bytes, encoding);
116
117		}
118
119		//remove prefix whitespace, intern HTML entities, etc.
120
121		//build an input stream from the our bytes for parsing...
122	0	is = new ByteArrayInputStream( bytes );
123
124	0	return is;
125
126		}
127
128		/**
129		* @deprecated Use #parse( FeedParserException, InputStream, String )
130		*/
131		public void parse(FeedParserListener listener,
132		InputStream is) throws FeedParserException {
133
134	0	parse(listener, is, null);
135
136	0	}
137
138		/**
139		* Parse this feed.
140		*/
141		public void parse(FeedParserListener listener,
142		org.jdom.Document doc) throws FeedParserException {
143
144		try {
145
146	0	String root = doc.getRootElement().getName();
147
148		//Handle OPML
149	0	if ("opml".equals(root)) {
150	0	OPMLFeedParser.parse(listener, doc);
151	0	return;
152		}
153
154		//Handle changes.xml
155	0	if ("weblogUpdates".equals(root)) {
156	0	ChangesFeedParser.parse(listener, doc);
157	0	return;
158		}
159
160		//Handle ATOM
161	0	if ( "feed".equals( root ) ) {
162	0	AtomFeedParser.parse(listener, doc);
163	0	return;
164		}
165
166		//Handle FOAF
167	0	if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) {
168	0	FOAFFeedParser.parse(listener, doc);
169	0	return;
170		}
171
172		//FIXME: if this is XHTML we need to handle this with either an XFN
173		//or an XOXO directory parser. There might be more metadata we need
174		//to parse here. (also I wonder if this could be a chance to do
175		//autodiscovery).
176
177		//fall back on RDF and RSS parsing.
178
179		//FIXME: if this is an UNKNOWN format We need to throw an
180		//UnsupportedFeedxception (which extends FeedParserException)
181		//
182		// In this situation the ROOT elements should be: rss or RDF
183
184	0	RSSFeedParser.parse(listener, doc);
185
186	0	} catch (FeedParserException fpe) {
187		//if an explicit FeedParserException is thrown just rethrow it..
188	0	throw fpe;
189	0	} catch (Throwable t) {
190	0	throw new FeedParserException(t);
191	0	}
192
193	0	}
194
195		/**
196		* Convert an InputStream to a byte array.
197		*/
198		public byte[] toByteArray(InputStream is) throws IOException {
199
200		//WARNING:
201	0	ByteArrayOutputStream bos = new ByteArrayOutputStream();
202
203		//now process the Reader...
204	0	byte data[] = new byte[200];
205
206	0	int readCount = 0;
207
208	0	while ((readCount = is.read(data)) > 0) {
209
210	0	bos.write(data, 0, readCount);
211		}
212
213	0	is.close();
214	0	bos.close();
215
216	0	return bos.toByteArray();
217
218		}
219
220		}
221