Coverage Report - org.apache.commons.feedparser.FeedParserImpl
 
Classes in this File Line Coverage Branch Coverage Complexity
FeedParserImpl
0%
0/53
0%
0/14
4.8
 
 1  
 /*
 2  
  * Copyright 1999,2004 The Apache Software Foundation.
 3  
  *
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  *
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  *
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.feedparser;
 18  
 
 19  
 import java.io.ByteArrayInputStream;
 20  
 import java.io.ByteArrayOutputStream;
 21  
 import java.io.IOException;
 22  
 import java.io.InputStream;
 23  
 
 24  
 import org.apache.commons.feedparser.tools.XMLCleanser;
 25  
 import org.apache.commons.feedparser.tools.XMLEncodingParser;
 26  
 import org.apache.log4j.Logger;
 27  
 import org.jdom.input.SAXBuilder;
 28  
 
 29  
 /**
 30  
  * This FeedParser implementation is based on JDOM and Jaxen and is based around
 31  
  * XPath and JDOM iteration.  While the implementation is straight forward it
 32  
  * has not been optimized for performance.  A SAX based parser would certainly
 33  
  * be less memory intensive but with the downside of being harder to develop.
 34  
  *
 35  
  * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
 36  
  * @version $Id: FeedParserImpl.java 373614 2006-01-30 22:31:21Z mvdb $
 37  
  */
 38  0
 public class FeedParserImpl implements FeedParser {
 39  
 
 40  0
     private static Logger log = Logger.getLogger(FeedParserImpl.class);
 41  
 
 42  
     /**
 43  
      * Parse this feed.
 44  
      *
 45  
      * @param resource The URL of the feed being parsed.  This is optional and
 46  
      *                 may be null but is used when an exception is thrown to aid debugging.
 47  
      */
 48  
     public void parse(FeedParserListener listener,
 49  
                       InputStream is,
 50  
                       String resource) throws FeedParserException {
 51  
 
 52  
         try {
 53  
 
 54  
             // Need to massage our XML support for UTF-8 to prevent the dreaded
 55  
             // "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in some
 56  
             // default feeds.  This was tested a great deal under NewsMonster
 57  
             // and I'm happy with the results.  Within FeedParser 2.0 we will be
 58  
             // using SAX2 so this won't be as big of a problem.  In FeedParser
 59  
             // 2.0 (or as soon as we use SAX) this code should be totally
 60  
             // removed to use the original stream.
 61  
 
 62  0
             is = getCorrectInputStream( is );
 63  
 
 64  
             //OK.  Now we have the right InputStream so we should build our DOM
 65  
             //and exec.
 66  0
             SAXBuilder builder = new SAXBuilder();
 67  
 
 68  
             //NOTE: in b10 of JDOM this won't accept an InputStream and requires
 69  
             //a org.w3c.dom.Document so we'll have to build one here.  Will this
 70  
             //slow things down any?
 71  
 
 72  0
             org.jdom.Document doc = builder.build( is );
 73  
 
 74  0
             parse(listener, doc);
 75  
 
 76  0
         } catch (FeedParserException fpe) {
 77  
             //if an explicit FeedParserException is thrown just rethrow it..
 78  0
             throw fpe;
 79  0
         } catch (Throwable t) {
 80  
 
 81  
             //FIXME: when this is a JDOM or XML parser Exception we should
 82  
             //detect when we're working with an XHTML or HTML file and then
 83  
             //parse it with an XFN/XOXO event listener.
 84  
 
 85  0
             throw new FeedParserException(t);
 86  0
         }
 87  
 
 88  0
     }
 89  
 
 90  
     /**
 91  
      * Perform the Xerces UTF8 correction and FeedFilter.
 92  
      */
 93  
     private InputStream getCorrectInputStream(InputStream is)
 94  
             throws Exception {
 95  
 
 96  0
         byte[] bytes = toByteArray(is);
 97  
 
 98  
         //FIXME: if we return the WRONG content type here we will break.
 99  
         //getBytes()... UTF-16 and UTF-32 especially.  We should also perform
 100  
         //HTTP Content-Type parsing here to preserve the content type.  This can
 101  
         //be fixed by integrating our networking API from NewsMonster.
 102  
 
 103  0
         String encoding = XMLEncodingParser.parse(bytes);
 104  
 
 105  0
         if (encoding == null)
 106  0
             encoding = "UTF-8";
 107  
 
 108  0
         if ( encoding.startsWith( "UTF" ) ) {
 109  
 
 110  0
             String result = XMLCleanser.cleanse( bytes, encoding );
 111  0
             bytes = FeedFilter.parse( result, encoding );
 112  
 
 113  0
         } else {
 114  
 
 115  0
             bytes = FeedFilter.parse(bytes, encoding);
 116  
 
 117  
         }
 118  
 
 119  
         //remove prefix whitespace, intern HTML entities, etc.
 120  
 
 121  
         //build an input stream from the our bytes for parsing...
 122  0
         is = new ByteArrayInputStream( bytes );
 123  
 
 124  0
         return is;
 125  
 
 126  
     }
 127  
 
 128  
     /**
 129  
      * @deprecated Use #parse( FeedParserException, InputStream, String )
 130  
      */
 131  
     public void parse(FeedParserListener listener,
 132  
                       InputStream is) throws FeedParserException {
 133  
 
 134  0
         parse(listener, is, null);
 135  
 
 136  0
     }
 137  
 
 138  
     /**
 139  
      * Parse this feed.
 140  
      */
 141  
     public void parse(FeedParserListener listener,
 142  
                       org.jdom.Document doc) throws FeedParserException {
 143  
 
 144  
         try {
 145  
 
 146  0
             String root = doc.getRootElement().getName();
 147  
 
 148  
             //Handle OPML
 149  0
             if ("opml".equals(root)) {
 150  0
                 OPMLFeedParser.parse(listener, doc);
 151  0
                 return;
 152  
             }
 153  
 
 154  
             //Handle changes.xml
 155  0
             if ("weblogUpdates".equals(root)) {
 156  0
                 ChangesFeedParser.parse(listener, doc);
 157  0
                 return;
 158  
             }
 159  
 
 160  
             //Handle ATOM
 161  0
             if ( "feed".equals( root ) ) {
 162  0
                 AtomFeedParser.parse(listener, doc);
 163  0
                 return;
 164  
             }
 165  
 
 166  
             //Handle FOAF
 167  0
             if (doc.getRootElement().getChildren("Person", NS.FOAF).size() > 0) {
 168  0
                 FOAFFeedParser.parse(listener, doc);
 169  0
                 return;
 170  
             }
 171  
 
 172  
             //FIXME: if this is XHTML we need to handle this with either an XFN
 173  
             //or an XOXO directory parser.  There might be more metadata we need
 174  
             //to parse here.  (also I wonder if this could be a chance to do
 175  
             //autodiscovery).
 176  
 
 177  
             //fall back on RDF and RSS parsing.
 178  
 
 179  
             //FIXME: if this is an UNKNOWN format We need to throw an
 180  
             //UnsupportedFeedxception (which extends FeedParserException)
 181  
             //
 182  
             // In this situation the ROOT elements should be: rss or RDF
 183  
 
 184  0
             RSSFeedParser.parse(listener, doc);
 185  
 
 186  0
         } catch (FeedParserException fpe) {
 187  
             //if an explicit FeedParserException is thrown just rethrow it..
 188  0
             throw fpe;
 189  0
         } catch (Throwable t) {
 190  0
             throw new FeedParserException(t);
 191  0
         }
 192  
 
 193  0
     }
 194  
 
 195  
     /**
 196  
      * Convert an InputStream to a byte array.
 197  
      */
 198  
     public byte[] toByteArray(InputStream is) throws IOException {
 199  
 
 200  
         //WARNING:
 201  0
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
 202  
 
 203  
         //now process the Reader...
 204  0
         byte data[] = new byte[200];
 205  
 
 206  0
         int readCount = 0;
 207  
 
 208  0
         while ((readCount = is.read(data)) > 0) {
 209  
 
 210  0
             bos.write(data, 0, readCount);
 211  
         }
 212  
 
 213  0
         is.close();
 214  0
         bos.close();
 215  
 
 216  0
         return bos.toByteArray();
 217  
 
 218  
     }
 219  
 
 220  
 }
 221