Coverage Report - org.apache.commons.feedparser.tools.XMLEncodingParser
 
Classes in this File Line Coverage Branch Coverage Complexity
XMLEncodingParser
0%
0/38
0%
0/40
7.667
 
 1  
 /*
 2  
  * Copyright 1999,2004 The Apache Software Foundation.
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  * 
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.feedparser.tools;
 18  
 
 19  
 
 20  
 /**
 21  
  *
 22  
  * Given an XML document pull out the encoding or the default (UTF-8) if not
 23  
  * specified.
 24  
  *
 25  
  * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
 26  
  */
 27  0
 public class XMLEncodingParser {
 28  
 
 29  
     public static final String ENCODING = "encoding=\"";
 30  
     
 31  
     /**
 32  
      *
 33  
      * 
 34  
      */
 35  
     public static String parse( byte[] content ) throws Exception {
 36  
 
 37  
         //this isn't really pretty but it is fast.
 38  
 
 39  
         //just use the first 100 bytes
 40  
 
 41  
         String str;
 42  
 
 43  0
         if ( content.length > 100 ) {
 44  0
             str = new String( content, 0, 100 );
 45  
         } else {
 46  0
             str = new String( content );
 47  
         }
 48  
 
 49  0
         String result = getEncodingFromBOM( content );
 50  
 
 51  0
         if ( result != null )
 52  0
             return result;
 53  
         
 54  0
         int end = str.indexOf( ">" );
 55  
 
 56  0
         if ( end == -1 )
 57  0
             return "UTF-8";
 58  
 
 59  0
         String decl = str.substring( 0, end );
 60  
 
 61  0
         int index = decl.indexOf( ENCODING );
 62  
         
 63  0
         if ( index != -1 ) {
 64  
 
 65  0
             String encoding = decl.substring( index + ENCODING.length(),
 66  
                                               decl.length() );
 67  
 
 68  0
             end = encoding.indexOf( "\"" );
 69  
             
 70  0
             if ( end == -1 )
 71  0
                 return "UTF-8";
 72  
 
 73  0
             encoding = encoding.substring( 0, end);
 74  0
             encoding = encoding.toUpperCase();
 75  
 
 76  0
             if ( "UTF8".equals( encoding ) )
 77  0
                 encoding = "UTF-8";
 78  
             
 79  0
             return encoding;
 80  
             
 81  
         }
 82  
 
 83  0
         return "UTF-8";
 84  
 
 85  
     }
 86  
 
 87  
     private static String getEncodingFromBOM( byte[] content ) {
 88  
 
 89  
         // Technically speaking if we see a BOM is specified we're supposed to
 90  
         // return UTF-16 or UTF-32 but because we only care about anything UTF
 91  
         // returning UTF-8 is incorrect but acceptable.
 92  
         //
 93  
         // http://www.unicode.org/faq/utf_bom.html#BOM
 94  
 
 95  0
         if ( content.length > 2 ) {
 96  
 
 97  
             //perform UTF-16 tests
 98  0
             if ( content[0] == -1 &&
 99  
                  content[1] == -2 ) 
 100  0
                 return "UTF-16";
 101  
 
 102  0
             if ( content[0] == -2 &&
 103  
                  content[1] == -1 ) 
 104  0
                 return "UTF-16";
 105  
 
 106  
         }
 107  
 
 108  0
         if ( content.length > 4 ) {
 109  
 
 110  
             //perform UTF-16 tests
 111  0
             if ( content[0] == 0 &&
 112  
                  content[1] == 0 &&
 113  
                  content[2] == -2 &&
 114  
                  content[3] == -1 ) 
 115  0
                 return "UTF-32";
 116  
 
 117  0
             if ( content[0] == -1 &&
 118  
                  content[1] == -2 &&
 119  
                  content[2] == 0 &&
 120  
                  content[3] == 0 ) 
 121  0
                 return "UTF-32";
 122  
 
 123  
         }
 124  
 
 125  0
         return null;
 126  
         
 127  
     }
 128  
     
 129  
     public static void main( String[] args ) throws Exception {
 130  
 
 131  0
         System.out.println( parse( "<?xml encoding=\"utf-8\"?>".getBytes() ) );
 132  0
         System.out.println( parse( "<?xml encoding=\"UTF-8\"?>".getBytes() ) );
 133  0
         System.out.println( parse( "<?xml encoding=\"utf8\"?>".getBytes() ) );
 134  
 
 135  0
     }
 136  
 
 137  
 }