00001 /*
00002 * The Apache Software License, Version 1.1
00003 *
00004 * Copyright (c) 1999-2000 The Apache Software Foundation. All rights
00005 * reserved.
00006 *
00007 * Redistribution and use in source and binary forms, with or without
00008 * modification, are permitted provided that the following conditions
00009 * are met:
00010 *
00011 * 1. Redistributions of source code must retain the above copyright
00012 * notice, this list of conditions and the following disclaimer.
00013 *
00014 * 2. Redistributions in binary form must reproduce the above copyright
00015 * notice, this list of conditions and the following disclaimer in
00016 * the documentation and/or other materials provided with the
00017 * distribution.
00018 *
00019 * 3. The end-user documentation included with the redistribution,
00020 * if any, must include the following acknowledgment:
00021 * "This product includes software developed by the
00022 * Apache Software Foundation (http://www.apache.org/)."
00023 * Alternately, this acknowledgment may appear in the software itself,
00024 * if and wherever such third-party acknowledgments normally appear.
00025 *
00026 * 4. The names "Xerces" and "Apache Software Foundation" must
00027 * not be used to endorse or promote products derived from this
00028 * software without prior written permission. For written
00029 * permission, please contact apache\@apache.org.
00030 *
00031 * 5. Products derived from this software may not be called "Apache",
00032 * nor may "Apache" appear in their name, without prior written
00033 * permission of the Apache Software Foundation.
00034 *
00035 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
00036 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00037 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
00038 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
00039 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00040 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00041 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
00042 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
00043 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
00044 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
00045 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
00046 * SUCH DAMAGE.
00047 * ====================================================================
00048 *
00049 * This software consists of voluntary contributions made by many
00050 * individuals on behalf of the Apache Software Foundation, and was
00051 * originally based on software copyright (c) 1999, International
00052 * Business Machines, Inc., http://www.ibm.com . For more information
00053 * on the Apache Software Foundation, please see
00054 * <http://www.apache.org/>.
00055 */
00056
00057 /*
00058 * $Log: XMLRecognizer.hpp,v $
00059 * Revision 1.6 2000/02/24 20:00:23 abagchi
00060 * Swat for removing Log from API docs
00061 *
00062 * Revision 1.5 2000/02/15 01:21:31 roddey
00063 * Some initial documentation improvements. More to come...
00064 *
00065 * Revision 1.4 2000/02/06 07:47:48 rahulj
00066 * Year 2K copyright swat.
00067 *
00068 * Revision 1.3 1999/12/18 00:19:03 roddey
00069 * More changes to support the new, completely orthagonal, support for
00070 * intrinsic encodings.
00071 *
00072 * Revision 1.2 1999/11/23 01:49:27 rahulj
00073 * Cannot use class qualifier in class defn. CC under HPUX is happy.
00074 *
00075 * Revision 1.1.1.1 1999/11/09 01:08:37 twl
00076 * Initial checkin
00077 *
00078 * Revision 1.2 1999/11/08 20:44:40 rahul
00079 * Swat for adding in Product name and CVS comment log variable.
00080 *
00081 */
00082
00083 #if !defined(XMLRECOGNIZER_HPP)
00084 #define XMLRECOGNIZER_HPP
00085
00094 class XMLRecognizer
00095 {
00096 public :
00097 // -----------------------------------------------------------------------
00098 // Class types
00099 //
00100 // This enum represents the various encoding families that we have to
00101 // deal with individually at the scanner level. This does not indicate
00102 // the exact encoding, just the rough family that would let us scan
00103 // the XML/TextDecl to find the encoding string.
00104 //
00105 // The 'L's and 'B's stand for little or big endian. We conditionally
00106 // create versions that will automatically map to the local UTF-16 and
00107 // UCS-4 endian modes.
00108 //
00109 // OtherEncoding means that its some transcoder based encoding, i.e. not
00110 // one of the ones that we do internally. Its a special case and should
00111 // never be used directly outside of the reader.
00112 //
00113 // NOTE: Keep this in sync with the name map array in the Cpp file!!
00114 // -----------------------------------------------------------------------
00115 enum Encodings
00116 {
00117 EBCDIC = 0
00118 , UCS_4B = 1
00119 , UCS_4L = 2
00120 , US_ASCII = 3
00121 , UTF_8 = 4
00122 , UTF_16B = 5
00123 , UTF_16L = 6
00124
00125 , Encodings_Count
00126 , Encodings_Min = EBCDIC
00127 , Encodings_Max = UTF_16L
00128
00129 , OtherEncoding = 999
00130
00131 #if defined(ENDIANMODE_BIG)
00132 , Def_UTF16 = UTF_16B
00133 , Def_UCS4 = UCS_4B
00134 #else
00135 , Def_UTF16 = UTF_16L
00136 , Def_UCS4 = UCS_4L
00137 #endif
00138 };
00139
00140
00141 // -----------------------------------------------------------------------
00142 // Public, const static data
00143 //
00144 // These are the byte sequences for each of the encodings that we can
00145 // auto sense, and their lengths.
00146 // -----------------------------------------------------------------------
00147 static const char fgASCIIPre[];
00148 static const unsigned int fgASCIIPreLen;
00149 static const XMLByte fgEBCDICPre[];
00150 static const unsigned int fgEBCDICPreLen;
00151 static const XMLByte fgUTF16BPre[];
00152 static const XMLByte fgUTF16LPre[];
00153 static const unsigned int fgUTF16PreLen;
00154 static const XMLByte fgUCS4BPre[];
00155 static const XMLByte fgUCS4LPre[];
00156 static const unsigned int fgUCS4PreLen;
00157
00158
00159 // -----------------------------------------------------------------------
00160 // Encoding recognition methods
00161 // -----------------------------------------------------------------------
00162 static Encodings basicEncodingProbe
00163 (
00164 const XMLByte* const rawBuffer
00165 , const unsigned int rawByteCount
00166 );
00167
00168 static Encodings encodingForName
00169 (
00170 const XMLCh* const theEncName
00171 );
00172
00173 static const XMLCh* nameForEncoding(const Encodings theEncoding);
00174
00175
00176 private :
00177 // -----------------------------------------------------------------------
00178 // Unimplemented constructors, operators, and destructor
00179 //
00180 // This class is effectively being used as a namespace for some static
00181 // methods.
00182 // -----------------------------------------------------------------------
00183 XMLRecognizer();
00184 ~XMLRecognizer();
00185 void operator=(const XMLRecognizer&);
00186 };
00187
00188 #endif