00001 /*! 00002 * @file unicode.c 00003 * 00004 * @brief Manipulate Unicode (@link #jchar jchar@endlink)[] 00005 * character strings. 00006 * 00007 * There are three character string types in this program: 00008 * null-terminated @link #rchar (rchar)@endlink strings 00009 * @e ala 'C' language, UTF-8 00010 * @link #CONSTANT_Utf8_info (CONSTANT_Utf8_info)@endlink strings, 00011 * and Unicode @link #jchar (jchar)[]@endlink strings. 00012 * 00013 * Unicode (@link #jchar jchar@endlink) character utilities 00014 * that do @e not involve UTF8. 00015 * 00016 * ALL referenced to type (@link #jchar jchar@endlink) involve 00017 * Unicode characters throughout all of the code. Manipulations 00018 * of them should take place @e only through these utilities. 00019 * 00020 * 00021 * @section Control 00022 * 00023 * \$URL: https://svn.apache.org/path/name/unicode.c $ \$Id: unicode.c 0 09/28/2005 dlydick $ 00024 * 00025 * Copyright 2005 The Apache Software Foundation 00026 * or its licensors, as applicable. 00027 * 00028 * Licensed under the Apache License, Version 2.0 ("the License"); 00029 * you may not use this file except in compliance with the License. 00030 * You may obtain a copy of the License at 00031 * 00032 * http://www.apache.org/licenses/LICENSE-2.0 00033 * 00034 * Unless required by applicable law or agreed to in writing, 00035 * software distributed under the License is distributed on an 00036 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 00037 * either express or implied. 00038 * 00039 * See the License for the specific language governing permissions 00040 * and limitations under the License. 00041 * 00042 * @version \$LastChangedRevision: 0 $ 00043 * 00044 * @date \$LastChangedDate: 09/28/2005 $ 00045 * 00046 * @author \$LastChangedBy: dlydick $ 00047 * Original code contributed by Daniel Lydick on 09/28/2005. 00048 * 00049 * @section Reference 00050 * 00051 */ 00052 00053 #include "arch.h" 00054 ARCH_COPYRIGHT_APACHE(unicode, c, "$URL: https://svn.apache.org/path/name/unicode.c $ $Id: unicode.c 0 09/28/2005 dlydick $"); 00055 00056 00057 #include <string.h> 00058 00059 #include "jvmcfg.h" 00060 #include "cfmacros.h" 00061 #include "classfile.h" 00062 00063 00064 /* 00065 * 00066 * Convert Unicode buffer into UTF8 buffer. 00067 * 00068 * 00069 * @param[in] inbfr Unicode character string 00070 * 00071 * @param[in] length Number of bytes in @b inbfr 00072 * 00073 * @param[out] outbfr UTF8 byte string 00074 * 00075 * 00076 * @returns UTF8 structure containing length and character buffer (plus 00077 * tag), but return in (cp_info_dup) for full proper word 00078 * alignment. When done with the data, call HEAP_FREE_DATA() 00079 * on it. 00080 * 00081 * @c @b rc->bytes UTF8 version of @b inbfr string in @b outbfr 00082 * 00083 * @c @b rc->length Number of UTF8 bytes in 00084 * @c @b rc->bytes. This will 00085 * only be the same as input @b length 00086 * when ALL UTF8 characters are 7-bit 00087 * ASCII. It will otherwise be less 00088 * than that. 00089 */ 00090 00091 cp_info_dup *unicode_cnv2utf(jchar *inbfr, jshort length) 00092 { 00093 jshort bytecnvcount = 0; 00094 jshort unicodecnvcount; 00095 jubyte *outbfr; 00096 00097 /* 00098 * Make two passes through input string, one for UTF8 length (for 00099 * proper heap allocation size), one for the conversion. 00100 * 00101 * Pass 1: calculate result size for heap allocation. This is 00102 * merely a stripped-down edition of pass 2, only 00103 * incrementing input buffer pointer and byte count. 00104 */ 00105 for (unicodecnvcount = 0; 00106 unicodecnvcount < length; 00107 unicodecnvcount++) 00108 { 00109 /* Process one-byte UTF8 conversion */ 00110 if ((UTF8_SINGLE_MIN <= *inbfr) && 00111 (UTF8_SINGLE_MAX >= *inbfr)) 00112 { 00113 /* 00114 * Calculate a narrowing conversion, 00115 * but 9 MS bits are all zeroes, so no value change. 00116 */ 00117 inbfr++; 00118 bytecnvcount++; 00119 } 00120 else 00121 { 00122 /* Calculate two-byte UTF8 conversion */ 00123 if (((UNICODE_DOUBLE_MIN <= *inbfr) && 00124 (UNICODE_DOUBLE_MAX >= *inbfr)) 00125 00126 /* Also handle special case of NUL as two-byte character. */ 00127 || (UNICODE_DOUBLE_NUL == *inbfr)) 00128 { 00129 outbfr++; 00130 bytecnvcount++; 00131 00132 inbfr++; 00133 bytecnvcount++; 00134 } 00135 else 00136 { 00137 /* 00138 * Calculate three-byte UTF8 conversion-- all remaining 00139 * cases, UNICODE_TRIPLE_MIN to UNICODE_TRIPLE_MAX 00140 */ 00141 bytecnvcount++; 00142 00143 bytecnvcount++; 00144 00145 inbfr++; 00146 bytecnvcount++; 00147 } 00148 } 00149 } /* for () */ 00150 00151 /* 00152 * Allocate enough heap space for output string, but within the 00153 * context of the output result type. The size calculation 00154 * replaces generic (cp_info) with specifc (CONSTANT_Utf8_info) 00155 * info, adjusting for the amount of string data to be stored 00156 * into the result. 00157 */ 00158 cp_info_dup *rc = HEAP_GET_DATA(sizeof(cp_info_dup) - 00159 sizeof(cp_info) + 00160 sizeof(CONSTANT_Utf8_info) - 00161 sizeof(u1) + 00162 bytecnvcount, 00163 rfalse); 00164 00165 /* Prepare output structure with everything but character cnv */ 00166 CONSTANT_Utf8_info *pcpui = PTR_THIS_CP_Utf8(rc); 00167 pcpui->tag = CONSTANT_Utf8; 00168 pcpui->length = bytecnvcount; 00169 outbfr = (jubyte *) pcpui->bytes; 00170 00171 /* Pass 2: Perform conversion itself */ 00172 bytecnvcount = 0; 00173 00174 for (unicodecnvcount = 0; 00175 unicodecnvcount < length; 00176 unicodecnvcount++) 00177 { 00178 /* Process one-byte UTF8 conversion */ 00179 if ((UTF8_SINGLE_MIN <= *inbfr) && 00180 (UTF8_SINGLE_MAX >= *inbfr)) 00181 { 00182 /* 00183 * Perform a narrowing conversion, 00184 * but 9 MS bits are all zeroes, so no value change. 00185 */ 00186 *outbfr++ = UTF8_SINGLE_MASK0 & ((jbyte) (*inbfr++)); 00187 bytecnvcount++; 00188 } 00189 else 00190 { 00191 /* Process two-byte UTF8 conversion */ 00192 if (((UNICODE_DOUBLE_MIN <= *inbfr) && 00193 (UNICODE_DOUBLE_MAX >= *inbfr)) 00194 00195 /* Also handle special case of NUL as two-byte character. */ 00196 || (UNICODE_DOUBLE_NUL == *inbfr)) 00197 { 00198 *outbfr = (*inbfr >> UTF8_DOUBLE_FIRST_SHIFT) & 00199 UTF8_DOUBLE_FIRST_MASK0; 00200 *outbfr++ |= UTF8_DOUBLE_FIRST_VAL; 00201 bytecnvcount++; 00202 00203 *outbfr = (*inbfr++) & UTF8_DOUBLE_SECOND_MASK0; 00204 *outbfr++ |= UTF8_DOUBLE_SECOND_VAL; 00205 bytecnvcount++; 00206 } 00207 else 00208 { 00209 /* 00210 * Process three-byte UTF8 conversion-- all remaining 00211 * cases, UNICODE_TRIPLE_MIN to UNICODE_TRIPLE_MAX 00212 */ 00213 *outbfr = (*inbfr >> UTF8_TRIPLE_FIRST_SHIFT) & 00214 UTF8_TRIPLE_FIRST_MASK0; 00215 *outbfr++ |= UTF8_TRIPLE_FIRST_VAL; 00216 bytecnvcount++; 00217 00218 *outbfr = (*inbfr >> UTF8_TRIPLE_SECOND_SHIFT) & 00219 UTF8_TRIPLE_SECOND_MASK0; 00220 *outbfr++ |= UTF8_TRIPLE_SECOND_VAL; 00221 bytecnvcount++; 00222 00223 *outbfr = (*inbfr++) & UTF8_TRIPLE_THIRD_MASK0; 00224 *outbfr++ |= UTF8_TRIPLE_THIRD_VAL; 00225 bytecnvcount++; 00226 } 00227 } 00228 } /* for () */ 00229 00230 return(rc); 00231 00232 } /* END of unicode_cnv2utf() */ 00233 00234 00235 /*! 00236 * @brief Compare two Unicode strings of any length, @b s1 minus @b s2 00237 * 00238 * 00239 * @param s1 First string to compare 00240 * 00241 * @param l1 Length of first string 00242 * 00243 * @param s2 Second string to compare 00244 * 00245 * @param l2 Length of second string 00246 * 00247 * 00248 * @returns lexicographical difference of <b><code>s1 - s2</code></b>. 00249 * Notice that the (jchar) data is unsigned, the (jshort) 00250 * result is signed, due to the arithmetic nature of the 00251 * calculation. 00252 * 00253 */ 00254 jshort unicode_strcmp(jchar *s1, u2 l1, jchar *s2, u2 l2) 00255 { 00256 /* Compare shortest common run length */ 00257 rint cmplen = (l1 < l2) ? l1 : l2; 00258 00259 /* Perform Unicode strlen() function */ 00260 rint i; 00261 jshort rc = 0; 00262 00263 for (i = 0; i < cmplen; i++) 00264 { 00265 rc = s1[i] - s2[i]; 00266 if (0 != rc) 00267 { 00268 break; 00269 } 00270 } 00271 00272 /* 00273 * THIS LOGIC IS THE SAME AS FOR s1_s2_strncmp(), BUT 00274 * OPERATES ON (jchar) instead of (rchar). 00275 */ 00276 00277 /* Return from several permutations of strlen */ 00278 if (l1 == l2) 00279 { 00280 return(rc); 00281 } 00282 else 00283 if (l1 > l2) 00284 { 00285 /* 00286 * If a difference existed, return it, else use 00287 * the last character of @b s1 as character minus 00288 * NUL byte (or zero), which equals character. 00289 */ 00290 if (0 != rc) 00291 { 00292 return(rc); 00293 } 00294 00295 /* 00296 * First character of @b s1 past length of @b s2 00297 */ 00298 return((jshort) s1[l2]); 00299 } 00300 else 00301 { 00302 /* If a difference existed, return it, else use end of @b s2 */ 00303 /* 00304 * If a difference existed, return it, else use 00305 * the last character of @b s1 as NUL byte (or zero) 00306 * minus character, which equals negative of character. 00307 */ 00308 if (0 != rc) 00309 { 00310 return(rc); 00311 } 00312 00313 /* First character of @b s2 past length of @b s1 */ 00314 return((jshort) (0 - s2[l1])); 00315 } 00316 } /* END of unicode_strcmp() */ 00317 00318 00319 /* EOF */ 00320