Main Page | Namespace List | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals | Related Pages

unicode.c

Go to the documentation of this file.
00001 /*!
00002  * @file unicode.c
00003  *
00004  * @brief Manipulate Unicode (@link #jchar jchar@endlink)[]
00005  * character strings.
00006  *
00007  * There are three character string types in this program:
00008  * null-terminated @link #rchar (rchar)@endlink strings
00009  * @e ala 'C' language, UTF-8
00010  * @link #CONSTANT_Utf8_info (CONSTANT_Utf8_info)@endlink strings,
00011  * and Unicode @link #jchar (jchar)[]@endlink strings.
00012  *
00013  * Unicode (@link #jchar jchar@endlink) character utilities
00014  * that do @e not involve UTF8.
00015  *
00016  * ALL referenced to type (@link #jchar jchar@endlink) involve
00017  * Unicode characters throughout all of the code.  Manipulations
00018  * of them should take place @e only through these utilities.
00019  *
00020  *
00021  * @section Control
00022  *
00023  * \$URL: https://svn.apache.org/path/name/unicode.c $ \$Id: unicode.c 0 09/28/2005 dlydick $
00024  *
00025  * Copyright 2005 The Apache Software Foundation
00026  * or its licensors, as applicable.
00027  *
00028  * Licensed under the Apache License, Version 2.0 ("the License");
00029  * you may not use this file except in compliance with the License.
00030  * You may obtain a copy of the License at
00031  *
00032  *     http://www.apache.org/licenses/LICENSE-2.0
00033  *
00034  * Unless required by applicable law or agreed to in writing,
00035  * software distributed under the License is distributed on an
00036  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
00037  * either express or implied.
00038  *
00039  * See the License for the specific language governing permissions
00040  * and limitations under the License.
00041  *
00042  * @version \$LastChangedRevision: 0 $
00043  *
00044  * @date \$LastChangedDate: 09/28/2005 $
00045  *
00046  * @author \$LastChangedBy: dlydick $
00047  *         Original code contributed by Daniel Lydick on 09/28/2005.
00048  *
00049  * @section Reference
00050  *
00051  */
00052 
00053 #include "arch.h"
00054 ARCH_COPYRIGHT_APACHE(unicode, c, "$URL: https://svn.apache.org/path/name/unicode.c $ $Id: unicode.c 0 09/28/2005 dlydick $");
00055 
00056 
00057 #include <string.h>
00058 
00059 #include "jvmcfg.h" 
00060 #include "cfmacros.h"
00061 #include "classfile.h"
00062 
00063 
00064 /*
00065  *
00066  * Convert Unicode buffer into UTF8 buffer.
00067  *
00068  *
00069  * @param[in]  inbfr   Unicode character string
00070  *
00071  * @param[in]  length  Number of bytes in @b inbfr
00072  *
00073  * @param[out] outbfr  UTF8 byte string
00074  *
00075  *
00076  * @returns  UTF8 structure containing length and character buffer (plus
00077  *           tag), but return in (cp_info_dup) for full proper word
00078  *           alignment. When done with the data, call HEAP_FREE_DATA()
00079  *           on it.
00080  *
00081  *    @c @b rc->bytes    UTF8 version of @b inbfr string in @b outbfr
00082  *
00083  *    @c @b rc->length   Number of UTF8 bytes in
00084  *                        @c @b rc->bytes.  This will
00085  *                        only be the same as input @b length
00086  *                        when ALL UTF8 characters are 7-bit
00087  *                        ASCII.  It will otherwise be less
00088  *                        than that.
00089  */
00090 
00091 cp_info_dup *unicode_cnv2utf(jchar *inbfr, jshort length)
00092 {
00093     jshort bytecnvcount = 0;
00094     jshort unicodecnvcount;
00095     jubyte *outbfr;
00096 
00097     /*
00098      * Make two passes through input string, one for UTF8 length (for
00099      * proper heap allocation size), one for the conversion.
00100      *
00101      * Pass 1: calculate result size for heap allocation.  This is
00102      *         merely a stripped-down edition of pass 2, only
00103      *         incrementing input buffer pointer and byte count.
00104      */
00105     for (unicodecnvcount = 0;
00106          unicodecnvcount < length;
00107          unicodecnvcount++)
00108     {
00109         /* Process one-byte UTF8 conversion */
00110         if ((UTF8_SINGLE_MIN <= *inbfr) &&
00111             (UTF8_SINGLE_MAX >= *inbfr))
00112         {
00113             /*
00114              * Calculate a narrowing conversion,
00115              * but 9 MS bits are all zeroes, so no value change.
00116              */
00117             inbfr++;
00118             bytecnvcount++;
00119         }
00120         else
00121         {
00122             /* Calculate two-byte UTF8 conversion */
00123             if (((UNICODE_DOUBLE_MIN <= *inbfr) &&
00124                  (UNICODE_DOUBLE_MAX >= *inbfr))
00125 
00126             /* Also handle special case of NUL as two-byte character. */
00127                 || (UNICODE_DOUBLE_NUL == *inbfr))
00128             {
00129                 outbfr++;
00130                 bytecnvcount++;
00131 
00132                 inbfr++;
00133                 bytecnvcount++;
00134             }
00135             else
00136             {
00137                 /*
00138                  * Calculate three-byte UTF8 conversion-- all remaining
00139                  * cases, UNICODE_TRIPLE_MIN to UNICODE_TRIPLE_MAX
00140                  */
00141                 bytecnvcount++;
00142 
00143                 bytecnvcount++;
00144 
00145                 inbfr++;
00146                 bytecnvcount++;
00147             }
00148         }
00149     } /* for () */
00150 
00151     /*
00152      * Allocate enough heap space for output string, but within the
00153      * context of the output result type.  The size calculation
00154      * replaces generic (cp_info) with specifc (CONSTANT_Utf8_info)
00155      * info, adjusting for the amount of string data to be stored
00156      * into the result.
00157      */
00158     cp_info_dup *rc = HEAP_GET_DATA(sizeof(cp_info_dup) -
00159                                         sizeof(cp_info) +
00160                                         sizeof(CONSTANT_Utf8_info) -
00161                                         sizeof(u1) +
00162                                         bytecnvcount,
00163                                     rfalse);
00164 
00165     /* Prepare output structure with everything but character cnv */
00166     CONSTANT_Utf8_info *pcpui = PTR_THIS_CP_Utf8(rc);
00167     pcpui->tag = CONSTANT_Utf8;
00168     pcpui->length = bytecnvcount;
00169     outbfr = (jubyte *) pcpui->bytes;
00170 
00171     /* Pass 2:  Perform conversion itself */
00172     bytecnvcount = 0;
00173 
00174     for (unicodecnvcount = 0;
00175          unicodecnvcount < length;
00176          unicodecnvcount++)
00177     {
00178         /* Process one-byte UTF8 conversion */
00179         if ((UTF8_SINGLE_MIN <= *inbfr) &&
00180             (UTF8_SINGLE_MAX >= *inbfr))
00181         {
00182             /*
00183              * Perform a narrowing conversion,
00184              * but 9 MS bits are all zeroes, so no value change.
00185              */
00186             *outbfr++ = UTF8_SINGLE_MASK0 & ((jbyte) (*inbfr++));
00187             bytecnvcount++;
00188         }
00189         else
00190         {
00191             /* Process two-byte UTF8 conversion */
00192             if (((UNICODE_DOUBLE_MIN <= *inbfr) &&
00193                  (UNICODE_DOUBLE_MAX >= *inbfr))
00194 
00195             /* Also handle special case of NUL as two-byte character. */
00196                 || (UNICODE_DOUBLE_NUL == *inbfr))
00197             {
00198                 *outbfr    = (*inbfr >> UTF8_DOUBLE_FIRST_SHIFT) &
00199                             UTF8_DOUBLE_FIRST_MASK0;
00200                 *outbfr++ |= UTF8_DOUBLE_FIRST_VAL;
00201                 bytecnvcount++;
00202 
00203                 *outbfr    = (*inbfr++) & UTF8_DOUBLE_SECOND_MASK0;
00204                 *outbfr++ |= UTF8_DOUBLE_SECOND_VAL;
00205                 bytecnvcount++;
00206             }
00207             else
00208             {
00209                 /*
00210                  * Process three-byte UTF8 conversion-- all remaining
00211                  * cases, UNICODE_TRIPLE_MIN to UNICODE_TRIPLE_MAX
00212                  */
00213                 *outbfr    = (*inbfr >> UTF8_TRIPLE_FIRST_SHIFT) &
00214                             UTF8_TRIPLE_FIRST_MASK0;
00215                 *outbfr++ |= UTF8_TRIPLE_FIRST_VAL;
00216                 bytecnvcount++;
00217 
00218                 *outbfr    = (*inbfr >> UTF8_TRIPLE_SECOND_SHIFT) &
00219                             UTF8_TRIPLE_SECOND_MASK0;
00220                 *outbfr++ |= UTF8_TRIPLE_SECOND_VAL;
00221                 bytecnvcount++;
00222 
00223                 *outbfr    = (*inbfr++) & UTF8_TRIPLE_THIRD_MASK0;
00224                 *outbfr++ |= UTF8_TRIPLE_THIRD_VAL;
00225                 bytecnvcount++;
00226             }
00227         }
00228     } /* for () */
00229 
00230     return(rc);
00231 
00232 } /* END of unicode_cnv2utf() */
00233 
00234 
00235 /*!
00236  * @brief Compare two Unicode strings of any length, @b s1 minus @b s2
00237  *
00238  *
00239  * @param  s1   First string to compare
00240  *
00241  * @param  l1   Length of first string
00242  *
00243  * @param  s2   Second string to compare
00244  *
00245  * @param  l2   Length of second string
00246  *
00247  *
00248  * @returns lexicographical difference of <b><code>s1 - s2</code></b>.
00249  *          Notice that the (jchar) data is unsigned, the (jshort)
00250  *          result is signed, due to the arithmetic nature of the
00251  *          calculation.
00252  *
00253  */
00254 jshort unicode_strcmp(jchar *s1, u2 l1, jchar *s2, u2 l2)
00255 {
00256     /* Compare shortest common run length */
00257     rint cmplen = (l1 < l2) ? l1 : l2;
00258 
00259     /* Perform Unicode strlen() function */
00260     rint i;
00261     jshort rc = 0;
00262 
00263     for (i = 0; i < cmplen; i++)
00264     {
00265         rc = s1[i] - s2[i];
00266         if (0 != rc)
00267         {
00268             break;
00269         }
00270     }
00271 
00272     /*
00273      * THIS LOGIC IS THE SAME AS FOR s1_s2_strncmp(), BUT
00274      * OPERATES ON (jchar) instead of (rchar).
00275      */
00276 
00277     /* Return from several permutations of strlen */
00278     if (l1 == l2)
00279     {
00280         return(rc);
00281     }
00282     else
00283     if (l1 > l2)
00284     {
00285         /*
00286          * If a difference existed, return it, else use
00287          * the last character of @b s1 as character minus
00288          * NUL byte (or zero), which equals character.
00289          */
00290         if (0 != rc)
00291         {
00292             return(rc);
00293         }
00294 
00295         /*
00296          * First character of @b s1 past length of @b s2 
00297          */
00298         return((jshort) s1[l2]);
00299     }
00300     else
00301     {
00302         /* If a difference existed, return it, else use end of @b s2 */
00303         /*
00304          * If a difference existed, return it, else use
00305          * the last character of @b s1 as NUL byte (or zero)
00306          * minus character, which equals negative of character.
00307          */
00308         if (0 != rc)
00309         {
00310             return(rc);
00311         }
00312 
00313         /* First character of @b s2 past length of @b s1 */
00314         return((jshort) (0 - s2[l1]));
00315     }
00316 } /* END of unicode_strcmp() */
00317 
00318 
00319 /* EOF */
00320 

Generated on Fri Sep 30 18:59:35 2005 by  doxygen 1.4.4