Source code

001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one
003 *  or more contributor license agreements.  See the NOTICE file
004 *  distributed with this work for additional information
005 *  regarding copyright ownership.  The ASF licenses this file
006 *  to you under the Apache License, Version 2.0 (the
007 *  "License"); you may not use this file except in compliance
008 *  with the License.  You may obtain a copy of the License at
009 *  
010 *    http://www.apache.org/licenses/LICENSE-2.0
011 *  
012 *  Unless required by applicable law or agreed to in writing,
013 *  software distributed under the License is distributed on an
014 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 *  KIND, either express or implied.  See the License for the
016 *  specific language governing permissions and limitations
017 *  under the License. 
018 *  
019 */
020package org.apache.directory.shared.util;
021
022
023import java.io.IOException;
024import java.io.ObjectInput;
025import java.io.ObjectOutput;
026
027/**
028 * Various unicode manipulation methods that are more efficient then chaining
029 * operations: all is done in the same buffer without creating a bunch of string
030 * objects.
031 * 
032 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
033 */
034public final class Unicode
035{
036    /**
037     * Count the number of bytes needed to return an Unicode char. This can be
038     * from 1 to 6.
039     *
040     * @param bytes The bytes to read
041     * @param pos Position to start counting. It must be a valid start of a
042     *            encoded char !
043     * @return The number of bytes to create a char, or -1 if the encoding is
044     *         wrong. TODO : Should stop after the third byte, as a char is only
045     *         2 bytes long.
046     */
047    public static int countBytesPerChar( byte[] bytes, int pos )
048    {
049        if ( bytes == null )
050        {
051            return -1;
052        }
053
054        if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 )
055        {
056            return 1;
057        }
058        else if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES )
059        {
060            return 2;
061        }
062        else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES )
063        {
064            return 3;
065        }
066        else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES )
067        {
068            return 4;
069        }
070        else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
071        {
072            return 5;
073        }
074        else if ( ( bytes[pos] & UnicodeConstants.UTF8_SIX_BYTES_MASK ) == UnicodeConstants.UTF8_SIX_BYTES )
075        {
076            return 6;
077        }
078        else
079        {
080            return -1;
081        }
082    }
083
084    /**
085     * Return the Unicode char which is coded in the bytes at position 0.
086     *
087     * @param bytes The byte[] represntation of an Unicode string.
088     * @return The first char found.
089     */
090    public static char bytesToChar( byte[] bytes )
091    {
092        return bytesToChar( bytes, 0 );
093    }
094
095    /**
096     * Return the Unicode char which is coded in the bytes at the given
097     * position.
098     *
099     * @param bytes The byte[] represntation of an Unicode string.
100     * @param pos The current position to start decoding the char
101     * @return The decoded char, or -1 if no char can be decoded TODO : Should
102     *         stop after the third byte, as a char is only 2 bytes long.
103     */
104    public static char bytesToChar( byte[] bytes, int pos )
105    {
106        if ( bytes == null )
107        {
108            return ( char ) -1;
109        }
110
111        if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 )
112        {
113            return ( char ) bytes[pos];
114        }
115        else
116        {
117            if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES )
118            {
119                // Two bytes char
120                return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy
121                                                                    // 10zz-zzzz
122                                                                    // ->
123                                                                    // 0000-0xxx
124                                                                    // 0000-0000
125                    ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz
126                                                        // -> 0000-0000
127                                                        // yy00-0000
128                ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000
129                                            // 00zz-zzzz
130                ); // -> 0000-0xxx yyzz-zzzz (07FF)
131            }
132            else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES )
133            {
134                // Three bytes char
135                return ( char ) (
136                // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000
137                ( ( bytes[pos] & 0x0F ) << 12 )
138                    // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000
139                    + ( ( bytes[pos + 1] & 0x3C ) << 6 )
140                    // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000
141                    + ( ( bytes[pos + 1] & 0x03 ) << 6 )
142                // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz
143                + ( bytes[pos + 2] & 0x3F )
144                // -> tttt-xxxx yyzz-zzzz (FF FF)
145                );
146            }
147            else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES )
148            {
149                // Four bytes char
150                return ( char ) (
151                // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00
152                // 0000-0000 0000-0000
153                ( ( bytes[pos] & 0x07 ) << 18 )
154                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu
155                    // 0000-0000 0000-0000
156                    + ( ( bytes[pos + 1] & 0x30 ) << 16 )
157                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
158                    // vvvv-0000 0000-0000
159                    + ( ( bytes[pos + 1] & 0x0F ) << 12 )
160                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
161                    // 0000-xxxx 0000-0000
162                    + ( ( bytes[pos + 2] & 0x3C ) << 6 )
163                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
164                    // 0000-0000 yy00-0000
165                    + ( ( bytes[pos + 2] & 0x03 ) << 6 )
166                // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
167                // 0000-0000 00zz-zzzz
168                + ( bytes[pos + 3] & 0x3F )
169                // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
170                );
171            }
172            else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
173            {
174                // Five bytes char
175                return ( char ) (
176                // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
177                // 0000-00tt 0000-0000 0000-0000 0000-0000
178                ( ( bytes[pos] & 0x03 ) << 24 )
179                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
180                    // 0000-0000 uuuu-uu00 0000-0000 0000-0000
181                    + ( ( bytes[pos + 1] & 0x3F ) << 18 )
182                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
183                    // 0000-0000 0000-00vv 0000-0000 0000-0000
184                    + ( ( bytes[pos + 2] & 0x30 ) << 12 )
185                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
186                    // 0000-0000 0000-0000 wwww-0000 0000-0000
187                    + ( ( bytes[pos + 2] & 0x0F ) << 12 )
188                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
189                    // 0000-0000 0000-0000 0000-xxxx 0000-0000
190                    + ( ( bytes[pos + 3] & 0x3C ) << 6 )
191                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
192                    // 0000-0000 0000-0000 0000-0000 yy00-0000
193                    + ( ( bytes[pos + 3] & 0x03 ) << 6 )
194                // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
195                // 0000-0000 0000-0000 0000-0000 00zz-zzzz
196                + ( bytes[pos + 4] & 0x3F )
197                // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
198                );
199            }
200            else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
201            {
202                // Six bytes char
203                return ( char ) (
204                // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
205                // ->
206                // 0s00-0000 0000-0000 0000-0000 0000-0000
207                ( ( bytes[pos] & 0x01 ) << 30 )
208                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
209                    // ->
210                    // 00tt-tttt 0000-0000 0000-0000 0000-0000
211                    + ( ( bytes[pos + 1] & 0x3F ) << 24 )
212                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
213                    // 10zz-zzzz ->
214                    // 0000-0000 uuuu-uu00 0000-0000 0000-0000
215                    + ( ( bytes[pos + 2] & 0x3F ) << 18 )
216                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
217                    // 10zz-zzzz ->
218                    // 0000-0000 0000-00vv 0000-0000 0000-0000
219                    + ( ( bytes[pos + 3] & 0x30 ) << 12 )
220                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
221                    // 10zz-zzzz ->
222                    // 0000-0000 0000-0000 wwww-0000 0000-0000
223                    + ( ( bytes[pos + 3] & 0x0F ) << 12 )
224                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
225                    // 10zz-zzzz ->
226                    // 0000-0000 0000-0000 0000-xxxx 0000-0000
227                    + ( ( bytes[pos + 4] & 0x3C ) << 6 )
228                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
229                    // 10zz-zzzz ->
230                    // 0000-0000 0000-0000 0000-0000 yy00-0000
231                    + ( ( bytes[pos + 4] & 0x03 ) << 6 )
232                // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
233                // ->
234                // 0000-0000 0000-0000 0000-0000 00zz-zzzz
235                + ( bytes[pos + 5] & 0x3F )
236                // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
237                );
238            }
239            else
240            {
241                return ( char ) -1;
242            }
243        }
244    }
245
246    /**
247     * Return the number of bytes that hold an Unicode char.
248     *
249     * @param car The character to be decoded
250     * @return The number of bytes to hold the char. TODO : Should stop after
251     *         the third byte, as a char is only 2 bytes long.
252     */
253    public static int countNbBytesPerChar( char car )
254    {
255        if ( ( car & UnicodeConstants.CHAR_ONE_BYTE_MASK ) == 0 )
256        {
257            return 1;
258        }
259        else if ( ( car & UnicodeConstants.CHAR_TWO_BYTES_MASK ) == 0 )
260        {
261            return 2;
262        }
263        else if ( ( car & UnicodeConstants.CHAR_THREE_BYTES_MASK ) == 0 )
264        {
265            return 3;
266        }
267        else if ( ( car & UnicodeConstants.CHAR_FOUR_BYTES_MASK ) == 0 )
268        {
269            return 4;
270        }
271        else if ( ( car & UnicodeConstants.CHAR_FIVE_BYTES_MASK ) == 0 )
272        {
273            return 5;
274        }
275        else if ( ( car & UnicodeConstants.CHAR_SIX_BYTES_MASK ) == 0 )
276        {
277            return 6;
278        }
279        else
280        {
281            return -1;
282        }
283    }
284
285    /**
286     * Count the number of bytes included in the given char[].
287     *
288     * @param chars The char array to decode
289     * @return The number of bytes in the char array
290     */
291    public static int countBytes( char[] chars )
292    {
293        if ( chars == null )
294        {
295            return 0;
296        }
297
298        int nbBytes = 0;
299        int currentPos = 0;
300
301        while ( currentPos < chars.length )
302        {
303            int nbb = countNbBytesPerChar( chars[currentPos] );
304
305            // If the number of bytes necessary to encode a character is
306            // above 3, we will need two UTF-16 chars
307            currentPos += ( nbb < 4 ? 1 : 2 );
308            nbBytes += nbb;
309        }
310
311        return nbBytes;
312    }
313
314    /**
315     * Count the number of chars included in the given byte[].
316     *
317     * @param bytes The byte array to decode
318     * @return The number of char in the byte array
319     */
320    public static int countChars( byte[] bytes )
321    {
322        if ( bytes == null )
323        {
324            return 0;
325        }
326
327        int nbChars = 0;
328        int currentPos = 0;
329
330        while ( currentPos < bytes.length )
331        {
332            currentPos += countBytesPerChar(bytes, currentPos);
333            nbChars++;
334        }
335
336        return nbChars;
337    }
338
339    /**
340     * Return the Unicode char which is coded in the bytes at the given
341     * position.
342     *
343     * @param car The character to be transformed to an array of bytes
344     *
345     * @return The byte array representing the char
346     *
347     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
348     */
349    public static byte[] charToBytes( char car )
350    {
351        byte[] bytes = new byte[countNbBytesPerChar(car)];
352
353        if ( car <= 0x7F )
354        {
355            // Single byte char
356            bytes[0] = ( byte ) car;
357            return bytes;
358        }
359        else if ( car <= 0x7FF )
360        {
361            // two bytes char
362            bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
363            bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
364        }
365        else
366        {
367            // Three bytes char
368            bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
369            bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
370            bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
371        }
372
373        return bytes;
374    }
375
376    /**
377     * Check if the current char is in the unicodeSubset : all chars but
378     * '\0', '(', ')', '*' and '\'
379     *
380     * @param str The string to check
381     * @param pos Position of the current char
382     * @return True if the current char is in the unicode subset
383     */
384    public static boolean isUnicodeSubset( String str, int pos )
385    {
386        if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
387        {
388            return false;
389        }
390
391        char c = str.charAt( pos );
392
393        return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] );
394    }
395
396    /**
397     * Check if the current char is in the unicodeSubset : all chars but
398     * '\0', '(', ')', '*' and '\'
399     *
400     * @param c The char to check
401     * @return True if the current char is in the unicode subset
402     */
403    public static boolean isUnicodeSubset( char c )
404    {
405        return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] );
406    }
407
408    /**
409     *
410     * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
411     * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
412     * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
413     * two, or three bytes, depending on the value of the character.
414     *
415     * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
416     * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
417     * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
418     * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
419     * once.
420     *
421     * See also {@link java.io.DataOutput#writeUTF(String)}.
422     *
423     * @param objectOutput The objectOutput to write to
424     * @param str The value to write
425     * @throws java.io.IOException If the value can't be written to the file
426     */
427    public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
428    {
429        // Write a 'null' string
430        if ( str == null )
431        {
432            objectOutput.writeInt( 0 );
433            objectOutput.writeUTF( "null" );
434        }
435        else
436        {
437            // Write length of string
438            objectOutput.writeInt( str.length() );
439
440            StringBuffer strBuf = new StringBuffer( str );
441
442            // Write the string in portions not larger than 21845 characters
443            while ( strBuf != null )
444            {
445                if ( strBuf.length() < 21845 )
446                {
447                    objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
448                    strBuf = null;
449                }
450                else
451                {
452                    objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
453                    strBuf.delete( 0, 21845 );
454                }
455            }
456        }
457    }
458
459    /**
460     *
461     * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
462     * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
463     * characters is then returned as a String.
464     *
465     * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
466     * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
467     * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
468     * length of each group is computed from the value of the first byte of the group. The byte following a group, if
469     * any, is the first byte of the next group.
470     *
471     *See also {@link java.io.DataInput#readUTF()}.
472     *
473     * @param objectInput The objectInput to read from
474     * @return The read string
475     * @throws java.io.IOException If the value can't be read
476     */
477    public static String readUTF( ObjectInput objectInput ) throws IOException
478    {
479        StringBuffer strBuf = null;
480
481        // Read length of the string
482        int strLength = objectInput.readInt();
483
484        // Start reading the string
485        strBuf = new StringBuffer( objectInput.readUTF() );
486
487        if ( strLength == 0 && strBuf.toString().equals( "null" ) )
488        {
489            // The special case of a 'null' string
490            return null;
491        }
492        else
493        {
494            while ( strLength > strBuf.length() )
495            {
496                strBuf.append( objectInput.readUTF() );
497            }
498            return strBuf.toString();
499        }
500    }
501}