Source code

001/*
002 *  Licensed to the Apache Software Foundation (ASF) under one
003 *  or more contributor license agreements.  See the NOTICE file
004 *  distributed with this work for additional information
005 *  regarding copyright ownership.  The ASF licenses this file
006 *  to you under the Apache License, Version 2.0 (the
007 *  "License"); you may not use this file except in compliance
008 *  with the License.  You may obtain a copy of the License at
009 * 
010 *    http://www.apache.org/licenses/LICENSE-2.0
011 * 
012 *  Unless required by applicable law or agreed to in writing,
013 *  software distributed under the License is distributed on an
014 *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 *  KIND, either express or implied.  See the License for the
016 *  specific language governing permissions and limitations
017 *  under the License.
018 * 
019 */
020package org.apache.directory.api.util;
021
022
023import java.io.IOException;
024import java.io.ObjectInput;
025import java.io.ObjectOutput;
026
027
028/**
029 * Various unicode manipulation methods that are more efficient then chaining
030 * operations: all is done in the same buffer without creating a bunch of string
031 * objects.
032 * 
033 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
034 */
035public final class Unicode
036{
037    /**
038     * Count the number of bytes needed to return an Unicode char. This can be
039     * from 1 to 6.
040     *
041     * @param bytes The bytes to read
042     * @param pos Position to start counting. It must be a valid start of a
043     *            encoded char !
044     * @return The number of bytes to create a char, or -1 if the encoding is
045     *         wrong. TODO : Should stop after the third byte, as a char is only
046     *         2 bytes long.
047     */
048    public static int countBytesPerChar( byte[] bytes, int pos )
049    {
050        if ( bytes == null )
051        {
052            return -1;
053        }
054
055        if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 )
056        {
057            return 1;
058        }
059        else if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES )
060        {
061            return 2;
062        }
063        else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES )
064        {
065            return 3;
066        }
067        else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES )
068        {
069            return 4;
070        }
071        else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
072        {
073            return 5;
074        }
075        else if ( ( bytes[pos] & UnicodeConstants.UTF8_SIX_BYTES_MASK ) == UnicodeConstants.UTF8_SIX_BYTES )
076        {
077            return 6;
078        }
079        else
080        {
081            return -1;
082        }
083    }
084
085
086    /**
087     * Return the Unicode char which is coded in the bytes at position 0.
088     *
089     * @param bytes The byte[] represntation of an Unicode string.
090     * @return The first char found.
091     */
092    public static char bytesToChar( byte[] bytes )
093    {
094        return bytesToChar( bytes, 0 );
095    }
096
097
098    /**
099     * Return the Unicode char which is coded in the bytes at the given
100     * position.
101     *
102     * @param bytes The byte[] represntation of an Unicode string.
103     * @param pos The current position to start decoding the char
104     * @return The decoded char, or -1 if no char can be decoded TODO : Should
105     *         stop after the third byte, as a char is only 2 bytes long.
106     */
107    public static char bytesToChar( byte[] bytes, int pos )
108    {
109        if ( bytes == null )
110        {
111            return ( char ) -1;
112        }
113
114        if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 )
115        {
116            return ( char ) bytes[pos];
117        }
118        else
119        {
120            if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES )
121            {
122                // Two bytes char
123                return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy
124                                                                   // 10zz-zzzz
125                                                                   // ->
126                                                                   // 0000-0xxx
127                                                                   // 0000-0000
128                    ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz
129                                                     // -> 0000-0000
130                                                     // yy00-0000
131                ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000
132                                          // 00zz-zzzz
133                ); // -> 0000-0xxx yyzz-zzzz (07FF)
134            }
135            else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES )
136            {
137                // Three bytes char
138                return ( char ) (
139                // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000
140                ( ( bytes[pos] & 0x0F ) << 12 )
141                    // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000
142                    + ( ( bytes[pos + 1] & 0x3C ) << 6 )
143                    // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000
144                    + ( ( bytes[pos + 1] & 0x03 ) << 6 )
145                    // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz
146                    + ( bytes[pos + 2] & 0x3F )
147                // -> tttt-xxxx yyzz-zzzz (FF FF)
148                );
149            }
150            else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES )
151            {
152                // Four bytes char
153                return ( char ) (
154                // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00
155                // 0000-0000 0000-0000
156                ( ( bytes[pos] & 0x07 ) << 18 )
157                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu
158                    // 0000-0000 0000-0000
159                    + ( ( bytes[pos + 1] & 0x30 ) << 16 )
160                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
161                    // vvvv-0000 0000-0000
162                    + ( ( bytes[pos + 1] & 0x0F ) << 12 )
163                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
164                    // 0000-xxxx 0000-0000
165                    + ( ( bytes[pos + 2] & 0x3C ) << 6 )
166                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
167                    // 0000-0000 yy00-0000
168                    + ( ( bytes[pos + 2] & 0x03 ) << 6 )
169                    // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
170                    // 0000-0000 00zz-zzzz
171                    + ( bytes[pos + 3] & 0x3F )
172                // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
173                );
174            }
175            else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
176            {
177                // Five bytes char
178                return ( char ) (
179                // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
180                // 0000-00tt 0000-0000 0000-0000 0000-0000
181                ( ( bytes[pos] & 0x03 ) << 24 )
182                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
183                    // 0000-0000 uuuu-uu00 0000-0000 0000-0000
184                    + ( ( bytes[pos + 1] & 0x3F ) << 18 )
185                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
186                    // 0000-0000 0000-00vv 0000-0000 0000-0000
187                    + ( ( bytes[pos + 2] & 0x30 ) << 12 )
188                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
189                    // 0000-0000 0000-0000 wwww-0000 0000-0000
190                    + ( ( bytes[pos + 2] & 0x0F ) << 12 )
191                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
192                    // 0000-0000 0000-0000 0000-xxxx 0000-0000
193                    + ( ( bytes[pos + 3] & 0x3C ) << 6 )
194                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
195                    // 0000-0000 0000-0000 0000-0000 yy00-0000
196                    + ( ( bytes[pos + 3] & 0x03 ) << 6 )
197                    // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
198                    // 0000-0000 0000-0000 0000-0000 00zz-zzzz
199                    + ( bytes[pos + 4] & 0x3F )
200                // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
201                );
202            }
203            else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
204            {
205                // Six bytes char
206                return ( char ) (
207                // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
208                // ->
209                // 0s00-0000 0000-0000 0000-0000 0000-0000
210                ( ( bytes[pos] & 0x01 ) << 30 )
211                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
212                    // ->
213                    // 00tt-tttt 0000-0000 0000-0000 0000-0000
214                    + ( ( bytes[pos + 1] & 0x3F ) << 24 )
215                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
216                    // 10zz-zzzz ->
217                    // 0000-0000 uuuu-uu00 0000-0000 0000-0000
218                    + ( ( bytes[pos + 2] & 0x3F ) << 18 )
219                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
220                    // 10zz-zzzz ->
221                    // 0000-0000 0000-00vv 0000-0000 0000-0000
222                    + ( ( bytes[pos + 3] & 0x30 ) << 12 )
223                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
224                    // 10zz-zzzz ->
225                    // 0000-0000 0000-0000 wwww-0000 0000-0000
226                    + ( ( bytes[pos + 3] & 0x0F ) << 12 )
227                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
228                    // 10zz-zzzz ->
229                    // 0000-0000 0000-0000 0000-xxxx 0000-0000
230                    + ( ( bytes[pos + 4] & 0x3C ) << 6 )
231                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
232                    // 10zz-zzzz ->
233                    // 0000-0000 0000-0000 0000-0000 yy00-0000
234                    + ( ( bytes[pos + 4] & 0x03 ) << 6 )
235                    // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
236                    // ->
237                    // 0000-0000 0000-0000 0000-0000 00zz-zzzz
238                    + ( bytes[pos + 5] & 0x3F )
239                // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
240                );
241            }
242            else
243            {
244                return ( char ) -1;
245            }
246        }
247    }
248
249
250    /**
251     * Return the number of bytes that hold an Unicode char.
252     *
253     * @param car The character to be decoded
254     * @return The number of bytes to hold the char. TODO : Should stop after
255     *         the third byte, as a char is only 2 bytes long.
256     */
257    public static int countNbBytesPerChar( char car )
258    {
259        if ( ( car & UnicodeConstants.CHAR_ONE_BYTE_MASK ) == 0 )
260        {
261            return 1;
262        }
263        else if ( ( car & UnicodeConstants.CHAR_TWO_BYTES_MASK ) == 0 )
264        {
265            return 2;
266        }
267        else if ( ( car & UnicodeConstants.CHAR_THREE_BYTES_MASK ) == 0 )
268        {
269            return 3;
270        }
271        else if ( ( car & UnicodeConstants.CHAR_FOUR_BYTES_MASK ) == 0 )
272        {
273            return 4;
274        }
275        else if ( ( car & UnicodeConstants.CHAR_FIVE_BYTES_MASK ) == 0 )
276        {
277            return 5;
278        }
279        else if ( ( car & UnicodeConstants.CHAR_SIX_BYTES_MASK ) == 0 )
280        {
281            return 6;
282        }
283        else
284        {
285            return -1;
286        }
287    }
288
289
290    /**
291     * Count the number of bytes included in the given char[].
292     *
293     * @param chars The char array to decode
294     * @return The number of bytes in the char array
295     */
296    public static int countBytes( char[] chars )
297    {
298        if ( chars == null )
299        {
300            return 0;
301        }
302
303        int nbBytes = 0;
304        int currentPos = 0;
305
306        while ( currentPos < chars.length )
307        {
308            int nbb = countNbBytesPerChar( chars[currentPos] );
309
310            // If the number of bytes necessary to encode a character is
311            // above 3, we will need two UTF-16 chars
312            currentPos += ( nbb < 4 ? 1 : 2 );
313            nbBytes += nbb;
314        }
315
316        return nbBytes;
317    }
318
319
320    /**
321     * Count the number of chars included in the given byte[].
322     *
323     * @param bytes The byte array to decode
324     * @return The number of char in the byte array
325     */
326    public static int countChars( byte[] bytes )
327    {
328        if ( bytes == null )
329        {
330            return 0;
331        }
332
333        int nbChars = 0;
334        int currentPos = 0;
335
336        while ( currentPos < bytes.length )
337        {
338            currentPos += countBytesPerChar( bytes, currentPos );
339            nbChars++;
340        }
341
342        return nbChars;
343    }
344
345
346    /**
347     * Return the Unicode char which is coded in the bytes at the given
348     * position.
349     *
350     * @param car The character to be transformed to an array of bytes
351     *
352     * @return The byte array representing the char
353     *
354     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
355     */
356    public static byte[] charToBytes( char car )
357    {
358        byte[] bytes = new byte[countNbBytesPerChar( car )];
359
360        if ( car <= 0x7F )
361        {
362            // Single byte char
363            bytes[0] = ( byte ) car;
364            return bytes;
365        }
366        else if ( car <= 0x7FF )
367        {
368            // two bytes char
369            bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
370            bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
371        }
372        else
373        {
374            // Three bytes char
375            bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
376            bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
377            bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
378        }
379
380        return bytes;
381    }
382
383
384    /**
385     * Check if the current char is in the unicodeSubset : all chars but
386     * '\0', '(', ')', '*' and '\'
387     *
388     * @param str The string to check
389     * @param pos Position of the current char
390     * @return True if the current char is in the unicode subset
391     */
392    public static boolean isUnicodeSubset( String str, int pos )
393    {
394        if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
395        {
396            return false;
397        }
398
399        char c = str.charAt( pos );
400
401        return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] );
402    }
403
404
405    /**
406     * Check if the current char is in the unicodeSubset : all chars but
407     * '\0', '(', ')', '*' and '\'
408     *
409     * @param c The char to check
410     * @return True if the current char is in the unicode subset
411     */
412    public static boolean isUnicodeSubset( char c )
413    {
414        return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] );
415    }
416
417
418    /**
419     * Check if the current byte is in the unicodeSubset : all chars but
420     * '\0', '(', ')', '*' and '\'
421     *
422     * @param b The byte to check
423     * @return True if the current byte is in the unicode subset
424     */
425    public static boolean isUnicodeSubset( byte b )
426    {
427        return ( ( b < 0 ) || ( b > 127 ) || UnicodeConstants.UNICODE_SUBSET[b] );
428    }
429
430
431    /**
432     *
433     * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
434     * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
435     * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
436     * two, or three bytes, depending on the value of the character.
437     *
438     * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
439     * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
440     * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
441     * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
442     * once.
443     *
444     * See also {@link java.io.DataOutput#writeUTF(String)}.
445     *
446     * @param objectOutput The objectOutput to write to
447     * @param str The value to write
448     * @throws java.io.IOException If the value can't be written to the file
449     */
450    public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
451    {
452        // Write a 'null' string
453        if ( str == null )
454        {
455            objectOutput.writeInt( 0 );
456            objectOutput.writeUTF( "null" );
457        }
458        else
459        {
460            // Write length of string
461            objectOutput.writeInt( str.length() );
462
463            StringBuffer strBuf = new StringBuffer( str );
464
465            // Write the string in portions not larger than 21845 characters
466            while ( strBuf != null )
467            {
468                if ( strBuf.length() < 21845 )
469                {
470                    objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
471                    strBuf = null;
472                }
473                else
474                {
475                    objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
476                    strBuf.delete( 0, 21845 );
477                }
478            }
479        }
480    }
481
482
483    /**
484     *
485     * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
486     * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
487     * characters is then returned as a String.
488     *
489     * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
490     * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
491     * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
492     * length of each group is computed from the value of the first byte of the group. The byte following a group, if
493     * any, is the first byte of the next group.
494     *
495     *See also {@link java.io.DataInput#readUTF()}.
496     *
497     * @param objectInput The objectInput to read from
498     * @return The read string
499     * @throws java.io.IOException If the value can't be read
500     */
501    public static String readUTF( ObjectInput objectInput ) throws IOException
502    {
503        StringBuffer strBuf = null;
504
505        // Read length of the string
506        int strLength = objectInput.readInt();
507
508        // Start reading the string
509        strBuf = new StringBuffer( objectInput.readUTF() );
510
511        if ( strLength == 0 && strBuf.toString().equals( "null" ) )
512        {
513            // The special case of a 'null' string
514            return null;
515        }
516        else
517        {
518            while ( strLength > strBuf.length() )
519            {
520                strBuf.append( objectInput.readUTF() );
521            }
522            return strBuf.toString();
523        }
524    }
525}