View Javadoc
1   /*
2    *  Licensed to the Apache Software Foundation (ASF) under one
3    *  or more contributor license agreements.  See the NOTICE file
4    *  distributed with this work for additional information
5    *  regarding copyright ownership.  The ASF licenses this file
6    *  to you under the Apache License, Version 2.0 (the
7    *  "License"); you may not use this file except in compliance
8    *  with the License.  You may obtain a copy of the License at
9    * 
10   *    http://www.apache.org/licenses/LICENSE-2.0
11   * 
12   *  Unless required by applicable law or agreed to in writing,
13   *  software distributed under the License is distributed on an
14   *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   *  KIND, either express or implied.  See the License for the
16   *  specific language governing permissions and limitations
17   *  under the License.
18   * 
19   */
20  package org.apache.directory.api.util;
21  
22  
23  import java.io.IOException;
24  import java.io.ObjectInput;
25  import java.io.ObjectOutput;
26  
27  
28  /**
29   * Various unicode manipulation methods that are more efficient then chaining
30   * operations: all is done in the same buffer without creating a bunch of string
31   * objects.
32   * 
33   * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a>
34   */
35  public final class Unicode
36  {
37      /**
38       * Count the number of bytes needed to return an Unicode char. This can be
39       * from 1 to 6.
40       *
41       * @param bytes The bytes to read
42       * @param pos Position to start counting. It must be a valid start of a
43       *            encoded char !
44       * @return The number of bytes to create a char, or -1 if the encoding is
45       *         wrong. TODO : Should stop after the third byte, as a char is only
46       *         2 bytes long.
47       */
48      public static int countBytesPerChar( byte[] bytes, int pos )
49      {
50          if ( bytes == null )
51          {
52              return -1;
53          }
54  
55          if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 )
56          {
57              return 1;
58          }
59          else if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES )
60          {
61              return 2;
62          }
63          else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES )
64          {
65              return 3;
66          }
67          else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES )
68          {
69              return 4;
70          }
71          else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
72          {
73              return 5;
74          }
75          else if ( ( bytes[pos] & UnicodeConstants.UTF8_SIX_BYTES_MASK ) == UnicodeConstants.UTF8_SIX_BYTES )
76          {
77              return 6;
78          }
79          else
80          {
81              return -1;
82          }
83      }
84  
85  
86      /**
87       * Return the Unicode char which is coded in the bytes at position 0.
88       *
89       * @param bytes The byte[] represntation of an Unicode string.
90       * @return The first char found.
91       */
92      public static char bytesToChar( byte[] bytes )
93      {
94          return bytesToChar( bytes, 0 );
95      }
96  
97  
98      /**
99       * Return the Unicode char which is coded in the bytes at the given
100      * position.
101      *
102      * @param bytes The byte[] represntation of an Unicode string.
103      * @param pos The current position to start decoding the char
104      * @return The decoded char, or -1 if no char can be decoded TODO : Should
105      *         stop after the third byte, as a char is only 2 bytes long.
106      */
107     public static char bytesToChar( byte[] bytes, int pos )
108     {
109         if ( bytes == null )
110         {
111             return ( char ) -1;
112         }
113 
114         if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 )
115         {
116             return ( char ) bytes[pos];
117         }
118         else
119         {
120             if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES )
121             {
122                 // Two bytes char
123                 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy
124                                                                    // 10zz-zzzz
125                                                                    // ->
126                                                                    // 0000-0xxx
127                                                                    // 0000-0000
128                     ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz
129                                                      // -> 0000-0000
130                                                      // yy00-0000
131                 ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000
132                                           // 00zz-zzzz
133                 ); // -> 0000-0xxx yyzz-zzzz (07FF)
134             }
135             else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES )
136             {
137                 // Three bytes char
138                 return ( char ) (
139                 // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000
140                 ( ( bytes[pos] & 0x0F ) << 12 )
141                     // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000
142                     + ( ( bytes[pos + 1] & 0x3C ) << 6 )
143                     // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000
144                     + ( ( bytes[pos + 1] & 0x03 ) << 6 )
145                     // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz
146                     + ( bytes[pos + 2] & 0x3F )
147                 // -> tttt-xxxx yyzz-zzzz (FF FF)
148                 );
149             }
150             else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES )
151             {
152                 // Four bytes char
153                 return ( char ) (
154                 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00
155                 // 0000-0000 0000-0000
156                 ( ( bytes[pos] & 0x07 ) << 18 )
157                     // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu
158                     // 0000-0000 0000-0000
159                     + ( ( bytes[pos + 1] & 0x30 ) << 16 )
160                     // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
161                     // vvvv-0000 0000-0000
162                     + ( ( bytes[pos + 1] & 0x0F ) << 12 )
163                     // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
164                     // 0000-xxxx 0000-0000
165                     + ( ( bytes[pos + 2] & 0x3C ) << 6 )
166                     // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
167                     // 0000-0000 yy00-0000
168                     + ( ( bytes[pos + 2] & 0x03 ) << 6 )
169                     // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000
170                     // 0000-0000 00zz-zzzz
171                     + ( bytes[pos + 3] & 0x3F )
172                 // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
173                 );
174             }
175             else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
176             {
177                 // Five bytes char
178                 return ( char ) (
179                 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
180                 // 0000-00tt 0000-0000 0000-0000 0000-0000
181                 ( ( bytes[pos] & 0x03 ) << 24 )
182                     // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
183                     // 0000-0000 uuuu-uu00 0000-0000 0000-0000
184                     + ( ( bytes[pos + 1] & 0x3F ) << 18 )
185                     // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
186                     // 0000-0000 0000-00vv 0000-0000 0000-0000
187                     + ( ( bytes[pos + 2] & 0x30 ) << 12 )
188                     // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
189                     // 0000-0000 0000-0000 wwww-0000 0000-0000
190                     + ( ( bytes[pos + 2] & 0x0F ) << 12 )
191                     // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
192                     // 0000-0000 0000-0000 0000-xxxx 0000-0000
193                     + ( ( bytes[pos + 3] & 0x3C ) << 6 )
194                     // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
195                     // 0000-0000 0000-0000 0000-0000 yy00-0000
196                     + ( ( bytes[pos + 3] & 0x03 ) << 6 )
197                     // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
198                     // 0000-0000 0000-0000 0000-0000 00zz-zzzz
199                     + ( bytes[pos + 4] & 0x3F )
200                 // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
201                 );
202             }
203             else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES )
204             {
205                 // Six bytes char
206                 return ( char ) (
207                 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
208                 // ->
209                 // 0s00-0000 0000-0000 0000-0000 0000-0000
210                 ( ( bytes[pos] & 0x01 ) << 30 )
211                     // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
212                     // ->
213                     // 00tt-tttt 0000-0000 0000-0000 0000-0000
214                     + ( ( bytes[pos + 1] & 0x3F ) << 24 )
215                     // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
216                     // 10zz-zzzz ->
217                     // 0000-0000 uuuu-uu00 0000-0000 0000-0000
218                     + ( ( bytes[pos + 2] & 0x3F ) << 18 )
219                     // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
220                     // 10zz-zzzz ->
221                     // 0000-0000 0000-00vv 0000-0000 0000-0000
222                     + ( ( bytes[pos + 3] & 0x30 ) << 12 )
223                     // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
224                     // 10zz-zzzz ->
225                     // 0000-0000 0000-0000 wwww-0000 0000-0000
226                     + ( ( bytes[pos + 3] & 0x0F ) << 12 )
227                     // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
228                     // 10zz-zzzz ->
229                     // 0000-0000 0000-0000 0000-xxxx 0000-0000
230                     + ( ( bytes[pos + 4] & 0x3C ) << 6 )
231                     // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy
232                     // 10zz-zzzz ->
233                     // 0000-0000 0000-0000 0000-0000 yy00-0000
234                     + ( ( bytes[pos + 4] & 0x03 ) << 6 )
235                     // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz
236                     // ->
237                     // 0000-0000 0000-0000 0000-0000 00zz-zzzz
238                     + ( bytes[pos + 5] & 0x3F )
239                 // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
240                 );
241             }
242             else
243             {
244                 return ( char ) -1;
245             }
246         }
247     }
248 
249 
250     /**
251      * Return the number of bytes that hold an Unicode char.
252      *
253      * @param car The character to be decoded
254      * @return The number of bytes to hold the char. TODO : Should stop after
255      *         the third byte, as a char is only 2 bytes long.
256      */
257     public static int countNbBytesPerChar( char car )
258     {
259         if ( ( car & UnicodeConstants.CHAR_ONE_BYTE_MASK ) == 0 )
260         {
261             return 1;
262         }
263         else if ( ( car & UnicodeConstants.CHAR_TWO_BYTES_MASK ) == 0 )
264         {
265             return 2;
266         }
267         else if ( ( car & UnicodeConstants.CHAR_THREE_BYTES_MASK ) == 0 )
268         {
269             return 3;
270         }
271         else if ( ( car & UnicodeConstants.CHAR_FOUR_BYTES_MASK ) == 0 )
272         {
273             return 4;
274         }
275         else if ( ( car & UnicodeConstants.CHAR_FIVE_BYTES_MASK ) == 0 )
276         {
277             return 5;
278         }
279         else if ( ( car & UnicodeConstants.CHAR_SIX_BYTES_MASK ) == 0 )
280         {
281             return 6;
282         }
283         else
284         {
285             return -1;
286         }
287     }
288 
289 
290     /**
291      * Count the number of bytes included in the given char[].
292      *
293      * @param chars The char array to decode
294      * @return The number of bytes in the char array
295      */
296     public static int countBytes( char[] chars )
297     {
298         if ( chars == null )
299         {
300             return 0;
301         }
302 
303         int nbBytes = 0;
304         int currentPos = 0;
305 
306         while ( currentPos < chars.length )
307         {
308             int nbb = countNbBytesPerChar( chars[currentPos] );
309 
310             // If the number of bytes necessary to encode a character is
311             // above 3, we will need two UTF-16 chars
312             currentPos += ( nbb < 4 ? 1 : 2 );
313             nbBytes += nbb;
314         }
315 
316         return nbBytes;
317     }
318 
319 
320     /**
321      * Count the number of chars included in the given byte[].
322      *
323      * @param bytes The byte array to decode
324      * @return The number of char in the byte array
325      */
326     public static int countChars( byte[] bytes )
327     {
328         if ( bytes == null )
329         {
330             return 0;
331         }
332 
333         int nbChars = 0;
334         int currentPos = 0;
335 
336         while ( currentPos < bytes.length )
337         {
338             currentPos += countBytesPerChar( bytes, currentPos );
339             nbChars++;
340         }
341 
342         return nbChars;
343     }
344 
345 
346     /**
347      * Return the Unicode char which is coded in the bytes at the given
348      * position.
349      *
350      * @param car The character to be transformed to an array of bytes
351      *
352      * @return The byte array representing the char
353      *
354      * TODO : Should stop after the third byte, as a char is only 2 bytes long.
355      */
356     public static byte[] charToBytes( char car )
357     {
358         byte[] bytes = new byte[countNbBytesPerChar( car )];
359 
360         if ( car <= 0x7F )
361         {
362             // Single byte char
363             bytes[0] = ( byte ) car;
364             return bytes;
365         }
366         else if ( car <= 0x7FF )
367         {
368             // two bytes char
369             bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) );
370             bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
371         }
372         else
373         {
374             // Three bytes char
375             bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) );
376             bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) );
377             bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) );
378         }
379 
380         return bytes;
381     }
382 
383 
384     /**
385      * Check if the current char is in the unicodeSubset : all chars but
386      * '\0', '(', ')', '*' and '\'
387      *
388      * @param str The string to check
389      * @param pos Position of the current char
390      * @return True if the current char is in the unicode subset
391      */
392     public static boolean isUnicodeSubset( String str, int pos )
393     {
394         if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) )
395         {
396             return false;
397         }
398 
399         char c = str.charAt( pos );
400 
401         return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] );
402     }
403 
404 
405     /**
406      * Check if the current char is in the unicodeSubset : all chars but
407      * '\0', '(', ')', '*' and '\'
408      *
409      * @param c The char to check
410      * @return True if the current char is in the unicode subset
411      */
412     public static boolean isUnicodeSubset( char c )
413     {
414         return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] );
415     }
416 
417 
418     /**
419      * Check if the current byte is in the unicodeSubset : all chars but
420      * '\0', '(', ')', '*' and '\'
421      *
422      * @param b The byte to check
423      * @return True if the current byte is in the unicode subset
424      */
425     public static boolean isUnicodeSubset( byte b )
426     {
427         return ( ( b < 0 ) || ( b > 127 ) || UnicodeConstants.UNICODE_SUBSET[b] );
428     }
429 
430 
431     /**
432      *
433      * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation
434      * of every character in the string str. If str is null, the string value 'null' is written with a length of 0
435      * instead of throwing an NullPointerException. Each character in the string s  is converted to a group of one,
436      * two, or three bytes, depending on the value of the character.
437      *
438      * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is
439      * written in the length information (four bytes (writeInt)) and the string is split into smaller parts
440      * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes
441      * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at
442      * once.
443      *
444      * See also {@link java.io.DataOutput#writeUTF(String)}.
445      *
446      * @param objectOutput The objectOutput to write to
447      * @param str The value to write
448      * @throws java.io.IOException If the value can't be written to the file
449      */
450     public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException
451     {
452         // Write a 'null' string
453         if ( str == null )
454         {
455             objectOutput.writeInt( 0 );
456             objectOutput.writeUTF( "null" );
457         }
458         else
459         {
460             // Write length of string
461             objectOutput.writeInt( str.length() );
462 
463             StringBuffer strBuf = new StringBuffer( str );
464 
465             // Write the string in portions not larger than 21845 characters
466             while ( strBuf != null )
467             {
468                 if ( strBuf.length() < 21845 )
469                 {
470                     objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) );
471                     strBuf = null;
472                 }
473                 else
474                 {
475                     objectOutput.writeUTF( strBuf.substring( 0, 21845 ) );
476                     strBuf.delete( 0, 21845 );
477                 }
478             }
479         }
480     }
481 
482 
483     /**
484      *
485      * Reads in a string that has been encoded using a modified UTF-8  format. The general contract of readUTF  is
486      * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of
487      * characters is then returned as a String.
488      *
489      * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner
490      * of the readUnsignedShort  method . This integer value is called the UTF length and specifies the number of
491      * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The
492      * length of each group is computed from the value of the first byte of the group. The byte following a group, if
493      * any, is the first byte of the next group.
494      *
495      *See also {@link java.io.DataInput#readUTF()}.
496      *
497      * @param objectInput The objectInput to read from
498      * @return The read string
499      * @throws java.io.IOException If the value can't be read
500      */
501     public static String readUTF( ObjectInput objectInput ) throws IOException
502     {
503         StringBuffer strBuf = null;
504 
505         // Read length of the string
506         int strLength = objectInput.readInt();
507 
508         // Start reading the string
509         strBuf = new StringBuffer( objectInput.readUTF() );
510 
511         if ( strLength == 0 && strBuf.toString().equals( "null" ) )
512         {
513             // The special case of a 'null' string
514             return null;
515         }
516         else
517         {
518             while ( strLength > strBuf.length() )
519             {
520                 strBuf.append( objectInput.readUTF() );
521             }
522             return strBuf.toString();
523         }
524     }
525 }