001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 * 019 */ 020package org.apache.directory.shared.util; 021 022 023import java.io.IOException; 024import java.io.ObjectInput; 025import java.io.ObjectOutput; 026 027/** 028 * Various unicode manipulation methods that are more efficient then chaining 029 * operations: all is done in the same buffer without creating a bunch of string 030 * objects. 031 * 032 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a> 033 */ 034public final class Unicode 035{ 036 /** 037 * Count the number of bytes needed to return an Unicode char. This can be 038 * from 1 to 6. 039 * 040 * @param bytes The bytes to read 041 * @param pos Position to start counting. It must be a valid start of a 042 * encoded char ! 043 * @return The number of bytes to create a char, or -1 if the encoding is 044 * wrong. TODO : Should stop after the third byte, as a char is only 045 * 2 bytes long. 046 */ 047 public static int countBytesPerChar( byte[] bytes, int pos ) 048 { 049 if ( bytes == null ) 050 { 051 return -1; 052 } 053 054 if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 ) 055 { 056 return 1; 057 } 058 else if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES ) 059 { 060 return 2; 061 } 062 else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES ) 063 { 064 return 3; 065 } 066 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES ) 067 { 068 return 4; 069 } 070 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) 071 { 072 return 5; 073 } 074 else if ( ( bytes[pos] & UnicodeConstants.UTF8_SIX_BYTES_MASK ) == UnicodeConstants.UTF8_SIX_BYTES ) 075 { 076 return 6; 077 } 078 else 079 { 080 return -1; 081 } 082 } 083 084 /** 085 * Return the Unicode char which is coded in the bytes at position 0. 086 * 087 * @param bytes The byte[] represntation of an Unicode string. 088 * @return The first char found. 089 */ 090 public static char bytesToChar( byte[] bytes ) 091 { 092 return bytesToChar( bytes, 0 ); 093 } 094 095 /** 096 * Return the Unicode char which is coded in the bytes at the given 097 * position. 098 * 099 * @param bytes The byte[] represntation of an Unicode string. 100 * @param pos The current position to start decoding the char 101 * @return The decoded char, or -1 if no char can be decoded TODO : Should 102 * stop after the third byte, as a char is only 2 bytes long. 103 */ 104 public static char bytesToChar( byte[] bytes, int pos ) 105 { 106 if ( bytes == null ) 107 { 108 return ( char ) -1; 109 } 110 111 if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 ) 112 { 113 return ( char ) bytes[pos]; 114 } 115 else 116 { 117 if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES ) 118 { 119 // Two bytes char 120 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy 121 // 10zz-zzzz 122 // -> 123 // 0000-0xxx 124 // 0000-0000 125 ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz 126 // -> 0000-0000 127 // yy00-0000 128 ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000 129 // 00zz-zzzz 130 ); // -> 0000-0xxx yyzz-zzzz (07FF) 131 } 132 else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES ) 133 { 134 // Three bytes char 135 return ( char ) ( 136 // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000 137 ( ( bytes[pos] & 0x0F ) << 12 ) 138 // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000 139 + ( ( bytes[pos + 1] & 0x3C ) << 6 ) 140 // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000 141 + ( ( bytes[pos + 1] & 0x03 ) << 6 ) 142 // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz 143 + ( bytes[pos + 2] & 0x3F ) 144 // -> tttt-xxxx yyzz-zzzz (FF FF) 145 ); 146 } 147 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES ) 148 { 149 // Four bytes char 150 return ( char ) ( 151 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00 152 // 0000-0000 0000-0000 153 ( ( bytes[pos] & 0x07 ) << 18 ) 154 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu 155 // 0000-0000 0000-0000 156 + ( ( bytes[pos + 1] & 0x30 ) << 16 ) 157 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 158 // vvvv-0000 0000-0000 159 + ( ( bytes[pos + 1] & 0x0F ) << 12 ) 160 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 161 // 0000-xxxx 0000-0000 162 + ( ( bytes[pos + 2] & 0x3C ) << 6 ) 163 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 164 // 0000-0000 yy00-0000 165 + ( ( bytes[pos + 2] & 0x03 ) << 6 ) 166 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 167 // 0000-0000 00zz-zzzz 168 + ( bytes[pos + 3] & 0x3F ) 169 // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF) 170 ); 171 } 172 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) 173 { 174 // Five bytes char 175 return ( char ) ( 176 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 177 // 0000-00tt 0000-0000 0000-0000 0000-0000 178 ( ( bytes[pos] & 0x03 ) << 24 ) 179 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 180 // 0000-0000 uuuu-uu00 0000-0000 0000-0000 181 + ( ( bytes[pos + 1] & 0x3F ) << 18 ) 182 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 183 // 0000-0000 0000-00vv 0000-0000 0000-0000 184 + ( ( bytes[pos + 2] & 0x30 ) << 12 ) 185 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 186 // 0000-0000 0000-0000 wwww-0000 0000-0000 187 + ( ( bytes[pos + 2] & 0x0F ) << 12 ) 188 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 189 // 0000-0000 0000-0000 0000-xxxx 0000-0000 190 + ( ( bytes[pos + 3] & 0x3C ) << 6 ) 191 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 192 // 0000-0000 0000-0000 0000-0000 yy00-0000 193 + ( ( bytes[pos + 3] & 0x03 ) << 6 ) 194 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 195 // 0000-0000 0000-0000 0000-0000 00zz-zzzz 196 + ( bytes[pos + 4] & 0x3F ) 197 // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF) 198 ); 199 } 200 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) 201 { 202 // Six bytes char 203 return ( char ) ( 204 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 205 // -> 206 // 0s00-0000 0000-0000 0000-0000 0000-0000 207 ( ( bytes[pos] & 0x01 ) << 30 ) 208 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 209 // -> 210 // 00tt-tttt 0000-0000 0000-0000 0000-0000 211 + ( ( bytes[pos + 1] & 0x3F ) << 24 ) 212 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 213 // 10zz-zzzz -> 214 // 0000-0000 uuuu-uu00 0000-0000 0000-0000 215 + ( ( bytes[pos + 2] & 0x3F ) << 18 ) 216 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 217 // 10zz-zzzz -> 218 // 0000-0000 0000-00vv 0000-0000 0000-0000 219 + ( ( bytes[pos + 3] & 0x30 ) << 12 ) 220 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 221 // 10zz-zzzz -> 222 // 0000-0000 0000-0000 wwww-0000 0000-0000 223 + ( ( bytes[pos + 3] & 0x0F ) << 12 ) 224 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 225 // 10zz-zzzz -> 226 // 0000-0000 0000-0000 0000-xxxx 0000-0000 227 + ( ( bytes[pos + 4] & 0x3C ) << 6 ) 228 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 229 // 10zz-zzzz -> 230 // 0000-0000 0000-0000 0000-0000 yy00-0000 231 + ( ( bytes[pos + 4] & 0x03 ) << 6 ) 232 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 233 // -> 234 // 0000-0000 0000-0000 0000-0000 00zz-zzzz 235 + ( bytes[pos + 5] & 0x3F ) 236 // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF) 237 ); 238 } 239 else 240 { 241 return ( char ) -1; 242 } 243 } 244 } 245 246 /** 247 * Return the number of bytes that hold an Unicode char. 248 * 249 * @param car The character to be decoded 250 * @return The number of bytes to hold the char. TODO : Should stop after 251 * the third byte, as a char is only 2 bytes long. 252 */ 253 public static int countNbBytesPerChar( char car ) 254 { 255 if ( ( car & UnicodeConstants.CHAR_ONE_BYTE_MASK ) == 0 ) 256 { 257 return 1; 258 } 259 else if ( ( car & UnicodeConstants.CHAR_TWO_BYTES_MASK ) == 0 ) 260 { 261 return 2; 262 } 263 else if ( ( car & UnicodeConstants.CHAR_THREE_BYTES_MASK ) == 0 ) 264 { 265 return 3; 266 } 267 else if ( ( car & UnicodeConstants.CHAR_FOUR_BYTES_MASK ) == 0 ) 268 { 269 return 4; 270 } 271 else if ( ( car & UnicodeConstants.CHAR_FIVE_BYTES_MASK ) == 0 ) 272 { 273 return 5; 274 } 275 else if ( ( car & UnicodeConstants.CHAR_SIX_BYTES_MASK ) == 0 ) 276 { 277 return 6; 278 } 279 else 280 { 281 return -1; 282 } 283 } 284 285 /** 286 * Count the number of bytes included in the given char[]. 287 * 288 * @param chars The char array to decode 289 * @return The number of bytes in the char array 290 */ 291 public static int countBytes( char[] chars ) 292 { 293 if ( chars == null ) 294 { 295 return 0; 296 } 297 298 int nbBytes = 0; 299 int currentPos = 0; 300 301 while ( currentPos < chars.length ) 302 { 303 int nbb = countNbBytesPerChar( chars[currentPos] ); 304 305 // If the number of bytes necessary to encode a character is 306 // above 3, we will need two UTF-16 chars 307 currentPos += ( nbb < 4 ? 1 : 2 ); 308 nbBytes += nbb; 309 } 310 311 return nbBytes; 312 } 313 314 /** 315 * Count the number of chars included in the given byte[]. 316 * 317 * @param bytes The byte array to decode 318 * @return The number of char in the byte array 319 */ 320 public static int countChars( byte[] bytes ) 321 { 322 if ( bytes == null ) 323 { 324 return 0; 325 } 326 327 int nbChars = 0; 328 int currentPos = 0; 329 330 while ( currentPos < bytes.length ) 331 { 332 currentPos += countBytesPerChar(bytes, currentPos); 333 nbChars++; 334 } 335 336 return nbChars; 337 } 338 339 /** 340 * Return the Unicode char which is coded in the bytes at the given 341 * position. 342 * 343 * @param car The character to be transformed to an array of bytes 344 * 345 * @return The byte array representing the char 346 * 347 * TODO : Should stop after the third byte, as a char is only 2 bytes long. 348 */ 349 public static byte[] charToBytes( char car ) 350 { 351 byte[] bytes = new byte[countNbBytesPerChar(car)]; 352 353 if ( car <= 0x7F ) 354 { 355 // Single byte char 356 bytes[0] = ( byte ) car; 357 return bytes; 358 } 359 else if ( car <= 0x7FF ) 360 { 361 // two bytes char 362 bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) ); 363 bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 364 } 365 else 366 { 367 // Three bytes char 368 bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) ); 369 bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) ); 370 bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 371 } 372 373 return bytes; 374 } 375 376 /** 377 * Check if the current char is in the unicodeSubset : all chars but 378 * '\0', '(', ')', '*' and '\' 379 * 380 * @param str The string to check 381 * @param pos Position of the current char 382 * @return True if the current char is in the unicode subset 383 */ 384 public static boolean isUnicodeSubset( String str, int pos ) 385 { 386 if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) ) 387 { 388 return false; 389 } 390 391 char c = str.charAt( pos ); 392 393 return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] ); 394 } 395 396 /** 397 * Check if the current char is in the unicodeSubset : all chars but 398 * '\0', '(', ')', '*' and '\' 399 * 400 * @param c The char to check 401 * @return True if the current char is in the unicode subset 402 */ 403 public static boolean isUnicodeSubset( char c ) 404 { 405 return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] ); 406 } 407 408 /** 409 * 410 * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation 411 * of every character in the string str. If str is null, the string value 'null' is written with a length of 0 412 * instead of throwing an NullPointerException. Each character in the string s is converted to a group of one, 413 * two, or three bytes, depending on the value of the character. 414 * 415 * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is 416 * written in the length information (four bytes (writeInt)) and the string is split into smaller parts 417 * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes 418 * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at 419 * once. 420 * 421 * See also {@link java.io.DataOutput#writeUTF(String)}. 422 * 423 * @param objectOutput The objectOutput to write to 424 * @param str The value to write 425 * @throws java.io.IOException If the value can't be written to the file 426 */ 427 public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException 428 { 429 // Write a 'null' string 430 if ( str == null ) 431 { 432 objectOutput.writeInt( 0 ); 433 objectOutput.writeUTF( "null" ); 434 } 435 else 436 { 437 // Write length of string 438 objectOutput.writeInt( str.length() ); 439 440 StringBuffer strBuf = new StringBuffer( str ); 441 442 // Write the string in portions not larger than 21845 characters 443 while ( strBuf != null ) 444 { 445 if ( strBuf.length() < 21845 ) 446 { 447 objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) ); 448 strBuf = null; 449 } 450 else 451 { 452 objectOutput.writeUTF( strBuf.substring( 0, 21845 ) ); 453 strBuf.delete( 0, 21845 ); 454 } 455 } 456 } 457 } 458 459 /** 460 * 461 * Reads in a string that has been encoded using a modified UTF-8 format. The general contract of readUTF is 462 * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of 463 * characters is then returned as a String. 464 * 465 * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner 466 * of the readUnsignedShort method . This integer value is called the UTF length and specifies the number of 467 * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The 468 * length of each group is computed from the value of the first byte of the group. The byte following a group, if 469 * any, is the first byte of the next group. 470 * 471 *See also {@link java.io.DataInput#readUTF()}. 472 * 473 * @param objectInput The objectInput to read from 474 * @return The read string 475 * @throws java.io.IOException If the value can't be read 476 */ 477 public static String readUTF( ObjectInput objectInput ) throws IOException 478 { 479 StringBuffer strBuf = null; 480 481 // Read length of the string 482 int strLength = objectInput.readInt(); 483 484 // Start reading the string 485 strBuf = new StringBuffer( objectInput.readUTF() ); 486 487 if ( strLength == 0 && strBuf.toString().equals( "null" ) ) 488 { 489 // The special case of a 'null' string 490 return null; 491 } 492 else 493 { 494 while ( strLength > strBuf.length() ) 495 { 496 strBuf.append( objectInput.readUTF() ); 497 } 498 return strBuf.toString(); 499 } 500 } 501}