001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 * 019 */ 020package org.apache.directory.api.util; 021 022 023import java.io.IOException; 024import java.io.ObjectInput; 025import java.io.ObjectOutput; 026 027 028/** 029 * Various unicode manipulation methods that are more efficient then chaining 030 * operations: all is done in the same buffer without creating a bunch of string 031 * objects. 032 * 033 * @author <a href="mailto:dev@directory.apache.org">Apache Directory Project</a> 034 */ 035public final class Unicode 036{ 037 /** 038 * Count the number of bytes needed to return an Unicode char. This can be 039 * from 1 to 6. 040 * 041 * @param bytes The bytes to read 042 * @param pos Position to start counting. It must be a valid start of a 043 * encoded char ! 044 * @return The number of bytes to create a char, or -1 if the encoding is 045 * wrong. TODO : Should stop after the third byte, as a char is only 046 * 2 bytes long. 047 */ 048 public static int countBytesPerChar( byte[] bytes, int pos ) 049 { 050 if ( bytes == null ) 051 { 052 return -1; 053 } 054 055 if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 ) 056 { 057 return 1; 058 } 059 else if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES ) 060 { 061 return 2; 062 } 063 else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES ) 064 { 065 return 3; 066 } 067 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES ) 068 { 069 return 4; 070 } 071 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) 072 { 073 return 5; 074 } 075 else if ( ( bytes[pos] & UnicodeConstants.UTF8_SIX_BYTES_MASK ) == UnicodeConstants.UTF8_SIX_BYTES ) 076 { 077 return 6; 078 } 079 else 080 { 081 return -1; 082 } 083 } 084 085 086 /** 087 * Return the Unicode char which is coded in the bytes at position 0. 088 * 089 * @param bytes The byte[] represntation of an Unicode string. 090 * @return The first char found. 091 */ 092 public static char bytesToChar( byte[] bytes ) 093 { 094 return bytesToChar( bytes, 0 ); 095 } 096 097 098 /** 099 * Return the Unicode char which is coded in the bytes at the given 100 * position. 101 * 102 * @param bytes The byte[] represntation of an Unicode string. 103 * @param pos The current position to start decoding the char 104 * @return The decoded char, or -1 if no char can be decoded TODO : Should 105 * stop after the third byte, as a char is only 2 bytes long. 106 */ 107 public static char bytesToChar( byte[] bytes, int pos ) 108 { 109 if ( bytes == null ) 110 { 111 return ( char ) -1; 112 } 113 114 if ( ( bytes[pos] & UnicodeConstants.UTF8_MULTI_BYTES_MASK ) == 0 ) 115 { 116 return ( char ) bytes[pos]; 117 } 118 else 119 { 120 if ( ( bytes[pos] & UnicodeConstants.UTF8_TWO_BYTES_MASK ) == UnicodeConstants.UTF8_TWO_BYTES ) 121 { 122 // Two bytes char 123 return ( char ) ( ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy 124 // 10zz-zzzz 125 // -> 126 // 0000-0xxx 127 // 0000-0000 128 ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz 129 // -> 0000-0000 130 // yy00-0000 131 ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000 132 // 00zz-zzzz 133 ); // -> 0000-0xxx yyzz-zzzz (07FF) 134 } 135 else if ( ( bytes[pos] & UnicodeConstants.UTF8_THREE_BYTES_MASK ) == UnicodeConstants.UTF8_THREE_BYTES ) 136 { 137 // Three bytes char 138 return ( char ) ( 139 // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000 140 ( ( bytes[pos] & 0x0F ) << 12 ) 141 // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000 142 + ( ( bytes[pos + 1] & 0x3C ) << 6 ) 143 // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000 144 + ( ( bytes[pos + 1] & 0x03 ) << 6 ) 145 // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz 146 + ( bytes[pos + 2] & 0x3F ) 147 // -> tttt-xxxx yyzz-zzzz (FF FF) 148 ); 149 } 150 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FOUR_BYTES_MASK ) == UnicodeConstants.UTF8_FOUR_BYTES ) 151 { 152 // Four bytes char 153 return ( char ) ( 154 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00 155 // 0000-0000 0000-0000 156 ( ( bytes[pos] & 0x07 ) << 18 ) 157 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu 158 // 0000-0000 0000-0000 159 + ( ( bytes[pos + 1] & 0x30 ) << 16 ) 160 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 161 // vvvv-0000 0000-0000 162 + ( ( bytes[pos + 1] & 0x0F ) << 12 ) 163 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 164 // 0000-xxxx 0000-0000 165 + ( ( bytes[pos + 2] & 0x3C ) << 6 ) 166 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 167 // 0000-0000 yy00-0000 168 + ( ( bytes[pos + 2] & 0x03 ) << 6 ) 169 // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 170 // 0000-0000 00zz-zzzz 171 + ( bytes[pos + 3] & 0x3F ) 172 // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF) 173 ); 174 } 175 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) 176 { 177 // Five bytes char 178 return ( char ) ( 179 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 180 // 0000-00tt 0000-0000 0000-0000 0000-0000 181 ( ( bytes[pos] & 0x03 ) << 24 ) 182 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 183 // 0000-0000 uuuu-uu00 0000-0000 0000-0000 184 + ( ( bytes[pos + 1] & 0x3F ) << 18 ) 185 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 186 // 0000-0000 0000-00vv 0000-0000 0000-0000 187 + ( ( bytes[pos + 2] & 0x30 ) << 12 ) 188 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 189 // 0000-0000 0000-0000 wwww-0000 0000-0000 190 + ( ( bytes[pos + 2] & 0x0F ) << 12 ) 191 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 192 // 0000-0000 0000-0000 0000-xxxx 0000-0000 193 + ( ( bytes[pos + 3] & 0x3C ) << 6 ) 194 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 195 // 0000-0000 0000-0000 0000-0000 yy00-0000 196 + ( ( bytes[pos + 3] & 0x03 ) << 6 ) 197 // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 198 // 0000-0000 0000-0000 0000-0000 00zz-zzzz 199 + ( bytes[pos + 4] & 0x3F ) 200 // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF) 201 ); 202 } 203 else if ( ( bytes[pos] & UnicodeConstants.UTF8_FIVE_BYTES_MASK ) == UnicodeConstants.UTF8_FIVE_BYTES ) 204 { 205 // Six bytes char 206 return ( char ) ( 207 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 208 // -> 209 // 0s00-0000 0000-0000 0000-0000 0000-0000 210 ( ( bytes[pos] & 0x01 ) << 30 ) 211 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 212 // -> 213 // 00tt-tttt 0000-0000 0000-0000 0000-0000 214 + ( ( bytes[pos + 1] & 0x3F ) << 24 ) 215 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 216 // 10zz-zzzz -> 217 // 0000-0000 uuuu-uu00 0000-0000 0000-0000 218 + ( ( bytes[pos + 2] & 0x3F ) << 18 ) 219 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 220 // 10zz-zzzz -> 221 // 0000-0000 0000-00vv 0000-0000 0000-0000 222 + ( ( bytes[pos + 3] & 0x30 ) << 12 ) 223 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 224 // 10zz-zzzz -> 225 // 0000-0000 0000-0000 wwww-0000 0000-0000 226 + ( ( bytes[pos + 3] & 0x0F ) << 12 ) 227 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 228 // 10zz-zzzz -> 229 // 0000-0000 0000-0000 0000-xxxx 0000-0000 230 + ( ( bytes[pos + 4] & 0x3C ) << 6 ) 231 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 232 // 10zz-zzzz -> 233 // 0000-0000 0000-0000 0000-0000 yy00-0000 234 + ( ( bytes[pos + 4] & 0x03 ) << 6 ) 235 // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz 236 // -> 237 // 0000-0000 0000-0000 0000-0000 00zz-zzzz 238 + ( bytes[pos + 5] & 0x3F ) 239 // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF) 240 ); 241 } 242 else 243 { 244 return ( char ) -1; 245 } 246 } 247 } 248 249 250 /** 251 * Return the number of bytes that hold an Unicode char. 252 * 253 * @param car The character to be decoded 254 * @return The number of bytes to hold the char. TODO : Should stop after 255 * the third byte, as a char is only 2 bytes long. 256 */ 257 public static int countNbBytesPerChar( char car ) 258 { 259 if ( ( car & UnicodeConstants.CHAR_ONE_BYTE_MASK ) == 0 ) 260 { 261 return 1; 262 } 263 else if ( ( car & UnicodeConstants.CHAR_TWO_BYTES_MASK ) == 0 ) 264 { 265 return 2; 266 } 267 else if ( ( car & UnicodeConstants.CHAR_THREE_BYTES_MASK ) == 0 ) 268 { 269 return 3; 270 } 271 else if ( ( car & UnicodeConstants.CHAR_FOUR_BYTES_MASK ) == 0 ) 272 { 273 return 4; 274 } 275 else if ( ( car & UnicodeConstants.CHAR_FIVE_BYTES_MASK ) == 0 ) 276 { 277 return 5; 278 } 279 else if ( ( car & UnicodeConstants.CHAR_SIX_BYTES_MASK ) == 0 ) 280 { 281 return 6; 282 } 283 else 284 { 285 return -1; 286 } 287 } 288 289 290 /** 291 * Count the number of bytes included in the given char[]. 292 * 293 * @param chars The char array to decode 294 * @return The number of bytes in the char array 295 */ 296 public static int countBytes( char[] chars ) 297 { 298 if ( chars == null ) 299 { 300 return 0; 301 } 302 303 int nbBytes = 0; 304 int currentPos = 0; 305 306 while ( currentPos < chars.length ) 307 { 308 int nbb = countNbBytesPerChar( chars[currentPos] ); 309 310 // If the number of bytes necessary to encode a character is 311 // above 3, we will need two UTF-16 chars 312 currentPos += ( nbb < 4 ? 1 : 2 ); 313 nbBytes += nbb; 314 } 315 316 return nbBytes; 317 } 318 319 320 /** 321 * Count the number of chars included in the given byte[]. 322 * 323 * @param bytes The byte array to decode 324 * @return The number of char in the byte array 325 */ 326 public static int countChars( byte[] bytes ) 327 { 328 if ( bytes == null ) 329 { 330 return 0; 331 } 332 333 int nbChars = 0; 334 int currentPos = 0; 335 336 while ( currentPos < bytes.length ) 337 { 338 currentPos += countBytesPerChar( bytes, currentPos ); 339 nbChars++; 340 } 341 342 return nbChars; 343 } 344 345 346 /** 347 * Return the Unicode char which is coded in the bytes at the given 348 * position. 349 * 350 * @param car The character to be transformed to an array of bytes 351 * 352 * @return The byte array representing the char 353 * 354 * TODO : Should stop after the third byte, as a char is only 2 bytes long. 355 */ 356 public static byte[] charToBytes( char car ) 357 { 358 byte[] bytes = new byte[countNbBytesPerChar( car )]; 359 360 if ( car <= 0x7F ) 361 { 362 // Single byte char 363 bytes[0] = ( byte ) car; 364 return bytes; 365 } 366 else if ( car <= 0x7FF ) 367 { 368 // two bytes char 369 bytes[0] = ( byte ) ( 0x00C0 + ( ( car & 0x07C0 ) >> 6 ) ); 370 bytes[1] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 371 } 372 else 373 { 374 // Three bytes char 375 bytes[0] = ( byte ) ( 0x00E0 + ( ( car & 0xF000 ) >> 12 ) ); 376 bytes[1] = ( byte ) ( 0x0080 + ( ( car & 0x0FC0 ) >> 6 ) ); 377 bytes[2] = ( byte ) ( 0x0080 + ( car & 0x3F ) ); 378 } 379 380 return bytes; 381 } 382 383 384 /** 385 * Check if the current char is in the unicodeSubset : all chars but 386 * '\0', '(', ')', '*' and '\' 387 * 388 * @param str The string to check 389 * @param pos Position of the current char 390 * @return True if the current char is in the unicode subset 391 */ 392 public static boolean isUnicodeSubset( String str, int pos ) 393 { 394 if ( ( str == null ) || ( str.length() <= pos ) || ( pos < 0 ) ) 395 { 396 return false; 397 } 398 399 char c = str.charAt( pos ); 400 401 return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] ); 402 } 403 404 405 /** 406 * Check if the current char is in the unicodeSubset : all chars but 407 * '\0', '(', ')', '*' and '\' 408 * 409 * @param c The char to check 410 * @return True if the current char is in the unicode subset 411 */ 412 public static boolean isUnicodeSubset( char c ) 413 { 414 return ( ( c > 127 ) || UnicodeConstants.UNICODE_SUBSET[c] ); 415 } 416 417 418 /** 419 * Check if the current byte is in the unicodeSubset : all chars but 420 * '\0', '(', ')', '*' and '\' 421 * 422 * @param b The byte to check 423 * @return True if the current byte is in the unicode subset 424 */ 425 public static boolean isUnicodeSubset( byte b ) 426 { 427 return ( ( b < 0 ) || ( b > 127 ) || UnicodeConstants.UNICODE_SUBSET[b] ); 428 } 429 430 431 /** 432 * 433 * Writes four bytes of length information to the output stream, followed by the modified UTF-8 representation 434 * of every character in the string str. If str is null, the string value 'null' is written with a length of 0 435 * instead of throwing an NullPointerException. Each character in the string s is converted to a group of one, 436 * two, or three bytes, depending on the value of the character. 437 * 438 * Due to given restrictions (total number of written bytes in a row can't exceed 65535) the total length is 439 * written in the length information (four bytes (writeInt)) and the string is split into smaller parts 440 * if necessary and written. As each character may be converted to a group of maximum 3 bytes and 65535 bytes 441 * can be written at maximum we're on the save side when writing a chunk of only 21845 (65535/3) characters at 442 * once. 443 * 444 * See also {@link java.io.DataOutput#writeUTF(String)}. 445 * 446 * @param objectOutput The objectOutput to write to 447 * @param str The value to write 448 * @throws java.io.IOException If the value can't be written to the file 449 */ 450 public static void writeUTF( ObjectOutput objectOutput, String str ) throws IOException 451 { 452 // Write a 'null' string 453 if ( str == null ) 454 { 455 objectOutput.writeInt( 0 ); 456 objectOutput.writeUTF( "null" ); 457 } 458 else 459 { 460 // Write length of string 461 objectOutput.writeInt( str.length() ); 462 463 StringBuffer strBuf = new StringBuffer( str ); 464 465 // Write the string in portions not larger than 21845 characters 466 while ( strBuf != null ) 467 { 468 if ( strBuf.length() < 21845 ) 469 { 470 objectOutput.writeUTF( strBuf.substring( 0, strBuf.length() ) ); 471 strBuf = null; 472 } 473 else 474 { 475 objectOutput.writeUTF( strBuf.substring( 0, 21845 ) ); 476 strBuf.delete( 0, 21845 ); 477 } 478 } 479 } 480 } 481 482 483 /** 484 * 485 * Reads in a string that has been encoded using a modified UTF-8 format. The general contract of readUTF is 486 * that it reads a representation of a Unicode character string encoded in modified UTF-8 format; this string of 487 * characters is then returned as a String. 488 * 489 * First, four bytes are read (readInt) and used to construct an unsigned 16-bit integer in exactly the manner 490 * of the readUnsignedShort method . This integer value is called the UTF length and specifies the number of 491 * additional bytes to be read. These bytes are then converted to characters by considering them in groups. The 492 * length of each group is computed from the value of the first byte of the group. The byte following a group, if 493 * any, is the first byte of the next group. 494 * 495 *See also {@link java.io.DataInput#readUTF()}. 496 * 497 * @param objectInput The objectInput to read from 498 * @return The read string 499 * @throws java.io.IOException If the value can't be read 500 */ 501 public static String readUTF( ObjectInput objectInput ) throws IOException 502 { 503 StringBuffer strBuf = null; 504 505 // Read length of the string 506 int strLength = objectInput.readInt(); 507 508 // Start reading the string 509 strBuf = new StringBuffer( objectInput.readUTF() ); 510 511 if ( strLength == 0 && strBuf.toString().equals( "null" ) ) 512 { 513 // The special case of a 'null' string 514 return null; 515 } 516 else 517 { 518 while ( strLength > strBuf.length() ) 519 { 520 strBuf.append( objectInput.readUTF() ); 521 } 522 return strBuf.toString(); 523 } 524 } 525}