"' . getHexStringFromCodepoint( $columns[12] ) . '", // ' . $columns[1] . PHP_EOL; } if ( !empty( $columns[13] ) ) { $upperToLower .= ' "' . $source . '" => "' . getHexStringFromCodepoint( $columns[13] ) . '", // ' . $columns[1] . PHP_EOL; } } fclose( $fp ); $lowerToUpper .= ');' . PHP_EOL . '}'; $upperToLower .= ');' . PHP_EOL . '}'; file_put_contents( 'Template/src/structs/lower_to_upper.php', $lowerToUpper ); file_put_contents( 'Template/src/structs/upper_to_lower.php', $upperToLower ); } /** * Get the hex representation of a unicode codepoint. * * What is going on: * http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-AppendixA * http://developers.sun.com/dev/gadc/technicalpublications/articles/utf8.html * * @param int $codepoint * @return string */ function getHexStringFromCodepoint( $codepoint ) { // the comments below explain whats done with the bitwise calculations $codepoint = hexdec( $codepoint ); $result = ''; if ( $codepoint < 0x80 ) { // C1 = U $result = "\\x" . dechex( $codepoint ); } elseif ( $codepoint < 0x800 ) { // C1 = U \ 64 + 192 // C2 = U mod 64 + 128 $result = "\\x" . dechex( $codepoint >> 6 | 0xc0 ) . "\\x" . dechex( $codepoint & 0x3f | 0x80 ); } elseif ( $codepoint < 0x10000 ) { // C1 = U \ 4096 + 224 // C2 = (U mod 4096) \ 64 + 128 // C3 = U mod 64 + 128 $result = "\\x" . dechex( $codepoint >> 12 | 0xe0 ) . "\\x" . dechex( $codepoint >> 6 & 0x3f | 0x80 ) . "\\x" . dechex( $codepoint & 0x3f | 0x80 ); } elseif ( $codepoint < 0x110000 ) { // C1 = U \ 262144 + 240 // C2 = (U mod 262144) \ 4096 + 128 // C3 = (U mod 4096) \ 64 + 128 // C4 = U mod 64 + 128 $result = "\\x" . dechex( $codepoint >> 18 | 0xf0 ) . "\\x" . dechex( $codepoint >> 12 & 0x3f | 0x80 ) . "\\x" . dechex( $codepoint >> 6 & 0x3f | 0x80 ) . "\\x" . dechex( $codepoint & 0x3f | 0x80 ); } return $result; }