/* * (c) Copyright 2010 Talis Systems Ltd. * All rights reserved. * [See end of file] */ package dev; import java.nio.ByteBuffer ; import migrate.lib.VarInteger ; import org.openjena.atlas.lib.Pair ; import org.openjena.atlas.lib.StrUtils ; public class StringUTF8 { // See also: // http://www.unicode.org/reports/tr6/ -- a compression scheme for Unicode. // ftp://ftp.unicode.org/Public/PROGRAMS/SCSU/ /* A compact string. * Stored as: * [int - allocation space] * int - length * bytes - UTF8 * except: * ints are held as Varints * bytes (FF, 00) is "http://" * bytes (FF, 01) is "http://www." * * FF is an illegal UTF8 byte, as a first byte of a UTF-8 codepoint sequence. * F5-FF would be 4 byte sequences above 10FFFF * 80-BF not in a first byte. * * Eventually, will be a slice of a very large ByteBuffer and we will do our own malloc. * * 00000000-01111111 00-7F 0-127 Single-byte encoding (compatible with US-ASCII) * 10000000-10111111 80-BF 128-191 Second, third, or fourth byte of a multi-byte sequence * 11000000-11000001 C0-C1 192-193 Overlong encoding: start of 2-byte sequence, but would encode a code point ≤ 127 * 11000010-11011111 C2-DF 194-223 Start of 2-byte sequence * 11100000-11101111 E0-EF 224-239 Start of 3-byte sequence * 11110000-11110100 F0-F4 240-244 Start of 4-byte sequence * 11110101-11110111 F5-F7 245-247 Restricted by RFC 3629: start of 4-byte sequence for codepoint above 10FFFF * 11111000-11111011 F8-FB 248-251 Restricted by RFC 3629: start of 5-byte sequence * 11111100-11111101 FC-FD 252-253 Restricted by RFC 3629: start of 6-byte sequence * 11111110-11111111 FE-FF 254-255 Invalid: not defined by original UTF-8 specification * */ public static StringUTF8 alloc(String string) { return null ; } // V1 - object allocation. private VarInteger length ; private ByteBuffer bytes ; private StringUTF8(String string) { // A copy ... byte[] rawbytes = StrUtils.asUTF8bytes(string) ; bytes = ByteBuffer.wrap(rawbytes) ; length = VarInteger.valueOf(rawbytes.length) ; } public String asString() {return StrUtils.fromUTF8bytes(bytes.array()) ; } @Override public String toString() { return asString() ; } static ByteBuffer space1 = ByteBuffer.allocate(1000*1000) ; private static Pair allocSpace(String string) { return null ; } private static ByteBuffer allocSpace(VarInteger allocSpaceId, VarInteger allocId) { if ( allocSpaceId.value() != 0 ) throw new IllegalArgumentException() ; return space1 ; } } /* * (c) Copyright 2010 Talis Systems Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */