/*
 * (c) Copyright 2010 Talis Systems Ltd.
 * (c) Copyright 2011 Epimorphics Ltd.
 * All rights reserved.
 * [See end of file]
 */

package org.openjena.atlas.io;

import java.io.ByteArrayInputStream ;
import java.io.IOException ;
import java.io.InputStream ;
import java.io.Reader ;

import org.openjena.atlas.AtlasException ;

/** Fast and streaming UTF-8 */
public final class InStreamUTF8 extends Reader implements CharStream
{
    // TODO Add line and col counts.
    // See arq.utf8. 
    // TODO Better ready()/available() in InputStreamBuffered
    
    // The standard Java way of doing this is via charset decoders.
    // One small disadvantage is that bad UTF-8 does not get flagged as to
    // the byte position of the error.
    
    // This class collects knowledge of how UTF-8 encoding works;
    // the Java classes are usually slightly faster compared to using
    // this class with an InputStreamBuffered but the difference is small.
    // This class generated meaningful error messages (when line/col added).
    
    // The Java classes copy-convert a byte buffer into a char buffer.
    // Sometimes, for example in a parser, this isn't a convenient model
    // because the app is looking one character at a time and accumulating
    // the chars until it sees the end of a token of arbitrary length
    // or processes escape sequences.  
    //
    // The app might use a StringBuilder so the bytes get copied into
    // a char buffer and out again.  Instead, this code assumes the
    // app is in charge of that.
    
    // UTF-8 (UTF-16) is different from other character sets because 
    // the relationship with Java's internal character representation is
    // arithmetic, not a character mapping. 
    
    // Todo: chars > 16 bits -> surrogate pairs. 
    
    /*
     * http://en.wikipedia.org/wiki/UTF-8
     * http://tools.ietf.org/html/rfc3629
     * http://www.ietf.org/rfc/rfc3629.txt
     * 
     * Unicode                                  Byte1       Byte2       Byte3       Byte4
     * U+0000–U+007F    0 to 127                0xxxxxxx
     * U+0080–U+07FF    128 to 2,047            110yyyxx    10xxxxxx 
     * U+0800–U+FFFF    2,048 to 65,535         1110yyyy    10yyyyxx    10xxxxxx
     * U+10000–U+10FFFF 65,536 to 1,114,111     11110zzz    10zzyyyy    10yyyyxx    10xxxxxx
     * 
     * Restricted cases (RFC 3629)
     * 11110101-11110111    F5-F7   245-247     start of 4-byte sequence for codepoint above 10FFFF
     * 11111000-11111011    F8-FB   248-251     start of 5-byte sequence
     * 11111100-11111101    FC-FD   252-253     start of 6-byte sequence
     * 
     * Illegal:
     * 11000000-11000001    C0-C1   192-193     Overlong encoding: start of a 2-byte sequence, but code point <= 127
     * 11111110-11111111    FE-FF   254-255     Invalid: not defined by original UTF-8 specification
     */
    
    // There is some sort of stream decoder backing the Sun implementation 
    // of CharsetDecoder (sun.io.StreamDecoder) but it's not on all platforms
    // I want a known decoder specifically for UTF8
    
    private InputStreamBuffered input ;
    //private long count = 0 ;

    public InStreamUTF8(InputStream in)
    {
        if ( in instanceof InputStreamBuffered )
        {
            input = (InputStreamBuffered)in ;
            return ;
        }
        input = new InputStreamBuffered(in) ;
    }
    
    public InStreamUTF8(InputStreamBuffered in) { input = in ; }
    

    @Override
    public boolean ready() throws IOException
    {
        return input.available() > 0 ;
    }
    
    @Override
    public void close() throws IOException
    { input.close() ; }
    
    //@Override
    public void closeStream()
    { IO.close(input) ; }

    @Override
    public int read(char[] cbuf, int off, int len) throws IOException
    {
        // Doing this on a block of bytes may be faster.
        for ( int i = off ; i < off+len ; i++ )
        {
            int x = read() ;
            if ( x == -1 )
            {
                if ( i == off )
                    return -1 ;
                return (i-off) ;
            }
            cbuf[i] = (char)x ;
        }
        return len ; 
    }

    @Override
    public final int read() throws IOException
    { 
        int ch = advance(input) ;
        //if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
        return ch ;
    }
    
    
    /** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
     *  Not necessarily a valid char (this function can be used a straight UTF8 decoder
     */
    public final int advance()
    { return advance(input) ; }
    
    /** Next codepoint */
    public static final int advance(InputStreamBuffered input)
    {
        int x = input.advance() ;
        if ( x == -1 ) return -1 ;
        return advance(input, x) ;
    }
    
    /** Next codepoint, given the first byte of any UTF-8 byte sequence is already known.
     * Not necessarily a valid char (this function can be used a straight UTF8 decoder
     */
    
    public static final int advance(InputStreamBuffered input, int x)
    {
        //count++ ;
        // Fastpath
        if ( x == -1 || x <= 127 ) 
        {
            //count++ ;
            return x ;
        }

        // 10 => extension byte
        // 110..... => 2 bytes
        if ( (x & 0xE0) == 0xC0 )
        {
            int ch = readMultiBytes(input, x & 0x1F, 2) ;
            // count += 2 ;
            return ch ;
            
        }
        //  1110.... => 3 bytes : 16 bits : not outside 16bit chars 
        if ( (x & 0xF0) == 0xE0 ) 
        {
            int ch = readMultiBytes(input, x & 0x0F, 3) ;
            // count += 3 ;
            //if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
            return ch ;
        }

        // Looking like 4 byte charcater.
        int ch = -2 ;
        // 11110zzz => 4 bytes.
        if ( (x & 0xF8) == 0xF0 )
        {
             ch = readMultiBytes(input, x & 0x08, 4) ;
             // Opsp - need two returns. Character.toChars(ch, chars, 0) ;
             // count += 4 ;
        }
             
        else 
            IO.exception(new IOException("Illegal UTF-8: "+x)) ;

        // This test will go off.  We're processing a 4 byte sequence but Java only supports 16 bit chars. 
        if ( ch > Character.MAX_VALUE )
            throw new AtlasException("Out of range character (must use a surrogate pair)") ;
        if ( ! Character.isDefined(ch) ) throw new AtlasException(String.format("Undefined codepoint: 0x%04X", ch)) ;
        return ch ;
    }
    
    private static int readMultiBytes(InputStreamBuffered input, int start, int len) //throws IOException
    {
        //System.out.print(" -("+len+")") ; p(start) ;
        
        int x = start ;
        for ( int i = 0 ; i < len-1 ; i++ )
        {
            int x2 = input.advance() ;
            if ( x2 == -1 )
                throw new AtlasException("Premature end to UTF-8 sequence at end of input") ;
            
            if ( (x2 & 0xC0) != 0x80 )
                //throw new AtlasException("Illegal UTF-8 processing character "+count+": "+x2) ;
                throw new AtlasException(String.format("Illegal UTF-8 processing character: 0x%04X",x2)) ;
            // 6 bits of x2
            x = (x << 6) | (x2 & 0x3F); 
        }
        return x ;
    }

    private static void p(int ch)
    {
        System.out.printf(" %02X", ch) ;
        if ( ch == -1 )
            System.out.println();
    }
    
    public static String decode(byte[] bytes)
    {
        try
        {
            char[] chars = new char[bytes.length] ;
            InputStream in = new ByteArrayInputStream(bytes) ;
            StringBuilder buff = new StringBuilder() ;
            Reader r = new InStreamUTF8(in) ;
            int len ;
            len = r.read(chars) ;
            return new String(chars, 0, len) ;
        } catch (IOException ex)
        {
            IO.exception(ex) ;
            return null ;
        }
    }
}

/*
 * (c) Copyright 2010 Talis Systems Ltd.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */