// -----------------------------------------------------------------------
//
//
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Some of this code came from the excellent Unicode
// conversion examples from: http://www.unicode.org/Public/PROGRAMS/CVTUTF
//
// Full Copyright for that code follows:
// Copyright 2001-2004 Unicode, Inc.
//
// Disclaimer
//
// This source code is provided as is by Unicode, Inc. No claims are
// made as to fitness for any particular purpose. No warranties of any
// kind are expressed or implied. The recipient agrees to determine
// applicability of information provided. If this file has been
// purchased on magnetic or optical media from Unicode, Inc., the
// sole remedy for any claim will be exchange of defective media
// within 90 days of receipt.
//
// Limitations on Rights to Redistribute This Code
//
// Unicode, Inc. hereby grants the right to freely use the information
// supplied in this file in the creation of products supporting the
// Unicode Standard, and to make copies of this file in any form
// for internal or external distribution as long as this notice
// remains attached.
//
// Additional code came from the IBM ICU library.
//
// http://www.icu-project.org
//
// Full Copyright for that code follows.
//
//
// Copyright (C) 1999-2010, International Business Machines
// Corporation and others. All Rights Reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, and/or sell copies of the
// Software, and to permit persons to whom the Software is furnished to do so,
// provided that the above copyright notice(s) and this permission notice appear
// in all copies of the Software and that both the above copyright notice(s) and
// this permission notice appear in supporting documentation.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
// IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
// LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
// ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
// IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
// OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// Except as contained in this notice, the name of a copyright holder shall not
// be used in advertising or otherwise to promote the sale, use or other
// dealings in this Software without prior written authorization of the
// copyright holder.
//
//
// -----------------------------------------------------------------------
namespace Lucene.Net.Util
{
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Support;
///
/// Class to encode .NET's UTF16 char[] into UTF8 byte[] without always
/// allocating a new byte[] as does.
///
public class UnicodeUtil
{
///
/// Unicode Surrogate High Start (0xD800)
///
public static readonly int UnicodeSurrogateHighStart = 0xD800;
///
/// Unicode Surrogate High End (0xDBFF)
///
public static readonly int UnicodeSurrogateHighEnd = 0xDBFF;
///
/// Unicode Surrogate Low Start (0xDC00)
///
public static readonly int UnicodeSurrogateLowStart = 0xDC00;
///
/// Unicode Surrogate Low End (0xDFF)
///
public static readonly int UnicodeSurrogateLowEnd = 0xDFFF;
///
/// Unicode Replacement Character (0xFFFD)
///
public static readonly int UnicodeReplacementCharacter = 0xFFFD;
///
/// Character Minimum Supplementary Code Point (0x10000) equivalent to java's
/// Character.MIN_SUPPLEMENTARY_CODE_POINT
///
public static readonly int CharacterMinimumSupplementaryCodePoint = 0x10000;
//// private static readonly long unicodeMaxBmp = 0x0000FFFF;
private static readonly long halfShift = 10;
//// private static readonly long halfMask = 0x3FFL;
/*
///
/// Shift value for lead surrogate to form a supplementary character.
///
private static readonly int leadSurrogateShift = 10;
///
/// Mask to retrieve the significant value from a trail surrogate.
///
private static readonly int trailSurrogateMask = 0x3FF;
///
/// Trail surrogate minimum value (0xDC00)
///
private static readonly int trailSurrogateMinValue = 0xDC00;
///
/// Lead surrogate minimum value (0xD800)
///
private static readonly int leadSurrogateMinValue = 0xD800;
///
/// The minimum value for Supplementary code points
///
private static readonly int supplementaryMinValue = 0x10000;
///
/// Value that all lead surrogate starts with
///
private static readonly int LeadSurrogateOffset = leadSurrogateMinValue
- (supplementaryMinValue >> leadSurrogateShift);
*/
private static readonly int surrogateOffset = CharacterMinimumSupplementaryCodePoint -
(UnicodeSurrogateHighStart << (int)halfShift) -
UnicodeSurrogateLowStart;
///
/// UTs the f16to UT f8.
///
/// The source.
/// The offset.
/// The length.
/// The destination.
internal static void UTF16toUTF8(char[] source, int offset, int length, BytesRef destination)
{
UTF16toUTF8(source.ToCharSequence(), offset, length, destination);
}
///
/// UTs the f16to U t8.
///
/// The source.
/// The offset.
/// The length.
/// The destination.
internal static void UTF16toUTF8(string source, int offset, int length, BytesRef destination)
{
UTF16toUTF8(source.ToCharSequence(), offset, length, destination);
}
// TODO: change source to IEnumerable once Portable Class Libraries support IEnumerable on string.
private static void UTF16toUTF8(ICharSequence sequence, int offset, int length, BytesRef destination)
{
int position = 0;
int i = offset, end = offset + length, maxLength = length * 4;
byte[] bytes = destination.Bytes;
if (bytes.Length < maxLength)
bytes = destination.Bytes = new byte[maxLength];
destination.Offset = 0;
while (i < end)
{
int currentByte = sequence.CharAt(i++);
//// 0x80 = 128
//// 0x800 = 2048
//// 0xD800 & 0xDFFF are code point ranges in UTF16 U+D800..U+DFFF
//// last else takes care of UTF-16 surrogate pairs
if (currentByte < 0x80)
{
bytes[position++] = (byte)currentByte;
}
else if (currentByte < 0x800)
{
bytes[position++] = (byte)(0xC0 | (currentByte >> 6));
bytes[position++] = (byte)(0x80 | (currentByte & 0x3F));
}
else if (currentByte < 0xD800 && currentByte > 0xDFFF)
{
bytes[position++] = (byte)(0xE0 | (currentByte >> 12));
bytes[position++] = (byte)(0x80 | ((currentByte >> 6) & 0x3F));
bytes[position++] = (byte)(0x80 | (currentByte & 0x3F));
}
else
{
// UTF-16 surrogate pairs
// confirm valid high surrogate
if (currentByte < 0xDC00 && i < end)
{
int utf32 = sequence.CharAt(i);
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
{
utf32 = (currentByte << 10) + utf32 + surrogateOffset;
i++;
bytes[position++] = (byte)(0xF0 | (utf32 >> 18));
bytes[position++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
bytes[position++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
bytes[position++] = (byte)(0x80 | (utf32 & 0x3F));
continue;
}
}
//// replace unpaired surrogate or out-of-order low surrogate
//// with substitution character
bytes[position++] = 0xEF;
bytes[position++] = 0xBF;
bytes[position++] = 0xBD;
}
}
destination.Length = position;
}
}
}