/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Lucene.Net.Documents;
using Lucene.Net.Search;
using Lucene.Net.Support;
using NumericTokenStream = Lucene.Net.Analysis.NumericTokenStream;
namespace Lucene.Net.Util
{
/// This is a helper class to generate prefix-encoded representations for numerical values
/// and supplies converters to represent float/double values as sortable integers/longs.
///
/// To quickly execute range queries in Apache Lucene, a range is divided recursively
/// into multiple intervals for searching: The center of the range is searched only with
/// the lowest possible precision in the trie, while the boundaries are matched
/// more exactly. This reduces the number of terms dramatically.
///
/// This class generates terms to achive this: First the numerical integer values need to
/// be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
/// and the bits are converted to ASCII chars with each 7 bit. The resulting string is
/// sortable like the original integer value. Each value is also prefixed
/// (in the first char) by the shift value (number of bits removed) used
/// during encoding.
///
/// To also index floating point numbers, this class supplies two methods to convert them
/// to integer values by changing their bit layout: ,
/// . You will have no precision loss by
/// converting floating point numbers to integers and back (only that the integer form
/// is not usable). Other data types like dates can easily converted to longs or ints (e.g.
/// date to long: ).
///
/// For easy usage, the trie algorithm is implemented for indexing inside
/// that can index int, long,
/// float, and double. For querying,
/// and implement the query part
/// for the same data types.
///
/// This class can also be used, to generate lexicographically sortable (according
/// ) representations of numeric data types for other
/// usages (e.g. sorting).
///
/// NOTE: This API is experimental and
/// might change in incompatible ways in the next release.
///
///
/// 2.9
///
public sealed class NumericUtils
{
private NumericUtils()
{
} // no instance!
/// The default precision step used by , ,
/// , and as default
///
public const int PRECISION_STEP_DEFAULT = 4;
/// Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is
/// stored as SHIFT_START_LONG+shift in the first character
///
public static char SHIFT_START_LONG = (char) 0x20;
/// Expert: The maximum term length (used for char[] buffer size)
/// for encoding long values.
///
///
///
public const int BUF_SIZE_LONG = 63 / 7 + 2;
/// Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is
/// stored as SHIFT_START_INT+shift in the first character
///
public static char SHIFT_START_INT = (char) 0x60;
/// Expert: The maximum term length (used for char[] buffer size)
/// for encoding int values.
///
///
///
public const int BUF_SIZE_INT = 31 / 7 + 2;
/// Expert: Returns prefix coded bits after reducing the precision by shift bits.
/// This is method is used by .
///
/// the numeric value
///
/// how many bits to strip from the right
///
/// that will contain the encoded chars, must be at least of
/// length
///
/// number of chars written to buffer
///
public static int LongToPrefixCoded(long val, int shift, char[] buffer)
{
if (shift > 63 || shift < 0)
throw new System.ArgumentException("Illegal shift value, must be 0..63");
int nChars = (63 - shift) / 7 + 1, len = nChars + 1;
buffer[0] = (char) (SHIFT_START_LONG + shift);
ulong sortableBits = BitConverter.ToUInt64(BitConverter.GetBytes(val), 0) ^ 0x8000000000000000L;
sortableBits = sortableBits >> shift;
while (nChars >= 1)
{
// Store 7 bits per character for good efficiency when UTF-8 encoding.
// The whole number is right-justified so that lucene can prefix-encode
// the terms more efficiently.
buffer[nChars--] = (char) (sortableBits & 0x7f);
sortableBits = sortableBits >> 7;
}
return len;
}
/// Expert: Returns prefix coded bits after reducing the precision by shift bits.
/// This is method is used by .
///
/// the numeric value
///
/// how many bits to strip from the right
///
public static System.String LongToPrefixCoded(long val, int shift)
{
char[] buffer = new char[BUF_SIZE_LONG];
int len = LongToPrefixCoded(val, shift, buffer);
return new System.String(buffer, 0, len);
}
/// This is a convenience method, that returns prefix coded bits of a long without
/// reducing the precision. It can be used to store the full precision value as a
/// stored field in index.
/// To decode, use .
///
public static System.String LongToPrefixCoded(long val)
{
return LongToPrefixCoded(val, 0);
}
/// Expert: Returns prefix coded bits after reducing the precision by shift bits.
/// This is method is used by .
///
/// the numeric value
///
/// how many bits to strip from the right
///
/// that will contain the encoded chars, must be at least of
/// length
///
/// number of chars written to buffer
///
public static int IntToPrefixCoded(int val, int shift, char[] buffer)
{
if (shift > 31 || shift < 0)
throw new System.ArgumentException("Illegal shift value, must be 0..31");
int nChars = (31 - shift) / 7 + 1, len = nChars + 1;
buffer[0] = (char) (SHIFT_START_INT + shift);
int sortableBits = val ^ unchecked((int) 0x80000000);
sortableBits = Number.URShift(sortableBits, shift);
while (nChars >= 1)
{
// Store 7 bits per character for good efficiency when UTF-8 encoding.
// The whole number is right-justified so that lucene can prefix-encode
// the terms more efficiently.
buffer[nChars--] = (char) (sortableBits & 0x7f);
sortableBits = Number.URShift(sortableBits, 7);
}
return len;
}
/// Expert: Returns prefix coded bits after reducing the precision by shift bits.
/// This is method is used by .
///
/// the numeric value
///
/// how many bits to strip from the right
///
public static System.String IntToPrefixCoded(int val, int shift)
{
char[] buffer = new char[BUF_SIZE_INT];
int len = IntToPrefixCoded(val, shift, buffer);
return new System.String(buffer, 0, len);
}
/// This is a convenience method, that returns prefix coded bits of an int without
/// reducing the precision. It can be used to store the full precision value as a
/// stored field in index.
/// To decode, use .
///
public static System.String IntToPrefixCoded(int val)
{
return IntToPrefixCoded(val, 0);
}
/// Returns a long from prefixCoded characters.
/// Rightmost bits will be zero for lower precision codes.
/// This method can be used to decode e.g. a stored field.
///
/// NumberFormatException if the supplied string is
/// not correctly prefix encoded.
///
///
///
public static long PrefixCodedToLong(System.String prefixCoded)
{
int shift = prefixCoded[0] - SHIFT_START_LONG;
if (shift > 63 || shift < 0)
throw new System.FormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
ulong sortableBits = 0UL;
for (int i = 1, len = prefixCoded.Length; i < len; i++)
{
sortableBits <<= 7;
char ch = prefixCoded[i];
if (ch > 0x7f)
{
throw new System.FormatException("Invalid prefixCoded numerical value representation (char " + System.Convert.ToString((int) ch, 16) + " at position " + i + " is invalid)");
}
sortableBits |= (ulong) ch;
}
return BitConverter.ToInt64(BitConverter.GetBytes((sortableBits << shift) ^ 0x8000000000000000L), 0);
}
/// Returns an int from prefixCoded characters.
/// Rightmost bits will be zero for lower precision codes.
/// This method can be used to decode e.g. a stored field.
///
/// NumberFormatException if the supplied string is
/// not correctly prefix encoded.
///
///
///
public static int PrefixCodedToInt(System.String prefixCoded)
{
int shift = prefixCoded[0] - SHIFT_START_INT;
if (shift > 31 || shift < 0)
throw new System.FormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)");
int sortableBits = 0;
for (int i = 1, len = prefixCoded.Length; i < len; i++)
{
sortableBits <<= 7;
char ch = prefixCoded[i];
if (ch > 0x7f)
{
throw new System.FormatException("Invalid prefixCoded numerical value representation (char " + System.Convert.ToString((int) ch, 16) + " at position " + i + " is invalid)");
}
sortableBits |= (int) ch;
}
return (sortableBits << shift) ^ unchecked((int) 0x80000000);
}
/// Converts a double value to a sortable signed long.
/// The value is converted by getting their IEEE 754 floating-point "double format"
/// bit layout and then some bits are swapped, to be able to compare the result as long.
/// By this the precision is not reduced, but the value can easily used as a long.
///
///
///
public static long DoubleToSortableLong(double val)
{
long f = BitConverter.DoubleToInt64Bits(val); // {{Aroush-2.9}} will this work the same as 'java.lang.Double.doubleToRawLongBits()'?
if (f < 0)
f ^= 0x7fffffffffffffffL;
return f;
}
/// Convenience method: this just returns:
/// longToPrefixCoded(doubleToSortableLong(val))
///
public static System.String DoubleToPrefixCoded(double val)
{
return LongToPrefixCoded(DoubleToSortableLong(val));
}
/// Converts a sortable long back to a double.
///
///
public static double SortableLongToDouble(long val)
{
if (val < 0)
val ^= 0x7fffffffffffffffL;
return BitConverter.Int64BitsToDouble(val);
}
/// Convenience method: this just returns:
/// sortableLongToDouble(prefixCodedToLong(val))
///
public static double PrefixCodedToDouble(System.String val)
{
return SortableLongToDouble(PrefixCodedToLong(val));
}
/// Converts a float value to a sortable signed int.
/// The value is converted by getting their IEEE 754 floating-point "float format"
/// bit layout and then some bits are swapped, to be able to compare the result as int.
/// By this the precision is not reduced, but the value can easily used as an int.
///
///
///
public static int FloatToSortableInt(float val)
{
int f = BitConverter.ToInt32(BitConverter.GetBytes(val), 0);
if (f < 0)
f ^= 0x7fffffff;
return f;
}
/// Convenience method: this just returns:
/// intToPrefixCoded(floatToSortableInt(val))
///
public static System.String FloatToPrefixCoded(float val)
{
return IntToPrefixCoded(FloatToSortableInt(val));
}
/// Converts a sortable int back to a float.
///
///
public static float SortableIntToFloat(int val)
{
if (val < 0)
val ^= 0x7fffffff;
return BitConverter.ToSingle(BitConverter.GetBytes(val), 0);
}
/// Convenience method: this just returns:
/// sortableIntToFloat(prefixCodedToInt(val))
///
public static float PrefixCodedToFloat(System.String val)
{
return SortableIntToFloat(PrefixCodedToInt(val));
}
/// Expert: Splits a long range recursively.
/// You may implement a builder that adds clauses to a
/// for each call to its
///
/// method.
/// This method is used by .
///
public static void SplitLongRange(LongRangeBuilder builder, int precisionStep, long minBound, long maxBound)
{
SplitRange(builder, 64, precisionStep, minBound, maxBound);
}
/// Expert: Splits an int range recursively.
/// You may implement a builder that adds clauses to a
/// for each call to its
///
/// method.
/// This method is used by .
///
public static void SplitIntRange(IntRangeBuilder builder, int precisionStep, int minBound, int maxBound)
{
SplitRange(builder, 32, precisionStep, (long) minBound, (long) maxBound);
}
/// This helper does the splitting for both 32 and 64 bit.
private static void SplitRange(System.Object builder, int valSize, int precisionStep, long minBound, long maxBound)
{
if (precisionStep < 1)
throw new System.ArgumentException("precisionStep must be >=1");
if (minBound > maxBound)
return ;
for (int shift = 0; ; shift += precisionStep)
{
// calculate new bounds for inner precision
long diff = 1L << (shift + precisionStep);
long mask = ((1L << precisionStep) - 1L) << shift;
bool hasLower = (minBound & mask) != 0L;
bool hasUpper = (maxBound & mask) != mask;
long nextMinBound = (hasLower?(minBound + diff):minBound) & ~ mask;
long nextMaxBound = (hasUpper?(maxBound - diff):maxBound) & ~ mask;
bool lowerWrapped = nextMinBound < minBound,
upperWrapped = nextMaxBound > maxBound;
if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound || lowerWrapped || upperWrapped)
{
// We are in the lowest precision or the next precision is not available.
AddRange(builder, valSize, minBound, maxBound, shift);
// exit the split recursion loop
break;
}
if (hasLower)
AddRange(builder, valSize, minBound, minBound | mask, shift);
if (hasUpper)
AddRange(builder, valSize, maxBound & ~ mask, maxBound, shift);
// recurse to next precision
minBound = nextMinBound;
maxBound = nextMaxBound;
}
}
/// Helper that delegates to correct range builder
private static void AddRange(System.Object builder, int valSize, long minBound, long maxBound, int shift)
{
// for the max bound set all lower bits (that were shifted away):
// this is important for testing or other usages of the splitted range
// (e.g. to reconstruct the full range). The prefixEncoding will remove
// the bits anyway, so they do not hurt!
maxBound |= (1L << shift) - 1L;
// delegate to correct range builder
switch (valSize)
{
case 64:
((LongRangeBuilder) builder).AddRange(minBound, maxBound, shift);
break;
case 32:
((IntRangeBuilder) builder).AddRange((int) minBound, (int) maxBound, shift);
break;
default:
// Should not happen!
throw new System.ArgumentException("valSize must be 32 or 64.");
}
}
/// Expert: Callback for .
/// You need to overwrite only one of the methods.
/// NOTE: This is a very low-level interface,
/// the method signatures may change in later versions.
///
public abstract class LongRangeBuilder
{
/// Overwrite this method, if you like to receive the already prefix encoded range bounds.
/// You can directly build classical (inclusive) range queries from them.
///
public virtual void AddRange(System.String minPrefixCoded, System.String maxPrefixCoded)
{
throw new System.NotSupportedException();
}
/// Overwrite this method, if you like to receive the raw long range bounds.
/// You can use this for e.g. debugging purposes (print out range bounds).
///
public virtual void AddRange(long min, long max, int shift)
{
AddRange(Lucene.Net.Util.NumericUtils.LongToPrefixCoded(min, shift), Lucene.Net.Util.NumericUtils.LongToPrefixCoded(max, shift));
}
}
/// Expert: Callback for .
/// You need to overwrite only one of the methods.
/// NOTE: This is a very low-level interface,
/// the method signatures may change in later versions.
///
public abstract class IntRangeBuilder
{
/// Overwrite this method, if you like to receive the already prefix encoded range bounds.
/// You can directly build classical range (inclusive) queries from them.
///
public virtual void AddRange(System.String minPrefixCoded, System.String maxPrefixCoded)
{
throw new System.NotSupportedException();
}
/// Overwrite this method, if you like to receive the raw int range bounds.
/// You can use this for e.g. debugging purposes (print out range bounds).
///
public virtual void AddRange(int min, int max, int shift)
{
AddRange(Lucene.Net.Util.NumericUtils.IntToPrefixCoded(min, shift), Lucene.Net.Util.NumericUtils.IntToPrefixCoded(max, shift));
}
}
}
}