package org.apache.lucene.util; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.analysis.NumericTokenStream; // for javadocs import org.apache.lucene.document.NumericField; // for javadocs import org.apache.lucene.search.NumericRangeQuery; // for javadocs import org.apache.lucene.search.NumericRangeFilter; // for javadocs /** * This is a helper class to generate prefix-encoded representations for numerical values * and supplies converters to represent float/double values as sortable integers/longs. * *
To quickly execute range queries in Apache Lucene, a range is divided recursively * into multiple intervals for searching: The center of the range is searched only with * the lowest possible precision in the trie, while the boundaries are matched * more exactly. This reduces the number of terms dramatically. * *
This class generates terms to achieve this: First the numerical integer values need to
* be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
* and the bits are converted to ASCII chars with each 7 bit. The resulting string is
* sortable like the original integer value. Each value is also prefixed
* (in the first char) by the shift value (number of bits removed) used
* during encoding.
*
*
To also index floating point numbers, this class supplies two methods to convert them * to integer values by changing their bit layout: {@link #doubleToSortableLong}, * {@link #floatToSortableInt}. You will have no precision loss by * converting floating point numbers to integers and back (only that the integer form * is not usable). Other data types like dates can easily converted to longs or ints (e.g. * date to long: {@link java.util.Date#getTime}). * *
For easy usage, the trie algorithm is implemented for indexing inside
* {@link NumericTokenStream} that can index int, long,
* float, and double. For querying,
* {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part
* for the same data types.
*
*
This class can also be used, to generate lexicographically sortable (according * {@link String#compareTo(String)}) representations of numeric data types for other * usages (e.g. sorting). * *
NOTE: This API is experimental and
* might change in incompatible ways in the next release.
*
* @since 2.9
*/
public final class NumericUtils {
private NumericUtils() {} // no instance!
/**
* The default precision step used by {@link NumericField}, {@link NumericTokenStream},
* {@link NumericRangeQuery}, and {@link NumericRangeFilter} as default
*/
public static final int PRECISION_STEP_DEFAULT = 4;
/**
* Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is
* stored as SHIFT_START_LONG+shift in the first character
*/
public static final char SHIFT_START_LONG = (char)0x20;
/**
* Expert: The maximum term length (used for char[] buffer size)
* for encoding long values.
* @see #longToPrefixCoded(long,int,char[])
*/
public static final int BUF_SIZE_LONG = 63/7 + 2;
/**
* Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is
* stored as SHIFT_START_INT+shift in the first character
*/
public static final char SHIFT_START_INT = (char)0x60;
/**
* Expert: The maximum term length (used for char[] buffer size)
* for encoding int values.
* @see #intToPrefixCoded(int,int,char[])
*/
public static final int BUF_SIZE_INT = 31/7 + 2;
/**
* Expert: Returns prefix coded bits after reducing the precision by shift bits.
* This is method is used by {@link NumericTokenStream}.
* @param val the numeric value
* @param shift how many bits to strip from the right
* @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_LONG}
* length
* @return number of chars written to buffer
*/
public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) {
if (shift>63 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..63");
int nChars = (63-shift)/7 + 1, len = nChars+1;
buffer[0] = (char)(SHIFT_START_LONG + shift);
long sortableBits = val ^ 0x8000000000000000L;
sortableBits >>>= shift;
while (nChars>=1) {
// Store 7 bits per character for good efficiency when UTF-8 encoding.
// The whole number is right-justified so that lucene can prefix-encode
// the terms more efficiently.
buffer[nChars--] = (char)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
return len;
}
/**
* Expert: Returns prefix coded bits after reducing the precision by shift bits.
* This is method is used by {@link LongRangeBuilder}.
* @param val the numeric value
* @param shift how many bits to strip from the right
*/
public static String longToPrefixCoded(final long val, final int shift) {
final char[] buffer = new char[BUF_SIZE_LONG];
final int len = longToPrefixCoded(val, shift, buffer);
return new String(buffer, 0, len);
}
/**
* This is a convenience method, that returns prefix coded bits of a long without
* reducing the precision. It can be used to store the full precision value as a
* stored field in index.
*
To decode, use {@link #prefixCodedToLong}.
*/
public static String longToPrefixCoded(final long val) {
return longToPrefixCoded(val, 0);
}
/**
* Expert: Returns prefix coded bits after reducing the precision by shift bits.
* This is method is used by {@link NumericTokenStream}.
* @param val the numeric value
* @param shift how many bits to strip from the right
* @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT}
* length
* @return number of chars written to buffer
*/
public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) {
if (shift>31 || shift<0)
throw new IllegalArgumentException("Illegal shift value, must be 0..31");
int nChars = (31-shift)/7 + 1, len = nChars+1;
buffer[0] = (char)(SHIFT_START_INT + shift);
int sortableBits = val ^ 0x80000000;
sortableBits >>>= shift;
while (nChars>=1) {
// Store 7 bits per character for good efficiency when UTF-8 encoding.
// The whole number is right-justified so that lucene can prefix-encode
// the terms more efficiently.
buffer[nChars--] = (char)(sortableBits & 0x7f);
sortableBits >>>= 7;
}
return len;
}
/**
* Expert: Returns prefix coded bits after reducing the precision by shift bits.
* This is method is used by {@link IntRangeBuilder}.
* @param val the numeric value
* @param shift how many bits to strip from the right
*/
public static String intToPrefixCoded(final int val, final int shift) {
final char[] buffer = new char[BUF_SIZE_INT];
final int len = intToPrefixCoded(val, shift, buffer);
return new String(buffer, 0, len);
}
/**
* This is a convenience method, that returns prefix coded bits of an int without
* reducing the precision. It can be used to store the full precision value as a
* stored field in index.
*
To decode, use {@link #prefixCodedToInt}.
*/
public static String intToPrefixCoded(final int val) {
return intToPrefixCoded(val, 0);
}
/**
* Returns a long from prefixCoded characters.
* Rightmost bits will be zero for lower precision codes.
* This method can be used to decode e.g. a stored field.
* @throws NumberFormatException if the supplied string is
* not correctly prefix encoded.
* @see #longToPrefixCoded(long)
*/
public static long prefixCodedToLong(final String prefixCoded) {
final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG;
if (shift>63 || shift<0)
throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
long sortableBits = 0L;
for (int i=1, len=prefixCoded.length(); i This method is used by {@link NumericRangeQuery}.
*/
public static void splitLongRange(final LongRangeBuilder builder,
final int precisionStep, final long minBound, final long maxBound
) {
splitRange(builder, 64, precisionStep, minBound, maxBound);
}
/**
* Expert: Splits an int range recursively.
* You may implement a builder that adds clauses to a
* {@link org.apache.lucene.search.BooleanQuery} for each call to its
* {@link IntRangeBuilder#addRange(String,String)}
* method.
* This method is used by {@link NumericRangeQuery}.
*/
public static void splitIntRange(final IntRangeBuilder builder,
final int precisionStep, final int minBound, final int maxBound
) {
splitRange(builder, 32, precisionStep, (long)minBound, (long)maxBound);
}
/** This helper does the splitting for both 32 and 64 bit. */
private static void splitRange(
final Object builder, final int valSize,
final int precisionStep, long minBound, long maxBound
) {
if (precisionStep < 1)
throw new IllegalArgumentException("precisionStep must be >=1");
if (minBound > maxBound) return;
for (int shift=0; ; shift += precisionStep) {
// calculate new bounds for inner precision
final long diff = 1L << (shift+precisionStep),
mask = ((1L< NOTE: This is a very low-level interface,
* the method signatures may change in later versions.
*/
public static abstract class LongRangeBuilder {
/**
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
* You can directly build classical (inclusive) range queries from them.
*/
public void addRange(String minPrefixCoded, String maxPrefixCoded) {
throw new UnsupportedOperationException();
}
/**
* Overwrite this method, if you like to receive the raw long range bounds.
* You can use this for e.g. debugging purposes (print out range bounds).
*/
public void addRange(final long min, final long max, final int shift) {
addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift));
}
}
/**
* Expert: Callback for {@link #splitIntRange}.
* You need to overwrite only one of the methods.
* NOTE: This is a very low-level interface,
* the method signatures may change in later versions.
*/
public static abstract class IntRangeBuilder {
/**
* Overwrite this method, if you like to receive the already prefix encoded range bounds.
* You can directly build classical range (inclusive) queries from them.
*/
public void addRange(String minPrefixCoded, String maxPrefixCoded) {
throw new UnsupportedOperationException();
}
/**
* Overwrite this method, if you like to receive the raw int range bounds.
* You can use this for e.g. debugging purposes (print out range bounds).
*/
public void addRange(final int min, final int max, final int shift) {
addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift));
}
}
}
double value to a sortable signed long.
* The value is converted by getting their IEEE 754 floating-point "double format"
* bit layout and then some bits are swapped, to be able to compare the result as long.
* By this the precision is not reduced, but the value can easily used as a long.
* @see #sortableLongToDouble
*/
public static long doubleToSortableLong(double val) {
long f = Double.doubleToRawLongBits(val);
if (f<0) f ^= 0x7fffffffffffffffL;
return f;
}
/**
* Convenience method: this just returns:
* longToPrefixCoded(doubleToSortableLong(val))
*/
public static String doubleToPrefixCoded(double val) {
return longToPrefixCoded(doubleToSortableLong(val));
}
/**
* Converts a sortable long back to a double.
* @see #doubleToSortableLong
*/
public static double sortableLongToDouble(long val) {
if (val<0) val ^= 0x7fffffffffffffffL;
return Double.longBitsToDouble(val);
}
/**
* Convenience method: this just returns:
* sortableLongToDouble(prefixCodedToLong(val))
*/
public static double prefixCodedToDouble(String val) {
return sortableLongToDouble(prefixCodedToLong(val));
}
/**
* Converts a float value to a sortable signed int.
* The value is converted by getting their IEEE 754 floating-point "float format"
* bit layout and then some bits are swapped, to be able to compare the result as int.
* By this the precision is not reduced, but the value can easily used as an int.
* @see #sortableIntToFloat
*/
public static int floatToSortableInt(float val) {
int f = Float.floatToRawIntBits(val);
if (f<0) f ^= 0x7fffffff;
return f;
}
/**
* Convenience method: this just returns:
* intToPrefixCoded(floatToSortableInt(val))
*/
public static String floatToPrefixCoded(float val) {
return intToPrefixCoded(floatToSortableInt(val));
}
/**
* Converts a sortable int back to a float.
* @see #floatToSortableInt
*/
public static float sortableIntToFloat(int val) {
if (val<0) val ^= 0x7fffffff;
return Float.intBitsToFloat(val);
}
/**
* Convenience method: this just returns:
* sortableIntToFloat(prefixCodedToInt(val))
*/
public static float prefixCodedToFloat(String val) {
return sortableIntToFloat(prefixCodedToInt(val));
}
/**
* Expert: Splits a long range recursively.
* You may implement a builder that adds clauses to a
* {@link org.apache.lucene.search.BooleanQuery} for each call to its
* {@link LongRangeBuilder#addRange(String,String)}
* method.
*