/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Support; using DocIdSet = Lucene.Net.Search.DocIdSet; using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; namespace Lucene.Net.Util { /// An "open" BitSet implementation that allows direct access to the array of words /// storing the bits. ///

/// Unlike java.util.bitset, the fact that bits are packed into an array of longs /// is part of the interface. This allows efficient implementation of other algorithms /// by someone other than the author. It also allows one to efficiently implement /// alternate serialization or interchange formats. ///

/// OpenBitSet is faster than java.util.BitSet in most operations /// and *much* faster at calculating cardinality of sets and results of set operations. /// It can also handle sets of larger cardinality (up to 64 * 2**32-1) ///

/// The goals of OpenBitSet are the fastest implementation possible, and /// maximum code reuse. Extra safety and encapsulation /// may always be built on top, but if that's built in, the cost can never be removed (and /// hence people re-implement their own version in order to get better performance). /// If you want a "safe", totally encapsulated (and slower and limited) BitSet /// class, use java.util.BitSet. ///

///

Performance Results

/// /// Test system: Pentium 4, Sun Java 1.5_06 -server -Xbatch -Xmx64M ///
BitSet size = 1,000,000 ///
Results are java.util.BitSet time divided by OpenBitSet time. /// /// /// /// /// /// /// /// /// /// ///
cardinality intersect_count union nextSetBit get iterator
50% full 3.36 3.96 1.44 1.46 1.99 1.58
1% full 3.31 3.90   1.04   0.99
///
/// Test system: AMD Opteron, 64 bit linux, Sun Java 1.5_06 -server -Xbatch -Xmx64M ///
BitSet size = 1,000,000 ///
Results are java.util.BitSet time divided by OpenBitSet time. /// /// /// /// /// /// /// /// /// /// ///
cardinality intersect_count union nextSetBit get iterator
50% full 2.50 3.50 1.00 1.03 1.12 1.25
1% full 2.51 3.49   1.00   1.02
///
/// $Id$ /// [Serializable] public class OpenBitSet:DocIdSet, System.ICloneable { protected internal long[] internalbits; protected internal int wlen; // number of words (elements) used in the array /// Constructs an OpenBitSet large enough to hold numBits. /// /// /// /// public OpenBitSet(long numBits) { internalbits = new long[Bits2words(numBits)]; wlen = internalbits.Length; } public OpenBitSet():this(64) { } /// Constructs an OpenBitSet from an existing long[]. ///
/// The first 64 bits are in long[0], /// with bit index 0 at the least significant bit, and bit index 63 at the most significant. /// Given a bit index, /// the word containing it is long[index/64], and it is at bit number index%64 within that word. ///

/// numWords are the number of elements in the array that contain /// set bits (non-zero longs). /// numWords should be <= bits.length, and /// any existing words in the array at position >= numWords should be zero. /// ///

public OpenBitSet(long[] bits, int numWords) { this.internalbits = bits; this.wlen = numWords; } public override DocIdSetIterator Iterator() { return new OpenBitSetIterator(internalbits, wlen); } /// This DocIdSet implementation is cacheable. public override bool IsCacheable { get { return true; } } /// Returns the current capacity in bits (1 greater than the index of the last bit) public virtual long Capacity() { return internalbits.Length << 6; } /// Returns the current capacity of this set. Included for /// compatibility. This is *not* equal to /// public virtual long Size() { return Capacity(); } /// Returns true if there are no set bits public virtual bool IsEmpty() { return Cardinality() == 0; } /// Expert: Gets or sets the long[] storing the bits [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Performance", "CA1819:PropertiesShouldNotReturnArrays")] public virtual long[] Bits { set { this.internalbits = value; } get { return internalbits; } } /// Expert: gets or sets the number of longs in the array that are in use public virtual int NumWords { get { return wlen; } set { this.wlen = value; } } /// Returns true or false for the specified bit index. public virtual bool Get(int index) { int i = index >> 6; // div 64 // signed shift will keep a negative index and force an // array-index-out-of-bounds-exception, removing the need for an explicit check. if (i >= internalbits.Length) return false; int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; return (internalbits[i] & bitmask) != 0; } /// Returns true or false for the specified bit index. /// The index should be less than the OpenBitSet size /// public virtual bool FastGet(int index) { int i = index >> 6; // div 64 // signed shift will keep a negative index and force an // array-index-out-of-bounds-exception, removing the need for an explicit check. int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; return (internalbits[i] & bitmask) != 0; } /// Returns true or false for the specified bit index public virtual bool Get(long index) { int i = (int) (index >> 6); // div 64 if (i >= internalbits.Length) return false; int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; return (internalbits[i] & bitmask) != 0; } /// Returns true or false for the specified bit index. /// The index should be less than the OpenBitSet size. /// public virtual bool FastGet(long index) { int i = (int) (index >> 6); // div 64 int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; return (internalbits[i] & bitmask) != 0; } /* // alternate implementation of get() public boolean get1(int index) { int i = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 return ((bits[i]>>>bit) & 0x01) != 0; // this does a long shift and a bittest (on x86) vs // a long shift, and a long AND, (the test for zero is prob a no-op) // testing on a P4 indicates this is slower than (bits[i] & bitmask) != 0; } */ /// returns 1 if the bit is set, 0 if not. /// The index should be less than the OpenBitSet size /// public virtual int GetBit(int index) { int i = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 return ((int )((ulong) (internalbits[i]) >> bit)) & 0x01; } /* public boolean get2(int index) { int word = index >> 6; // div 64 int bit = index & 0x0000003f; // mod 64 return (bits[word] << bit) < 0; // hmmm, this would work if bit order were reversed // we could right shift and check for parity bit, if it was available to us. } */ /// sets a bit, expanding the set size if necessary public virtual void Set(long index) { int wordNum = ExpandingWordNum(index); int bit = (int) index & 0x3f; long bitmask = 1L << bit; internalbits[wordNum] |= bitmask; } /// Sets the bit at the specified index. /// The index should be less than the OpenBitSet size. /// public virtual void FastSet(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] |= bitmask; } /// Sets the bit at the specified index. /// The index should be less than the OpenBitSet size. /// public virtual void FastSet(long index) { int wordNum = (int) (index >> 6); int bit = (int) index & 0x3f; long bitmask = 1L << bit; internalbits[wordNum] |= bitmask; } /// Sets a range of bits, expanding the set size if necessary /// /// /// lower index /// /// one-past the last bit to set /// public virtual void Set(long startIndex, long endIndex) { if (endIndex <= startIndex) return ; int startWord = (int) (startIndex >> 6); // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = ExpandingWordNum(endIndex - 1); long startmask = - 1L << (int) startIndex; long endmask = (long) (0xffffffffffffffffUL >> (int) - endIndex); // 64-(endIndex&0x3f) is the same as -endIndex due to wrap if (startWord == endWord) { internalbits[startWord] |= (startmask & endmask); return ; } internalbits[startWord] |= startmask; for (int i = startWord + 1; i < endWord; i++) internalbits[i] = -1L; internalbits[endWord] |= endmask; } protected internal virtual int ExpandingWordNum(long index) { int wordNum = (int) (index >> 6); if (wordNum >= wlen) { EnsureCapacity(index + 1); wlen = wordNum + 1; } return wordNum; } /// clears a bit. /// The index should be less than the OpenBitSet size. /// public virtual void FastClear(int index) { int wordNum = index >> 6; int bit = index & 0x03f; long bitmask = 1L << bit; internalbits[wordNum] &= ~ bitmask; // hmmm, it takes one more instruction to clear than it does to set... any // way to work around this? If there were only 63 bits per word, we could // use a right shift of 10111111...111 in binary to position the 0 in the // correct place (using sign extension). // Could also use Long.rotateRight() or rotateLeft() *if* they were converted // by the JVM into a native instruction. // bits[word] &= Long.rotateLeft(0xfffffffe,bit); } /// clears a bit. /// The index should be less than the OpenBitSet size. /// public virtual void FastClear(long index) { int wordNum = (int) (index >> 6); // div 64 int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] &= ~ bitmask; } /// clears a bit, allowing access beyond the current set size without changing the size. public virtual void Clear(long index) { int wordNum = (int) (index >> 6); // div 64 if (wordNum >= wlen) return ; int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] &= ~ bitmask; } /// Clears a range of bits. Clearing past the end does not change the size of the set. /// /// /// lower index /// /// one-past the last bit to clear /// public virtual void Clear(int startIndex, int endIndex) { if (endIndex <= startIndex) return ; int startWord = (startIndex >> 6); if (startWord >= wlen) return ; // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = ((endIndex - 1) >> 6); long startmask = - 1L << startIndex; long endmask = (long) (0xffffffffffffffffUL >> - endIndex); // 64-(endIndex&0x3f) is the same as -endIndex due to wrap // invert masks since we are clearing startmask = ~ startmask; endmask = ~ endmask; if (startWord == endWord) { internalbits[startWord] &= (startmask | endmask); return ; } internalbits[startWord] &= startmask; int middle = System.Math.Min(wlen, endWord); for (int i = startWord + 1; i < middle; i++) internalbits[i] = 0L; if (endWord < wlen) { internalbits[endWord] &= endmask; } } /// Clears a range of bits. Clearing past the end does not change the size of the set. /// /// /// lower index /// /// one-past the last bit to clear /// public virtual void Clear(long startIndex, long endIndex) { if (endIndex <= startIndex) return ; int startWord = (int) (startIndex >> 6); if (startWord >= wlen) return ; // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = (int) ((endIndex - 1) >> 6); long startmask = - 1L << (int) startIndex; long endmask = (long) (0xffffffffffffffffUL >> (int) - endIndex); // 64-(endIndex&0x3f) is the same as -endIndex due to wrap // invert masks since we are clearing startmask = ~ startmask; endmask = ~ endmask; if (startWord == endWord) { internalbits[startWord] &= (startmask | endmask); return ; } internalbits[startWord] &= startmask; int middle = System.Math.Min(wlen, endWord); for (int i = startWord + 1; i < middle; i++) internalbits[i] = 0L; if (endWord < wlen) { internalbits[endWord] &= endmask; } } /// Sets a bit and returns the previous value. /// The index should be less than the OpenBitSet size. /// public virtual bool GetAndSet(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; bool val = (internalbits[wordNum] & bitmask) != 0; internalbits[wordNum] |= bitmask; return val; } /// Sets a bit and returns the previous value. /// The index should be less than the OpenBitSet size. /// public virtual bool GetAndSet(long index) { int wordNum = (int) (index >> 6); // div 64 int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; bool val = (internalbits[wordNum] & bitmask) != 0; internalbits[wordNum] |= bitmask; return val; } /// flips a bit. /// The index should be less than the OpenBitSet size. /// public virtual void FastFlip(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] ^= bitmask; } /// flips a bit. /// The index should be less than the OpenBitSet size. /// public virtual void FastFlip(long index) { int wordNum = (int) (index >> 6); // div 64 int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] ^= bitmask; } /// flips a bit, expanding the set size if necessary public virtual void Flip(long index) { int wordNum = ExpandingWordNum(index); int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] ^= bitmask; } /// flips a bit and returns the resulting bit value. /// The index should be less than the OpenBitSet size. /// public virtual bool FlipAndGet(int index) { int wordNum = index >> 6; // div 64 int bit = index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] ^= bitmask; return (internalbits[wordNum] & bitmask) != 0; } /// flips a bit and returns the resulting bit value. /// The index should be less than the OpenBitSet size. /// public virtual bool FlipAndGet(long index) { int wordNum = (int) (index >> 6); // div 64 int bit = (int) index & 0x3f; // mod 64 long bitmask = 1L << bit; internalbits[wordNum] ^= bitmask; return (internalbits[wordNum] & bitmask) != 0; } /// Flips a range of bits, expanding the set size if necessary /// /// /// lower index /// /// one-past the last bit to flip /// public virtual void Flip(long startIndex, long endIndex) { if (endIndex <= startIndex) return ; int startWord = (int) (startIndex >> 6); // since endIndex is one past the end, this is index of the last // word to be changed. int endWord = ExpandingWordNum(endIndex - 1); /* Grrr, java shifting wraps around so -1L>>>64 == -1 * for that reason, make sure not to use endmask if the bits to flip will * be zero in the last word (redefine endWord to be the last changed...) long startmask = -1L << (startIndex & 0x3f); // example: 11111...111000 long endmask = -1L >>> (64-(endIndex & 0x3f)); // example: 00111...111111 ***/ long startmask = - 1L << (int) startIndex; long endmask = (long) (0xffffffffffffffffUL >> (int) - endIndex); // 64-(endIndex&0x3f) is the same as -endIndex due to wrap if (startWord == endWord) { internalbits[startWord] ^= (startmask & endmask); return ; } internalbits[startWord] ^= startmask; for (int i = startWord + 1; i < endWord; i++) { internalbits[i] = ~ internalbits[i]; } internalbits[endWord] ^= endmask; } /* public static int pop(long v0, long v1, long v2, long v3) { // derived from pop_array by setting last four elems to 0. // exchanges one pop() call for 10 elementary operations // saving about 7 instructions... is there a better way? long twosA=v0 & v1; long ones=v0^v1; long u2=ones^v2; long twosB =(ones&v2)|(u2&v3); ones=u2^v3; long fours=(twosA&twosB); long twos=twosA^twosB; return (pop(fours)<<2) + (pop(twos)<<1) + pop(ones); } */ /// the number of set bits /// public virtual long Cardinality() { return BitUtil.Pop_array(internalbits, 0, wlen); } /// Returns the popcount or cardinality of the intersection of the two sets. /// Neither set is modified. /// public static long IntersectionCount(OpenBitSet a, OpenBitSet b) { return BitUtil.Pop_intersect(a.internalbits, b.internalbits, 0, System.Math.Min(a.wlen, b.wlen)); } /// Returns the popcount or cardinality of the union of the two sets. /// Neither set is modified. /// public static long UnionCount(OpenBitSet a, OpenBitSet b) { long tot = BitUtil.Pop_union(a.internalbits, b.internalbits, 0, System.Math.Min(a.wlen, b.wlen)); if (a.wlen < b.wlen) { tot += BitUtil.Pop_array(b.internalbits, a.wlen, b.wlen - a.wlen); } else if (a.wlen > b.wlen) { tot += BitUtil.Pop_array(a.internalbits, b.wlen, a.wlen - b.wlen); } return tot; } /// Returns the popcount or cardinality of "a and not b" /// or "intersection(a, not(b))". /// Neither set is modified. /// public static long AndNotCount(OpenBitSet a, OpenBitSet b) { long tot = BitUtil.Pop_andnot(a.internalbits, b.internalbits, 0, System.Math.Min(a.wlen, b.wlen)); if (a.wlen > b.wlen) { tot += BitUtil.Pop_array(a.internalbits, b.wlen, a.wlen - b.wlen); } return tot; } /// Returns the popcount or cardinality of the exclusive-or of the two sets. /// Neither set is modified. /// public static long XorCount(OpenBitSet a, OpenBitSet b) { long tot = BitUtil.Pop_xor(a.internalbits, b.internalbits, 0, System.Math.Min(a.wlen, b.wlen)); if (a.wlen < b.wlen) { tot += BitUtil.Pop_array(b.internalbits, a.wlen, b.wlen - a.wlen); } else if (a.wlen > b.wlen) { tot += BitUtil.Pop_array(a.internalbits, b.wlen, a.wlen - b.wlen); } return tot; } /// Returns the index of the first set bit starting at the index specified. /// -1 is returned if there are no more set bits. /// public virtual int NextSetBit(int index) { int i = index >> 6; if (i >= wlen) return - 1; int subIndex = index & 0x3f; // index within the word long word = internalbits[i] >> subIndex; // skip all the bits to the right of index if (word != 0) { return (i << 6) + subIndex + BitUtil.Ntz(word); } while (++i < wlen) { word = internalbits[i]; if (word != 0) return (i << 6) + BitUtil.Ntz(word); } return - 1; } /// Returns the index of the first set bit starting at the index specified. /// -1 is returned if there are no more set bits. /// public virtual long NextSetBit(long index) { int i = (int) (index >> 6); if (i >= wlen) return - 1; int subIndex = (int) index & 0x3f; // index within the word long word = (long) ((ulong) internalbits[i] >> subIndex); // skip all the bits to the right of index if (word != 0) { return (((long) i) << 6) + (subIndex + BitUtil.Ntz(word)); } while (++i < wlen) { word = internalbits[i]; if (word != 0) return (((long) i) << 6) + BitUtil.Ntz(word); } return - 1; } public virtual System.Object Clone() { try { OpenBitSet obs = new OpenBitSet((long[]) internalbits.Clone(), wlen); //obs.bits = new long[obs.bits.Length]; //obs.bits.CopyTo(obs.bits, 0); // hopefully an array clone is as fast(er) than arraycopy return obs; } catch (System.Exception e) { throw new System.SystemException(e.Message, e); } } /// this = this AND other public virtual void Intersect(OpenBitSet other) { int newLen = System.Math.Min(this.wlen, other.wlen); long[] thisArr = this.internalbits; long[] otherArr = other.internalbits; // testing against zero can be more efficient int pos = newLen; while (--pos >= 0) { thisArr[pos] &= otherArr[pos]; } if (this.wlen > newLen) { // fill zeros from the new shorter length to the old length for (int i = newLen; i < this.wlen; i++) internalbits[i] = 0L; } this.wlen = newLen; } /// this = this OR other public virtual void Union(OpenBitSet other) { int newLen = System.Math.Max(wlen, other.wlen); EnsureCapacityWords(newLen); long[] thisArr = this.internalbits; long[] otherArr = other.internalbits; int pos = System.Math.Min(wlen, other.wlen); while (--pos >= 0) { thisArr[pos] |= otherArr[pos]; } if (this.wlen < newLen) { Array.Copy(otherArr, this.wlen, thisArr, this.wlen, newLen - this.wlen); } this.wlen = newLen; } /// Remove all elements set in other. this = this AND_NOT other public virtual void Remove(OpenBitSet other) { int idx = System.Math.Min(wlen, other.wlen); long[] thisArr = this.internalbits; long[] otherArr = other.internalbits; while (--idx >= 0) { thisArr[idx] &= ~ otherArr[idx]; } } /// this = this XOR other public virtual void Xor(OpenBitSet other) { int newLen = System.Math.Max(wlen, other.wlen); EnsureCapacityWords(newLen); long[] thisArr = this.internalbits; long[] otherArr = other.internalbits; int pos = System.Math.Min(wlen, other.wlen); while (--pos >= 0) { thisArr[pos] ^= otherArr[pos]; } if (this.wlen < newLen) { Array.Copy(otherArr, this.wlen, thisArr, this.wlen, newLen - this.wlen); } this.wlen = newLen; } // some BitSet compatability methods //* see */ public virtual void And(OpenBitSet other) { Intersect(other); } //* see */ public virtual void Or(OpenBitSet other) { Union(other); } //* see */ public virtual void AndNot(OpenBitSet other) { Remove(other); } /// returns true if the sets have any elements in common public virtual bool Intersects(OpenBitSet other) { int pos = System.Math.Min(this.wlen, other.wlen); long[] thisArr = this.internalbits; long[] otherArr = other.internalbits; while (--pos >= 0) { if ((thisArr[pos] & otherArr[pos]) != 0) return true; } return false; } /// Expand the long[] with the size given as a number of words (64 bit longs). /// getNumWords() is unchanged by this call. /// public virtual void EnsureCapacityWords(int numWords) { if (internalbits.Length < numWords) { internalbits = ArrayUtil.Grow(internalbits, numWords); } } /// Ensure that the long[] is big enough to hold numBits, expanding it if necessary. /// getNumWords() is unchanged by this call. /// public virtual void EnsureCapacity(long numBits) { EnsureCapacityWords(Bits2words(numBits)); } /// Lowers numWords, the number of words in use, /// by checking for trailing zero words. /// public virtual void TrimTrailingZeros() { int idx = wlen - 1; while (idx >= 0 && internalbits[idx] == 0) idx--; wlen = idx + 1; } /// returns the number of 64 bit words it would take to hold numBits public static int Bits2words(long numBits) { return (int) ((((numBits - 1) >> 6)) + 1); } /// returns true if both sets have the same bits set public override bool Equals(System.Object o) { if (this == o) return true; if (!(o is OpenBitSet)) return false; OpenBitSet a; OpenBitSet b = (OpenBitSet) o; // make a the larger set. if (b.wlen > this.wlen) { a = b; b = this; } else { a = this; } // check for any set bits out of the range of b for (int i = a.wlen - 1; i >= b.wlen; i--) { if (a.internalbits[i] != 0) return false; } for (int i = b.wlen - 1; i >= 0; i--) { if (a.internalbits[i] != b.internalbits[i]) return false; } return true; } public override int GetHashCode() { // Start with a zero hash and use a mix that results in zero if the input is zero. // This effectively truncates trailing zeros without an explicit check. long h = 0; for (int i = internalbits.Length; --i >= 0; ) { h ^= internalbits[i]; h = (h << 1) | (Number.URShift(h, 63)); // rotate left } // fold leftmost bits into right and add a constant to prevent // empty sets from returning 0, which is too common. return (int)(((h >> 32) ^ h) + 0x98761234); } } }