/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Documents; using Lucene.Net.Support; using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; namespace Lucene.Net.Index { sealed class TermsHashPerField:InvertedDocConsumerPerField { private void InitBlock() { postingsHashHalfSize = postingsHashSize / 2; postingsHashMask = postingsHashSize - 1; postingsHash = new RawPostingList[postingsHashSize]; } internal TermsHashConsumerPerField consumer; internal TermsHashPerField nextPerField; internal TermsHashPerThread perThread; internal DocumentsWriter.DocState docState; internal FieldInvertState fieldState; internal ITermAttribute termAtt; // Copied from our perThread internal CharBlockPool charPool; internal IntBlockPool intPool; internal ByteBlockPool bytePool; internal int streamCount; internal int numPostingInt; internal FieldInfo fieldInfo; internal bool postingsCompacted; internal int numPostings; private int postingsHashSize = 4; private int postingsHashHalfSize; private int postingsHashMask; private RawPostingList[] postingsHash; private RawPostingList p; public TermsHashPerField(DocInverterPerField docInverterPerField, TermsHashPerThread perThread, TermsHashPerThread nextPerThread, FieldInfo fieldInfo) { InitBlock(); this.perThread = perThread; intPool = perThread.intPool; charPool = perThread.charPool; bytePool = perThread.bytePool; docState = perThread.docState; fieldState = docInverterPerField.fieldState; this.consumer = perThread.consumer.AddField(this, fieldInfo); streamCount = consumer.GetStreamCount(); numPostingInt = 2 * streamCount; this.fieldInfo = fieldInfo; if (nextPerThread != null) nextPerField = (TermsHashPerField) nextPerThread.AddField(docInverterPerField, fieldInfo); else nextPerField = null; } internal void ShrinkHash(int targetSize) { System.Diagnostics.Debug.Assert(postingsCompacted || numPostings == 0); int newSize = 4; if (newSize != postingsHash.Length) { postingsHash = new RawPostingList[newSize]; postingsHashSize = newSize; postingsHashHalfSize = newSize / 2; postingsHashMask = newSize - 1; } System.Array.Clear(postingsHash,0,postingsHash.Length); } public void Reset() { if (!postingsCompacted) CompactPostings(); System.Diagnostics.Debug.Assert(numPostings <= postingsHash.Length); if (numPostings > 0) { perThread.termsHash.RecyclePostings(postingsHash, numPostings); Array.Clear(postingsHash, 0, numPostings); numPostings = 0; } postingsCompacted = false; if (nextPerField != null) nextPerField.Reset(); } public override void Abort() { lock (this) { Reset(); if (nextPerField != null) nextPerField.Abort(); } } public void InitReader(ByteSliceReader reader, RawPostingList p, int stream) { System.Diagnostics.Debug.Assert(stream < streamCount); int[] ints = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; int upto = p.intStart & DocumentsWriter.INT_BLOCK_MASK; reader.Init(bytePool, p.byteStart + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]); } private void CompactPostings() { lock (this) { int upto = 0; for (int i = 0; i < postingsHashSize; i++) { if (postingsHash[i] != null) { if (upto < i) { postingsHash[upto] = postingsHash[i]; postingsHash[i] = null; } upto++; } } System.Diagnostics.Debug.Assert(upto == numPostings); postingsCompacted = true; } } /// Collapse the hash table & sort in-place. public RawPostingList[] SortPostings() { CompactPostings(); QuickSort(postingsHash, 0, numPostings - 1); return postingsHash; } internal void QuickSort(RawPostingList[] postings, int lo, int hi) { if (lo >= hi) return ; else if (hi == 1 + lo) { if (ComparePostings(postings[lo], postings[hi]) > 0) { RawPostingList tmp = postings[lo]; postings[lo] = postings[hi]; postings[hi] = tmp; } return ; } int mid = Number.URShift((lo + hi), 1); if (ComparePostings(postings[lo], postings[mid]) > 0) { RawPostingList tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (ComparePostings(postings[mid], postings[hi]) > 0) { RawPostingList tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (ComparePostings(postings[lo], postings[mid]) > 0) { RawPostingList tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return ; RawPostingList partition = postings[mid]; for (; ; ) { while (ComparePostings(postings[right], partition) > 0) --right; while (left < right && ComparePostings(postings[left], partition) <= 0) ++left; if (left < right) { RawPostingList tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } QuickSort(postings, lo, left); QuickSort(postings, left + 1, hi); } /// Compares term text for two Posting instance and /// returns -1 if p1 < p2; 1 if p1 > p2; else 0. /// internal int ComparePostings(RawPostingList p1, RawPostingList p2) { if (p1 == p2) return 0; char[] text1 = charPool.buffers[p1.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos1 = p1.textStart & DocumentsWriter.CHAR_BLOCK_MASK; char[] text2 = charPool.buffers[p2.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos2 = p2.textStart & DocumentsWriter.CHAR_BLOCK_MASK; System.Diagnostics.Debug.Assert(text1 != text2 || pos1 != pos2); while (true) { char c1 = text1[pos1++]; char c2 = text2[pos2++]; if (c1 != c2) { if (0xffff == c2) return 1; else if (0xffff == c1) return - 1; else return c1 - c2; } else // This method should never compare equal postings // unless p1==p2 System.Diagnostics.Debug.Assert(c1 != 0xffff); } } /// Test whether the text for current RawPostingList p equals /// current tokenText. /// private bool PostingEquals(char[] tokenText, int tokenTextLen) { char[] text = perThread.charPool.buffers[p.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; System.Diagnostics.Debug.Assert(text != null); int pos = p.textStart & DocumentsWriter.CHAR_BLOCK_MASK; int tokenPos = 0; for (; tokenPos < tokenTextLen; pos++, tokenPos++) if (tokenText[tokenPos] != text[pos]) return false; return 0xffff == text[pos]; } private bool doCall; private bool doNextCall; internal override void Start(IFieldable f) { termAtt = fieldState.attributeSource.AddAttribute(); consumer.Start(f); if (nextPerField != null) { nextPerField.Start(f); } } internal override bool Start(IFieldable[] fields, int count) { doCall = consumer.Start(fields, count); if (nextPerField != null) doNextCall = nextPerField.Start(fields, count); return doCall || doNextCall; } // Secondary entry point (for 2nd & subsequent TermsHash), // because token text has already been "interned" into // textStart, so we hash by textStart public void Add(int textStart) { int code = textStart; int hashPos = code & postingsHashMask; System.Diagnostics.Debug.Assert(!postingsCompacted); // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && p.textStart != textStart) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && p.textStart != textStart); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. // Refill? if (0 == perThread.freePostingsCount) perThread.MorePostings(); // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; System.Diagnostics.Debug.Assert(p != null); p.textStart = textStart; System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null); postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) RehashPostings(2 * postingsHashSize); // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) intPool.NextBuffer(); if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) bytePool.NextBuffer(); intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for (int i = 0; i < streamCount; i++) { int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); intUptos[intUptoStart + i] = upto + bytePool.byteOffset; } p.byteStart = intUptos[intUptoStart]; consumer.NewTerm(p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.AddTerm(p); } } // Primary entry point (for first TermsHash) internal override void Add() { System.Diagnostics.Debug.Assert(!postingsCompacted); // We are first in the chain so we must "intern" the // term text into textStart address // Get the text of this term. char[] tokenText = termAtt.TermBuffer(); ; int tokenTextLen = termAtt.TermLength(); // Compute hashcode & replace any invalid UTF16 sequences int downto = tokenTextLen; int code = 0; while (downto > 0) { char ch = tokenText[--downto]; if (ch >= UnicodeUtil.UNI_SUR_LOW_START && ch <= UnicodeUtil.UNI_SUR_LOW_END) { if (0 == downto) { // Unpaired ch = tokenText[downto] = (char) (UnicodeUtil.UNI_REPLACEMENT_CHAR); } else { char ch2 = tokenText[downto - 1]; if (ch2 >= UnicodeUtil.UNI_SUR_HIGH_START && ch2 <= UnicodeUtil.UNI_SUR_HIGH_END) { // OK: high followed by low. This is a valid // surrogate pair. code = ((code * 31) + ch) * 31 + ch2; downto--; continue; } else { // Unpaired ch = tokenText[downto] = (char) (UnicodeUtil.UNI_REPLACEMENT_CHAR); } } } else if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && (ch <= UnicodeUtil.UNI_SUR_HIGH_END || ch == 0xffff)) { // Unpaired or 0xffff ch = tokenText[downto] = (char) (UnicodeUtil.UNI_REPLACEMENT_CHAR); } code = (code * 31) + ch; } int hashPos = code & postingsHashMask; // Locate RawPostingList in hash p = postingsHash[hashPos]; if (p != null && !PostingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && !PostingEquals(tokenText, tokenTextLen)); } if (p == null) { // First time we are seeing this token since we last // flushed the hash. int textLen1 = 1 + tokenTextLen; if (textLen1 + charPool.charUpto > DocumentsWriter.CHAR_BLOCK_SIZE) { if (textLen1 > DocumentsWriter.CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (docState.maxTermPrefix == null) docState.maxTermPrefix = new System.String(tokenText, 0, 30); consumer.SkippingLongTerm(); return ; } charPool.NextBuffer(); } // Refill? if (0 == perThread.freePostingsCount) perThread.MorePostings(); // Pull next free RawPostingList from free list p = perThread.freePostings[--perThread.freePostingsCount]; System.Diagnostics.Debug.Assert(p != null); char[] text = charPool.buffer; int textUpto = charPool.charUpto; p.textStart = textUpto + charPool.charOffset; charPool.charUpto += textLen1; Array.Copy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto + tokenTextLen] = (char) (0xffff); System.Diagnostics.Debug.Assert(postingsHash [hashPos] == null); postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) RehashPostings(2 * postingsHashSize); // Init stream slices if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) intPool.NextBuffer(); if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) bytePool.NextBuffer(); intUptos = intPool.buffer; intUptoStart = intPool.intUpto; intPool.intUpto += streamCount; p.intStart = intUptoStart + intPool.intOffset; for (int i = 0; i < streamCount; i++) { int upto = bytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); intUptos[intUptoStart + i] = upto + bytePool.byteOffset; } p.byteStart = intUptos[intUptoStart]; consumer.NewTerm(p); } else { intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT]; intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK; consumer.AddTerm(p); } if (doNextCall) nextPerField.Add(p.textStart); } internal int[] intUptos; internal int intUptoStart; internal void WriteByte(int stream, byte b) { int upto = intUptos[intUptoStart + stream]; byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT]; System.Diagnostics.Debug.Assert(bytes != null); int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK; if (bytes[offset] != 0) { // End of slice; allocate a new one offset = bytePool.AllocSlice(bytes, offset); bytes = bytePool.buffer; intUptos[intUptoStart + stream] = offset + bytePool.byteOffset; } bytes[offset] = b; (intUptos[intUptoStart + stream])++; } public void WriteBytes(int stream, byte[] b, int offset, int len) { // TODO: optimize int end = offset + len; for (int i = offset; i < end; i++) WriteByte(stream, b[i]); } internal void WriteVInt(int stream, int i) { System.Diagnostics.Debug.Assert(stream < streamCount); while ((i & ~ 0x7F) != 0) { WriteByte(stream, (byte) ((i & 0x7f) | 0x80)); i = Number.URShift(i, 7); } WriteByte(stream, (byte) i); } internal override void Finish() { consumer.Finish(); if (nextPerField != null) nextPerField.Finish(); } /// Called when postings hash is too small (> 50% /// occupied) or too large (< 20% occupied). /// internal void RehashPostings(int newSize) { int newMask = newSize - 1; RawPostingList[] newHash = new RawPostingList[newSize]; for (int i = 0; i < postingsHashSize; i++) { RawPostingList p0 = postingsHash[i]; if (p0 != null) { int code; if (perThread.primary) { int start = p0.textStart & DocumentsWriter.CHAR_BLOCK_MASK; char[] text = charPool.buffers[p0.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int pos = start; while (text[pos] != 0xffff) pos++; code = 0; while (pos > start) code = (code * 31) + text[--pos]; } else code = p0.textStart; int hashPos = code & newMask; System.Diagnostics.Debug.Assert(hashPos >= 0); if (newHash[hashPos] != null) { int inc = ((code >> 8) + code) | 1; do { code += inc; hashPos = code & newMask; } while (newHash[hashPos] != null); } newHash[hashPos] = p0; } } postingsHashMask = newMask; postingsHash = newHash; postingsHashSize = newSize; postingsHashHalfSize = newSize >> 1; } } }