/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Documents; using IndexOutput = Lucene.Net.Store.IndexOutput; using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; namespace Lucene.Net.Index { sealed class TermVectorsTermsWriterPerField:TermsHashConsumerPerField { internal TermVectorsTermsWriterPerThread perThread; internal TermsHashPerField termsHashPerField; internal TermVectorsTermsWriter termsWriter; internal FieldInfo fieldInfo; internal DocumentsWriter.DocState docState; internal FieldInvertState fieldState; internal bool doVectors; internal bool doVectorPositions; internal bool doVectorOffsets; internal int maxNumPostings; internal IOffsetAttribute offsetAttribute = null; public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) { this.termsHashPerField = termsHashPerField; this.perThread = perThread; this.termsWriter = perThread.termsWriter; this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; } internal override int GetStreamCount() { return 2; } internal override bool Start(IFieldable[] fields, int count) { doVectors = false; doVectorPositions = false; doVectorOffsets = false; for (int i = 0; i < count; i++) { IFieldable field = fields[i]; if (field.IsIndexed && field.IsTermVectorStored) { doVectors = true; doVectorPositions |= field.IsStorePositionWithTermVector; doVectorOffsets |= field.IsStoreOffsetWithTermVector; } } if (doVectors) { if (perThread.doc == null) { perThread.doc = termsWriter.GetPerDoc(); perThread.doc.docID = docState.docID; System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0); System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.Length); System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.FilePointer); } System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID); if (termsHashPerField.numPostings != 0) { // Only necessary if previous doc hit a // non-aborting exception while writing vectors in // this field: termsHashPerField.Reset(); perThread.termsHashPerThread.Reset(false); } } // TODO: only if needed for performance //perThread.postingsCount = 0; return doVectors; } public void Abort() { } /// Called once per field per document if term vectors /// are enabled, to write the vectors to /// RAMOutputStream, which is then quickly flushed to /// the real term vectors files in the Directory. /// internal override void Finish() { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); int numPostings = termsHashPerField.numPostings; System.Diagnostics.Debug.Assert(numPostings >= 0); if (!doVectors || numPostings == 0) return ; if (numPostings > maxNumPostings) maxNumPostings = numPostings; IndexOutput tvf = perThread.doc.perDocTvf; // This is called once, after inverting all occurences // of a given field in the doc. At this point we flush // our hash into the DocWriter. System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); perThread.doc.AddField(termsHashPerField.fieldInfo.number); RawPostingList[] postings = termsHashPerField.SortPostings(); tvf.WriteVInt(numPostings); byte bits = (byte) (0x0); if (doVectorPositions) bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; if (doVectorOffsets) bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvf.WriteByte(bits); int encoderUpto = 0; int lastTermBytesCount = 0; ByteSliceReader reader = perThread.vectorSliceReader; char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; for (int j = 0; j < numPostings; j++) { TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j]; int freq = posting.freq; char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; // We swap between two encoders to save copying // last Term's byte array UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; // TODO: we could do this incrementally UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); int termBytesCount = utf8Result.length; // TODO: UTF16toUTF8 could tell us this prefix // Compute common prefix between last term and // this term int prefix = 0; if (j > 0) { byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; byte[] termBytes = perThread.utf8Results[encoderUpto].result; while (prefix < lastTermBytesCount && prefix < termBytesCount) { if (lastTermBytes[prefix] != termBytes[prefix]) break; prefix++; } } encoderUpto = 1 - encoderUpto; lastTermBytesCount = termBytesCount; int suffix = termBytesCount - prefix; tvf.WriteVInt(prefix); tvf.WriteVInt(suffix); tvf.WriteBytes(utf8Result.result, prefix, suffix); tvf.WriteVInt(freq); if (doVectorPositions) { termsHashPerField.InitReader(reader, posting, 0); reader.WriteTo(tvf); } if (doVectorOffsets) { termsHashPerField.InitReader(reader, posting, 1); reader.WriteTo(tvf); } } termsHashPerField.Reset(); // NOTE: we clear, per-field, at the thread level, // because term vectors fully write themselves on each // field; this saves RAM (eg if large doc has two large // fields w/ term vectors on) because we recycle/reuse // all RAM after each field: perThread.termsHashPerThread.Reset(false); } internal void ShrinkHash() { termsHashPerField.ShrinkHash(maxNumPostings); maxNumPostings = 0; } internal override void Start(IFieldable f) { if (doVectorOffsets) { offsetAttribute = fieldState.attributeSource.AddAttribute(); } else { offsetAttribute = null; } } internal override void NewTerm(RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start")); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; p.freq = 1; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.StartOffset; ; int endOffset = fieldState.offset + offsetAttribute.EndOffset; termsHashPerField.WriteVInt(1, startOffset); termsHashPerField.WriteVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.WriteVInt(0, fieldState.position); p.lastPosition = fieldState.position; } } internal override void AddTerm(RawPostingList p0) { System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start")); TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; p.freq++; if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.StartOffset; ; int endOffset = fieldState.offset + offsetAttribute.EndOffset; termsHashPerField.WriteVInt(1, startOffset - p.lastOffset); termsHashPerField.WriteVInt(1, endOffset - startOffset); p.lastOffset = endOffset; } if (doVectorPositions) { termsHashPerField.WriteVInt(0, fieldState.position - p.lastPosition); p.lastPosition = fieldState.position; } } internal override void SkippingLongTerm() { } } }