/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using IndexOutput = Lucene.Net.Store.IndexOutput; using RAMOutputStream = Lucene.Net.Store.RAMOutputStream; using ArrayUtil = Lucene.Net.Util.ArrayUtil; namespace Lucene.Net.Index { sealed class TermVectorsTermsWriter:TermsHashConsumer { private void InitBlock() { docFreeList = new PerDoc[1]; } internal DocumentsWriter docWriter; internal TermVectorsWriter termVectorsWriter; internal PerDoc[] docFreeList; internal int freeCount; internal IndexOutput tvx; internal IndexOutput tvd; internal IndexOutput tvf; internal int lastDocID; public TermVectorsTermsWriter(DocumentsWriter docWriter) { InitBlock(); this.docWriter = docWriter; } public override TermsHashConsumerPerThread AddThread(TermsHashPerThread termsHashPerThread) { return new TermVectorsTermsWriterPerThread(termsHashPerThread, this); } internal override void CreatePostings(RawPostingList[] postings, int start, int count) { int end = start + count; for (int i = start; i < end; i++) postings[i] = new PostingList(); } public override void Flush(IDictionary> threadsAndFields, SegmentWriteState state) { lock (this) { // NOTE: it's possible that all documents seen in this segment // hit non-aborting exceptions, in which case we will // not have yet init'd the TermVectorsWriter. This is // actually OK (unlike in the stored fields case) // because, although IieldInfos.hasVectors() will return // true, the TermVectorsReader gracefully handles // non-existence of the term vectors files. if (tvx != null) { if (state.numDocsInStore > 0) // In case there are some final documents that we // didn't see (because they hit a non-aborting exception): Fill(state.numDocsInStore - docWriter.DocStoreOffset); tvx.Flush(); tvd.Flush(); tvf.Flush(); } foreach(var entry in threadsAndFields) { foreach(var field in entry.Value) { TermVectorsTermsWriterPerField perField = (TermVectorsTermsWriterPerField)field; perField.termsHashPerField.Reset(); perField.ShrinkHash(); } TermVectorsTermsWriterPerThread perThread = (TermVectorsTermsWriterPerThread) entry.Key; perThread.termsHashPerThread.Reset(true); } } } internal override void CloseDocStore(SegmentWriteState state) { lock (this) { if (tvx != null) { // At least one doc in this run had term vectors // enabled Fill(state.numDocsInStore - docWriter.DocStoreOffset); tvx.Close(); tvf.Close(); tvd.Close(); tvx = null; System.Diagnostics.Debug.Assert(state.docStoreSegmentName != null); System.String fileName = state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION; if (4 + ((long) state.numDocsInStore) * 16 != state.directory.FileLength(fileName)) throw new System.SystemException("after flush: tvx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.FileLength(fileName) + " length in bytes of " + fileName + " file exists?=" + state.directory.FileExists(fileName)); state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); lastDocID = 0; } } } internal int allocCount; internal PerDoc GetPerDoc() { lock (this) { if (freeCount == 0) { allocCount++; if (allocCount > docFreeList.Length) { // Grow our free list up front to make sure we have // enough space to recycle all outstanding PerDoc // instances System.Diagnostics.Debug.Assert(allocCount == 1 + docFreeList.Length); docFreeList = new PerDoc[ArrayUtil.GetNextSize(allocCount)]; } return new PerDoc(this); } else return docFreeList[--freeCount]; } } /// Fills in no-term-vectors for all docs we haven't seen /// since the last doc that had term vectors. /// internal void Fill(int docID) { int docStoreOffset = docWriter.DocStoreOffset; int end = docID + docStoreOffset; if (lastDocID < end) { long tvfPosition = tvf.FilePointer; while (lastDocID < end) { tvx.WriteLong(tvd.FilePointer); tvd.WriteVInt(0); tvx.WriteLong(tvfPosition); lastDocID++; } } } internal void InitTermVectorsWriter() { lock (this) { if (tvx == null) { System.String docStoreSegment = docWriter.DocStoreSegment; if (docStoreSegment == null) return ; System.Diagnostics.Debug.Assert(docStoreSegment != null); // If we hit an exception while init'ing the term // vector output files, we must abort this segment // because those files will be in an unknown // state: tvx = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); tvd = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); tvf = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); tvx.WriteInt(TermVectorsReader.FORMAT_CURRENT); tvd.WriteInt(TermVectorsReader.FORMAT_CURRENT); tvf.WriteInt(TermVectorsReader.FORMAT_CURRENT); docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); lastDocID = 0; } } } internal void FinishDocument(PerDoc perDoc) { lock (this) { System.Diagnostics.Debug.Assert(docWriter.writer.TestPoint("TermVectorsTermsWriter.finishDocument start")); InitTermVectorsWriter(); Fill(perDoc.docID); // Append term vectors to the real outputs: tvx.WriteLong(tvd.FilePointer); tvx.WriteLong(tvf.FilePointer); tvd.WriteVInt(perDoc.numVectorFields); if (perDoc.numVectorFields > 0) { for (int i = 0; i < perDoc.numVectorFields; i++) tvd.WriteVInt(perDoc.fieldNumbers[i]); System.Diagnostics.Debug.Assert(0 == perDoc.fieldPointers [0]); long lastPos = perDoc.fieldPointers[0]; for (int i = 1; i < perDoc.numVectorFields; i++) { long pos = perDoc.fieldPointers[i]; tvd.WriteVLong(pos - lastPos); lastPos = pos; } perDoc.perDocTvf.WriteTo(tvf); perDoc.numVectorFields = 0; } System.Diagnostics.Debug.Assert(lastDocID == perDoc.docID + docWriter.DocStoreOffset); lastDocID++; perDoc.Reset(); Free(perDoc); System.Diagnostics.Debug.Assert(docWriter.writer.TestPoint("TermVectorsTermsWriter.finishDocument end")); } } public bool FreeRAM() { // We don't hold any state beyond one doc, so we don't // free persistent RAM here return false; } public override void Abort() { if (tvx != null) { try { tvx.Close(); } catch (System.Exception) { } tvx = null; } if (tvd != null) { try { tvd.Close(); } catch (System.Exception) { } tvd = null; } if (tvf != null) { try { tvf.Close(); } catch (System.Exception) { } tvf = null; } lastDocID = 0; } internal void Free(PerDoc doc) { lock (this) { System.Diagnostics.Debug.Assert(freeCount < docFreeList.Length); docFreeList[freeCount++] = doc; } } internal class PerDoc:DocumentsWriter.DocWriter { public PerDoc(TermVectorsTermsWriter enclosingInstance) { InitBlock(enclosingInstance); } private void InitBlock(TermVectorsTermsWriter enclosingInstance) { this.enclosingInstance = enclosingInstance; buffer = enclosingInstance.docWriter.NewPerDocBuffer(); perDocTvf = new RAMOutputStream(buffer); } private TermVectorsTermsWriter enclosingInstance; public TermVectorsTermsWriter Enclosing_Instance { get { return enclosingInstance; } } internal DocumentsWriter.PerDocBuffer buffer; internal RAMOutputStream perDocTvf; internal int numVectorFields; internal int[] fieldNumbers = new int[1]; internal long[] fieldPointers = new long[1]; internal void Reset() { perDocTvf.Reset(); buffer.Recycle(); numVectorFields = 0; } public override void Abort() { Reset(); Enclosing_Instance.Free(this); } internal void AddField(int fieldNumber) { if (numVectorFields == fieldNumbers.Length) { fieldNumbers = ArrayUtil.Grow(fieldNumbers); fieldPointers = ArrayUtil.Grow(fieldPointers); } fieldNumbers[numVectorFields] = fieldNumber; fieldPointers[numVectorFields] = perDocTvf.FilePointer; numVectorFields++; } public override long SizeInBytes() { return buffer.SizeInBytes; } public override void Finish() { Enclosing_Instance.FinishDocument(this); } } internal sealed class PostingList:RawPostingList { internal int freq; // How many times this term occurred in the current doc internal int lastOffset; // Last offset we saw internal int lastPosition; // Last position where this term occurred } internal override int BytesPerPosting() { return RawPostingList.BYTES_SIZE + 3 * DocumentsWriter.INT_NUM_BYTE; } } }