/* * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Directory = Lucene.Net.Store.Directory; using OutputStream = Lucene.Net.Store.OutputStream; using StringHelper = Lucene.Net.Util.StringHelper; namespace Lucene.Net.Index { /// Writer works by opening a document and then opening the fields within the document and then /// writing out the vectors for each Field. /// /// Rough usage: /// /// /// for each document /// { /// writer.openDocument(); /// for each Field on the document /// { /// writer.openField(Field); /// for all of the terms /// { /// writer.addTerm(...) /// } /// writer.closeField /// } /// writer.closeDocument() /// } /// /// sealed public class TermVectorsWriter { public const int FORMAT_VERSION = 1; //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file public const int FORMAT_SIZE = 4; //TODO: Figure out how to write with or w/o position information and read back in public const System.String TVX_EXTENSION = ".tvx"; public const System.String TVD_EXTENSION = ".tvd"; public const System.String TVF_EXTENSION = ".tvf"; private OutputStream tvx = null, tvd = null, tvf = null; private System.Collections.ArrayList fields = null; private System.Collections.ArrayList terms = null; private FieldInfos fieldInfos; private TVField currentField = null; private long currentDocPointer = - 1; /// Create term vectors writer for the specified segment in specified /// directory. A new TermVectorsWriter should be created for each /// segment. The parameter maxFields indicates how many total /// fields are found in this document. Not all of these fields may require /// termvectors to be stored, so the number of calls to /// openField is less or equal to this number. /// public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos) { // Open files for TermVector storage tvx = directory.CreateFile(segment + TVX_EXTENSION); tvx.WriteInt(FORMAT_VERSION); tvd = directory.CreateFile(segment + TVD_EXTENSION); tvd.WriteInt(FORMAT_VERSION); tvf = directory.CreateFile(segment + TVF_EXTENSION); tvf.WriteInt(FORMAT_VERSION); this.fieldInfos = fieldInfos; fields = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(fieldInfos.Size())); terms = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); } public void OpenDocument() { CloseDocument(); currentDocPointer = tvd.GetFilePointer(); } public void CloseDocument() { if (IsDocumentOpen()) { CloseField(); WriteDoc(); fields.Clear(); currentDocPointer = - 1; } } public bool IsDocumentOpen() { return currentDocPointer != - 1; } /// Start processing a Field. This can be followed by a number of calls to /// addTerm, and a final call to closeField to indicate the end of /// processing of this Field. If a Field was previously open, it is /// closed automatically. /// public void OpenField(System.String field) { if (!IsDocumentOpen()) throw new System.SystemException("Cannot open Field when no document is open."); CloseField(); currentField = new TVField(fieldInfos.FieldNumber(field)); } /// Finished processing current Field. This should be followed by a call to /// openField before future calls to addTerm. /// public void CloseField() { if (IsFieldOpen()) { /* DEBUG */ //System.out.println("closeField()"); /* DEBUG */ // save Field and terms WriteField(); fields.Add(currentField); terms.Clear(); currentField = null; } } /// Return true if a Field is currently open. public bool IsFieldOpen() { return currentField != null; } /// Add term to the Field's term vector. Field must already be open /// of NullPointerException is thrown. Terms should be added in /// increasing order of terms, one call per unique termNum. ProxPointer /// is a pointer into the TermPosition file (prx). Freq is the number of /// times this term appears in this Field, in this document. /// public void AddTerm(System.String termText, int freq) { if (!IsDocumentOpen()) throw new System.SystemException("Cannot add terms when document is not open"); if (!IsFieldOpen()) throw new System.SystemException("Cannot add terms when Field is not open"); AddTermInternal(termText, freq); } private void AddTermInternal(System.String termText, int freq) { currentField.length += freq; TVTerm term = new TVTerm(); term.termText = termText; term.freq = freq; terms.Add(term); } /// Add specified vectors to the document. public void AddVectors(TermFreqVector[] vectors) { if (!IsDocumentOpen()) throw new System.SystemException("Cannot add term vectors when document is not open"); if (IsFieldOpen()) throw new System.SystemException("Cannot add term vectors when Field is open"); for (int i = 0; i < vectors.Length; i++) { AddTermFreqVector(vectors[i]); } } /// Add specified vector to the document. Document must be open but no Field /// should be open or exception is thrown. The same document can have addTerm /// and addVectors calls mixed, however a given Field must either be /// populated with addTerm or with addVector. * /// public void AddTermFreqVector(TermFreqVector vector) { if (!IsDocumentOpen()) throw new System.SystemException("Cannot add term vector when document is not open"); if (IsFieldOpen()) throw new System.SystemException("Cannot add term vector when Field is open"); AddTermFreqVectorInternal(vector); } private void AddTermFreqVectorInternal(TermFreqVector vector) { OpenField(vector.GetField()); for (int i = 0; i < vector.Size(); i++) { AddTermInternal(vector.GetTerms()[i], vector.GetTermFrequencies()[i]); } CloseField(); } /// Close all streams. public /*internal*/ void Close() { try { CloseDocument(); } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process System.IO.IOException keep = null; if (tvx != null) try { tvx.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (tvd != null) try { tvd.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (tvf != null) try { tvf.Close(); } catch (System.IO.IOException e) { if (keep == null) keep = e; } if (keep != null) { throw new System.IO.IOException(keep.StackTrace); } } } private void WriteField() { // remember where this Field is written currentField.tvfPointer = tvf.GetFilePointer(); //System.out.println("Field Pointer: " + currentField.tvfPointer); int size; tvf.WriteVInt(size = terms.Count); tvf.WriteVInt(currentField.length - size); System.String lastTermText = ""; // write term ids and positions for (int i = 0; i < size; i++) { TVTerm term = (TVTerm) terms[i]; //tvf.writeString(term.termText); int start = StringHelper.StringDifference(lastTermText, term.termText); int length = term.termText.Length - start; tvf.WriteVInt(start); // write shared prefix length tvf.WriteVInt(length); // write delta length tvf.WriteChars(term.termText, start, length); // write delta chars tvf.WriteVInt(term.freq); lastTermText = term.termText; } } private void WriteDoc() { if (IsFieldOpen()) throw new System.SystemException("Field is still open while writing document"); //System.out.println("Writing doc pointer: " + currentDocPointer); // write document index record tvx.WriteLong(currentDocPointer); // write document data record int size; // write the number of fields tvd.WriteVInt(size = fields.Count); // write Field numbers int lastFieldNumber = 0; for (int i = 0; i < size; i++) { TVField field = (TVField) fields[i]; tvd.WriteVInt(field.number - lastFieldNumber); lastFieldNumber = field.number; } // write Field pointers long lastFieldPointer = 0; for (int i = 0; i < size; i++) { TVField field = (TVField) fields[i]; tvd.WriteVLong(field.tvfPointer - lastFieldPointer); lastFieldPointer = field.tvfPointer; } //System.out.println("After writing doc pointer: " + tvx.getFilePointer()); } private class TVField { internal int number; internal long tvfPointer = 0; internal int length = 0; // number of distinct term positions internal TVField(int number) { this.number = number; } } private class TVTerm { internal System.String termText; internal int freq = 0; //int positions[] = null; } } }