/* * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Directory = Lucene.Net.Store.Directory; using InputStream = Lucene.Net.Store.InputStream; namespace Lucene.Net.Index { /// TODO: relax synchro! public class TermVectorsReader { private FieldInfos fieldInfos; private InputStream tvx; private InputStream tvd; private InputStream tvf; private int size; public /*internal*/ TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos) { if (d.FileExists(segment + TermVectorsWriter.TVX_EXTENSION)) { tvx = d.OpenFile(segment + TermVectorsWriter.TVX_EXTENSION); CheckValidFormat(tvx); tvd = d.OpenFile(segment + TermVectorsWriter.TVD_EXTENSION); CheckValidFormat(tvd); tvf = d.OpenFile(segment + TermVectorsWriter.TVF_EXTENSION); CheckValidFormat(tvf); size = (int) tvx.Length() / 8; } this.fieldInfos = fieldInfos; } private void CheckValidFormat(InputStream in_Renamed) { int format = in_Renamed.ReadInt(); if (format > TermVectorsWriter.FORMAT_VERSION) { throw new System.IO.IOException("Incompatible format version: " + format + " expected " + TermVectorsWriter.FORMAT_VERSION + " or less"); } } internal virtual void Close() { lock (this) { // why don't we trap the exception and at least make sure that // all streams that we can close are closed? if (tvx != null) tvx.Close(); if (tvd != null) tvd.Close(); if (tvf != null) tvf.Close(); } } /// /// The number of documents in the reader /// internal virtual int Size() { return size; } /// Retrieve the term vector for the given document and Field /// The document number to retrieve the vector for /// /// The Field within the document to retrieve /// /// The TermFreqVector for the document and Field or null /// public /*internal*/ virtual TermFreqVector Get(int docNum, System.String field) { lock (this) { // Check if no term vectors are available for this segment at all int fieldNumber = fieldInfos.FieldNumber(field); TermFreqVector result = null; if (tvx != null) { try { //We need to account for the FORMAT_SIZE at when seeking in the tvx //We don't need to do this in other seeks because we already have the file pointer //that was written in another file tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); //System.out.println("TVX Pointer: " + tvx.getFilePointer()); long position = tvx.ReadLong(); tvd.Seek(position); int fieldCount = tvd.ReadVInt(); //System.out.println("Num Fields: " + fieldCount); // There are only a few fields per document. We opt for a full scan // rather then requiring that they be ordered. We need to read through // all of the fields anyway to get to the tvf pointers. int number = 0; int found = - 1; for (int i = 0; i < fieldCount; i++) { number += tvd.ReadVInt(); if (number == fieldNumber) found = i; } // This Field, although valid in the segment, was not found in this document if (found != - 1) { // Compute position in the tvf file position = 0; for (int i = 0; i <= found; i++) { position += tvd.ReadVLong(); } result = ReadTermVector(field, position); } else { //System.out.println("Field not found"); } } catch (System.Exception e) { //System.Console.Out.WriteLine(e.StackTrace); } } else { System.Console.Out.WriteLine("No tvx file"); } return result; } } /// Return all term vectors stored for this document or null if the could not be read in. internal virtual TermFreqVector[] Get(int docNum) { lock (this) { TermFreqVector[] result = null; // Check if no term vectors are available for this segment at all if (tvx != null) { try { //We need to offset by tvx.Seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE); long position = tvx.ReadLong(); tvd.Seek(position); int fieldCount = tvd.ReadVInt(); // No fields are vectorized for this document if (fieldCount != 0) { int number = 0; System.String[] fields = new System.String[fieldCount]; for (int i = 0; i < fieldCount; i++) { number += tvd.ReadVInt(); fields[i] = fieldInfos.FieldName(number); } // Compute position in the tvf file position = 0; long[] tvfPointers = new long[fieldCount]; for (int i = 0; i < fieldCount; i++) { position += tvd.ReadVLong(); tvfPointers[i] = position; } result = ReadTermVectors(fields, tvfPointers); } } catch (System.IO.IOException e) { Console.Error.Write(e.StackTrace); Console.Error.Flush(); } } else { System.Console.Out.WriteLine("No tvx file"); } return result; } } private SegmentTermVector[] ReadTermVectors(System.String[] fields, long[] tvfPointers) { SegmentTermVector[] res = new SegmentTermVector[fields.Length]; for (int i = 0; i < fields.Length; i++) { res[i] = ReadTermVector(fields[i], tvfPointers[i]); } return res; } /// /// The Field to read in /// /// The pointer within the tvf file where we should start reading /// /// The TermVector located at that position /// /// IOException private SegmentTermVector ReadTermVector(System.String field, long tvfPointer) { // Now read the data from specified position //We don't need to offset by the FORMAT here since the pointer already includes the offset tvf.Seek(tvfPointer); int numTerms = tvf.ReadVInt(); //System.out.println("Num Terms: " + numTerms); // If no terms - return a constant empty termvector if (numTerms == 0) return new SegmentTermVector(field, null, null); int length = numTerms + tvf.ReadVInt(); System.String[] terms = new System.String[numTerms]; int[] termFreqs = new int[numTerms]; int start = 0; int deltaLength = 0; int totalLength = 0; char[] buffer = new char[]{}; System.String previousString = ""; for (int i = 0; i < numTerms; i++) { start = tvf.ReadVInt(); deltaLength = tvf.ReadVInt(); totalLength = start + deltaLength; if (buffer.Length < totalLength) { buffer = new char[totalLength]; for (int j = 0; j < previousString.Length; j++) // copy contents buffer[j] = previousString[j]; } tvf.ReadChars(buffer, start, deltaLength); terms[i] = new System.String(buffer, 0, totalLength); previousString = terms[i]; termFreqs[i] = tvf.ReadVInt(); } SegmentTermVector tv = new SegmentTermVector(field, terms, termFreqs); return tv; } } }