/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Documents; using Directory = Lucene.Net.Store.Directory; using IndexInput = Lucene.Net.Store.IndexInput; namespace Lucene.Net.Index { /// Class responsible for access to stored document fields. ///

/// It uses <segment>.fdt and <segment>.fdx; files. /// ///

/// $Id: FieldsReader.java 507009 2007-02-13 14:06:52Z gsingers $ /// public sealed class FieldsReader { private FieldInfos fieldInfos; // The main fieldStream, used only for cloning. private IndexInput cloneableFieldsStream; // This is a clone of cloneableFieldsStream used for reading documents. // It should not be cloned outside of a synchronized context. private IndexInput fieldsStream; private IndexInput indexStream; private int size; private System.LocalDataStoreSlot fieldsStreamTL = System.Threading.Thread.AllocateDataSlot(); public FieldsReader(Directory d, System.String segment, FieldInfos fn) { fieldInfos = fn; cloneableFieldsStream = d.OpenInput(segment + ".fdt"); fieldsStream = (IndexInput) cloneableFieldsStream.Clone(); indexStream = d.OpenInput(segment + ".fdx"); size = (int) (indexStream.Length() / 8); } /// Closes the underlying {@link Lucene.Net.Store.IndexInput} streams, including any ones associated with a /// lazy implementation of a Field. This means that the Fields values will not be accessible. /// /// /// IOException public void Close() { fieldsStream.Close(); cloneableFieldsStream.Close(); indexStream.Close(); IndexInput localFieldsStream = (IndexInput) System.Threading.Thread.GetData(fieldsStreamTL); if (localFieldsStream != null) { localFieldsStream.Close(); System.Threading.Thread.SetData(fieldsStreamTL, null); } } public int Size() { return size; } public Document Doc(int n, FieldSelector fieldSelector) { indexStream.Seek(n * 8L); long position = indexStream.ReadLong(); fieldsStream.Seek(position); Document doc = new Document(); int numFields = fieldsStream.ReadVInt(); for (int i = 0; i < numFields; i++) { int fieldNumber = fieldsStream.ReadVInt(); FieldInfo fi = fieldInfos.FieldInfo(fieldNumber); FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.Accept(fi.name); byte bits = fieldsStream.ReadByte(); bool compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0; bool tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0; bool binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0; //TODO: Find an alternative approach here if this list continues to grow beyond the //list of 5 or 6 currently here. See Lucene 762 for discussion if (acceptField.Equals(FieldSelectorResult.LOAD)) { AddField(doc, fi, binary, compressed, tokenize); } else if (acceptField.Equals(FieldSelectorResult.LOAD_FOR_MERGE)) { AddFieldForMerge(doc, fi, binary, compressed, tokenize); } else if (acceptField.Equals(FieldSelectorResult.LOAD_AND_BREAK)) { AddField(doc, fi, binary, compressed, tokenize); break; //Get out of this loop } else if (acceptField.Equals(FieldSelectorResult.LAZY_LOAD)) { AddFieldLazy(doc, fi, binary, compressed, tokenize); } else if (acceptField.Equals(FieldSelectorResult.SIZE)) { SkipField(binary, compressed, AddFieldSize(doc, fi, binary, compressed)); } else if (acceptField.Equals(FieldSelectorResult.SIZE_AND_BREAK)) { AddFieldSize(doc, fi, binary, compressed); break; } else { SkipField(binary, compressed); } } return doc; } /// Skip the field. We still have to read some of the information about the field, but can skip past the actual content. /// This will have the most payoff on large fields. /// private void SkipField(bool binary, bool compressed) { SkipField(binary, compressed, fieldsStream.ReadVInt()); } private void SkipField(bool binary, bool compressed, int toRead) { if (binary || compressed) { long pointer = fieldsStream.GetFilePointer(); fieldsStream.Seek(pointer + toRead); } else { //We need to skip chars. This will slow us down, but still better fieldsStream.SkipChars(toRead); } } private void AddFieldLazy(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize) { if (binary == true) { int toRead = fieldsStream.ReadVInt(); long pointer = fieldsStream.GetFilePointer(); if (compressed) { //was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS)); doc.Add(new LazyField(this, fi.name, Field.Store.COMPRESS, toRead, pointer)); } else { //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES)); doc.Add(new LazyField(this, fi.name, Field.Store.YES, toRead, pointer)); } //Need to move the pointer ahead by toRead positions fieldsStream.Seek(pointer + toRead); } else { Field.Store store = Field.Store.YES; Field.Index index = GetIndexType(fi, tokenize); Field.TermVector termVector = GetTermVectorType(fi); Fieldable f; if (compressed) { store = Field.Store.COMPRESS; int toRead = fieldsStream.ReadVInt(); long pointer = fieldsStream.GetFilePointer(); f = new LazyField(this, fi.name, store, toRead, pointer); //skip over the part that we aren't loading fieldsStream.Seek(pointer + toRead); f.SetOmitNorms(fi.omitNorms); } else { int length = fieldsStream.ReadVInt(); long pointer = fieldsStream.GetFilePointer(); //Skip ahead of where we are by the length of what is stored fieldsStream.SkipChars(length); f = new LazyField(this, fi.name, store, index, termVector, length, pointer); f.SetOmitNorms(fi.omitNorms); } doc.Add(f); } } // in merge mode we don't uncompress the data of a compressed field private void AddFieldForMerge(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize) { System.Object data; if (binary || compressed) { int toRead = fieldsStream.ReadVInt(); byte[] b = new byte[toRead]; fieldsStream.ReadBytes(b, 0, b.Length); data = b; } else { data = fieldsStream.ReadString(); } doc.Add(new FieldForMerge(data, fi, binary, compressed, tokenize)); } private void AddField(Document doc, FieldInfo fi, bool binary, bool compressed, bool tokenize) { //we have a binary stored field, and it may be compressed if (binary) { int toRead = fieldsStream.ReadVInt(); byte[] b = new byte[toRead]; fieldsStream.ReadBytes(b, 0, b.Length); if (compressed) doc.Add(new Field(fi.name, Uncompress(b), Field.Store.COMPRESS)); else doc.Add(new Field(fi.name, b, Field.Store.YES)); } else { Field.Store store = Field.Store.YES; Field.Index index = GetIndexType(fi, tokenize); Field.TermVector termVector = GetTermVectorType(fi); Fieldable f; if (compressed) { store = Field.Store.COMPRESS; int toRead = fieldsStream.ReadVInt(); byte[] b = new byte[toRead]; fieldsStream.ReadBytes(b, 0, b.Length); f = new Field(fi.name, System.Text.Encoding.GetEncoding("UTF-8").GetString(Uncompress(b)), store, index, termVector); f.SetOmitNorms(fi.omitNorms); } else { f = new Field(fi.name, fieldsStream.ReadString(), store, index, termVector); f.SetOmitNorms(fi.omitNorms); } doc.Add(f); } } // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes) // Read just the size -- caller must skip the field content to continue reading fields // Return the size in bytes or chars, depending on field type private int AddFieldSize(Document doc, FieldInfo fi, bool binary, bool compressed) { int size = fieldsStream.ReadVInt(), bytesize = binary || compressed ? size : 2 * size; byte[] sizebytes = new byte[4]; sizebytes[0] = (byte) (SupportClass.Number.URShift(bytesize, 24)); sizebytes[1] = (byte) (SupportClass.Number.URShift(bytesize, 16)); sizebytes[2] = (byte) (SupportClass.Number.URShift(bytesize, 8)); sizebytes[3] = (byte) bytesize; doc.Add(new Field(fi.name, sizebytes, Field.Store.YES)); return size; } private Field.TermVector GetTermVectorType(FieldInfo fi) { Field.TermVector termVector = null; if (fi.storeTermVector) { if (fi.storeOffsetWithTermVector) { if (fi.storePositionWithTermVector) { termVector = Field.TermVector.WITH_POSITIONS_OFFSETS; } else { termVector = Field.TermVector.WITH_OFFSETS; } } else if (fi.storePositionWithTermVector) { termVector = Field.TermVector.WITH_POSITIONS; } else { termVector = Field.TermVector.YES; } } else { termVector = Field.TermVector.NO; } return termVector; } private Field.Index GetIndexType(FieldInfo fi, bool tokenize) { Field.Index index; if (fi.isIndexed && tokenize) index = Field.Index.TOKENIZED; else if (fi.isIndexed && !tokenize) index = Field.Index.UN_TOKENIZED; else index = Field.Index.NO; return index; } /// A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is /// loaded. /// [Serializable] private class LazyField:AbstractField, Fieldable { private void InitBlock(FieldsReader enclosingInstance) { this.enclosingInstance = enclosingInstance; } private FieldsReader enclosingInstance; public FieldsReader Enclosing_Instance { get { return enclosingInstance; } } private int toRead; private long pointer; public LazyField(FieldsReader enclosingInstance, System.String name, Field.Store store, int toRead, long pointer):base(name, store, Field.Index.NO, Field.TermVector.NO) { InitBlock(enclosingInstance); this.toRead = toRead; this.pointer = pointer; lazy = true; } public LazyField(FieldsReader enclosingInstance, System.String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer):base(name, store, index, termVector) { InitBlock(enclosingInstance); this.toRead = toRead; this.pointer = pointer; lazy = true; } private IndexInput GetFieldStream() { IndexInput localFieldsStream = (IndexInput) System.Threading.Thread.GetData(Enclosing_Instance.fieldsStreamTL); if (localFieldsStream == null) { localFieldsStream = (IndexInput) Enclosing_Instance.cloneableFieldsStream.Clone(); System.Threading.Thread.SetData(Enclosing_Instance.fieldsStreamTL, localFieldsStream); } return localFieldsStream; } /// The value of the field in Binary, or null. If null, the Reader or /// String value is used. Exactly one of stringValue(), readerValue() and /// binaryValue() must be set. /// public override byte[] BinaryValue() { if (fieldsData == null) { byte[] b = new byte[toRead]; IndexInput localFieldsStream = GetFieldStream(); //Throw this IO Exception since IndexREader.document does so anyway, so probably not that big of a change for people //since they are already handling this exception when getting the document try { localFieldsStream.Seek(pointer); localFieldsStream.ReadBytes(b, 0, b.Length); if (isCompressed == true) { fieldsData = Enclosing_Instance.Uncompress(b); } else { fieldsData = b; } } catch (System.IO.IOException e) { throw new FieldReaderException(e); } } return fieldsData is byte[] ? (byte[]) fieldsData : null; } /// The value of the field as a Reader, or null. If null, the String value /// or binary value is used. Exactly one of stringValue(), readerValue(), /// and binaryValue() must be set. /// public override System.IO.TextReader ReaderValue() { return fieldsData is System.IO.TextReader ? (System.IO.TextReader) fieldsData : null; } /// The value of the field as a String, or null. If null, the Reader value /// or binary value is used. Exactly one of stringValue(), readerValue(), and /// binaryValue() must be set. /// public override System.String StringValue() { if (fieldsData == null) { IndexInput localFieldsStream = GetFieldStream(); try { localFieldsStream.Seek(pointer); if (isCompressed) { byte[] b = new byte[toRead]; localFieldsStream.ReadBytes(b, 0, b.Length); fieldsData = System.Text.Encoding.GetEncoding("UTF-8").GetString(Enclosing_Instance.Uncompress(b)); } else { //read in chars b/c we already know the length we need to read char[] chars = new char[toRead]; localFieldsStream.ReadChars(chars, 0, toRead); fieldsData = new System.String(chars); } } catch (System.IO.IOException e) { throw new FieldReaderException(e); } } return fieldsData is System.String ? (System.String) fieldsData : null; } public long GetPointer() { return pointer; } public void SetPointer(long pointer) { this.pointer = pointer; } public int GetToRead() { return toRead; } public void SetToRead(int toRead) { this.toRead = toRead; } } private byte[] Uncompress(byte[] input) { return SupportClass.CompressionSupport.Uncompress(input); } // Instances of this class hold field properties and data // for merge [Serializable] public sealed class FieldForMerge : AbstractField { public override System.String StringValue() { return (System.String) this.fieldsData; } public override System.IO.TextReader ReaderValue() { // not needed for merge return null; } public override byte[] BinaryValue() { return (byte[]) this.fieldsData; } public FieldForMerge(System.Object value_Renamed, FieldInfo fi, bool binary, bool compressed, bool tokenize) { this.isStored = true; this.fieldsData = value_Renamed; this.isCompressed = compressed; this.isBinary = binary; this.isTokenized = tokenize; this.name = String.Intern(fi.name); this.isIndexed = fi.isIndexed; this.omitNorms = fi.omitNorms; this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector; this.storePositionWithTermVector = fi.storePositionWithTermVector; this.storeTermVector = fi.storeTermVector; } } } }