/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.IO; using TokenStream = Lucene.Net.Analysis.TokenStream; using IndexWriter = Lucene.Net.Index.IndexWriter; using StringHelper = Lucene.Net.Util.StringHelper; namespace Lucene.Net.Documents { /// A field is a section of a Document. Each field has two parts, a name and a /// value. Values may be free text, provided as a String or as a Reader, or they /// may be atomic keywords, which are not further processed. Such keywords may /// be used to represent dates, urls, etc. Fields are optionally stored in the /// index, so that they may be returned with hits on the document. /// [Serializable] public sealed class Field:AbstractField, IFieldable { /// Specifies whether and how a field should be stored. public enum Store { /// Store the original field value in the index. This is useful for short texts /// like a document's title which should be displayed with the results. The /// value is stored in its original form, i.e. no analyzer is used before it is /// stored. /// YES, /// Do not store the field value in the index. NO } /// Specifies whether and how a field should be indexed. public enum Index { /// Do not index the field value. This field can thus not be searched, /// but one can still access its contents provided it is /// stored. /// NO, /// Index the tokens produced by running the field's /// value through an Analyzer. This is useful for /// common text. /// ANALYZED, /// Index the field's value without using an Analyzer, so it can be searched. /// As no analyzer is used the value will be stored as a single term. This is /// useful for unique Ids like product numbers. /// NOT_ANALYZED, /// Expert: Index the field's value without an Analyzer, /// and also disable the storing of norms. Note that you /// can also separately enable/disable norms by setting /// . No norms means that /// index-time field and document boosting and field /// length normalization are disabled. The benefit is /// less memory usage as norms take up one byte of RAM /// per indexed field for every document in the index, /// during searching. Note that once you index a given /// field with norms enabled, disabling norms will /// have no effect. In other words, for this to have the /// above described effect on a field, all instances of /// that field must be indexed with NOT_ANALYZED_NO_NORMS /// from the beginning. /// NOT_ANALYZED_NO_NORMS, /// Expert: Index the tokens produced by running the /// field's value through an Analyzer, and also /// separately disable the storing of norms. See /// for what norms are /// and why you may want to disable them. /// ANALYZED_NO_NORMS, } /// Specifies whether and how a field should have term vectors. public enum TermVector { /// Do not store term vectors. NO, /// Store the term vectors of each document. A term vector is a list /// of the document's terms and their number of occurrences in that document. /// YES, /// Store the term vector + token position information /// /// /// /// WITH_POSITIONS, /// Store the term vector + Token offset information /// /// /// /// WITH_OFFSETS, /// Store the term vector + Token position and offset information /// /// /// /// /// /// /// /// WITH_POSITIONS_OFFSETS, } /// The value of the field as a String, or null. If null, the Reader value or /// binary value is used. Exactly one of stringValue(), /// readerValue(), and getBinaryValue() must be set. /// public override string StringValue { get { return fieldsData is System.String ? (System.String) fieldsData : null; } } /// The value of the field as a Reader, or null. If null, the String value or /// binary value is used. Exactly one of stringValue(), /// readerValue(), and getBinaryValue() must be set. /// public override TextReader ReaderValue { get { return fieldsData is System.IO.TextReader ? (System.IO.TextReader) fieldsData : null; } } /// The TokesStream for this field to be used when indexing, or null. If null, the Reader value /// or String value is analyzed to produce the indexed tokens. /// public override TokenStream TokenStreamValue { get { return tokenStream; } } ///

Expert: change the value of this field. This can /// be used during indexing to re-use a single Field /// instance to improve indexing speed by avoiding GC cost /// of new'ing and reclaiming Field instances. Typically /// a single instance is re-used as /// well. This helps most on small documents.

/// ///

Each Field instance should only be used once /// within a single instance. See ImproveIndexingSpeed /// for details.

///

public void SetValue(System.String value) { if (internalIsBinary) { throw new System.ArgumentException("cannot set a String value on a binary field"); } fieldsData = value; } /// Expert: change the value of this field. See setValue(String). public void SetValue(System.IO.TextReader value) { if (internalIsBinary) { throw new System.ArgumentException("cannot set a Reader value on a binary field"); } if (internalIsStored) { throw new System.ArgumentException("cannot set a Reader value on a stored field"); } fieldsData = value; } /// Expert: change the value of this field. See setValue(String). public void SetValue(byte[] value) { if (!internalIsBinary) { throw new System.ArgumentException("cannot set a byte[] value on a non-binary field"); } fieldsData = value; internalBinaryLength = value.Length; internalbinaryOffset = 0; } /// Expert: change the value of this field. See setValue(String). public void SetValue(byte[] value, int offset, int length) { if (!internalIsBinary) { throw new System.ArgumentException("cannot set a byte[] value on a non-binary field"); } fieldsData = value; internalBinaryLength = length; internalbinaryOffset = offset; } /// Expert: sets the token stream to be used for indexing and causes isIndexed() and isTokenized() to return true. /// May be combined with stored values from stringValue() or GetBinaryValue() /// public void SetTokenStream(TokenStream tokenStream) { this.internalIsIndexed = true; this.internalIsTokenized = true; this.tokenStream = tokenStream; } /// Create a field by specifying its name, value and how it will /// be saved in the index. Term vectors will not be stored in the index. /// /// /// The name of the field /// /// The string to process /// /// Whether value should be stored in the index /// /// Whether the field should be indexed, and if so, if it should /// be tokenized before indexing /// /// NullPointerException if name or value is null /// IllegalArgumentException if the field is neither stored nor indexed public Field(System.String name, System.String value, Store store, Index index) : this(name, value, store, index, TermVector.NO) { } /// Create a field by specifying its name, value and how it will /// be saved in the index. /// /// /// The name of the field /// /// The string to process /// /// Whether value should be stored in the index /// /// Whether the field should be indexed, and if so, if it should /// be tokenized before indexing /// /// Whether term vector should be stored /// /// NullPointerException if name or value is null /// IllegalArgumentException in any of the following situations: /// /// the field is neither stored nor indexed /// the field is not indexed but termVector is TermVector.YES /// /// public Field(System.String name, System.String value, Store store, Index index, TermVector termVector) : this(name, true, value, store, index, termVector) { } /// Create a field by specifying its name, value and how it will /// be saved in the index. /// /// /// The name of the field /// /// Whether to .intern() name or not /// /// The string to process /// /// Whether value should be stored in the index /// /// Whether the field should be indexed, and if so, if it should /// be tokenized before indexing /// /// Whether term vector should be stored /// /// NullPointerException if name or value is null /// IllegalArgumentException in any of the following situations: /// /// the field is neither stored nor indexed /// the field is not indexed but termVector is TermVector.YES /// /// public Field(System.String name, bool internName, System.String value, Store store, Index index, TermVector termVector) { if (name == null) throw new System.NullReferenceException("name cannot be null"); if (value == null) throw new System.NullReferenceException("value cannot be null"); if (name.Length == 0 && value.Length == 0) throw new System.ArgumentException("name and value cannot both be empty"); if (index == Index.NO && store == Store.NO) throw new System.ArgumentException("it doesn't make sense to have a field that " + "is neither indexed nor stored"); if (index == Index.NO && termVector != TermVector.NO) throw new System.ArgumentException("cannot store term vector information " + "for a field that is not indexed"); if (internName) // field names are optionally interned name = StringHelper.Intern(name); this.internalName = name; this.fieldsData = value; this.internalIsStored = store.IsStored(); this.internalIsIndexed = index.IsIndexed(); this.internalIsTokenized = index.IsAnalyzed(); this.internalOmitNorms = index.OmitNorms(); if (index == Index.NO) { this.internalOmitTermFreqAndPositions = false; } this.internalIsBinary = false; SetStoreTermVector(termVector); } /// Create a tokenized and indexed field that is not stored. Term vectors will /// not be stored. The Reader is read only when the Document is added to the index, /// i.e. you may not close the Reader until /// has been called. /// /// /// The name of the field /// /// The reader with the content /// /// NullPointerException if name or reader is null public Field(System.String name, System.IO.TextReader reader):this(name, reader, TermVector.NO) { } /// Create a tokenized and indexed field that is not stored, optionally with /// storing term vectors. The Reader is read only when the Document is added to the index, /// i.e. you may not close the Reader until /// has been called. /// /// /// The name of the field /// /// The reader with the content /// /// Whether term vector should be stored /// /// NullPointerException if name or reader is null public Field(System.String name, System.IO.TextReader reader, TermVector termVector) { if (name == null) throw new System.NullReferenceException("name cannot be null"); if (reader == null) throw new System.NullReferenceException("reader cannot be null"); this.internalName = StringHelper.Intern(name); // field names are interned this.fieldsData = reader; this.internalIsStored = false; this.internalIsIndexed = true; this.internalIsTokenized = true; this.internalIsBinary = false; SetStoreTermVector(termVector); } /// Create a tokenized and indexed field that is not stored. Term vectors will /// not be stored. This is useful for pre-analyzed fields. /// The TokenStream is read only when the Document is added to the index, /// i.e. you may not close the TokenStream until /// has been called. /// /// /// The name of the field /// /// The TokenStream with the content /// /// NullPointerException if name or tokenStream is null public Field(System.String name, TokenStream tokenStream):this(name, tokenStream, TermVector.NO) { } /// Create a tokenized and indexed field that is not stored, optionally with /// storing term vectors. This is useful for pre-analyzed fields. /// The TokenStream is read only when the Document is added to the index, /// i.e. you may not close the TokenStream until /// has been called. /// /// /// The name of the field /// /// The TokenStream with the content /// /// Whether term vector should be stored /// /// NullPointerException if name or tokenStream is null public Field(System.String name, TokenStream tokenStream, TermVector termVector) { if (name == null) throw new System.NullReferenceException("name cannot be null"); if (tokenStream == null) throw new System.NullReferenceException("tokenStream cannot be null"); this.internalName = StringHelper.Intern(name); // field names are interned this.fieldsData = null; this.tokenStream = tokenStream; this.internalIsStored = false; this.internalIsIndexed = true; this.internalIsTokenized = true; this.internalIsBinary = false; SetStoreTermVector(termVector); } /// Create a stored field with binary value. Optionally the value may be compressed. /// /// /// The name of the field /// /// The binary value /// /// How value should be stored (compressed or not) /// /// IllegalArgumentException if store is Store.NO public Field(System.String name, byte[] value_Renamed, Store store):this(name, value_Renamed, 0, value_Renamed.Length, store) { } /// Create a stored field with binary value. Optionally the value may be compressed. /// /// /// The name of the field /// /// The binary value /// /// Starting offset in value where this Field's bytes are /// /// Number of bytes to use for this Field, starting at offset /// /// How value should be stored (compressed or not) /// /// IllegalArgumentException if store is Store.NO public Field(System.String name, byte[] value_Renamed, int offset, int length, Store store) { if (name == null) throw new System.ArgumentException("name cannot be null"); if (value_Renamed == null) throw new System.ArgumentException("value cannot be null"); this.internalName = StringHelper.Intern(name); // field names are interned fieldsData = value_Renamed; if (store == Store.NO) throw new System.ArgumentException("binary values can't be unstored"); internalIsStored = store.IsStored(); internalIsIndexed = false; internalIsTokenized = false; internalOmitTermFreqAndPositions = false; internalOmitNorms = true; internalIsBinary = true; internalBinaryLength = length; internalbinaryOffset = offset; SetStoreTermVector(TermVector.NO); } } public static class FieldExtensions { public static bool IsStored(this Field.Store store) { switch(store) { case Field.Store.YES: return true; case Field.Store.NO: return false; default: throw new ArgumentOutOfRangeException("store", "Invalid value for Field.Store"); } } public static bool IsIndexed(this Field.Index index) { switch(index) { case Field.Index.NO: return false; case Field.Index.ANALYZED: case Field.Index.NOT_ANALYZED: case Field.Index.NOT_ANALYZED_NO_NORMS: case Field.Index.ANALYZED_NO_NORMS: return true; default: throw new ArgumentOutOfRangeException("index", "Invalid value for Field.Index"); } } public static bool IsAnalyzed(this Field.Index index) { switch (index) { case Field.Index.NO: case Field.Index.NOT_ANALYZED: case Field.Index.NOT_ANALYZED_NO_NORMS: return false; case Field.Index.ANALYZED: case Field.Index.ANALYZED_NO_NORMS: return true; default: throw new ArgumentOutOfRangeException("index", "Invalid value for Field.Index"); } } public static bool OmitNorms(this Field.Index index) { switch (index) { case Field.Index.ANALYZED: case Field.Index.NOT_ANALYZED: return false; case Field.Index.NO: case Field.Index.NOT_ANALYZED_NO_NORMS: case Field.Index.ANALYZED_NO_NORMS: return true; default: throw new ArgumentOutOfRangeException("index", "Invalid value for Field.Index"); } } public static bool IsStored(this Field.TermVector tv) { switch(tv) { case Field.TermVector.NO: return false; case Field.TermVector.YES: case Field.TermVector.WITH_OFFSETS: case Field.TermVector.WITH_POSITIONS: case Field.TermVector.WITH_POSITIONS_OFFSETS: return true; default: throw new ArgumentOutOfRangeException("tv", "Invalid value for Field.TermVector"); } } public static bool WithPositions(this Field.TermVector tv) { switch (tv) { case Field.TermVector.NO: case Field.TermVector.YES: case Field.TermVector.WITH_OFFSETS: return false; case Field.TermVector.WITH_POSITIONS: case Field.TermVector.WITH_POSITIONS_OFFSETS: return true; default: throw new ArgumentOutOfRangeException("tv", "Invalid value for Field.TermVector"); } } public static bool WithOffsets(this Field.TermVector tv) { switch (tv) { case Field.TermVector.NO: case Field.TermVector.YES: case Field.TermVector.WITH_POSITIONS: return false; case Field.TermVector.WITH_OFFSETS: case Field.TermVector.WITH_POSITIONS_OFFSETS: return true; default: throw new ArgumentOutOfRangeException("tv", "Invalid value for Field.TermVector"); } } public static Field.Index ToIndex(bool indexed, bool analyed) { return ToIndex(indexed, analyed, false); } public static Field.Index ToIndex(bool indexed, bool analyzed, bool omitNorms) { // If it is not indexed nothing else matters if (!indexed) { return Field.Index.NO; } // typical, non-expert if (!omitNorms) { if (analyzed) { return Field.Index.ANALYZED; } return Field.Index.NOT_ANALYZED; } // Expert: Norms omitted if (analyzed) { return Field.Index.ANALYZED_NO_NORMS; } return Field.Index.NOT_ANALYZED_NO_NORMS; } /// /// Get the best representation of a TermVector given the flags. /// public static Field.TermVector ToTermVector(bool stored, bool withOffsets, bool withPositions) { // If it is not stored, nothing else matters. if (!stored) { return Field.TermVector.NO; } if (withOffsets) { if (withPositions) { return Field.TermVector.WITH_POSITIONS_OFFSETS; } return Field.TermVector.WITH_OFFSETS; } if (withPositions) { return Field.TermVector.WITH_POSITIONS; } return Field.TermVector.YES; } } }