/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using IndexReader = Lucene.Net.Index.IndexReader; using TermDocs = Lucene.Net.Index.TermDocs; using OpenBitSet = Lucene.Net.Util.OpenBitSet; namespace Lucene.Net.Search { /// A that only accepts documents whose single /// term value in the specified field is contained in the /// provided set of allowed terms. /// ///

/// /// This is the same functionality as TermsFilter (from /// contrib/queries), except this filter requires that the /// field contains only a single term for all documents. /// Because of drastically different implementations, they /// also have different performance characteristics, as /// described below. /// ///

/// /// The first invocation of this filter on a given field will /// be slower, since a must be /// created. Subsequent invocations using the same field /// will re-use this cache. However, as with all /// functionality based on , persistent RAM /// is consumed to hold the cache, and is not freed until the /// is closed. In contrast, TermsFilter /// has no persistent RAM consumption. /// /// ///

/// /// With each search, this filter translates the specified /// set of Terms into a private keyed by /// term number per unique (normally one /// reader per segment). Then, during matching, the term /// number for each docID is retrieved from the cache and /// then checked for inclusion using the . /// Since all testing is done using RAM resident data /// structures, performance should be very fast, most likely /// fast enough to not require further caching of the /// DocIdSet for each possible combination of terms. /// However, because docIDs are simply scanned linearly, an /// index with a great many small documents may find this /// linear scan too costly. /// ///

/// /// In contrast, TermsFilter builds up an , /// keyed by docID, every time it's created, by enumerating /// through all matching docs using to seek /// and scan through each term's docID list. While there is /// no linear scan of all docIDs, besides the allocation of /// the underlying array in the , this /// approach requires a number of "disk seeks" in proportion /// to the number of terms, which can be exceptionally costly /// when there are cache misses in the OS's IO cache. /// ///

/// /// Generally, this filter will be slower on the first /// invocation for a given field, but subsequent invocations, /// even if you change the allowed set of Terms, should be /// faster than TermsFilter, especially as the number of /// Terms being matched increases. If you are matching only /// a very small number of terms, and those terms in turn /// match a very small number of documents, TermsFilter may /// perform faster. /// ///

/// /// Which filter is best is very application dependent. ///

[Serializable] public class FieldCacheTermsFilter:Filter { private readonly string field; private readonly string[] terms; public FieldCacheTermsFilter(string field, params string[] terms) { this.field = field; this.terms = terms; } public virtual FieldCache FieldCache { get { return FieldCache_Fields.DEFAULT; } } public override DocIdSet GetDocIdSet(IndexReader reader) { return new FieldCacheTermsFilterDocIdSet(this, FieldCache.GetStringIndex(reader, field)); } protected internal class FieldCacheTermsFilterDocIdSet:DocIdSet { private void InitBlock(FieldCacheTermsFilter enclosingInstance) { this.enclosingInstance = enclosingInstance; } private FieldCacheTermsFilter enclosingInstance; public FieldCacheTermsFilter Enclosing_Instance { get { return enclosingInstance; } } private readonly Lucene.Net.Search.StringIndex fcsi; private readonly OpenBitSet openBitSet; public FieldCacheTermsFilterDocIdSet(FieldCacheTermsFilter enclosingInstance, StringIndex fcsi) { InitBlock(enclosingInstance); this.fcsi = fcsi; openBitSet = new OpenBitSet(this.fcsi.lookup.Length); foreach (string t in Enclosing_Instance.terms) { int termNumber = this.fcsi.BinarySearchLookup(t); if (termNumber > 0) { openBitSet.FastSet(termNumber); } } } public override DocIdSetIterator Iterator() { return new FieldCacheTermsFilterDocIdSetIterator(this); } /// This DocIdSet implementation is cacheable. public override bool IsCacheable { get { return true; } } protected internal class FieldCacheTermsFilterDocIdSetIterator:DocIdSetIterator { public FieldCacheTermsFilterDocIdSetIterator(FieldCacheTermsFilterDocIdSet enclosingInstance) { InitBlock(enclosingInstance); } private void InitBlock(FieldCacheTermsFilterDocIdSet enclosingInstance) { this.enclosingInstance = enclosingInstance; } private FieldCacheTermsFilterDocIdSet enclosingInstance; public FieldCacheTermsFilterDocIdSet Enclosing_Instance { get { return enclosingInstance; } } private int doc = - 1; public override int DocID() { return doc; } public override int NextDoc() { try { while (!Enclosing_Instance.openBitSet.FastGet(Enclosing_Instance.fcsi.order[++doc])) { } } catch (IndexOutOfRangeException) { doc = NO_MORE_DOCS; } return doc; } public override int Advance(int target) { try { doc = target; while (!Enclosing_Instance.openBitSet.FastGet(Enclosing_Instance.fcsi.order[doc])) { doc++; } } catch (IndexOutOfRangeException) { doc = NO_MORE_DOCS; } return doc; } } } } }