/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using Lucene.Net.Index; using Lucene.Net.Support; using Lucene.Net.Util; using Version = Lucene.Net.Util.Version; namespace Lucene.Net.Analysis.Query { /* * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection * which prevents very common words from being passed into queries. *

* For very large indexes the cost * of reading TermDocs for a very common word can be high. This analyzer was created after experience with * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for * this term to take 2 seconds. *

*

* Use the various "addStopWords" methods in this class to automate the identification and addition of * stop words found in an already existing index. *

*/ public class QueryAutoStopWordAnalyzer : Analyzer { Analyzer _delegate; HashMap> stopWordsPerField = new HashMap>(); //The default maximum percentage (40%) of index documents which //can contain a term, after which the term is considered to be a stop word. public const float defaultMaxDocFreqPercent = 0.4f; private readonly Version matchVersion; /* * Initializes this analyzer with the Analyzer object that actually produces the tokens * * @param _delegate The choice of {@link Analyzer} that is used to produce the token stream which needs filtering */ public QueryAutoStopWordAnalyzer(Version matchVersion, Analyzer _delegate) { this._delegate = _delegate; SetOverridesTokenStreamMethod(); this.matchVersion = matchVersion; } /* * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @return The number of stop words identified. * @throws IOException */ public int AddStopWords(IndexReader reader) { return AddStopWords(reader, defaultMaxDocFreqPercent); } /* * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @param maxDocFreq The maximum number of index documents which can contain a term, after which * the term is considered to be a stop word * @return The number of stop words identified. * @throws IOException */ public int AddStopWords(IndexReader reader, int maxDocFreq) { int numStopWords = 0; ICollection fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED); for (IEnumerator iter = fieldNames.GetEnumerator(); iter.MoveNext();) { String fieldName = iter.Current; numStopWords += AddStopWords(reader, fieldName, maxDocFreq); } return numStopWords; } /* * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which * contain a term, after which the word is considered to be a stop word. * @return The number of stop words identified. * @throws IOException */ public int AddStopWords(IndexReader reader, float maxPercentDocs) { int numStopWords = 0; ICollection fieldNames = reader.GetFieldNames(IndexReader.FieldOption.INDEXED); for (IEnumerator iter = fieldNames.GetEnumerator(); iter.MoveNext();) { String fieldName = iter.Current; numStopWords += AddStopWords(reader, fieldName, maxPercentDocs); } return numStopWords; } /* * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @param fieldName The field for which stopwords will be added * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which * contain a term, after which the word is considered to be a stop word. * @return The number of stop words identified. * @throws IOException */ public int AddStopWords(IndexReader reader, String fieldName, float maxPercentDocs) { return AddStopWords(reader, fieldName, (int) (reader.NumDocs() * maxPercentDocs)); } /* * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs * * @param reader The {@link IndexReader} which will be consulted to identify potential stop words that * exceed the required document frequency * @param fieldName The field for which stopwords will be added * @param maxDocFreq The maximum number of index documents which * can contain a term, after which the term is considered to be a stop word. * @return The number of stop words identified. * @throws IOException */ public int AddStopWords(IndexReader reader, String fieldName, int maxDocFreq) { var stopWords = Support.Compatibility.SetFactory.CreateHashSet(); String internedFieldName = StringHelper.Intern(fieldName); TermEnum te = reader.Terms(new Term(fieldName)); Term term = te.Term; while (term != null) { if (term.Field != internedFieldName) { break; } if (te.DocFreq() > maxDocFreq) { stopWords.Add(term.Text); } if (!te.Next()) { break; } term = te.Term; } stopWordsPerField.Add(fieldName, stopWords); /* if the stopwords for a field are changed, * then saved streams for that field are erased. */ IDictionary streamMap = (IDictionary) PreviousTokenStream; if (streamMap != null) streamMap.Remove(fieldName); return stopWords.Count; } public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result; try { result = _delegate.ReusableTokenStream(fieldName, reader); } catch (IOException) { result = _delegate.TokenStream(fieldName, reader); } var stopWords = stopWordsPerField[fieldName]; if (stopWords != null) { result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopWords); } return result; } private class SavedStreams { /* the underlying stream */ protected internal TokenStream Wrapped; /* * when there are no stopwords for the field, refers to wrapped. * if there stopwords, it is a StopFilter around wrapped. */ protected internal TokenStream WithStopFilter; }; public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) { if (overridesTokenStreamMethod) { // LUCENE-1678: force fallback to tokenStream() if we // have been subclassed and that subclass overrides // tokenStream but not reusableTokenStream return TokenStream(fieldName, reader); } /* map of SavedStreams for each field */ IDictionary streamMap = (IDictionary)PreviousTokenStream; if (streamMap == null) { streamMap = new HashMap(); PreviousTokenStream = streamMap; } SavedStreams streams = streamMap[fieldName]; if (streams == null) { /* an entry for this field does not exist, create one */ streams = new SavedStreams(); streamMap.Add(fieldName, streams); streams.Wrapped = _delegate.ReusableTokenStream(fieldName, reader); /* if there are any stopwords for the field, save the stopfilter */ var stopWords = stopWordsPerField[fieldName]; if (stopWords != null) streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.Wrapped, stopWords); else streams.WithStopFilter = streams.Wrapped; } else { /* * an entry for this field exists, verify the wrapped stream has not * changed. if it has not, reuse it, otherwise wrap the new stream. */ TokenStream result = _delegate.ReusableTokenStream(fieldName, reader); if (result == streams.Wrapped) { /* the wrapped analyzer reused the stream */ streams.WithStopFilter.Reset(); } else { /* * the wrapped analyzer did not. if there are any stopwords for the * field, create a new StopFilter around the new stream */ streams.Wrapped = result; var stopWords = stopWordsPerField[fieldName]; if (stopWords != null) streams.WithStopFilter = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), streams.Wrapped, stopWords); else streams.WithStopFilter = streams.Wrapped; } } return streams.WithStopFilter; } /* * Provides information on which stop words have been identified for a field * * @param fieldName The field for which stop words identified in "addStopWords" * method calls will be returned * @return the stop words identified for a field */ public String[] GetStopWords(String fieldName) { String[] result; var stopWords = stopWordsPerField[fieldName]; if (stopWords != null) { result = stopWords.ToArray(); } else { result = new String[0]; } return result; } /* * Provides information on which stop words have been identified for all fields * * @return the stop words (as terms) */ public Term[] GetStopWords() { List allStopWords = new List(); foreach(var fieldName in stopWordsPerField.Keys) { var stopWords = stopWordsPerField[fieldName]; foreach(var text in stopWords) { allStopWords.Add(new Term(fieldName, text)); } } return allStopWords.ToArray(); } } }