package org.apache.lucene.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.queryParser.QueryParser; // for javadoc
import org.apache.lucene.util.Version;
/**
* Removes stop words from a token stream.
*/
public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords;
private boolean enablePositionIncrements = false;
private TermAttribute termAtt;
private PositionIncrementAttribute posIncrAtt;
/**
* Construct a token stream filtering the given input.
* If stopWords is an instance of {@link CharArraySet} (true if
* makeStopSet() was used to construct the set) it will be directly used
* and ignoreCase will be ignored since CharArraySet
* directly controls case sensitivity.
*
stopWords is not an instance of {@link CharArraySet},
* a new CharArraySet will be constructed and ignoreCase will be
* used to specify the case sensitivity of that set.
*
* @param enablePositionIncrements true if token positions should record the removed stop words
* @param input Input TokenStream
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
* @param ignoreCase if true, all words are lower cased first
*/
public StopFilter(boolean enablePositionIncrements, TokenStream input, Set> stopWords, boolean ignoreCase)
{
super(input);
if (stopWords instanceof CharArraySet) {
this.stopWords = (CharArraySet)stopWords;
} else {
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
this.stopWords.addAll(stopWords);
}
this.enablePositionIncrements = enablePositionIncrements;
termAtt = addAttribute(TermAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
}
/**
* Constructs a filter which removes words from the input
* TokenStream that are named in the Set.
*
* @param enablePositionIncrements true if token positions should record the removed stop words
* @param in Input stream
* @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords
* @see #makeStopSet(java.lang.String[])
*/
public StopFilter(boolean enablePositionIncrements, TokenStream in, Set> stopWords) {
this(enablePositionIncrements, in, stopWords, false);
}
/**
* Builds a Set from an array of stop words,
* appropriate for passing into the StopFilter constructor.
* This permits this stopWords construction to be cached once when
* an Analyzer is constructed.
*
* @see #makeStopSet(java.lang.String[], boolean) passing false to ignoreCase
*/
public static final Set