/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System.IO;
using Version = Lucene.Net.Util.Version;
/**
* Analyzer for Brazilian language. Supports an external list of stopwords (words that
* will not be indexed at all) and an external list of exclusions (word that will
* not be stemmed, but indexed).
*
*/
namespace Lucene.Net.Analysis.BR
{
public sealed class BrazilianAnalyzer : Analyzer
{
/**
* List of typical Brazilian stopwords.
*/
//TODO: Make this private in 3.1
public static string[] BRAZILIAN_STOP_WORDS = {
"a", "ainda", "alem", "ambas", "ambos", "antes",
"ao", "aonde", "aos", "apos", "aquele", "aqueles",
"as", "assim", "com", "como", "contra", "contudo",
"cuja", "cujas", "cujo", "cujos", "da", "das", "de",
"dela", "dele", "deles", "demais", "depois", "desde",
"desta", "deste", "dispoe", "dispoem", "diversa",
"diversas", "diversos", "do", "dos", "durante", "e",
"ela", "elas", "ele", "eles", "em", "entao", "entre",
"essa", "essas", "esse", "esses", "esta", "estas",
"este", "estes", "ha", "isso", "isto", "logo", "mais",
"mas", "mediante", "menos", "mesma", "mesmas", "mesmo",
"mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste",
"nos", "o", "os", "ou", "outra", "outras", "outro", "outros",
"pelas", "pelas", "pelo", "pelos", "perante", "pois", "por",
"porque", "portanto", "proprio", "propios", "quais", "qual",
"qualquer", "quando", "quanto", "que", "quem", "quer", "se",
"seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua",
"suas", "tal", "tambem", "teu", "teus", "toda", "todas",
"todo",
"todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns"
};
///
/// Returns an unmodifiable instance of the default stop-words set.
///
/// Returns an unmodifiable instance of the default stop-words set.
public static ISet GetDefaultStopSet()
{
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder
{
internal static ISet DEFAULT_STOP_SET =
CharArraySet.UnmodifiableSet(new CharArraySet(BRAZILIAN_STOP_WORDS, false));
}
///
/// Contains the stopwords used with the StopFilter.
///
private ISet stoptable = new HashSet();
private readonly Version matchVersion;
///
/// Contains words that should be indexed but not stemmed.
// TODO: make this private in 3.1
///
private ISet excltable = new HashSet();
public BrazilianAnalyzer(Version matchVersion)
: this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
{
}
/**
* Builds an analyzer with the given stop words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, ISet stopwords)
{
stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
this.matchVersion = matchVersion;
}
/**
* Builds an analyzer with the given stop words and stemming exclusion words
*
* @param matchVersion
* lucene compatibility version
* @param stopwords
* a stopword set
*/
public BrazilianAnalyzer(Version matchVersion, ISet stopwords,
ISet stemExclusionSet)
: this(matchVersion, stopwords)
{
excltable = CharArraySet.UnmodifiableSet(CharArraySet
.Copy(stemExclusionSet));
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
public BrazilianAnalyzer(Version matchVersion, params string[] stopwords)
: this(matchVersion, StopFilter.MakeStopSet(stopwords))
{
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
public BrazilianAnalyzer(Version matchVersion, IDictionary stopwords)
: this(matchVersion, stopwords.Keys.ToArray())
{
}
/**
* Builds an analyzer with the given stop words.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set)} instead
*/
public BrazilianAnalyzer(Version matchVersion, FileInfo stopwords)
: this(matchVersion, WordlistLoader.GetWordSet(stopwords))
{
}
/**
* Builds an exclusionlist from an array of Strings.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void SetStemExclusionTable(params string[] exclusionlist)
{
excltable = StopFilter.MakeStopSet(exclusionlist);
PreviousTokenStream = null; // force a new stemmer to be created
}
/**
* Builds an exclusionlist from a {@link Map}.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void SetStemExclusionTable(IDictionary exclusionlist)
{
excltable = new HashSet(exclusionlist.Keys);
PreviousTokenStream = null; // force a new stemmer to be created
}
/**
* Builds an exclusionlist from the words contained in the given file.
* @deprecated use {@link #BrazilianAnalyzer(Version, Set, Set)} instead
*/
public void SetStemExclusionTable(FileInfo exclusionlist)
{
excltable = WordlistLoader.GetWordSet(exclusionlist);
PreviousTokenStream = null; // force a new stemmer to be created
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
* {@link BrazilianStemFilter}.
*/
public override TokenStream TokenStream(String fieldName, TextReader reader)
{
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new LowerCaseFilter(result);
result = new StandardFilter(result);
result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
result, stoptable);
result = new BrazilianStemFilter(result, excltable);
return result;
}
private class SavedStreams
{
protected internal Tokenizer source;
protected internal TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
* {@link BrazilianStemFilter}.
*/
public override TokenStream ReusableTokenStream(String fieldName, TextReader reader)
{
SavedStreams streams = (SavedStreams) PreviousTokenStream;
if (streams == null)
{
streams = new SavedStreams();
streams.source = new StandardTokenizer(matchVersion, reader);
streams.result = new LowerCaseFilter(streams.source);
streams.result = new StandardFilter(streams.result);
streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
streams.result, stoptable);
streams.result = new BrazilianStemFilter(streams.result, excltable);
PreviousTokenStream = streams;
}
else
{
streams.source.Reset(reader);
}
return streams.result;
}
}
}