/*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using LowerCaseFilter = Lucene.Net.Analysis.LowerCaseFilter;
using StopFilter = Lucene.Net.Analysis.StopFilter;
using TokenStream = Lucene.Net.Analysis.TokenStream;
using StandardFilter = Lucene.Net.Analysis.Standard.StandardFilter;
using StandardTokenizer = Lucene.Net.Analysis.Standard.StandardTokenizer;
namespace Lucene.Net.Analysis.DE
{
/// Analyzer for German language. Supports an external list of stopwords (words that
/// will not be indexed at all) and an external list of exclusions (word that will
/// not be stemmed, but indexed).
/// A default set of stopwords is used unless an alternative list is specified, the
/// exclusion list is empty by default.
///
///
/// Gerhard Schwarz
///
/// $Id: GermanAnalyzer.java,v 1.16 2004/05/30 20:24:20 otis Exp $
///
public class GermanAnalyzer : Analyzer
{
/// List of typical german stopwords.
private System.String[] GERMAN_STOP_WORDS = new System.String[]
{
"einer", "eine", "eines", "einem", "einen", "der", "die",
"das", "dass", "daß", "du", "er", "sie", "es", "was", "wer",
"wie", "wir", "und", "oder", "ohne", "mit", "am", "im", "in",
"aus", "auf", "ist", "sein", "war", "wird", "ihr", "ihre",
"ihres", "als", "für", "von", "mit", "dich", "dir", "mich",
"mir", "mein", "sein", "kein", "durch", "wegen", "wird"
};
/// Contains the stopwords used with the StopFilter.
private System.Collections.Hashtable stopSet = new System.Collections.Hashtable();
/// Contains words that should be indexed but not stemmed.
private System.Collections.Hashtable exclusionSet = new System.Collections.Hashtable();
/// Builds an analyzer.
public GermanAnalyzer()
{
stopSet = StopFilter.MakeStopSet(GERMAN_STOP_WORDS);
}
/// Builds an analyzer with the given stop words.
public GermanAnalyzer(System.String[] stopwords)
{
stopSet = StopFilter.MakeStopSet(stopwords);
}
/// Builds an analyzer with the given stop words.
public GermanAnalyzer(System.Collections.Hashtable stopwords)
{
stopSet = new System.Collections.Hashtable(new System.Collections.Hashtable(stopwords));
}
/// Builds an analyzer with the given stop words.
public GermanAnalyzer(System.IO.FileInfo stopwords)
{
stopSet = WordlistLoader.GetWordSet(stopwords);
}
/// Builds an exclusionlist from an array of Strings.
public virtual void SetStemExclusionTable(System.String[] exclusionlist)
{
exclusionSet = StopFilter.MakeStopSet(exclusionlist);
}
/// Builds an exclusionlist from a Hashtable.
public virtual void SetStemExclusionTable(System.Collections.Hashtable exclusionlist)
{
exclusionSet = new System.Collections.Hashtable(new System.Collections.Hashtable(exclusionlist));
}
/// Builds an exclusionlist from the words contained in the given file.
public virtual void SetStemExclusionTable(System.IO.FileInfo exclusionlist)
{
exclusionSet = WordlistLoader.GetWordSet(exclusionlist);
}
/// Creates a TokenStream which tokenizes all the text in the provided Reader.
///
///
/// A TokenStream build from a StandardTokenizer filtered with
/// StandardFilter, LowerCaseFilter, StopFilter, GermanStemFilter
///
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
result = new GermanStemFilter(result, exclusionSet);
return result;
}
}
}