/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections.Generic;
using System.IO;
using System.Collections;
using System.Linq;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis;
using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.De
{
///
/// Analyzer for German language. Supports an external list of stopwords (words that
/// will not be indexed at all) and an external list of exclusions (word that will
/// not be stemmed, but indexed).
/// A default set of stopwords is used unless an alternative list is specified, the
/// exclusion list is empty by default.
///
public class GermanAnalyzer : Analyzer
{
///
/// List of typical german stopwords.
///
//TODO: make this private in 3.1
private static readonly String[] GERMAN_STOP_WORDS =
{
"einer", "eine", "eines", "einem", "einen",
"der", "die", "das", "dass", "daß",
"du", "er", "sie", "es",
"was", "wer", "wie", "wir",
"und", "oder", "ohne", "mit",
"am", "im", "in", "aus", "auf",
"ist", "sein", "war", "wird",
"ihr", "ihre", "ihres",
"als", "für", "von",
"dich", "dir", "mich", "mir",
"mein", "kein",
"durch", "wegen"
};
///
/// Returns a set of default German-stopwords
///
public static ISet GetDefaultStopSet()
{
return DefaultSetHolder.DEFAULT_SET;
}
private static class DefaultSetHolder
{
internal static readonly ISet DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet(
(IEnumerable)GERMAN_STOP_WORDS,
false));
}
///
/// Contains the stopwords used with the StopFilter.
///
//TODO: make this readonly in 3.1
private ISet stopSet;
///
/// Contains words that should be indexed but not stemmed.
///
//TODO: make this readonly in 3.1
private ISet exclusionSet;
private Version matchVersion;
private readonly bool _normalizeDin2;
///
/// Builds an analyzer with the default stop words:
///
///
[Obsolete("Use GermanAnalyzer(Version) instead")]
public GermanAnalyzer()
: this(Version.LUCENE_CURRENT)
{
}
///
/// Builds an analyzer with the default stop words:
///
///
/// Lucene compatibility version
public GermanAnalyzer(Version matchVersion)
: this(matchVersion, DefaultSetHolder.DEFAULT_SET)
{ }
///
/// Builds an analyzer with the default stop words:
///
///
/// Lucene compatibility version
/// Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This
/// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
/// respectively, before the DIN1 stemmer is invoked.
public GermanAnalyzer(Version matchVersion, bool normalizeDin2)
: this(matchVersion, DefaultSetHolder.DEFAULT_SET, normalizeDin2)
{ }
///
/// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
///
/// Lucene compatibility version
/// a stopword set
public GermanAnalyzer(Version matchVersion, ISet stopwords)
: this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
{
}
///
/// Builds an analyzer with the given stop words
///
/// Lucene compatibility version
/// a stopword set
/// Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This
/// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
/// respectively, before the DIN1 stemmer is invoked.
public GermanAnalyzer(Version matchVersion, ISet stopwords, bool normalizeDin2)
: this(matchVersion, stopwords, CharArraySet.EMPTY_SET, normalizeDin2)
{
}
///
/// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer
///
/// lucene compatibility version
/// a stopword set
/// a stemming exclusion set
public GermanAnalyzer(Version matchVersion, ISet stopwords, ISet stemExclusionSet)
: this(matchVersion, stopwords, stemExclusionSet, false)
{ }
///
/// Builds an analyzer with the given stop words
///
/// lucene compatibility version
/// a stopword set
/// a stemming exclusion set
/// Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This
/// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o'
/// respectively, before the DIN1 stemmer is invoked.
public GermanAnalyzer(Version matchVersion, ISet stopwords, ISet stemExclusionSet, bool normalizeDin2)
{
stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords));
exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet));
this.matchVersion = matchVersion;
_normalizeDin2 = normalizeDin2;
SetOverridesTokenStreamMethod();
}
///
/// Builds an analyzer with the given stop words.
///
///
[Obsolete("use GermanAnalyzer(Version, Set) instead")]
public GermanAnalyzer(Version matchVersion, params string[] stopwords)
: this(matchVersion, StopFilter.MakeStopSet(stopwords))
{
}
///
/// Builds an analyzer with the given stop words.
///
[Obsolete("Use GermanAnalyzer(Version, ISet)")]
public GermanAnalyzer(Version matchVersion, IDictionary stopwords)
: this(matchVersion, stopwords.Keys.ToArray())
{
}
///
/// Builds an analyzer with the given stop words.
///
[Obsolete("Use GermanAnalyzer(Version, ISet)")]
public GermanAnalyzer(Version matchVersion, FileInfo stopwords)
: this(matchVersion, WordlistLoader.GetWordSet(stopwords))
{
}
///
/// Builds an exclusionlist from an array of Strings.
///
[Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
public void SetStemExclusionTable(String[] exclusionlist)
{
exclusionSet = StopFilter.MakeStopSet(exclusionlist);
PreviousTokenStream = null;
}
///
/// Builds an exclusionlist from a IDictionary.
///
[Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
public void SetStemExclusionTable(IDictionary exclusionlist)
{
exclusionSet = Support.Compatibility.SetFactory.CreateHashSet(exclusionlist.Keys);
PreviousTokenStream = null;
}
///
/// Builds an exclusionlist from the words contained in the given file.
///
[Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")]
public void SetStemExclusionTable(FileInfo exclusionlist)
{
exclusionSet = WordlistLoader.GetWordSet(exclusionlist);
PreviousTokenStream = null;
}
///
/// Creates a TokenStream which tokenizes all the text in the provided TextReader.
///
///
///
/// A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter
public override TokenStream TokenStream(String fieldName, TextReader reader)
{
TokenStream result = new StandardTokenizer(matchVersion, reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet);
result = new GermanStemFilter(result, exclusionSet, _normalizeDin2);
return result;
}
}
}