/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ using System; using System.Collections.Generic; using System.IO; using System.Collections; using System.Linq; using Lucene.Net.Analysis.Standard; using Lucene.Net.Analysis; using Version = Lucene.Net.Util.Version; namespace Lucene.Net.Analysis.De { ///

/// Analyzer for German language. Supports an external list of stopwords (words that /// will not be indexed at all) and an external list of exclusions (word that will /// not be stemmed, but indexed). /// A default set of stopwords is used unless an alternative list is specified, the /// exclusion list is empty by default. ///

public class GermanAnalyzer : Analyzer { ///

/// List of typical german stopwords. ///

//TODO: make this private in 3.1 private static readonly String[] GERMAN_STOP_WORDS = { "einer", "eine", "eines", "einem", "einen", "der", "die", "das", "dass", "daß", "du", "er", "sie", "es", "was", "wer", "wie", "wir", "und", "oder", "ohne", "mit", "am", "im", "in", "aus", "auf", "ist", "sein", "war", "wird", "ihr", "ihre", "ihres", "als", "für", "von", "dich", "dir", "mich", "mir", "mein", "kein", "durch", "wegen" }; ///

/// Returns a set of default German-stopwords ///

public static ISet GetDefaultStopSet() { return DefaultSetHolder.DEFAULT_SET; } private static class DefaultSetHolder { internal static readonly ISet DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet( (IEnumerable)GERMAN_STOP_WORDS, false)); } ///

/// Contains the stopwords used with the StopFilter. ///

//TODO: make this readonly in 3.1 private ISet stopSet; ///

/// Contains words that should be indexed but not stemmed. ///

//TODO: make this readonly in 3.1 private ISet exclusionSet; private Version matchVersion; private readonly bool _normalizeDin2; ///

/// Builds an analyzer with the default stop words: /// ///

[Obsolete("Use GermanAnalyzer(Version) instead")] public GermanAnalyzer() : this(Version.LUCENE_CURRENT) { } ///

/// Builds an analyzer with the default stop words: /// ///

/// Lucene compatibility version public GermanAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_SET) { } ///

/// Builds an analyzer with the default stop words: /// ///

/// Lucene compatibility version /// Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' /// respectively, before the DIN1 stemmer is invoked. public GermanAnalyzer(Version matchVersion, bool normalizeDin2) : this(matchVersion, DefaultSetHolder.DEFAULT_SET, normalizeDin2) { } ///

/// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer ///

/// Lucene compatibility version /// a stopword set public GermanAnalyzer(Version matchVersion, ISet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) { } ///

/// Builds an analyzer with the given stop words ///

/// Lucene compatibility version /// a stopword set /// Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' /// respectively, before the DIN1 stemmer is invoked. public GermanAnalyzer(Version matchVersion, ISet stopwords, bool normalizeDin2) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, normalizeDin2) { } ///

/// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer ///

/// lucene compatibility version /// a stopword set /// a stemming exclusion set public GermanAnalyzer(Version matchVersion, ISet stopwords, ISet stemExclusionSet) : this(matchVersion, stopwords, stemExclusionSet, false) { } ///

/// Builds an analyzer with the given stop words ///

/// lucene compatibility version /// a stopword set /// a stemming exclusion set /// Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' /// respectively, before the DIN1 stemmer is invoked. public GermanAnalyzer(Version matchVersion, ISet stopwords, ISet stemExclusionSet, bool normalizeDin2) { stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet)); this.matchVersion = matchVersion; _normalizeDin2 = normalizeDin2; SetOverridesTokenStreamMethod(); } ///

/// Builds an analyzer with the given stop words. ///

/// [Obsolete("use GermanAnalyzer(Version, Set) instead")] public GermanAnalyzer(Version matchVersion, params string[] stopwords) : this(matchVersion, StopFilter.MakeStopSet(stopwords)) { } ///

/// Builds an analyzer with the given stop words. ///

[Obsolete("Use GermanAnalyzer(Version, ISet)")] public GermanAnalyzer(Version matchVersion, IDictionary stopwords) : this(matchVersion, stopwords.Keys.ToArray()) { } ///

/// Builds an analyzer with the given stop words. ///

[Obsolete("Use GermanAnalyzer(Version, ISet)")] public GermanAnalyzer(Version matchVersion, FileInfo stopwords) : this(matchVersion, WordlistLoader.GetWordSet(stopwords)) { } ///

/// Builds an exclusionlist from an array of Strings. ///

[Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")] public void SetStemExclusionTable(String[] exclusionlist) { exclusionSet = StopFilter.MakeStopSet(exclusionlist); PreviousTokenStream = null; } ///

/// Builds an exclusionlist from a IDictionary. ///

[Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")] public void SetStemExclusionTable(IDictionary exclusionlist) { exclusionSet = Support.Compatibility.SetFactory.CreateHashSet(exclusionlist.Keys); PreviousTokenStream = null; } ///

/// Builds an exclusionlist from the words contained in the given file. ///

[Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")] public void SetStemExclusionTable(FileInfo exclusionlist) { exclusionSet = WordlistLoader.GetWordSet(exclusionlist); PreviousTokenStream = null; } ///

/// Creates a TokenStream which tokenizes all the text in the provided TextReader. ///

/// /// /// A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new StandardTokenizer(matchVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); result = new GermanStemFilter(result, exclusionSet, _normalizeDin2); return result; } } }