/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace Lucene.Net.Analysis.Hunspell { /// /// HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or /// more stems for a word. It conforms to the algorithm in the original hunspell algorithm, /// including recursive suffix stripping. /// /// Chris Male public class HunspellStemmer { private static Int32 RECURSION_CAP = 2; private readonly HunspellDictionary _dictionary; /// /// Constructs a new HunspellStemmer which will use the provided HunspellDictionary /// to create its stems. /// /// HunspellDictionary that will be used to create the stems. public HunspellStemmer(HunspellDictionary dictionary) { if (dictionary == null) throw new ArgumentNullException("dictionary"); _dictionary = dictionary; } /// /// Find the stem(s) of the provided word. /// /// Word to find the stems for. /// List of stems for the word. public IEnumerable Stem(String word) { if (word == null) throw new ArgumentNullException("word"); var stems = new List(); if (_dictionary.LookupWord(word) != null) stems.Add(new HunspellStem(word)); stems.AddRange(Stem(word, null, 0)); return stems; } /// /// Find the unique stem(s) of the provided word. /// /// Word to find the stems for. /// List of stems for the word. public IEnumerable UniqueStems(String word) { if (word == null) throw new ArgumentNullException("word"); var stems = new List(); var terms = new CharArraySet(8, false); if (_dictionary.LookupWord(word) != null) { stems.Add(new HunspellStem(word)); terms.Add(word); } var otherStems = Stem(word, null, 0); foreach (var s in otherStems) { if (!terms.Contains(s.Stem)) { stems.Add(s); terms.Add(s.Stem); } } return stems; } /// /// Generates a list of stems for the provided word. /// /// Word to generate the stems for. /// Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step. /// Level of recursion this stemming step is at. /// List of stems, pr an empty if no stems are found. private IEnumerable Stem(String word, Char[] flags, Int32 recursionDepth) { if (word == null) throw new ArgumentNullException("word"); var stems = new List(); var chars = word.ToCharArray(); var length = word.Length; for (var i = 0; i < length; i++) { var suffixes = _dictionary.LookupSuffix(chars, i, length - i); if (suffixes != null) { foreach (var suffix in suffixes) { if (HasCrossCheckedFlag(suffix.Flag, flags)) { var deAffixedLength = length - suffix.Append.Length; // TODO: can we do this in-place? var strippedWord = new StringBuilder() .Append(word, 0, deAffixedLength) .Append(suffix.Strip) .ToString(); var stemList = ApplyAffix(strippedWord, suffix, recursionDepth); foreach (var stem in stemList) { stem.AddSuffix(suffix); } stems.AddRange(stemList); } } } } for (var i = length - 1; i >= 0; i--) { var prefixes = _dictionary.LookupPrefix(chars, 0, i); if (prefixes != null) { foreach (var prefix in prefixes) { if (HasCrossCheckedFlag(prefix.Flag, flags)) { var deAffixedStart = prefix.Append.Length; var deAffixedLength = length - deAffixedStart; var strippedWord = new StringBuilder() .Append(prefix.Strip) .Append(word, deAffixedStart, deAffixedLength) .ToString(); var stemList = ApplyAffix(strippedWord, prefix, recursionDepth); foreach (var stem in stemList) { stem.AddPrefix(prefix); } stems.AddRange(stemList); } } } } return stems; } /// /// Applies the affix rule to the given word, producing a list of stems if any are found. /// /// Word the affix has been removed and the strip added. /// HunspellAffix representing the affix rule itself. /// Level of recursion this stemming step is at. /// List of stems for the word, or an empty list if none are found. public IEnumerable ApplyAffix(String strippedWord, HunspellAffix affix, Int32 recursionDepth) { if (strippedWord == null) throw new ArgumentNullException("strippedWord"); if (affix == null) throw new ArgumentNullException("affix"); if (!affix.CheckCondition(strippedWord)) { return new List(); } var words = _dictionary.LookupWord(strippedWord); if (words == null) { return new List(); } var stems = new List(); foreach (var hunspellWord in words) { if (hunspellWord.HasFlag(affix.Flag)) { if (affix.IsCrossProduct && recursionDepth < RECURSION_CAP) { var recursiveStems = Stem(strippedWord, affix.AppendFlags, ++recursionDepth); if (recursiveStems.Any()) { stems.AddRange(recursiveStems); } else { stems.Add(new HunspellStem(strippedWord)); } } else { stems.Add(new HunspellStem(strippedWord)); } } } return stems; } /// /// Checks if the given flag cross checks with the given array of flags. /// /// Flag to cross check with the array of flags. /// Array of flags to cross check against. Can be null. /// true if the flag is found in the array or the array is null, false otherwise. private static Boolean HasCrossCheckedFlag(Char flag, Char[] flags) { return flags == null || Array.BinarySearch(flags, flag) >= 0; } } }