/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Lucene.Net.Analysis.Hunspell {
///
/// HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or
/// more stems for a word. It conforms to the algorithm in the original hunspell algorithm,
/// including recursive suffix stripping.
///
/// Chris Male
public class HunspellStemmer {
private static Int32 RECURSION_CAP = 2;
private readonly HunspellDictionary _dictionary;
///
/// Constructs a new HunspellStemmer which will use the provided HunspellDictionary
/// to create its stems.
///
/// HunspellDictionary that will be used to create the stems.
public HunspellStemmer(HunspellDictionary dictionary) {
if (dictionary == null) throw new ArgumentNullException("dictionary");
_dictionary = dictionary;
}
///
/// Find the stem(s) of the provided word.
///
/// Word to find the stems for.
/// List of stems for the word.
public IEnumerable Stem(String word) {
if (word == null) throw new ArgumentNullException("word");
var stems = new List();
if (_dictionary.LookupWord(word) != null)
stems.Add(new HunspellStem(word));
stems.AddRange(Stem(word, null, 0));
return stems;
}
///
/// Find the unique stem(s) of the provided word.
///
/// Word to find the stems for.
/// List of stems for the word.
public IEnumerable UniqueStems(String word) {
if (word == null) throw new ArgumentNullException("word");
var stems = new List();
var terms = new CharArraySet(8, false);
if (_dictionary.LookupWord(word) != null) {
stems.Add(new HunspellStem(word));
terms.Add(word);
}
var otherStems = Stem(word, null, 0);
foreach (var s in otherStems) {
if (!terms.Contains(s.Stem)) {
stems.Add(s);
terms.Add(s.Stem);
}
}
return stems;
}
///
/// Generates a list of stems for the provided word.
///
/// Word to generate the stems for.
/// Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step.
/// Level of recursion this stemming step is at.
/// List of stems, pr an empty if no stems are found.
private IEnumerable Stem(String word, Char[] flags, Int32 recursionDepth) {
if (word == null) throw new ArgumentNullException("word");
var stems = new List();
var chars = word.ToCharArray();
var length = word.Length;
for (var i = 0; i < length; i++) {
var suffixes = _dictionary.LookupSuffix(chars, i, length - i);
if (suffixes != null) {
foreach (var suffix in suffixes) {
if (HasCrossCheckedFlag(suffix.Flag, flags)) {
var deAffixedLength = length - suffix.Append.Length;
// TODO: can we do this in-place?
var strippedWord = new StringBuilder()
.Append(word, 0, deAffixedLength)
.Append(suffix.Strip)
.ToString();
var stemList = ApplyAffix(strippedWord, suffix, recursionDepth);
foreach (var stem in stemList) {
stem.AddSuffix(suffix);
}
stems.AddRange(stemList);
}
}
}
}
for (var i = length - 1; i >= 0; i--) {
var prefixes = _dictionary.LookupPrefix(chars, 0, i);
if (prefixes != null) {
foreach (var prefix in prefixes) {
if (HasCrossCheckedFlag(prefix.Flag, flags)) {
var deAffixedStart = prefix.Append.Length;
var deAffixedLength = length - deAffixedStart;
var strippedWord = new StringBuilder()
.Append(prefix.Strip)
.Append(word, deAffixedStart, deAffixedLength)
.ToString();
var stemList = ApplyAffix(strippedWord, prefix, recursionDepth);
foreach (var stem in stemList) {
stem.AddPrefix(prefix);
}
stems.AddRange(stemList);
}
}
}
}
return stems;
}
///
/// Applies the affix rule to the given word, producing a list of stems if any are found.
///
/// Word the affix has been removed and the strip added.
/// HunspellAffix representing the affix rule itself.
/// Level of recursion this stemming step is at.
/// List of stems for the word, or an empty list if none are found.
public IEnumerable ApplyAffix(String strippedWord, HunspellAffix affix, Int32 recursionDepth) {
if (strippedWord == null) throw new ArgumentNullException("strippedWord");
if (affix == null) throw new ArgumentNullException("affix");
if (!affix.CheckCondition(strippedWord)) {
return new List();
}
var words = _dictionary.LookupWord(strippedWord);
if (words == null) {
return new List();
}
var stems = new List();
foreach (var hunspellWord in words) {
if (hunspellWord.HasFlag(affix.Flag)) {
if (affix.IsCrossProduct && recursionDepth < RECURSION_CAP) {
var recursiveStems = Stem(strippedWord, affix.AppendFlags, ++recursionDepth);
if (recursiveStems.Any()) {
stems.AddRange(recursiveStems);
} else {
stems.Add(new HunspellStem(strippedWord));
}
} else {
stems.Add(new HunspellStem(strippedWord));
}
}
}
return stems;
}
///
/// Checks if the given flag cross checks with the given array of flags.
///
/// Flag to cross check with the array of flags.
/// Array of flags to cross check against. Can be null.
/// true if the flag is found in the array or the array is null, false otherwise.
private static Boolean HasCrossCheckedFlag(Char flag, Char[] flags) {
return flags == null || Array.BinarySearch(flags, flag) >= 0;
}
}
}