/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Linq;
using Lucene.Net.Analysis.Tokenattributes;
namespace Lucene.Net.Analysis.Hunspell {
///
/// TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a
/// word having multiple stems, this filter can emit multiple tokens for each consumed token.
///
public class HunspellStemFilter : TokenFilter {
private readonly ITermAttribute _termAtt;
private readonly IPositionIncrementAttribute _posIncAtt;
private readonly HunspellStemmer _stemmer;
private readonly Queue _buffer = new Queue();
private State _savedState;
private readonly Boolean _dedup;
///
/// Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using
/// affix rules in the provided HunspellDictionary.
///
/// TokenStream whose tokens will be stemmed.
/// HunspellDictionary containing the affix rules and words that will be used to stem the tokens.
/// true if only unique terms should be output.
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true)
: base(input) {
_posIncAtt = AddAttribute();
_termAtt = AddAttribute();
_dedup = dedup;
_stemmer = new HunspellStemmer(dictionary);
}
public override Boolean IncrementToken() {
if (_buffer.Any()) {
var nextStem = _buffer.Dequeue();
RestoreState(_savedState);
_posIncAtt.PositionIncrement = 0;
_termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength);
return true;
}
if (!input.IncrementToken())
return false;
var newTerms = _dedup
? _stemmer.UniqueStems(_termAtt.Term)
: _stemmer.Stem(_termAtt.Term);
foreach (var newTerm in newTerms)
_buffer.Enqueue(newTerm);
if (_buffer.Count == 0)
// we do not know this word, return it unchanged
return true;
var stem = _buffer.Dequeue();
_termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength);
if (_buffer.Count > 0)
_savedState = CaptureState();
return true;
}
public override void Reset() {
base.Reset();
_buffer.Clear();
}
}
}