/*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
namespace Lucene.Net.Analysis.DE
{
/// A stemmer for German words. The algorithm is based on the report
/// "A Fast and Simple Stemming Algorithm for German Words" by Jörg
/// Caumanns (joerg.caumanns@isst.fhg.de).
///
///
/// Gerhard Schwarz
///
/// $Id: GermanStemmer.java,v 1.11 2004/05/30 20:24:20 otis Exp $
///
public class GermanStemmer
{
/// Buffer for the terms while stemming them.
private System.Text.StringBuilder sb = new System.Text.StringBuilder();
/// Amount of characters that are removed with substitute() while stemming.
private int substCount = 0;
/// Stemms the given term to an unique discriminator.
///
///
/// The term that should be stemmed.
///
/// Discriminator for term
///
protected internal virtual System.String Stem(System.String term)
{
// Use lowercase for medium stemming.
term = term.ToLower();
if (!IsStemmable(term))
return term;
// Reset the StringBuffer.
sb.Remove(0, sb.Length - 0);
sb.Insert(0, term);
// Stemming starts here...
Substitute(sb);
Strip(sb);
Optimize(sb);
Resubstitute(sb);
RemoveParticleDenotion(sb);
return sb.ToString();
}
/// Checks if a term could be stemmed.
///
///
/// true if, and only if, the given term consists in letters.
///
private bool IsStemmable(System.String term)
{
for (int c = 0; c < term.Length; c++)
{
if (!System.Char.IsLetter(term[c]))
return false;
}
return true;
}
/// suffix stripping (stemming) on the current term. The stripping is reduced
/// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
/// from which all regular suffixes are build of. The simplification causes
/// some overstemming, and way more irregular stems, but still provides unique.
/// discriminators in the most of those cases.
/// The algorithm is context free, except of the length restrictions.
///
private void Strip(System.Text.StringBuilder buffer)
{
bool doMore = true;
while (doMore && buffer.Length > 3)
{
if ((buffer.Length + substCount > 5) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("nd"))
{
buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
}
else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("em"))
{
buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
}
else if ((buffer.Length + substCount > 4) && buffer.ToString(buffer.Length - 2, buffer.Length).Equals("er"))
{
buffer.Remove(buffer.Length - 2, buffer.Length - (buffer.Length - 2));
}
else if (buffer[buffer.Length - 1] == 'e')
{
buffer.Remove(buffer.Length - 1, 1);
}
else if (buffer[buffer.Length - 1] == 's')
{
buffer.Remove(buffer.Length - 1, 1);
}
else if (buffer[buffer.Length - 1] == 'n')
{
buffer.Remove(buffer.Length - 1, 1);
}
// "t" occurs only as suffix of verbs.
else if (buffer[buffer.Length - 1] == 't')
{
buffer.Remove(buffer.Length - 1, 1);
}
else
{
doMore = false;
}
}
}
/// Does some optimizations on the term. This optimisations are
/// contextual.
///
private void Optimize(System.Text.StringBuilder buffer)
{
// Additional step for female plurals of professions and inhabitants.
if (buffer.Length > 5 && buffer.ToString(buffer.Length - 5, buffer.Length).Equals("erin*"))
{
buffer.Remove(buffer.Length - 1, 1);
Strip(buffer);
}
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
if (buffer[buffer.Length - 1] == ('z'))
{
buffer[buffer.Length - 1] = 'x';
}
}
/// Removes a particle denotion ("ge") from a term.
private void RemoveParticleDenotion(System.Text.StringBuilder buffer)
{
if (buffer.Length > 4)
{
for (int c = 0; c < buffer.Length - 3; c++)
{
if (buffer.ToString(c, c + 4).Equals("gege"))
{
buffer.Remove(c, c + 2 - c);
return ;
}
}
}
}
/// Do some substitutions for the term to reduce overstemming:
///
/// - Substitute Umlauts with their corresponding vowel: äöü -> aou,
/// "ß" is substituted by "ss"
/// - Substitute a second char of a pair of equal characters with
/// an asterisk: ?? -> ?*
/// - Substitute some common character combinations with a token:
/// sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
///
private void Substitute(System.Text.StringBuilder buffer)
{
substCount = 0;
for (int c = 0; c < buffer.Length; c++)
{
// Replace the second char of a pair of the equal characters with an asterisk
if (c > 0 && buffer[c] == buffer[c - 1])
{
buffer[c] = '*';
}
// Substitute Umlauts.
else if (buffer[c] == 'A') //// 'ä')
{
//'ä' ) {
buffer[c] = 'a';
}
else if (buffer[c] == 'A') //// 'ö')
{
//'ö' ) {
buffer[c] = 'o';
}
else if (buffer[c] == 'A') //// 'ü')
{
// 'ü' ) {
buffer[c] = 'u';
}
// Take care that at least one character is left left side from the current one
if (c < buffer.Length - 1)
{
if (buffer[c] == 'A') //// 'ß')
{
//'ß' ) {
buffer[c] = 's';
buffer.Insert(c + 1, 's');
substCount++;
}
// Masking several common character combinations with an token
else if ((c < buffer.Length - 2) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h')
{
buffer[c] = '$';
buffer.Remove(c + 1, c + 3 - (c + 1));
substCount = + 2;
}
else if (buffer[c] == 'c' && buffer[c + 1] == 'h')
{
buffer[c] = 'A'; //// '§';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 'e' && buffer[c + 1] == 'i')
{
buffer[c] = '%';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 'i' && buffer[c + 1] == 'e')
{
buffer[c] = '&';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 'i' && buffer[c + 1] == 'g')
{
buffer[c] = '#';
buffer.Remove(c + 1, 1);
substCount++;
}
else if (buffer[c] == 's' && buffer[c + 1] == 't')
{
buffer[c] = '!';
buffer.Remove(c + 1, 1);
substCount++;
}
}
}
}
/// Undoes the changes made by substitute(). That are character pairs and
/// character combinations. Umlauts will remain as their corresponding vowel,
/// as "ß" remains as "ss".
///
private void Resubstitute(System.Text.StringBuilder buffer)
{
for (int c = 0; c < buffer.Length; c++)
{
if (buffer[c] == '*')
{
char x = buffer[c - 1];
buffer[c] = x;
}
else if (buffer[c] == '$')
{
buffer[c] = 's';
buffer.Insert(c + 1, new char[]{'c', 'h'}, 0, 2);
}
else if (buffer[c] == 'A') //// '§')
{
// '§' ) {
buffer[c] = 'c';
buffer.Insert(c + 1, 'h');
}
else if (buffer[c] == '%')
{
buffer[c] = 'e';
buffer.Insert(c + 1, 'i');
}
else if (buffer[c] == '&')
{
buffer[c] = 'i';
buffer.Insert(c + 1, 'e');
}
else if (buffer[c] == '#')
{
buffer[c] = 'i';
buffer.Insert(c + 1, 'g');
}
else if (buffer[c] == '!')
{
buffer[c] = 's';
buffer.Insert(c + 1, 't');
}
}
}
}
}