using System; using System.IO; using System.Text; using System.Collections; namespace Lucene.Net.Analysis.De { ///

/// A stemmer for German words. The algorithm is based on the report /// "A Fast and Simple Stemming Algorithm for German Words" by JГ¶rg /// Caumanns (joerg.caumanns@isst.fhg.de). ///

public class GermanStemmer { ///

/// Buffer for the terms while stemming them. ///

private StringBuilder sb = new StringBuilder(); ///

/// Amount of characters that are removed with Substitute() while stemming. ///

private int substCount = 0; ///

/// Stemms the given term to an unique discriminator. ///

/// The term that should be stemmed. /// Discriminator for term internal String Stem( String term ) { // Use lowercase for medium stemming. term = term.ToLower(); if ( !IsStemmable( term ) ) return term; // Reset the StringBuilder. sb.Remove(0, sb.Length); sb.Insert(0, term); // Stemming starts here... Substitute( sb ); Strip( sb ); Optimize( sb ); Resubstitute( sb ); RemoveParticleDenotion( sb ); return sb.ToString(); } ///

/// Checks if a term could be stemmed. ///

/// /// true if, and only if, the given term consists in letters. private bool IsStemmable( String term ) { for ( int c = 0; c < term.Length; c++ ) { if ( !Char.IsLetter(term[c])) return false; } return true; } ///

/// Suffix stripping (stemming) on the current term. The stripping is reduced /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", /// from which all regular suffixes are build of. The simplification causes /// some overstemming, and way more irregular stems, but still provides unique. /// discriminators in the most of those cases. /// The algorithm is context free, except of the length restrictions. ///

/// private void Strip( StringBuilder buffer ) { bool doMore = true; while ( doMore && buffer.Length > 3 ) { if ( ( buffer.Length + substCount > 5 ) && buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) ) { buffer.Remove( buffer.Length - 2, 2 ); } else if ( ( buffer.Length + substCount > 4 ) && buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) ) { buffer.Remove( buffer.Length - 2, 2 ); } else if ( ( buffer.Length + substCount > 4 ) && buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) ) { buffer.Remove( buffer.Length - 2, 2 ); } else if ( buffer[buffer.Length - 1] == 'e' ) { buffer.Remove(buffer.Length - 1, 1); } else if ( buffer[buffer.Length - 1] == 's' ) { buffer.Remove(buffer.Length - 1, 1); } else if ( buffer[buffer.Length - 1] == 'n' ) { buffer.Remove(buffer.Length - 1, 1); } // "t" occurs only as suffix of verbs. else if ( buffer[buffer.Length - 1] == 't') { buffer.Remove(buffer.Length - 1, 1); } else { doMore = false; } } } ///

/// Does some optimizations on the term. This optimisations are contextual. ///

/// private void Optimize( StringBuilder buffer ) { // Additional step for female plurals of professions and inhabitants. if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" )) { buffer.Remove(buffer.Length - 1, 1); Strip(buffer); } // Additional step for irregular plural nouns like "Matrizen -> Matrix". if ( buffer[buffer.Length - 1] == ('z') ) { buffer[buffer.Length - 1] = 'x'; } } ///

/// Removes a particle denotion ("ge") from a term. ///

/// private void RemoveParticleDenotion( StringBuilder buffer ) { if ( buffer.Length > 4 ) { for ( int c = 0; c < buffer.Length - 3; c++ ) { if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) ) { buffer.Remove(c, 2); return; } } } } ///

/// Do some substitutions for the term to reduce overstemming: /// /// - Substitute Umlauts with their corresponding vowel: äöü -> aou, /// "ß" is substituted by "ss" /// - Substitute a second char of a pair of equal characters with /// an asterisk: ?? -> ?* /// - Substitute some common character combinations with a token: /// sch/ch/ei/ie/ig/st -> $/В§/%/&/#/! ///

private void Substitute( StringBuilder buffer ) { substCount = 0; for ( int c = 0; c < buffer.Length; c++ ) { // Replace the second char of a pair of the equal characters with an asterisk if ( c > 0 && buffer[c] == buffer[c - 1]) { buffer[c] = '*'; } // Substitute Umlauts. else if ( buffer[c] == 'ä' ) { buffer[c] = 'a'; } else if ( buffer[c] == 'ö' ) { buffer[c] = 'o'; } else if ( buffer[c] == 'ü' ) { buffer[c] = 'u'; } // Fix bug so that 'ß' at the end of a word is replaced. else if ( buffer[c] == 'ß' ) { buffer[c] = 's'; buffer.Insert(c + 1, 's'); substCount++; } // Take care that at least one character is left left side from the current one if ( c < buffer.Length - 1 ) { // Masking several common character combinations with an token if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' && buffer[c + 1] == 'c' && buffer[c + 2] == 'h' ) { buffer[c] = '$'; buffer.Remove(c + 1, 2); substCount =+ 2; } else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' ) { buffer[c] = '§'; buffer.Remove(c + 1, 1); substCount++; } else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' ) { buffer[c] = '%'; buffer.Remove(c + 1, 1); substCount++; } else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' ) { buffer[c] = '&'; buffer.Remove(c + 1, 1); substCount++; } else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' ) { buffer[c] = '#'; buffer.Remove(c + 1, 1); substCount++; } else if ( buffer[c] == 's' && buffer[c + 1] == 't' ) { buffer[c] = '!'; buffer.Remove(c + 1, 1); substCount++; } } } } ///

/// Undoes the changes made by Substitute(). That are character pairs and /// character combinations. Umlauts will remain as their corresponding vowel, /// as "?" remains as "ss". ///

/// private void Resubstitute( StringBuilder buffer ) { for ( int c = 0; c < buffer.Length; c++ ) { if ( buffer[c] == '*' ) { char x = buffer[c - 1]; buffer[c] = x; } else if ( buffer[c] == '$' ) { buffer[c] = 's'; buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2); } else if ( buffer[c] == '§' ) { buffer[c] = 'c'; buffer.Insert( c + 1, 'h' ); } else if ( buffer[c] == '%' ) { buffer[c] = 'e'; buffer.Insert( c + 1, 'i' ); } else if ( buffer[c] == '&' ) { buffer[c] = 'i'; buffer.Insert( c + 1, 'e' ); } else if ( buffer[c] == '#' ) { buffer[c] = 'i'; buffer.Insert( c + 1, 'g' ); } else if ( buffer[c] == '!' ) { buffer[c] = 's'; buffer.Insert( c + 1, 't' ); } } } } }