/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Text;
namespace Lucene.Net.Analysis.Ru
{
///
/// Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
///
public class RussianStemmer
{
private char[] charset;
///
/// positions of RV, R1 and R2 respectively
///
private int RV, R1, R2;
///
/// letters
///
// letters (currently unused letters are commented out)
private static char A = (char)0;
//private static char B = (char)1;
private static char V = (char)2;
private static char G = (char)3;
//private static char D = (char)4;
private static char E = (char)5;
//private static char ZH = (char)6;
//private static char Z = (char)7;
private static char I = (char)8;
private static char I_ = (char)9;
//private static char K = (char)10;
private static char L = (char)11;
private static char M = (char)12;
private static char N = (char)13;
private static char O = (char)14;
//private static char P = (char)15;
//private static char R = (char)16;
private static char S = (char)17;
private static char T = (char)18;
private static char U = (char)19;
//private static char F = (char)20;
private static char X = (char)21;
//private static char TS = (char)22;
//private static char CH = (char)23;
private static char SH = (char)24;
private static char SHCH = (char)25;
//private static char HARD = (char)26;
private static char Y = (char)27;
private static char SOFT = (char)28;
private static char AE = (char)29;
private static char IU = (char)30;
private static char IA = (char)31;
///
/// stem definitions
///
private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
private static char[][] perfectiveGerundEndings1 = {
new char[] { V },
new char[] { V, SH, I },
new char[] { V, SH, I, S, SOFT }
};
private static char[][] perfectiveGerund1Predessors = {
new char[] { A },
new char[] { IA }
};
private static char[][] perfectiveGerundEndings2 = {
new char[] { I, V },
new char[] {Y, V },
new char[] {I, V, SH, I },
new char[] {Y, V, SH, I },
new char[] {I, V, SH, I, S, SOFT },
new char[] {Y, V, SH, I, S, SOFT }
};
private static char[][] adjectiveEndings = {
new char[] { E, E },
new char[] { I, E },
new char[] { Y, E },
new char[] { O, E },
new char[] { E, I_ },
new char[] { I, I_ },
new char[] { Y, I_ },
new char[] { O, I_ },
new char[] { E, M },
new char[] { I, M },
new char[] { Y, M },
new char[] { O, M },
new char[] { I, X },
new char[] { Y, X },
new char[] { U, IU },
new char[] { IU, IU },
new char[] { A, IA },
new char[] { IA, IA },
new char[] { O, IU },
new char[] { E, IU },
new char[] { I, M, I },
new char[] { Y, M, I },
new char[] { E, G, O },
new char[] { O, G, O },
new char[] { E, M, U },
new char[] {O, M, U }
};
private static char[][] participleEndings1 = {
new char[] { SHCH },
new char[] { E, M },
new char[] { N, N },
new char[] { V, SH },
new char[] { IU, SHCH }
};
private static char[][] participleEndings2 = {
new char[] { I, V, SH },
new char[] { Y, V, SH },
new char[] { U, IU, SHCH }
};
private static char[][] participle1Predessors = {
new char[] { A },
new char[] { IA }
};
private static char[][] reflexiveEndings = {
new char[] { S, IA },
new char[] { S, SOFT }
};
private static char[][] verbEndings1 = {
new char[] { I_ },
new char[] { L },
new char[] { N },
new char[] { L, O },
new char[] { N, O },
new char[] { E, T },
new char[] { IU, T },
new char[] { L, A },
new char[] { N, A },
new char[] { L, I },
new char[] { E, M },
new char[] { N, Y },
new char[] { E, T, E },
new char[] { I_, T, E },
new char[] { T, SOFT },
new char[] { E, SH, SOFT },
new char[] { N, N, O }
};
private static char[][] verbEndings2 = {
new char[] { IU },
new char[] { U, IU },
new char[] { E, N },
new char[] { E, I_ },
new char[] { IA, T },
new char[] { U, I_ },
new char[] { I, L },
new char[] { Y, L },
new char[] { I, M },
new char[] { Y, M },
new char[] { I, T },
new char[] { Y, T },
new char[] { I, L, A },
new char[] { Y, L, A },
new char[] { E, N, A },
new char[] { I, T, E },
new char[] { I, L, I },
new char[] { Y, L, I },
new char[] { I, L, O },
new char[] { Y, L, O },
new char[] { E, N, O },
new char[] { U, E, T },
new char[] { U, IU, T },
new char[] { E, N, Y },
new char[] { I, T, SOFT },
new char[] { Y, T, SOFT },
new char[] { I, SH, SOFT },
new char[] { E, I_, T, E },
new char[] { U, I_, T, E }
};
private static char[][] verb1Predessors = {
new char[] { A },
new char[] { IA }
};
private static char[][] nounEndings = {
new char[] { A },
new char[] { U },
new char[] { I_ },
new char[] { O },
new char[] { U },
new char[] { E },
new char[] { Y },
new char[] { I },
new char[] { SOFT },
new char[] { IA },
new char[] { E, V },
new char[] { O, V },
new char[] { I, E },
new char[] { SOFT, E },
new char[] { IA, X },
new char[] { I, IU },
new char[] { E, I },
new char[] { I, I },
new char[] { E, I_ },
new char[] { O, I_ },
new char[] { E, M },
new char[] { A, M },
new char[] { O, M },
new char[] { A, X },
new char[] { SOFT, IU },
new char[] { I, IA },
new char[] { SOFT, IA },
new char[] { I, I_ },
new char[] { IA, M },
new char[] { IA, M, I },
new char[] { A, M, I },
new char[] { I, E, I_ },
new char[] { I, IA, M },
new char[] { I, E, M },
new char[] { I, IA, X },
new char[] { I, IA, M, I }
};
private static char[][] superlativeEndings = {
new char[] { E, I_, SH },
new char[] { E, I_, SH, E }
};
private static char[][] derivationalEndings = {
new char[] { O, S, T },
new char[] { O, S, T, SOFT }
};
///
/// RussianStemmer constructor comment.
///
public RussianStemmer()
{
}
///
/// RussianStemmer constructor comment.
///
///
public RussianStemmer(char[] charset)
{
this.charset = charset;
}
///
/// Adjectival ending is an adjective ending,
/// optionally preceded by participle ending.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// StringBuilder
///
private bool Adjectival(StringBuilder stemmingZone)
{
// look for adjective ending in a stemming zone
if (!FindAndRemoveEnding(stemmingZone, adjectiveEndings))
return false;
// if adjective ending was found, try for participle ending
bool r =
FindAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
||
FindAndRemoveEnding(stemmingZone, participleEndings2);
return true;
}
///
/// Derivational endings
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// StringBuilder
///
private bool Derivational(StringBuilder stemmingZone)
{
int endingLength = FindEnding(stemmingZone, derivationalEndings);
if (endingLength == 0)
// no derivational ending found
return false;
else
{
// Ensure that the ending locates in R2
if (R2 - RV <= stemmingZone.Length - endingLength)
{
stemmingZone.Length = stemmingZone.Length - endingLength;
return true;
}
else
{
return false;
}
}
}
///
/// Finds ending among given ending class and returns the length of ending found(0, if not found).
/// Creation date: (17/03/2002 8:18:34 PM)
///
///
///
///
///
private int FindEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
{
bool match = false;
for (int i = theEndingClass.Length - 1; i >= 0; i--)
{
char[] theEnding = theEndingClass[i];
// check if the ending is bigger than stemming zone
if (startIndex < theEnding.Length - 1)
{
match = false;
continue;
}
match = true;
int stemmingIndex = startIndex;
for (int j = theEnding.Length - 1; j >= 0; j--)
{
if (stemmingZone[stemmingIndex--] != charset[theEnding[j]])
{
match = false;
break;
}
}
// check if ending was found
if (match)
{
return theEndingClass[i].Length; // cut ending
}
}
return 0;
}
private int FindEnding(StringBuilder stemmingZone, char[][] theEndingClass)
{
return FindEnding(stemmingZone, stemmingZone.Length - 1, theEndingClass);
}
///
/// Finds the ending among the given class of endings and removes it from stemming zone.
/// Creation date: (17/03/2002 8:18:34 PM)
///
///
///
///
private bool FindAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
{
int endingLength = FindEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
stemmingZone.Length = stemmingZone.Length - endingLength;
// cut the ending found
return true;
}
}
///
/// Finds the ending among the given class of endings, then checks if this ending was
/// preceded by any of given predessors, and if so, removes it from stemming zone.
/// Creation date: (17/03/2002 8:18:34 PM)
///
///
///
///
///
private bool FindAndRemoveEnding(StringBuilder stemmingZone,
char[][] theEndingClass, char[][] thePredessors)
{
int endingLength = FindEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
int predessorLength =
FindEnding(stemmingZone,
stemmingZone.Length - endingLength - 1,
thePredessors);
if (predessorLength == 0)
return false;
else
{
stemmingZone.Length = stemmingZone.Length - endingLength;
// cut the ending found
return true;
}
}
}
///
/// Marks positions of RV, R1 and R2 in a given word.
/// Creation date: (16/03/2002 3:40:11 PM)
///
///
private void MarkPositions(String word)
{
RV = 0;
R1 = 0;
R2 = 0;
int i = 0;
// find RV
while (word.Length > i && !IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return; // RV zone is empty
RV = i;
// find R1
while (word.Length > i && IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return; // R1 zone is empty
R1 = i;
// find R2
while (word.Length > i && !IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return; // R2 zone is empty
while (word.Length > i && IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return; // R2 zone is empty
R2 = i;
}
///
/// Checks if character is a vowel..
/// Creation date: (16/03/2002 10:47:03 PM)
///
///
///
private bool IsVowel(char letter)
{
for (int i = 0; i < vowels.Length; i++)
{
if (letter == charset[vowels[i]])
return true;
}
return false;
}
///
/// Noun endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool Noun(StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, nounEndings);
}
///
/// Perfective gerund endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool PerfectiveGerund(StringBuilder stemmingZone)
{
return FindAndRemoveEnding(
stemmingZone,
perfectiveGerundEndings1,
perfectiveGerund1Predessors)
|| FindAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
}
///
/// Reflexive endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool Reflexive(StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, reflexiveEndings);
}
///
/// Insert the method's description here.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool RemoveI(StringBuilder stemmingZone)
{
if (stemmingZone.Length > 0
&& stemmingZone[stemmingZone.Length - 1] == charset[I])
{
stemmingZone.Length = stemmingZone.Length - 1;
return true;
}
else
{
return false;
}
}
///
/// Insert the method's description here.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool RemoveSoft(StringBuilder stemmingZone)
{
if (stemmingZone.Length > 0
&& stemmingZone[stemmingZone.Length - 1] == charset[SOFT])
{
stemmingZone.Length = stemmingZone.Length - 1;
return true;
}
else
{
return false;
}
}
///
/// Insert the method's description here.
/// Creation date: (16/03/2002 10:58:42 PM)
///
///
public void SetCharset(char[] newCharset)
{
charset = newCharset;
}
// ///
// /// Set ending definition as in Russian stemming algorithm.
// /// Creation date: (16/03/2002 11:16:36 PM)
// ///
// private void SetEndings()
// {
// vowels = new char[] { A, E, I, O, U, Y, AE, IU, IA };
//
// perfectiveGerundEndings1 = new char[][] {
// new char[] { V }, new char[] { V, SH, I }, new char[] { V, SH, I, S, SOFT }
// };
//
// perfectiveGerund1Predessors = new char[][] {
// new char[] { A }, new char[] { IA }
// };
//
// perfectiveGerundEndings2 = new char[][] {
// new char[] { I, V },
// new char[] { Y, V },
// new char[] { I, V, SH, I },
// new char[] { Y, V, SH, I },
// new char[] { I, V, SH, I, S, SOFT },
// new char[] { Y, V, SH, I, S, SOFT }
// };
//
// adjectiveEndings = new char[][] {
// new char[] { E, E },
// new char[] { I, E },
// new char[] { Y, E },
// new char[] { O, E },
// new char[] { E, I_ },
// new char[] { I, I_ },
// new char[] { Y, I_ },
// new char[] { O, I_ },
// new char[] { E, M },
// new char[] { I, M },
// new char[] { Y, M },
// new char[] { O, M },
// new char[] { I, X },
// new char[] { Y, X },
// new char[] { U, IU },
// new char[] { IU, IU },
// new char[] { A, IA },
// new char[] { IA, IA },
// new char[] { O, IU },
// new char[] { E, IU },
// new char[] { I, M, I },
// new char[] { Y, M, I },
// new char[] { E, G, O },
// new char[] { O, G, O },
// new char[] { E, M, U },
// new char[] { O, M, U }
// };
//
// participleEndings1 = new char[][] {
// new char[] { SHCH },
// new char[] { E, M },
// new char[] { N, N },
// new char[] { V, SH },
// new char[] { IU, SHCH }
// };
//
// participleEndings2 = new char[][] {
// new char[] { I, V, SH },
// new char[] { Y, V, SH },
// new char[] { U, IU, SHCH }
// };
//
// participle1Predessors = new char[][] {
// new char[] { A },
// new char[] { IA }
// };
//
// reflexiveEndings = new char[][] {
// new char[] { S, IA },
// new char[] { S, SOFT }
// };
//
// verbEndings1 = new char[][] {
// new char[] { I_ },
// new char[] { L },
// new char[] { N },
// new char[] { L, O },
// new char[] { N, O },
// new char[] { E, T },
// new char[] { IU, T },
// new char[] { L, A },
// new char[] { N, A },
// new char[] { L, I },
// new char[] { E, M },
// new char[] { N, Y },
// new char[] { E, T, E },
// new char[] { I_, T, E },
// new char[] { T, SOFT },
// new char[] { E, SH, SOFT },
// new char[] { N, N, O }
// };
//
// verbEndings2 = new char[][] {
// new char[] { IU },
// new char[] { U, IU },
// new char[] { E, N },
// new char[] { E, I_ },
// new char[] { IA, T },
// new char[] { U, I_ },
// new char[] { I, L },
// new char[] { Y, L },
// new char[] { I, M },
// new char[] { Y, M },
// new char[] { I, T },
// new char[] { Y, T },
// new char[] { I, L, A },
// new char[] { Y, L, A },
// new char[] { E, N, A },
// new char[] { I, T, E },
// new char[] { I, L, I },
// new char[] { Y, L, I },
// new char[] { I, L, O },
// new char[] { Y, L, O },
// new char[] { E, N, O },
// new char[] { U, E, T },
// new char[] { U, IU, T },
// new char[] { E, N, Y },
// new char[] { I, T, SOFT },
// new char[] { Y, T, SOFT },
// new char[] { I, SH, SOFT },
// new char[] { E, I_, T, E },
// new char[] { U, I_, T, E }
// };
//
// verb1Predessors = new char[][] {
// new char[] { A },
// new char[] { IA }
// };
//
// nounEndings = new char[][] {
// new char[] { A },
// new char[] { IU },
// new char[] { I_ },
// new char[] { O },
// new char[] { U },
// new char[] { E },
// new char[] { Y },
// new char[] { I },
// new char[] { SOFT },
// new char[] { IA },
// new char[] { E, V },
// new char[] { O, V },
// new char[] { I, E },
// new char[] { SOFT, E },
// new char[] { IA, X },
// new char[] { I, IU },
// new char[] { E, I },
// new char[] { I, I },
// new char[] { E, I_ },
// new char[] { O, I_ },
// new char[] { E, M },
// new char[] { A, M },
// new char[] { O, M },
// new char[] { A, X },
// new char[] { SOFT, IU },
// new char[] { I, IA },
// new char[] { SOFT, IA },
// new char[] { I, I_ },
// new char[] { IA, M },
// new char[] { IA, M, I },
// new char[] { A, M, I },
// new char[] { I, E, I_ },
// new char[] { I, IA, M },
// new char[] { I, E, M },
// new char[] { I, IA, X },
// new char[] { I, IA, M, I }
// };
//
// superlativeEndings = new char[][] {
// new char[] { E, I_, SH },
// new char[] { E, I_, SH, E }
// };
//
// derivationalEndings = new char[][] {
// new char[] { O, S, T },
// new char[] { O, S, T, SOFT }
// };
// }
///
/// Finds the stem for given Russian word.
/// Creation date: (16/03/2002 3:36:48 PM)
///
///
///
public String Stem(String input)
{
MarkPositions(input);
if (RV == 0)
return input; //RV wasn't detected, nothing to stem
StringBuilder stemmingZone = new StringBuilder(input.Substring(RV));
// stemming goes on in RV
// Step 1
if (!PerfectiveGerund(stemmingZone))
{
Reflexive(stemmingZone);
bool r =
Adjectival(stemmingZone)
|| Verb(stemmingZone)
|| Noun(stemmingZone);
}
// Step 2
RemoveI(stemmingZone);
// Step 3
Derivational(stemmingZone);
// Step 4
Superlative(stemmingZone);
UndoubleN(stemmingZone);
RemoveSoft(stemmingZone);
// return result
return input.Substring(0, RV) + stemmingZone.ToString();
}
///
/// Superlative endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool Superlative(StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, superlativeEndings);
}
///
/// Undoubles N.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool UndoubleN(StringBuilder stemmingZone)
{
char[][] doubleN = {
new char[] { N, N }
};
if (FindEnding(stemmingZone, doubleN) != 0)
{
stemmingZone.Length = stemmingZone.Length - 1;
return true;
}
else
{
return false;
}
}
///
/// Verb endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
///
///
private bool Verb(StringBuilder stemmingZone)
{
return FindAndRemoveEnding(
stemmingZone,
verbEndings1,
verb1Predessors)
|| FindAndRemoveEnding(stemmingZone, verbEndings2);
}
///
/// Static method for stemming with different charsets
///
///
///
///
public static String Stem(String theWord, char[] charset)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.SetCharset(charset);
return stemmer.Stem(theWord);
}
}
}