/*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
namespace Lucene.Net.Analysis.RU
{
/// Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
///
///
/// Boris Okner, b.okner@rogers.com
///
/// $Id: RussianStemmer.java,v 1.5 2004/03/29 22:48:01 cutting Exp $
///
public class RussianStemmer
{
private char[] charset;
// positions of RV, R1 and R2 respectively
private int RV, R1, R2;
// letters
private static char A = (char) (0);
private static char B = (char) (1);
private static char V = (char) (2);
private static char G = (char) (3);
private static char D = (char) (4);
private static char E = (char) (5);
private static char ZH = (char) (6);
private static char Z = (char) (7);
private static char I = (char) (8);
private static char I_ = (char) (9);
private static char K = (char) (10);
private static char L = (char) (11);
private static char M = (char) (12);
private static char N = (char) (13);
private static char O = (char) (14);
private static char P = (char) (15);
private static char R = (char) (16);
private static char S = (char) (17);
private static char T = (char) (18);
private static char U = (char) (19);
private static char F = (char) (20);
private static char X = (char) (21);
private static char TS = (char) (22);
private static char CH = (char) (23);
private static char SH = (char) (24);
private static char SHCH = (char) (25);
private static char HARD = (char) (26);
private static char Y = (char) (27);
private static char SOFT = (char) (28);
private static char AE = (char) (29);
private static char IU = (char) (30);
private static char IA = (char) (31);
// stem definitions
private static char[] vowels = new char[]{A, E, I, O, U, Y, AE, IU, IA};
private static char[][] perfectiveGerundEndings1 = new char[][]{new char[]{V}, new char[]{V, SH, I}, new char[]{V, SH, I, S, SOFT}};
private static char[][] perfectiveGerund1Predessors = new char[][]{new char[]{A}, new char[]{IA}};
private static char[][] perfectiveGerundEndings2 = new char[][]{new char[]{I, V}, new char[]{Y, V}, new char[]{I, V, SH, I}, new char[]{Y, V, SH, I}, new char[]{I, V, SH, I, S, SOFT}, new char[]{Y, V, SH, I, S, SOFT}};
private static char[][] adjectiveEndings = new char[][]{new char[]{E, E}, new char[]{I, E}, new char[]{Y, E}, new char[]{O, E}, new char[]{E, I_}, new char[]{I, I_}, new char[]{Y, I_}, new char[]{O, I_}, new char[]{E, M}, new char[]{I, M}, new char[]{Y, M}, new char[]{O, M}, new char[]{I, X}, new char[]{Y, X}, new char[]{U, IU}, new char[]{IU, IU}, new char[]{A, IA}, new char[]{IA, IA}, new char[]{O, IU}, new char[]{E, IU}, new char[]{I, M, I}, new char[]{Y, M, I}, new char[]{E, G, O}, new char[]{O, G, O}, new char[]{E, M, U}, new char[]{O, M, U}};
private static char[][] participleEndings1 = new char[][]{new char[]{SHCH}, new char[]{E, M}, new char[]{N, N}, new char[]{V, SH}, new char[]{IU, SHCH}};
private static char[][] participleEndings2 = new char[][]{new char[]{I, V, SH}, new char[]{Y, V, SH}, new char[]{U, IU, SHCH}};
private static char[][] participle1Predessors = new char[][]{new char[]{A}, new char[]{IA}};
private static char[][] reflexiveEndings = new char[][]{new char[]{S, IA}, new char[]{S, SOFT}};
private static char[][] verbEndings1 = new char[][]{new char[]{I_}, new char[]{L}, new char[]{N}, new char[]{L, O}, new char[]{N, O}, new char[]{E, T}, new char[]{IU, T}, new char[]{L, A}, new char[]{N, A}, new char[]{L, I}, new char[]{E, M}, new char[]{N, Y}, new char[]{E, T, E}, new char[]{I_, T, E}, new char[]{T, SOFT}, new char[]{E, SH, SOFT}, new char[]{N, N, O}};
private static char[][] verbEndings2 = new char[][]{new char[]{IU}, new char[]{U, IU}, new char[]{E, N}, new char[]{E, I_}, new char[]{IA, T}, new char[]{U, I_}, new char[]{I, L}, new char[]{Y, L}, new char[]{I, M}, new char[]{Y, M}, new char[]{I, T}, new char[]{Y, T}, new char[]{I, L, A}, new char[]{Y, L, A}, new char[]{E, N, A}, new char[]{I, T, E}, new char[]{I, L, I}, new char[]{Y, L, I}, new char[]{I, L, O}, new char[]{Y, L, O}, new char[]{E, N, O}, new char[]{U, E, T}, new char[]{U, IU, T}, new char[]{E, N, Y}, new char[]{I, T, SOFT}, new char[]{Y, T, SOFT}, new char[]{I, SH, SOFT}, new char[]{E, I_, T, E}, new char[]{U, I_, T, E}};
private static char[][] verb1Predessors = new char[][]{new char[]{A}, new char[]{IA}};
private static char[][] nounEndings = new char[][]{new char[]{A}, new char[]{U}, new char[]{I_}, new char[]{O}, new char[]{U}, new char[]{E}, new char[]{Y}, new char[]{I}, new char[]{SOFT}, new char[]{IA}, new char[]{E, V}, new char[]{O, V}, new char[]{I, E}, new char[]{SOFT, E}, new char[]{IA, X}, new char[]{I, IU}, new char[]{E, I}, new char[]{I, I}, new char[]{E, I_}, new char[]{O, I_}, new char[]{E, M}, new char[]{A, M}, new char[]{O, M}, new char[]{A, X}, new char[]{SOFT, IU}, new char[]{I, IA}, new char[]{SOFT, IA}, new char[]{I, I_}, new char[]{IA, M}, new char[]{IA, M, I}, new char[]{A, M, I}, new char[]{I, E, I_}, new char[]{I, IA, M}, new char[]{I, E, M}, new char[]{I, IA, X}, new char[]{I, IA, M, I}};
private static char[][] superlativeEndings = new char[][]{new char[]{E, I_, SH}, new char[]{E, I_, SH, E}};
private static char[][] derivationalEndings = new char[][]{new char[]{O, S, T}, new char[]{O, S, T, SOFT}};
/// RussianStemmer constructor comment.
public RussianStemmer():base()
{
}
/// RussianStemmer constructor comment.
public RussianStemmer(char[] charset):base()
{
this.charset = charset;
}
/// Adjectival ending is an adjective ending,
/// optionally preceded by participle ending.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool Adjectival(System.Text.StringBuilder stemmingZone)
{
// look for adjective ending in a stemming zone
if (!FindAndRemoveEnding(stemmingZone, adjectiveEndings))
return false;
// if adjective ending was found, try for participle ending
bool r = FindAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors) || FindAndRemoveEnding(stemmingZone, participleEndings2);
return true;
}
/// Derivational endings
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool Derivational(System.Text.StringBuilder stemmingZone)
{
int endingLength = FindEnding(stemmingZone, derivationalEndings);
if (endingLength == 0)
// no derivational ending found
return false;
else
{
// Ensure that the ending locates in R2
if (R2 - RV <= stemmingZone.Length - endingLength)
{
stemmingZone.Length -= endingLength;
return true;
}
else
{
return false;
}
}
}
/// Finds ending among given ending class and returns the length of ending found(0, if not found).
/// Creation date: (17/03/2002 8:18:34 PM)
///
private int FindEnding(System.Text.StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
{
bool match = false;
for (int i = theEndingClass.Length - 1; i >= 0; i--)
{
char[] theEnding = theEndingClass[i];
// check if the ending is bigger than stemming zone
if (startIndex < theEnding.Length - 1)
{
match = false;
continue;
}
match = true;
int stemmingIndex = startIndex;
for (int j = theEnding.Length - 1; j >= 0; j--)
{
if (stemmingZone[stemmingIndex--] != charset[theEnding[j]])
{
match = false;
break;
}
}
// check if ending was found
if (match)
{
return theEndingClass[i].Length; // cut ending
}
}
return 0;
}
private int FindEnding(System.Text.StringBuilder stemmingZone, char[][] theEndingClass)
{
return FindEnding(stemmingZone, stemmingZone.Length - 1, theEndingClass);
}
/// Finds the ending among the given class of endings and removes it from stemming zone.
/// Creation date: (17/03/2002 8:18:34 PM)
///
private bool FindAndRemoveEnding(System.Text.StringBuilder stemmingZone, char[][] theEndingClass)
{
int endingLength = FindEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
stemmingZone.Length -= endingLength;
// cut the ending found
return true;
}
}
/// Finds the ending among the given class of endings, then checks if this ending was
/// preceded by any of given predessors, and if so, removes it from stemming zone.
/// Creation date: (17/03/2002 8:18:34 PM)
///
private bool FindAndRemoveEnding(System.Text.StringBuilder stemmingZone, char[][] theEndingClass, char[][] thePredessors)
{
int endingLength = FindEnding(stemmingZone, theEndingClass);
if (endingLength == 0)
// not found
return false;
else
{
int predessorLength = FindEnding(stemmingZone, stemmingZone.Length - endingLength - 1, thePredessors);
if (predessorLength == 0)
return false;
else
{
stemmingZone.Length -= endingLength;
// cut the ending found
return true;
}
}
}
/// Marks positions of RV, R1 and R2 in a given word.
/// Creation date: (16/03/2002 3:40:11 PM)
///
private void MarkPositions(System.String word)
{
RV = 0;
R1 = 0;
R2 = 0;
int i = 0;
// find RV
while (word.Length > i && !IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return ; // RV zone is empty
RV = i;
// find R1
while (word.Length > i && IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return ; // R1 zone is empty
R1 = i;
// find R2
while (word.Length > i && !IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return ; // R2 zone is empty
while (word.Length > i && IsVowel(word[i]))
{
i++;
}
if (word.Length - 1 < ++i)
return ; // R2 zone is empty
R2 = i;
}
/// Checks if character is a vowel..
/// Creation date: (16/03/2002 10:47:03 PM)
///
/// boolean
///
/// char
///
private bool IsVowel(char letter)
{
for (int i = 0; i < vowels.Length; i++)
{
if (letter == charset[vowels[i]])
return true;
}
return false;
}
/// Noun endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool Noun(System.Text.StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, nounEndings);
}
/// Perfective gerund endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool PerfectiveGerund(System.Text.StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, perfectiveGerundEndings1, perfectiveGerund1Predessors) || FindAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
}
/// Reflexive endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool Reflexive(System.Text.StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, reflexiveEndings);
}
/// Insert the method's description here.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool RemoveI(System.Text.StringBuilder stemmingZone)
{
if (stemmingZone.Length > 0 && stemmingZone[stemmingZone.Length - 1] == charset[I])
{
stemmingZone.Length -= 1;
return true;
}
else
{
return false;
}
}
/// Insert the method's description here.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool RemoveSoft(System.Text.StringBuilder stemmingZone)
{
if (stemmingZone.Length > 0 && stemmingZone[stemmingZone.Length - 1] == charset[SOFT])
{
stemmingZone.Length -= 1;
return true;
}
else
{
return false;
}
}
/// Insert the method's description here.
/// Creation date: (16/03/2002 10:58:42 PM)
///
/// char[]
///
public virtual void SetCharset(char[] newCharset)
{
charset = newCharset;
}
/// Set ending definition as in Russian stemming algorithm.
/// Creation date: (16/03/2002 11:16:36 PM)
///
private void SetEndings()
{
vowels = new char[]{A, E, I, O, U, Y, AE, IU, IA};
perfectiveGerundEndings1 = new char[][]{new char[]{V}, new char[]{V, SH, I}, new char[]{V, SH, I, S, SOFT}};
perfectiveGerund1Predessors = new char[][]{new char[]{A}, new char[]{IA}};
perfectiveGerundEndings2 = new char[][]{new char[]{I, V}, new char[]{Y, V}, new char[]{I, V, SH, I}, new char[]{Y, V, SH, I}, new char[]{I, V, SH, I, S, SOFT}, new char[]{Y, V, SH, I, S, SOFT}};
adjectiveEndings = new char[][]{new char[]{E, E}, new char[]{I, E}, new char[]{Y, E}, new char[]{O, E}, new char[]{E, I_}, new char[]{I, I_}, new char[]{Y, I_}, new char[]{O, I_}, new char[]{E, M}, new char[]{I, M}, new char[]{Y, M}, new char[]{O, M}, new char[]{I, X}, new char[]{Y, X}, new char[]{U, IU}, new char[]{IU, IU}, new char[]{A, IA}, new char[]{IA, IA}, new char[]{O, IU}, new char[]{E, IU}, new char[]{I, M, I}, new char[]{Y, M, I}, new char[]{E, G, O}, new char[]{O, G, O}, new char[]{E, M, U}, new char[]{O, M, U}};
participleEndings1 = new char[][]{new char[]{SHCH}, new char[]{E, M}, new char[]{N, N}, new char[]{V, SH}, new char[]{IU, SHCH}};
participleEndings2 = new char[][]{new char[]{I, V, SH}, new char[]{Y, V, SH}, new char[]{U, IU, SHCH}};
participle1Predessors = new char[][]{new char[]{A}, new char[]{IA}};
reflexiveEndings = new char[][]{new char[]{S, IA}, new char[]{S, SOFT}};
verbEndings1 = new char[][]{new char[]{I_}, new char[]{L}, new char[]{N}, new char[]{L, O}, new char[]{N, O}, new char[]{E, T}, new char[]{IU, T}, new char[]{L, A}, new char[]{N, A}, new char[]{L, I}, new char[]{E, M}, new char[]{N, Y}, new char[]{E, T, E}, new char[]{I_, T, E}, new char[]{T, SOFT}, new char[]{E, SH, SOFT}, new char[]{N, N, O}};
verbEndings2 = new char[][]{new char[]{IU}, new char[]{U, IU}, new char[]{E, N}, new char[]{E, I_}, new char[]{IA, T}, new char[]{U, I_}, new char[]{I, L}, new char[]{Y, L}, new char[]{I, M}, new char[]{Y, M}, new char[]{I, T}, new char[]{Y, T}, new char[]{I, L, A}, new char[]{Y, L, A}, new char[]{E, N, A}, new char[]{I, T, E}, new char[]{I, L, I}, new char[]{Y, L, I}, new char[]{I, L, O}, new char[]{Y, L, O}, new char[]{E, N, O}, new char[]{U, E, T}, new char[]{U, IU, T}, new char[]{E, N, Y}, new char[]{I, T, SOFT}, new char[]{Y, T, SOFT}, new char[]{I, SH, SOFT}, new char[]{E, I_, T, E}, new char[]{U, I_, T, E}};
verb1Predessors = new char[][]{new char[]{A}, new char[]{IA}};
nounEndings = new char[][]{new char[]{A}, new char[]{IU}, new char[]{I_}, new char[]{O}, new char[]{U}, new char[]{E}, new char[]{Y}, new char[]{I}, new char[]{SOFT}, new char[]{IA}, new char[]{E, V}, new char[]{O, V}, new char[]{I, E}, new char[]{SOFT, E}, new char[]{IA, X}, new char[]{I, IU}, new char[]{E, I}, new char[]{I, I}, new char[]{E, I_}, new char[]{O, I_}, new char[]{E, M}, new char[]{A, M}, new char[]{O, M}, new char[]{A, X}, new char[]{SOFT, IU}, new char[]{I, IA}, new char[]{SOFT, IA}, new char[]{I, I_}, new char[]{IA, M}, new char[]{IA, M, I}, new char[]{A, M, I}, new char[]{I, E, I_}, new char[]{I, IA, M}, new char[]{I, E, M}, new char[]{I, IA, X}, new char[]{I, IA, M, I}};
superlativeEndings = new char[][]{new char[]{E, I_, SH}, new char[]{E, I_, SH, E}};
derivationalEndings = new char[][]{new char[]{O, S, T}, new char[]{O, S, T, SOFT}};
}
/// Finds the stem for given Russian word.
/// Creation date: (16/03/2002 3:36:48 PM)
///
/// java.lang.String
///
/// java.lang.String
///
public virtual System.String Stem(System.String input)
{
MarkPositions(input);
if (RV == 0)
return input; //RV wasn't detected, nothing to stem
System.Text.StringBuilder stemmingZone = new System.Text.StringBuilder(input.Substring(RV));
// stemming goes on in RV
// Step 1
if (!PerfectiveGerund(stemmingZone))
{
Reflexive(stemmingZone);
bool r = Adjectival(stemmingZone) || Verb(stemmingZone) || Noun(stemmingZone);
}
// Step 2
RemoveI(stemmingZone);
// Step 3
Derivational(stemmingZone);
// Step 4
Superlative(stemmingZone);
UndoubleN(stemmingZone);
RemoveSoft(stemmingZone);
// return result
return input.Substring(0, (RV) - (0)) + stemmingZone.ToString();
}
/// Superlative endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool Superlative(System.Text.StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, superlativeEndings);
}
/// Undoubles N.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool UndoubleN(System.Text.StringBuilder stemmingZone)
{
char[][] doubleN = new char[][]{new char[]{N, N}};
if (FindEnding(stemmingZone, doubleN) != 0)
{
stemmingZone.Length -= 1;
return true;
}
else
{
return false;
}
}
/// Verb endings.
/// Creation date: (17/03/2002 12:14:58 AM)
///
/// java.lang.StringBuffer
///
private bool Verb(System.Text.StringBuilder stemmingZone)
{
return FindAndRemoveEnding(stemmingZone, verbEndings1, verb1Predessors) || FindAndRemoveEnding(stemmingZone, verbEndings2);
}
/// Static method for stemming with different charsets
public static System.String Stem(System.String theWord, char[] charset)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.SetCharset(charset);
return stemmer.Stem(theWord);
}
}
}