using System; using System.Text; using System.IO; using System.Collections; using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.Ru { ///

/// Analyzer for Russian language. Supports an external list of stopwords (words that /// will not be indexed at all). /// A default set of stopwords is used unless an alternative list is specified. ///

public sealed class RussianAnalyzer : Analyzer { // letters private static char A = (char)0; private static char B = (char)1; private static char V = (char)2; private static char G = (char)3; private static char D = (char)4; private static char E = (char)5; private static char ZH = (char)6; private static char Z = (char)7; private static char I = (char)8; private static char I_ = (char)9; private static char K = (char)10; private static char L = (char)11; private static char M = (char)12; private static char N = (char)13; private static char O = (char)14; private static char P = (char)15; private static char R = (char)16; private static char S = (char)17; private static char T = (char)18; private static char U = (char)19; //private static char F = (char)20; private static char X = (char)21; //private static char TS = (char)22; private static char CH = (char)23; private static char SH = (char)24; private static char SHCH = (char)25; //private static char HARD = (char)26; private static char Y = (char)27; private static char SOFT = (char)28; private static char AE = (char)29; private static char IU = (char)30; private static char IA = (char)31; ///

/// List of typical Russian stopwords. ///

private static char[][] RUSSIAN_STOP_WORDS = { new char[] {A}, new char[] {B, E, Z}, new char[] {B, O, L, E, E}, new char[] {B, Y}, new char[] {B, Y, L}, new char[] {B, Y, L, A}, new char[] {B, Y, L, I}, new char[] {B, Y, L, O}, new char[] {B, Y, T, SOFT}, new char[] {V}, new char[] {V, A, M}, new char[] {V, A, S}, new char[] {V, E, S, SOFT}, new char[] {V, O}, new char[] {V, O, T}, new char[] {V, S, E}, new char[] {V, S, E, G, O}, new char[] {V, S, E, X}, new char[] {V, Y}, new char[] {G, D, E}, new char[] {D, A}, new char[] {D, A, ZH, E}, new char[] {D, L, IA}, new char[] {D, O}, new char[] {E, G, O}, new char[] {E, E}, new char[] {E, I_,}, new char[] {E, IU}, new char[] {E, S, L, I}, new char[] {E, S, T, SOFT}, new char[] {E, SHCH, E}, new char[] {ZH, E}, new char[] {Z, A}, new char[] {Z, D, E, S, SOFT}, new char[] {I}, new char[] {I, Z}, new char[] {I, L, I}, new char[] {I, M}, new char[] {I, X}, new char[] {K}, new char[] {K, A, K}, new char[] {K, O}, new char[] {K, O, G, D, A}, new char[] {K, T, O}, new char[] {L, I}, new char[] {L, I, B, O}, new char[] {M, N, E}, new char[] {M, O, ZH, E, T}, new char[] {M, Y}, new char[] {N, A}, new char[] {N, A, D, O}, new char[] {N, A, SH}, new char[] {N, E}, new char[] {N, E, G, O}, new char[] {N, E, E}, new char[] {N, E, T}, new char[] {N, I}, new char[] {N, I, X}, new char[] {N, O}, new char[] {N, U}, new char[] {O}, new char[] {O, B}, new char[] {O, D, N, A, K, O}, new char[] {O, N}, new char[] {O, N, A}, new char[] {O, N, I}, new char[] {O, N, O}, new char[] {O, T}, new char[] {O, CH, E, N, SOFT}, new char[] {P, O}, new char[] {P, O, D}, new char[] {P, R, I}, new char[] {S}, new char[] {S, O}, new char[] {T, A, K}, new char[] {T, A, K, ZH, E}, new char[] {T, A, K, O, I_}, new char[] {T, A, M}, new char[] {T, E}, new char[] {T, E, M}, new char[] {T, O}, new char[] {T, O, G, O}, new char[] {T, O, ZH, E}, new char[] {T, O, I_}, new char[] {T, O, L, SOFT, K, O}, new char[] {T, O, M}, new char[] {T, Y}, new char[] {U}, new char[] {U, ZH, E}, new char[] {X, O, T, IA}, new char[] {CH, E, G, O}, new char[] {CH, E, I_}, new char[] {CH, E, M}, new char[] {CH, T, O}, new char[] {CH, T, O, B, Y}, new char[] {CH, SOFT, E}, new char[] {CH, SOFT, IA}, new char[] {AE, T, A}, new char[] {AE, T, I}, new char[] {AE, T, O}, new char[] {IA} }; ///

/// Contains the stopwords used with the StopFilter. ///

private Hashtable stoptable = new Hashtable(); ///

/// Charset for Russian letters. /// Represents encoding for 32 lowercase Russian letters. /// Predefined charsets can be taken from RussianCharSets class ///

private char[] charset; ///

/// Builds an analyzer. ///

public RussianAnalyzer() { this.charset = RussianCharsets.UnicodeRussian; stoptable = StopFilter.MakeStopSet(MakeStopWords(RussianCharsets.UnicodeRussian)); } ///

/// Builds an analyzer. ///

/// public RussianAnalyzer(char[] charset) { this.charset = charset; stoptable = StopFilter.MakeStopSet(MakeStopWords(charset)); } ///

/// Builds an analyzer with the given stop words. ///

/// /// public RussianAnalyzer(char[] charset, String[] stopwords) { this.charset = charset; stoptable = StopFilter.MakeStopSet(stopwords); } ///

/// Takes russian stop words and translates them to a String array, using /// the given charset ///

/// /// private static String[] MakeStopWords(char[] charset) { String[] res = new String[RUSSIAN_STOP_WORDS.Length]; for (int i = 0; i < res.Length; i++) { char[] theStopWord = RUSSIAN_STOP_WORDS[i]; // translate the word,using the charset StringBuilder theWord = new StringBuilder(); for (int j = 0; j < theStopWord.Length; j++) { theWord.Append(charset[theStopWord[j]]); } res[i] = theWord.ToString(); } return res; } ///

/// Builds an analyzer with the given stop words. ///

/// /// public RussianAnalyzer(char[] charset, Hashtable stopwords) { this.charset = charset; stoptable = stopwords; } ///

/// Creates a TokenStream which tokenizes all the text in the provided TextReader. ///

/// /// /// /// A TokenStream build from a RussianLetterTokenizer filtered with /// RussianLowerCaseFilter, StopFilter, and RussianStemFilter /// public override TokenStream TokenStream(String fieldName, TextReader reader) { TokenStream result = new RussianLetterTokenizer(reader, charset); result = new RussianLowerCaseFilter(result, charset); result = new StopFilter(result, stoptable); result = new RussianStemFilter(result, charset); return result; } } }