/*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using StopFilter = Lucene.Net.Analysis.StopFilter;
using TokenStream = Lucene.Net.Analysis.TokenStream;
namespace Lucene.Net.Analysis.RU
{
/// Analyzer for Russian language. Supports an external list of stopwords (words that
/// will not be indexed at all).
/// A default set of stopwords is used unless an alternative list is specified.
///
///
/// Boris Okner, b.okner@rogers.com
///
/// $Id: RussianAnalyzer.java,v 1.7 2004/03/29 22:48:01 cutting Exp $
///
public sealed class RussianAnalyzer : Analyzer
{
// letters
private static char A = (char) (0);
private static char B = (char) (1);
private static char V = (char) (2);
private static char G = (char) (3);
private static char D = (char) (4);
private static char E = (char) (5);
private static char ZH = (char) (6);
private static char Z = (char) (7);
private static char I = (char) (8);
private static char I_ = (char) (9);
private static char K = (char) (10);
private static char L = (char) (11);
private static char M = (char) (12);
private static char N = (char) (13);
private static char O = (char) (14);
private static char P = (char) (15);
private static char R = (char) (16);
private static char S = (char) (17);
private static char T = (char) (18);
private static char U = (char) (19);
private static char F = (char) (20);
private static char X = (char) (21);
private static char TS = (char) (22);
private static char CH = (char) (23);
private static char SH = (char) (24);
private static char SHCH = (char) (25);
private static char HARD = (char) (26);
private static char Y = (char) (27);
private static char SOFT = (char) (28);
private static char AE = (char) (29);
private static char IU = (char) (30);
private static char IA = (char) (31);
/// List of typical Russian stopwords.
private static char[][] RUSSIAN_STOP_WORDS = new char[][]{new char[]{A}, new char[]{B, E, Z}, new char[]{B, O, L, E, E}, new char[]{B, Y}, new char[]{B, Y, L}, new char[]{B, Y, L, A}, new char[]{B, Y, L, I}, new char[]{B, Y, L, O}, new char[]{B, Y, T, SOFT}, new char[]{V}, new char[]{V, A, M}, new char[]{V, A, S}, new char[]{V, E, S, SOFT}, new char[]{V, O}, new char[]{V, O, T}, new char[]{V, S, E}, new char[]{V, S, E, G, O}, new char[]{V, S, E, X}, new char[]{V, Y}, new char[]{G, D, E}, new char[]{D, A}, new char[]{D, A, ZH, E}, new char[]{D, L, IA}, new char[]{D, O}, new char[]{E, G, O}, new char[]{E, E}, new char[]{E, I_}, new char[]{E, IU}, new char[]{E, S, L, I}, new char[]{E, S, T, SOFT}, new char[]{E, SHCH, E}, new char[]{ZH, E}, new char[]{Z, A}, new char[]{Z, D, E, S, SOFT}, new char[]{I}, new char[]{I, Z}, new char[]{I, L, I}, new char[]{I, M}, new char[]{I, X}, new char[]{K}, new char[]{K, A, K}, new char[]{K, O}, new char[]{K, O, G, D, A}, new char[]{K, T, O}, new char[]{L, I}, new char[]{L, I, B, O}, new char[]{M, N, E}, new char[]{M, O, ZH, E, T}, new char[]{M, Y}, new char[]{N, A}, new char[]{N, A, D, O}, new char[]{N, A, SH}, new char[]{N, E}, new char[]{N, E, G, O}, new char[]{N, E, E}, new char[]{N, E, T}, new char[]{N, I}, new char[]{N, I, X}, new char[]{N, O}, new char[]{N, U}, new char[]{O}, new char[]{O, B}, new char[]{O, D, N, A, K, O}, new char[]{O, N}, new char[]{O, N, A}, new char[]{O, N, I}, new char[]{O, N, O}, new char[]{O, T}, new char[]{O, CH, E, N, SOFT}, new char[]{P, O}, new char[]{P, O, D}, new char[]{P, R, I}, new char[]{S}, new char[]{S, O}, new char[]{T, A, K}, new char[]{T, A, K, ZH, E}, new char[]{T, A, K, O, I_}, new char[]{T, A, M}, new char[]{T, E}, new char[]{T, E, M}, new char[]{T, O}, new char[]{T, O, G, O}, new char[]{T, O, ZH, E}, new char[]{T, O, I_}, new char[]{T, O, L, SOFT, K, O}, new char[]{T, O, M}, new char[]{T, Y}, new char[]{U}, new char[]{U, ZH, E}, new char[]{X, O, T, IA}, new char[]{CH, E, G, O}, new char[]{CH, E, I_}, new char[]{CH, E, M},
new char[]{CH, T, O}, new char[]{CH, T, O, B, Y}, new char[]{CH, SOFT, E}, new char[]{CH, SOFT, IA}, new char[]{AE, T, A}, new char[]{AE, T, I}, new char[]{AE, T, O}, new char[]{IA}};
/// Contains the stopwords used with the StopFilter.
private System.Collections.Hashtable stopSet = new System.Collections.Hashtable();
/// Charset for Russian letters.
/// Represents encoding for 32 lowercase Russian letters.
/// Predefined charsets can be taken from RussianCharSets class
///
private char[] charset;
public RussianAnalyzer()
{
charset = RussianCharsets.UnicodeRussian;
stopSet = StopFilter.MakeStopSet(makeStopWords(RussianCharsets.UnicodeRussian));
}
/// Builds an analyzer.
public RussianAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.MakeStopSet(makeStopWords(charset));
}
/// Builds an analyzer with the given stop words.
public RussianAnalyzer(char[] charset, System.String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.MakeStopSet(stopwords);
}
// Takes russian stop words and translates them to a String array, using
// the given charset
private static System.String[] makeStopWords(char[] charset)
{
System.String[] res = new System.String[RUSSIAN_STOP_WORDS.Length];
for (int i = 0; i < res.Length; i++)
{
char[] theStopWord = RUSSIAN_STOP_WORDS[i];
// translate the word,using the charset
System.Text.StringBuilder theWord = new System.Text.StringBuilder();
for (int j = 0; j < theStopWord.Length; j++)
{
theWord.Append(charset[theStopWord[j]]);
}
res[i] = theWord.ToString();
}
return res;
}
/// Builds an analyzer with the given stop words.
/// create a Set version of this ctor
public RussianAnalyzer(char[] charset, System.Collections.Hashtable stopwords)
{
this.charset = charset;
stopSet = new System.Collections.Hashtable(new System.Collections.Hashtable(stopwords));
}
/// Creates a TokenStream which tokenizes all the text in the provided Reader.
///
///
/// A TokenStream build from a RussianLetterTokenizer filtered with
/// RussianLowerCaseFilter, StopFilter, and RussianStemFilter
///
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new RussianLetterTokenizer(reader, charset);
result = new RussianLowerCaseFilter(result, charset);
result = new StopFilter(result, stopSet);
result = new RussianStemFilter(result, charset);
return result;
}
}
}