/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
using System;
using System.Collections.Generic;
using System.IO;
using System.Collections;
using Lucene.Net.Analysis;
using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.CJK
{
///
/// Filters CJKTokenizer with StopFilter.
///
/// Che, Dong
///
public class CJKAnalyzer : Analyzer
{
//~ Static fields/initializers ---------------------------------------------
///
/// An array containing some common English words that are not usually
/// useful for searching. and some double-byte interpunctions.....
///
// TODO make this final in 3.1 -
// this might be revised and merged with StopFilter stop words too
[Obsolete("use GetDefaultStopSet() instead")] public static String[] STOP_WORDS =
{
"a", "and", "are", "as", "at", "be",
"but", "by", "for", "if", "in",
"into", "is", "it", "no", "not",
"of", "on", "or", "s", "such", "t",
"that", "the", "their", "then",
"there", "these", "they", "this",
"to", "was", "will", "with", "",
"www"
};
//~ Instance fields --------------------------------------------------------
///
/// Returns an unmodifiable instance of the default stop-words set.
///
/// Returns an unmodifiable instance of the default stop-words set.
public static ISet GetDefaultStopSet()
{
return DefaultSetHolder.DEFAULT_STOP_SET;
}
private static class DefaultSetHolder
{
internal static ISet DEFAULT_STOP_SET =
CharArraySet.UnmodifiableSet(new CharArraySet(STOP_WORDS, false));
}
///
/// stop word list
///
private ISet stopTable;
private readonly Version matchVersion;
//~ Constructors -----------------------------------------------------------
public CJKAnalyzer(Version matchVersion)
: this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
{
}
public CJKAnalyzer(Version matchVersion, ISet stopWords)
{
stopTable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopWords));
this.matchVersion = matchVersion;
}
///
/// Builds an analyzer which removes words in the provided array.
///
/// stop word array
public CJKAnalyzer(Version matchVersion, params string[] stopWords)
{
stopTable = StopFilter.MakeStopSet(stopWords);
this.matchVersion = matchVersion;
}
//~ Methods ----------------------------------------------------------------
///
/// get token stream from input
///
/// lucene field name
/// input reader
/// Token Stream
public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
{
return new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
new CJKTokenizer(reader), stopTable);
}
private class SavedStreams
{
protected internal Tokenizer source;
protected internal TokenStream result;
};
/**
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text
* in the provided {@link Reader}.
*
* @param fieldName lucene field name
* @param reader Input {@link Reader}
* @return A {@link TokenStream} built from {@link CJKTokenizer}, filtered with
* {@link StopFilter}
*/
public override sealed TokenStream ReusableTokenStream(String fieldName, TextReader reader)
{
/* tokenStream() is final, no back compat issue */
SavedStreams streams = (SavedStreams) PreviousTokenStream;
if (streams == null)
{
streams = new SavedStreams();
streams.source = new CJKTokenizer(reader);
streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion),
streams.source, stopTable);
PreviousTokenStream = streams;
}
else
{
streams.source.Reset(reader);
}
return streams.result;
}
}
}