/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ using System; using System.Collections.Generic; using System.IO; using System.Text.RegularExpressions; using Lucene.Net.Analysis.Tokenattributes; using Version = Lucene.Net.Util.Version; namespace Lucene.Net.Analysis.Miscellaneous { /** * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Regex} * (with behaviour identical to {@link String#split(String)}), * and that combines the functionality of * {@link org.apache.lucene.analysis.LetterTokenizer}, * {@link org.apache.lucene.analysis.LowerCaseTokenizer}, * {@link org.apache.lucene.analysis.WhitespaceTokenizer}, * {@link org.apache.lucene.analysis.StopFilter} into a single efficient * multi-purpose class. *
* If you are unsure how exactly a regular expression should look like, consider * prototyping by simply trying various expressions on some test texts via * {@link String#split(String)}. Once you are satisfied, give that regex to * RegexAnalyzer. Also see Java Regular Expression Tutorial. *
* This class can be considerably faster than the "normal" Lucene tokenizers. * It can also serve as a building block in a compound Lucene * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this * stemming example: *
* RegexAnalyzer pat = ... * TokenStream tokenStream = new SnowballFilter( * pat.tokenStream("content", "James is running round in the woods"), * "English")); ** */ public class PatternAnalyzer : Analyzer { /**
"\\W+"
; Divides text at non-letters (NOT char.IsLetter(c)) */
public static readonly Regex NON_WORD_PATTERN = new Regex("\\W+", RegexOptions.Compiled);
/** "\\s+"
; Divides text at whitespaces (char.IsWhitespace(c)) */
public static readonly Regex WHITESPACE_PATTERN = new Regex("\\s+", RegexOptions.Compiled);
private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS =
CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerabletrue
returns tokens after applying
* String.toLowerCase()
* @param stopWords
* if non-null, ignores all tokens that are contained in the
* given stop set (after previously having applied toLowerCase()
* if applicable). For example, created via
* {@link StopFilter#makeStopSet(String[])}and/or
* {@link org.apache.lucene.analysis.WordlistLoader}as in
* WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")
* or other stop words
* lists .
*/
public PatternAnalyzer(Version matchVersion, Regex Regex, bool toLowerCase, ISettokenStream(String, String)
and is
* less efficient than tokenStream(String, String)
.
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param reader
* the reader delivering the text
* @return a new token stream
*/
public override TokenStream TokenStream(String fieldName, TextReader reader)
{
if (reader is FastStringReader)
{ // fast path
return TokenStream(fieldName, ((FastStringReader)reader).GetString());
}
try
{
String text = ToString(reader);
return TokenStream(fieldName, text);
}
catch (IOException e)
{
throw new Exception("Wrapped Exception", e);
}
}
/**
* Indicates whether some other object is "equal to" this one.
*
* @param other
* the reference object with which to compare.
* @return true if equal, false otherwise
*/
public override bool Equals(Object other)
{
if (this == other) return true;
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
if (other is PatternAnalyzer)
{
PatternAnalyzer p2 = (PatternAnalyzer)other;
return
toLowerCase == p2.toLowerCase &&
EqRegex(Regex, p2.Regex) &&
Eq(stopWords, p2.stopWords);
}
return false;
}
/**
* Returns a hash code value for the object.
*
* @return the hash code.
*/
public override int GetHashCode()
{
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
int h = 1;
h = 31 * h + Regex.GetHashCode();
h = 31 * h + (int)Regex.Options;
h = 31 * h + (toLowerCase ? 1231 : 1237);
h = 31 * h + (stopWords != null ? stopWords.GetHashCode() : 0);
return h;
}
/** equality where o1 and/or o2 can be null */
private static bool Eq(Object o1, Object o2)
{
return (o1 == o2) || (o1 != null ? o1.Equals(o2) : false);
}
/** assumes p1 and p2 are not null */
private static bool EqRegex(Regex p1, Regex p2)
{
return p1 == p2 || (p1.Options == p2.Options && p1.ToString() == p2.ToString());
}
/**
* Reads until end-of-stream and returns all read chars, finally closes the stream.
*
* @param input the input stream
* @throws IOException if an I/O error occurs while reading the stream
*/
private static String ToString(TextReader input)
{
try
{
int len = 256;
char[] buffer = new char[len];
char[] output = new char[len];
len = 0;
int n;
while ((n = input.Read(buffer, 0, buffer.Length)) != 0)
{
if (len + n > output.Length)
{ // grow capacity
char[] tmp = new char[Math.Max(output.Length << 1, len + n)];
Array.Copy(output, 0, tmp, 0, len);
Array.Copy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
}
else
{
Array.Copy(buffer, 0, output, len, n);
}
len += n;
}
return new String(output, 0, len);
}
finally
{
if (input != null) input.Dispose();
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* The work horse; performance isn't fantastic, but it's not nearly as bad
* as one might think - kudos to the Sun regex developers.
*/
private sealed class RegexTokenizer : TokenStream
{
private readonly String str;
private readonly bool toLowerCase;
private Match matcher;
private int pos = 0;
private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture;
private ITermAttribute termAtt;
private IOffsetAttribute offsetAtt;
public RegexTokenizer(String str, Regex regex, bool toLowerCase)
{
this.str = str;
this.matcher = regex.Match(str);
this.toLowerCase = toLowerCase;
this.termAtt = AddAttribute