/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Fieldable = Lucene.Net.Documents.Fieldable;
using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException;
using Lucene.Net.Util;
using System.Reflection;
using System.IO;
namespace Lucene.Net.Analysis
{
// JAVA: src/java/org/apache/lucene/analysis/Analyzer.java
///
/// An represents a policy for extracting terms that are
/// indexed from text. The builds s, which
/// breaks down text into tokens.
///
///
///
/// A typical implementation will first build a .
/// The will break down the stream of characters from the
/// into raw s. One or
/// more s may then be applied to the output of the .
///
///
// REFACTOR: determine if this class should use IDisposable since it has a Close() method.
public abstract class Analyzer : IDisposable
{
private CloseableThreadLocal tokenStreams = new CloseableThreadLocal();
///
/// Gets or sets whether this class overrides the method.
///
protected internal bool overridesTokenStreamMethod;
///
/// Creates a which tokenizes all the text in
/// the provided .
///
/// The name of the . the fieldName can be null .
/// The text reader.
/// A .
public abstract TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader);
///
/// Creates a re-useable previously saved inside the
/// same thread that called this method. Callers that do not need to use more
/// than one TokenStream at the same time from this analyzer should use this
/// method for better performance.
///
///
///
/// This method defaults to invoking
///
///
public virtual TokenStream ReusableTokenStream(String fieldName, TextReader reader)
{
return TokenStream(fieldName, reader);
}
///
/// Gets the previous used by Analyzers that implement (overrides)
/// to retrieve a
/// previously saved for re-use by the same thread.
///
///
///
/// This method uses a to store the previous thread and retrieve it.
///
///
/// Throws when there is a null reference exception and the analyzer is closed.
///
/// Throws when there is a null reference to and the
/// analyzer is still open.
///
// REFACTOR: turn into a property.
protected internal virtual System.Object GetPreviousTokenStream()
{
try
{
return tokenStreams.Get();
}
catch (System.NullReferenceException ex)
{
// GLOBALIZATION: get exception message from resource file.
if (tokenStreams == null)
throw new AlreadyClosedException("this Analyzer is closed", ex);
// default to re-throw keep stack trace intact.
throw;
}
}
///
/// Sets the used by Analyzers that implement (overrides)
///
/// to save a for later re-use by the same thread.
///
/// The previous .
protected internal virtual void SetPreviousTokenStream(System.Object obj)
{
try
{
tokenStreams.Set(obj);
}
catch (System.NullReferenceException ex)
{
// GLOBALIZATION: get exception message from resource file.
if (tokenStreams == null)
throw new AlreadyClosedException("this Analyzer is closed", ex);
// default to re-throw keep stack trace intact.
throw;
}
}
///
/// This is only present to preserve
/// back-compat of classes that subclass a core analyzer
/// and override tokenStream but not reusableTokenStream.
///
/// The base class type.
[Obsolete("This is only present to preserve backwards compatibility of classes that subclass a core analyzer and override tokenStream but not reusableTokenStream ")]
protected internal virtual void SetOverridesTokenStreamMethod(System.Type baseClass)
{
Type[] paramsRenamed = new Type[] { typeof(String), typeof(TextReader) };
try
{
Type[] types = paramsRenamed ?? new Type[0];
MethodInfo method = this.GetType().GetMethod("TokenStream", types);
overridesTokenStreamMethod = (method != null && method.DeclaringType != baseClass);
}
catch
{
overridesTokenStreamMethod = false;
}
}
///
/// Gets the position of the increment gap between two
/// s using the same name. This
/// is called before indexing a instance if terms
/// have already been added to that field.
///
///
///
/// Specifying the position of the increment gap allows custom
/// s to place an automatic position increment gap between
/// instances using the same field name.
///
///
/// The default value position increment gap is 0.
///
///
/// Position Increment Gap - The value that controls the
/// virtual space between the last of one
/// instance and the first of the next instance.
/// Both fields share the same name.
///
///
/// Suppose a document has a multi-valued "author" field. Like this:
///
///
/// author: John Doe
/// author: Bob Smith
///
///
/// With a position increment gap of 0, a phrase query of "doe bob" would
/// be a match. With a gap of 100, a phrase query of "doe bob" would not
/// match. The gap of 100 would prevent the phrase queries from matching
/// even with a modest slop factor.
///
///
/// This explanation of the position increment gap was pulled from an entry by Erik Hatcher on the
///
/// lucene-solr-user list .
/// This was a better explanation than the one found in the code comments from the Lucene-Solr project.
///
///
/// The name of the field being indexed.
///
/// The position of the increment gap added to the next token emitted
/// from
///
public virtual int GetPositionIncrementGap(System.String fieldName)
{
return 0;
}
///
/// Gets the offset gap for a token in the specified field. By default this method
/// returns 1 for tokenized fields and 0 if the field is untokenized.
///
///
///
/// This method is similar to
/// and is only called if the field produced at least one token for indexing.
///
///
/// the field that was just analyzed
///
/// The offset gap, added to the next token emitted
/// from .
///
public virtual int GetOffsetGap(Fieldable field)
{
if (field.IsTokenized())
return 1;
else
return 0;
}
///
/// Frees persistent resources used by the .
///
///
///
/// The default implementation closes the internal s
/// used by the analyzer.
///
///
public virtual void Close()
{
if(tokenStreams!=null) tokenStreams.Close();
tokenStreams = null;
}
public virtual void Dispose()
{
Close();
}
}
}