/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Fieldable = Lucene.Net.Documents.Fieldable; using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException; using Lucene.Net.Util; using System.Reflection; using System.IO; namespace Lucene.Net.Analysis { // JAVA: src/java/org/apache/lucene/analysis/Analyzer.java /// /// An represents a policy for extracting terms that are /// indexed from text. The builds s, which /// breaks down text into tokens. /// /// /// /// A typical implementation will first build a . /// The will break down the stream of characters from the /// into raw s. One or /// more s may then be applied to the output of the . /// /// // REFACTOR: determine if this class should use IDisposable since it has a Close() method. public abstract class Analyzer : IDisposable { private CloseableThreadLocal tokenStreams = new CloseableThreadLocal(); /// /// Gets or sets whether this class overrides the method. /// protected internal bool overridesTokenStreamMethod; /// /// Creates a which tokenizes all the text in /// the provided . /// /// The name of the . the fieldName can be null. /// The text reader. /// A . public abstract TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader); /// /// Creates a re-useable previously saved inside the /// same thread that called this method. Callers that do not need to use more /// than one TokenStream at the same time from this analyzer should use this /// method for better performance. /// /// /// /// This method defaults to invoking /// /// public virtual TokenStream ReusableTokenStream(String fieldName, TextReader reader) { return TokenStream(fieldName, reader); } /// /// Gets the previous used by Analyzers that implement (overrides) /// to retrieve a /// previously saved for re-use by the same thread. /// /// /// /// This method uses a to store the previous thread and retrieve it. /// /// /// Throws when there is a null reference exception and the analyzer is closed. /// /// Throws when there is a null reference to and the /// analyzer is still open. /// // REFACTOR: turn into a property. protected internal virtual System.Object GetPreviousTokenStream() { try { return tokenStreams.Get(); } catch (System.NullReferenceException ex) { // GLOBALIZATION: get exception message from resource file. if (tokenStreams == null) throw new AlreadyClosedException("this Analyzer is closed", ex); // default to re-throw keep stack trace intact. throw; } } /// /// Sets the used by Analyzers that implement (overrides) /// /// to save a for later re-use by the same thread. /// /// The previous . protected internal virtual void SetPreviousTokenStream(System.Object obj) { try { tokenStreams.Set(obj); } catch (System.NullReferenceException ex) { // GLOBALIZATION: get exception message from resource file. if (tokenStreams == null) throw new AlreadyClosedException("this Analyzer is closed", ex); // default to re-throw keep stack trace intact. throw; } } /// /// This is only present to preserve /// back-compat of classes that subclass a core analyzer /// and override tokenStream but not reusableTokenStream. /// /// The base class type. [Obsolete("This is only present to preserve backwards compatibility of classes that subclass a core analyzer and override tokenStream but not reusableTokenStream ")] protected internal virtual void SetOverridesTokenStreamMethod(System.Type baseClass) { Type[] paramsRenamed = new Type[] { typeof(String), typeof(TextReader) }; try { Type[] types = paramsRenamed ?? new Type[0]; MethodInfo method = this.GetType().GetMethod("TokenStream", types); overridesTokenStreamMethod = (method != null && method.DeclaringType != baseClass); } catch { overridesTokenStreamMethod = false; } } /// /// Gets the position of the increment gap between two /// s using the same name. This /// is called before indexing a instance if terms /// have already been added to that field. /// /// /// /// Specifying the position of the increment gap allows custom /// s to place an automatic position increment gap between /// instances using the same field name. /// /// /// The default value position increment gap is 0. /// /// /// Position Increment Gap - The value that controls the /// virtual space between the last of one /// instance and the first of the next instance. /// Both fields share the same name. /// /// /// Suppose a document has a multi-valued "author" field. Like this: /// ///
    ///
  • author: John Doe
  • ///
  • author: Bob Smith
  • ///
/// /// With a position increment gap of 0, a phrase query of "doe bob" would /// be a match. With a gap of 100, a phrase query of "doe bob" would not /// match. The gap of 100 would prevent the phrase queries from matching /// even with a modest slop factor. /// /// /// This explanation of the position increment gap was pulled from an entry by Erik Hatcher on the /// /// lucene-solr-user list. /// This was a better explanation than the one found in the code comments from the Lucene-Solr project. /// ///
/// The name of the field being indexed. /// /// The position of the increment gap added to the next token emitted /// from /// public virtual int GetPositionIncrementGap(System.String fieldName) { return 0; } /// /// Gets the offset gap for a token in the specified field. By default this method /// returns 1 for tokenized fields and 0 if the field is untokenized. /// /// /// /// This method is similar to /// and is only called if the field produced at least one token for indexing. /// /// /// the field that was just analyzed /// /// The offset gap, added to the next token emitted /// from . /// public virtual int GetOffsetGap(Fieldable field) { if (field.IsTokenized()) return 1; else return 0; } /// /// Frees persistent resources used by the . /// /// /// /// The default implementation closes the internal s /// used by the analyzer. /// /// public virtual void Close() { if(tokenStreams!=null) tokenStreams.Close(); tokenStreams = null; } public virtual void Dispose() { Close(); } } }