/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using CharReader = Lucene.Net.Analysis.CharReader;
using Token = Lucene.Net.Analysis.Token;
using Tokenizer = Lucene.Net.Analysis.Tokenizer;
using OffsetAttribute = Lucene.Net.Analysis.Tokenattributes.OffsetAttribute;
using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
using TermAttribute = Lucene.Net.Analysis.Tokenattributes.TermAttribute;
using TypeAttribute = Lucene.Net.Analysis.Tokenattributes.TypeAttribute;
using AttributeSource = Lucene.Net.Util.AttributeSource;
using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Standard
{
/// A grammar-based tokenizer constructed with JFlex
///
/// This should be a good tokenizer for most European-language documents:
///
///
/// - Splits words at punctuation characters, removing punctuation. However, a
/// dot that's not followed by whitespace is considered part of a token.
/// - Splits words at hyphens, unless there's a number in the token, in which case
/// the whole token is interpreted as a product number and is not split.
/// - Recognizes email addresses and internet hostnames as one token.
///
///
/// Many applications have specific tokenizer needs. If this tokenizer does
/// not suit your application, please consider copying this source code
/// directory to your project and maintaining your own grammar-based tokenizer.
///
///
///
/// You must specify the required {@link Version} compatibility when creating
/// StandardAnalyzer:
///
/// - As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
/// LUCENE-1608
///
///
public class StandardTokenizer:Tokenizer
{
private void InitBlock()
{
maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
}
/// A private instance of the JFlex-constructed scanner
private StandardTokenizerImpl scanner;
public const int ALPHANUM = 0;
public const int APOSTROPHE = 1;
public const int ACRONYM = 2;
public const int COMPANY = 3;
public const int EMAIL = 4;
public const int HOST = 5;
public const int NUM = 6;
public const int CJ = 7;
/// this solves a bug where HOSTs that end with '.' are identified
/// as ACRONYMs. It is deprecated and will be removed in the next
/// release.
///
[Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs. It is deprecated and will be removed in the next release.")]
public const int ACRONYM_DEP = 8;
/// String token types that correspond to token type int constants
public static readonly System.String[] TOKEN_TYPES = new System.String[]{"", "", "", "", "", "", "", "", ""};
/// Please use {@link #TOKEN_TYPES} instead
///
[Obsolete("Please use TOKEN_TYPES instead")]
public static readonly System.String[] tokenImage = TOKEN_TYPES;
/// Specifies whether deprecated acronyms should be replaced with HOST type.
/// This is false by default to support backward compatibility.
///
/// See http://issues.apache.org/jira/browse/LUCENE-1068
///
///
/// this should be removed in the next release (3.0).
///
[Obsolete("this should be removed in the next release (3.0).")]
private bool replaceInvalidAcronym;
private int maxTokenLength;
/// Set the max allowed token length. Any token longer
/// than this is skipped.
///
public virtual void SetMaxTokenLength(int length)
{
this.maxTokenLength = length;
}
///
///
public virtual int GetMaxTokenLength()
{
return maxTokenLength;
}
/// Creates a new instance of the {@link StandardTokenizer}. Attaches the
/// input
to a newly created JFlex scanner.
///
/// Use {@link #StandardTokenizer(Version, Reader)} instead
///
[Obsolete("Use StandardTokenizer(Version, Reader) instead")]
public StandardTokenizer(System.IO.TextReader input):this(Version.LUCENE_24, input)
{
}
/// Creates a new instance of the {@link Lucene.Net.Analysis.Standard.StandardTokenizer}. Attaches
/// the input
to the newly created JFlex scanner.
///
///
/// The input reader
///
/// Set to true to replace mischaracterized acronyms with HOST.
///
/// See http://issues.apache.org/jira/browse/LUCENE-1068
///
/// Use {@link #StandardTokenizer(Version, Reader)} instead
///
[Obsolete("Use StandardTokenizer(Version, Reader) instead")]
public StandardTokenizer(System.IO.TextReader input, bool replaceInvalidAcronym):base()
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, replaceInvalidAcronym);
}
/// Creates a new instance of the
/// {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
/// the input
to the newly created JFlex scanner.
///
///
/// The input reader
///
/// See http://issues.apache.org/jira/browse/LUCENE-1068
///
public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, matchVersion);
}
/// Creates a new StandardTokenizer with a given {@link AttributeSource}.
/// Use
/// {@link #StandardTokenizer(Version, AttributeSource, Reader)}
/// instead
///
[Obsolete("Use StandardTokenizer(Version, AttributeSource, Reader) instead")]
public StandardTokenizer(AttributeSource source, System.IO.TextReader input, bool replaceInvalidAcronym):base(source)
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, replaceInvalidAcronym);
}
/// Creates a new StandardTokenizer with a given {@link AttributeSource}.
public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, matchVersion);
}
/// Creates a new StandardTokenizer with a given {@link Lucene.Net.Util.AttributeSource.AttributeFactory}
/// Use
/// {@link #StandardTokenizer(Version, org.apache.lucene.util.AttributeSource.AttributeFactory, Reader)}
/// instead
///
[Obsolete("Use StandardTokenizer(Version, Lucene.Net.Util.AttributeSource.AttributeFactory, Reader) instead")]
public StandardTokenizer(AttributeFactory factory, System.IO.TextReader input, bool replaceInvalidAcronym):base(factory)
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, replaceInvalidAcronym);
}
/// Creates a new StandardTokenizer with a given
/// {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
///
public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, matchVersion);
}
private void Init(System.IO.TextReader input, bool replaceInvalidAcronym)
{
this.replaceInvalidAcronym = replaceInvalidAcronym;
this.input = input;
termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute));
posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute));
}
private void Init(System.IO.TextReader input, Version matchVersion)
{
if (matchVersion.OnOrAfter(Version.LUCENE_24))
{
Init(input, true);
}
else
{
Init(input, false);
}
}
// this tokenizer generates three attributes:
// offset, positionIncrement and type
private TermAttribute termAtt;
private OffsetAttribute offsetAtt;
private PositionIncrementAttribute posIncrAtt;
private TypeAttribute typeAtt;
/*
* (non-Javadoc)
*
* @see Lucene.Net.Analysis.TokenStream#next()
*/
public override bool IncrementToken()
{
ClearAttributes();
int posIncr = 1;
while (true)
{
int tokenType = scanner.GetNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF)
{
return false;
}
if (scanner.Yylength() <= maxTokenLength)
{
posIncrAtt.SetPositionIncrement(posIncr);
scanner.GetText(termAtt);
int start = scanner.Yychar();
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
{
if (replaceInvalidAcronym)
{
typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
}
else
{
typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
}
}
else
{
typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
return true;
}
// When we skip a too-long term, we still increment the
// position increment
else
posIncr++;
}
}
public override void End()
{
// set final offset
int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
offsetAtt.SetOffset(finalOffset, finalOffset);
}
/// Will be removed in Lucene 3.0. This method is final, as it should
/// not be overridden. Delegates to the backwards compatibility layer.
///
[Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
public override Token Next(Token reusableToken)
{
return base.Next(reusableToken);
}
/// Will be removed in Lucene 3.0. This method is final, as it should
/// not be overridden. Delegates to the backwards compatibility layer.
///
[Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
public override Token Next()
{
return base.Next();
}
/*
* (non-Javadoc)
*
* @see Lucene.Net.Analysis.TokenStream#reset()
*/
public override void Reset()
{
base.Reset();
scanner.Yyreset(input);
}
public override void Reset(System.IO.TextReader reader)
{
base.Reset(reader);
Reset();
}
/// Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
/// when they should have been labeled as hosts instead.
///
/// true if StandardTokenizer now returns these tokens as Hosts, otherwise false
///
///
/// Remove in 3.X and make true the only valid value
///
[Obsolete("Remove in 3.X and make true the only valid value")]
public virtual bool IsReplaceInvalidAcronym()
{
return replaceInvalidAcronym;
}
///
/// Set to true to replace mischaracterized acronyms as HOST.
///
/// Remove in 3.X and make true the only valid value
///
/// See https://issues.apache.org/jira/browse/LUCENE-1068
///
[Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
public virtual void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
{
this.replaceInvalidAcronym = replaceInvalidAcronym;
}
}
}