/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;
using CharReader = Lucene.Net.Analysis.CharReader;
using Token = Lucene.Net.Analysis.Token;
using Tokenizer = Lucene.Net.Analysis.Tokenizer;
using AttributeSource = Lucene.Net.Util.AttributeSource;
using Version = Lucene.Net.Util.Version;
namespace Lucene.Net.Analysis.Standard
{
/// A grammar-based tokenizer constructed with JFlex
///
/// This should be a good tokenizer for most European-language documents:
///
///
/// - Splits words at punctuation characters, removing punctuation. However, a
/// dot that's not followed by whitespace is considered part of a token.
/// - Splits words at hyphens, unless there's a number in the token, in which case
/// the whole token is interpreted as a product number and is not split.
/// - Recognizes email addresses and internet hostnames as one token.
///
///
/// Many applications have specific tokenizer needs. If this tokenizer does
/// not suit your application, please consider copying this source code
/// directory to your project and maintaining your own grammar-based tokenizer.
///
///
///
/// You must specify the required compatibility when creating
/// StandardAnalyzer:
///
/// - As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
/// LUCENE-1608
///
///
public sealed class StandardTokenizer:Tokenizer
{
private void InitBlock()
{
maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
}
/// A private instance of the JFlex-constructed scanner
private StandardTokenizerImpl scanner;
public const int ALPHANUM = 0;
public const int APOSTROPHE = 1;
public const int ACRONYM = 2;
public const int COMPANY = 3;
public const int EMAIL = 4;
public const int HOST = 5;
public const int NUM = 6;
public const int CJ = 7;
/// this solves a bug where HOSTs that end with '.' are identified
/// as ACRONYMs.
///
[Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs.")]
public const int ACRONYM_DEP = 8;
/// String token types that correspond to token type int constants
public static readonly System.String[] TOKEN_TYPES = new System.String[]{"", "", "", "", "", "", "", "", ""};
private bool replaceInvalidAcronym;
private int maxTokenLength;
/// Set the max allowed token length. Any token longer
/// than this is skipped.
///
public int MaxTokenLength
{
get { return maxTokenLength; }
set { this.maxTokenLength = value; }
}
/// Creates a new instance of the
/// . Attaches
/// the input to the newly created JFlex scanner.
///
///
///
/// The input reader
///
/// See http://issues.apache.org/jira/browse/LUCENE-1068
///
public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, matchVersion);
}
/// Creates a new StandardTokenizer with a given .
public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, matchVersion);
}
/// Creates a new StandardTokenizer with a given
///
///
public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
{
InitBlock();
this.scanner = new StandardTokenizerImpl(input);
Init(input, matchVersion);
}
private void Init(System.IO.TextReader input, Version matchVersion)
{
if (matchVersion.OnOrAfter(Version.LUCENE_24))
{
replaceInvalidAcronym = true;
}
else
{
replaceInvalidAcronym = false;
}
this.input = input;
termAtt = AddAttribute();
offsetAtt = AddAttribute();
posIncrAtt = AddAttribute();
typeAtt = AddAttribute();
}
// this tokenizer generates three attributes:
// offset, positionIncrement and type
private ITermAttribute termAtt;
private IOffsetAttribute offsetAtt;
private IPositionIncrementAttribute posIncrAtt;
private ITypeAttribute typeAtt;
///
/// (non-Javadoc)
///
///
public override bool IncrementToken()
{
ClearAttributes();
int posIncr = 1;
while (true)
{
int tokenType = scanner.GetNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF)
{
return false;
}
if (scanner.Yylength() <= maxTokenLength)
{
posIncrAtt.PositionIncrement = posIncr;
scanner.GetText(termAtt);
int start = scanner.Yychar();
offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
{
if (replaceInvalidAcronym)
{
typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST];
termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
}
else
{
typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
}
}
else
{
typeAtt.Type = StandardTokenizerImpl.TOKEN_TYPES[tokenType];
}
return true;
}
// When we skip a too-long term, we still increment the
// position increment
else
posIncr++;
}
}
public override void End()
{
// set final offset
int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
offsetAtt.SetOffset(finalOffset, finalOffset);
}
public override void Reset(System.IO.TextReader reader)
{
base.Reset(reader);
scanner.Reset(reader);
}
///
/// Remove in 3.X and make true the only valid value
/// See https://issues.apache.org/jira/browse/LUCENE-1068
///
/// Set to true to replace mischaracterized acronyms as HOST.
///
[Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
public void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
{
this.replaceInvalidAcronym = replaceInvalidAcronym;
}
}
}