/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Token = Lucene.Net.Analysis.Token; using Tokenizer = Lucene.Net.Analysis.Tokenizer; namespace Lucene.Net.Analysis.Standard { ///

A grammar-based tokenizer constructed with JFlex /// ///

This should be a good tokenizer for most European-language documents: /// ///

Splits words at punctuation characters, removing punctuation. However, a /// dot that's not followed by whitespace is considered part of a token. ///
Splits words at hyphens, unless there's a number in the token, in which case /// the whole token is interpreted as a product number and is not split. ///
Recognizes email addresses and internet hostnames as one token. ///

/// ///

Many applications have specific tokenizer needs. If this tokenizer does /// not suit your application, please consider copying this source code /// directory to your project and maintaining your own grammar-based tokenizer. ///

public class StandardTokenizer : Tokenizer { private void InitBlock() { maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; } ///

A private instance of the JFlex-constructed scanner

private StandardTokenizerImpl scanner; ///

Specifies whether deprecated acronyms should be replaced with HOST type. /// This is false by default to support backward compatibility. ///

/// See http://issues.apache.org/jira/browse/LUCENE-1068 /// ///

/// this should be removed in the next release (3.0). /// private bool replaceInvalidAcronym = false; internal virtual void SetInput(System.IO.TextReader reader) { this.input = reader; } private int maxTokenLength; ///

Set the max allowed token length. Any token longer /// than this is skipped. ///

public virtual void SetMaxTokenLength(int length) { this.maxTokenLength = length; } /// /// public virtual int GetMaxTokenLength() { return maxTokenLength; } ///

Creates a new instance of the {@link StandardTokenizer}. Attaches the /// input to a newly created JFlex scanner. ///

public StandardTokenizer(System.IO.TextReader input) { InitBlock(); this.input = input; this.scanner = new StandardTokenizerImpl(input); } ///

Creates a new instance of the {@link Lucene.Net.Analysis.Standard.StandardTokenizer}. Attaches /// the input to the newly created JFlex scanner. /// ///

/// The input reader /// /// Set to true to replace mischaracterized acronyms with HOST. /// /// See http://issues.apache.org/jira/browse/LUCENE-1068 /// public StandardTokenizer(System.IO.TextReader input, bool replaceInvalidAcronym) { InitBlock(); this.replaceInvalidAcronym = replaceInvalidAcronym; this.input = input; this.scanner = new StandardTokenizerImpl(input); } /* * (non-Javadoc) * * @see Lucene.Net.Analysis.TokenStream#next() */ public override Token Next(Token result) { int posIncr = 1; while (true) { int tokenType = scanner.GetNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } if (scanner.Yylength() <= maxTokenLength) { result.Clear(); result.SetPositionIncrement(posIncr); scanner.GetText(result); int start = scanner.Yychar(); result.SetStartOffset(start); result.SetEndOffset(start + result.TermLength()); // This 'if' should be removed in the next release. For now, it converts // invalid acronyms to HOST. When removed, only the 'else' part should // remain. if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { if (replaceInvalidAcronym) { result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); result.SetTermLength(result.TermLength() - 1); // remove extra '.' } else { result.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); } } else { result.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } return result; } // When we skip a too-long term, we still increment the // position increment else posIncr++; } } /* * (non-Javadoc) * * @see Lucene.Net.Analysis.TokenStream#reset() */ public override void Reset() { base.Reset(); scanner.Yyreset(input); } public override void Reset(System.IO.TextReader reader) { input = reader; Reset(); } ///

Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com /// when they should have been labeled as hosts instead. ///

/// true if StandardTokenizer now returns these tokens as Hosts, otherwise false /// /// /// Remove in 3.X and make true the only valid value /// public virtual bool IsReplaceInvalidAcronym() { return replaceInvalidAcronym; } ///

/// Set to true to replace mischaracterized acronyms as HOST. /// /// Remove in 3.X and make true the only valid value /// /// See https://issues.apache.org/jira/browse/LUCENE-1068 /// public virtual void SetReplaceInvalidAcronym(bool replaceInvalidAcronym) { this.replaceInvalidAcronym = replaceInvalidAcronym; } } }