A grammar-based tokenizer constructed with JFlex
///
/// This should be a good tokenizer for most European-language documents:
///
///
/// - Splits words at punctuation characters, removing punctuation. However, a
/// dot that's not followed by whitespace is considered part of a token.
///
- Splits words at hyphens, unless there's a number in the token, in which case
/// the whole token is interpreted as a product number and is not split.
///
- Recognizes email addresses and internet hostnames as one token.
///
///
/// Many applications have specific tokenizer needs. If this tokenizer does
/// not suit your application, please consider copying this source code
/// directory to your project and maintaining your own grammar-based tokenizer.
///
public class StandardTokenizer : Tokenizer
{
private void InitBlock()
{
maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
}
///