/**f
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
options {
STATIC = false;
//IGNORE_CASE = true;
//BUILD_PARSER = false;
UNICODE_INPUT = true;
USER_CHAR_STREAM = true;
OPTIMIZE_TOKEN_MANAGER = true;
//DEBUG_TOKEN_MANAGER = true;
}
PARSER_BEGIN(StandardTokenizer)
package Lucene.Net.Analysis.Standard;
import java.io.*;
/** A grammar-based tokenizer constructed with JavaCC.
*
*
This should be a good tokenizer for most European-language documents:
*
*
* - Splits words at punctuation characters, removing punctuation. However, a
* dot that's not followed by whitespace is considered part of a token.
*
- Splits words at hyphens, unless there's a number in the token, in which case
* the whole token is interpreted as a product number and is not split.
*
- Recognizes email addresses and internet hostnames as one token.
*
*
* Many applications have specific tokenizer needs. If this tokenizer does
* not suit your application, please consider copying this source code
* directory to your project and maintaining your own grammar-based tokenizer.
*/
public class StandardTokenizer extends Lucene.Net.Analysis.Tokenizer {
/** Constructs a tokenizer for this Reader. */
public StandardTokenizer(Reader reader) {
this(new FastCharStream(reader));
this.input = reader;
}
}
PARSER_END(StandardTokenizer)
TOKEN : { // token patterns
// basic word: a sequence of digits & letters
||)+ >
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives
| ("'" )+ >
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
| "." ( ".")+ >
// company names like AT&T and Excite@Home.
| ("&"|"@") >
// email addresses
| (("."|"-"|"_") )* "@" (("."|"-") )+ >
// hostname
| ("." )+ >
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
|
|
| ( )+
| ( )+
| ( )+
| ( )+
)
>
| <#P: ("_"|"-"|"/"|"."|",") >
| <#HAS_DIGIT: // at least one digit
(|)*
(|)*
>
| < #ALPHA: ()+>
| < #LETTER: // unicode letters
[
"\u0041"-"\u005a",
"\u0061"-"\u007a",
"\u00c0"-"\u00d6",
"\u00d8"-"\u00f6",
"\u00f8"-"\u00ff",
"\u0100"-"\u1fff",
"\uffa0"-"\uffdc"
]
>
| < CJ: // Chinese, Japanese
[
"\u3040"-"\u318f",
"\u3100"-"\u312f", // BaPoMoFo (aka ZhuYin)
"\u3040"-"\u309F", // Japanese: Hiragana
"\u30A0"-"\u30FF", // Japanese: Katakana
"\u31F0"-"\u31FF", // Japanese: Katakana Phonetic Extensions
"\u3300"-"\u337f",
"\u3400"-"\u4dbf", // CJK Unified Ideographs Ext. A
"\u4e00"-"\u9fff",
"\uf900"-"\ufaff",
"\uff65"-"\uff9f"
// Otis: consider adding these, too
//
// 2E80-2EFF: CJK Radicals Supplement
// 2F00-2FDF: Kangxi Radicals
// 3190-319F: Kanbun
// 31C0-31EF: CJK Strokes
// 4E00-9FBF: CJK Unified
// F900-FAFF: CJK Compatibility Ideographs
]
>
| < KOREAN: // Korean
[
"\uac00"-"\ud7af", // Hangul Syllables
"\u1100"-"\u11ff" // Hangul Jamo
// "\uac00"-"\ud7a3"
]
>
| < #DIGIT: // unicode digits
[
"\u0030"-"\u0039",
"\u0660"-"\u0669",
"\u06f0"-"\u06f9",
"\u0966"-"\u096f",
"\u09e6"-"\u09ef",
"\u0a66"-"\u0a6f",
"\u0ae6"-"\u0aef",
"\u0b66"-"\u0b6f",
"\u0be7"-"\u0bef",
"\u0c66"-"\u0c6f",
"\u0ce6"-"\u0cef",
"\u0d66"-"\u0d6f",
"\u0e50"-"\u0e59",
"\u0ed0"-"\u0ed9",
"\u1040"-"\u1049"
]
>
}
SKIP : { // skip unrecognized chars
}
/** Returns the next token in the stream, or null at EOS.
* The returned token's type is set to an element of {@link
* StandardTokenizerConstants#tokenImage}.
*/
Lucene.Net.Analysis.Token next() throws IOException :
{
Token token = null;
}
{
( token = |
token = |
token = |
token = |
token = |
token = |
token = |
token = |
token =
)
{
if (token.kind == EOF) {
return null;
} else {
return
new Lucene.Net.Analysis.Token(token.image,
token.beginColumn,token.endColumn,
tokenImage[token.kind]);
}
}
}