package org.apache.lucene.analysis.standard; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate the tokenizer, only use Java 1.4 !!! This grammar currently uses constructs (eg :digit:, :letter:) whose meaning can vary according to the JRE used to run jflex. See https://issues.apache.org/jira/browse/LUCENE-1126 for details. For current backwards compatibility it is needed to support only Java 1.4 - this will change in Lucene 3.1. */ import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.tokenattributes.TermAttribute; %% %class StandardTokenizerImpl %unicode %integer %function getNextToken %pack %char %{ public static final int ALPHANUM = StandardTokenizer.ALPHANUM; public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE; public static final int ACRONYM = StandardTokenizer.ACRONYM; public static final int COMPANY = StandardTokenizer.COMPANY; public static final int EMAIL = StandardTokenizer.EMAIL; public static final int HOST = StandardTokenizer.HOST; public static final int NUM = StandardTokenizer.NUM; public static final int CJ = StandardTokenizer.CJ; /* * @deprecated this solves a bug where HOSTs that end with '.' are identified * as ACRONYMs. */ public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; public final int yychar() { return yychar; } /* * Resets the Tokenizer to a new Reader. */ final void reset(java.io.Reader r) { // reset to default buffer size, if buffer has grown if (zzBuffer.length > ZZ_BUFFERSIZE) { zzBuffer = new char[ZZ_BUFFERSIZE]; } yyreset(r); } /* * Fills Lucene token with the current token text. */ final void getText(Token t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } /* * Fills TermAttribute with the current token text. */ final void getText(TermAttribute t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); } %} THAI = [\u0E00-\u0E59] // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function) ALPHANUM = ({LETTER}|{THAI}|[:digit:])+ // internal apostrophes: O'Reilly, you're, O'Reilly's // use a post-filter to remove possessives APOSTROPHE = {ALPHA} ("'" {ALPHA})+ // acronyms: U.S.A., I.B.M., etc. // use a post-filter to remove dots ACRONYM = {LETTER} "." ({LETTER} ".")+ ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+ // company names like AT&T and Excite@Home. COMPANY = {ALPHA} ("&"|"@") {ALPHA} // email addresses EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+ // hostname HOST = {ALPHANUM} ((".") {ALPHANUM})+ // floating point, serial, model numbers, ip addresses, etc. // every other segment must have at least one digit NUM = ({ALPHANUM} {P} {HAS_DIGIT} | {HAS_DIGIT} {P} {ALPHANUM} | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) // punctuation P = ("_"|"-"|"/"|"."|",") // at least one digit HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])* ALPHA = ({LETTER})+ // From the JFlex manual: "the expression that matches everything of not matched by is !(!|)" LETTER = !(![:letter:]|{CJ}) // Chinese and Japanese (but NOT Korean, which is included in [:letter:]) CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f] WHITESPACE = \r\n | [ \r\n\t\f] %% {ALPHANUM} { return ALPHANUM; } {APOSTROPHE} { return APOSTROPHE; } {ACRONYM} { return ACRONYM; } {COMPANY} { return COMPANY; } {EMAIL} { return EMAIL; } {HOST} { return HOST; } {NUM} { return NUM; } {CJ} { return CJ; } {ACRONYM_DEP} { return ACRONYM_DEP; } /* Ignore the rest */ . | {WHITESPACE} { /* ignore */ }