/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ using System; using System.IO; using System.Text; using Lucene.Net.Analysis; namespace Lucene.Net.Analysis.CJK { /* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2004 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * 3. The end-user documentation included with the redistribution, * if any, must include the following acknowledgment: * "This product includes software developed by the * Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, * if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and * "Apache Lucene" must not be used to endorse or promote products * derived from this software without prior written permission. For * written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", * "Apache Lucene", nor may "Apache" appear in their name, without * prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation. For more * information on the Apache Software Foundation, please see * . */ /// ///

/// CJKTokenizer was modified from StopTokenizer which does a decent job for /// most European languages. and it perferm other token method for double-byte /// Characters: the token will return at each two charactors with overlap match.
/// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it /// also need filter filter zero length token ""
/// for Digit: digit, '+', '#' will token as letter
/// for more info on Asia language(Chinese Japanese Korean) text segmentation: /// please search google ///

/// /// @author Che, Dong /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $ ///
public sealed class CJKTokenizer : Tokenizer { //~ Static fields/initializers --------------------------------------------- /// /// Max word length /// private static int MAX_WORD_LEN = 255; /// /// buffer size /// private static int IO_BUFFER_SIZE = 256; //~ Instance fields -------------------------------------------------------- /// /// word offset, used to imply which character(in ) is parsed /// private int offset = 0; /// /// the index used only for ioBuffer /// private int bufferIndex = 0; /// /// data length /// private int dataLen = 0; /// /// character buffer, store the characters which are used to compose
/// the returned Token ///
private char[] buffer = new char[MAX_WORD_LEN]; /// /// I/O buffer, used to store the content of the input(one of the
/// members of Tokenizer) ///
private char[] ioBuffer = new char[IO_BUFFER_SIZE]; /// /// word type: single=>ASCII double=>non-ASCII word=>default /// private String tokenType = "word"; /// /// tag: previous character is a cached double-byte character "C1C2C3C4" /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" /// private bool preIsTokened = false; //~ Constructors ----------------------------------------------------------- /// /// Construct a token stream processing the given input. /// /// I/O reader public CJKTokenizer(TextReader _in) { input = _in; } //~ Methods ---------------------------------------------------------------- /// /// Returns the next token in the stream, or null at EOS. /// /// Token public override Token Next() { /** how many character(s) has been stored in buffer */ int length = 0; /** the position used to create Token */ int start = offset; while (true) { /** current charactor */ char c; /** unicode block of current charactor for detail */ //Character.UnicodeBlock ub; offset++; if (bufferIndex >= dataLen) { dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); bufferIndex = 0; } if (dataLen == 0) { if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } break; } else { return null; } } else { //get current character c = ioBuffer[bufferIndex++]; //get the UnicodeBlock of the current character //ub = Character.UnicodeBlock.of(c); } //if the current character is ASCII or Extend ASCII if (('\u0000' <= c && c <= '\u007F') || ('\uFF00' <= c && c <= '\uFFEF')) { if ('\uFF00' <= c && c <= '\uFFEF') { /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ int i = (int) c; i = i - 65248; c = (char) i; } // if the current character is a letter or "_" "+" "#" if (Char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')) ) { if (length == 0) { // "javaC1C2C3C4linux"
// ^--: the current character begin to token the ASCII // letter start = offset - 1; } else if (tokenType == "double") { // "javaC1C2C3C4linux"
// ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; tokenType = "single"; if (preIsTokened == true) { // there is only one non-ASCII has been stored length = 0; preIsTokened = false; break; } else { break; } } // store the LowerCase(c) in the buffer buffer[length++] = Char.ToLower(c); tokenType = "single"; // break the procedure if buffer overflowed! if (length == MAX_WORD_LEN) { break; } } else if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else { break; } } } else { // non-ASCII letter, eg."C1C2C3C4" if (Char.IsLetter(c)) { if (length == 0) { start = offset - 1; buffer[length++] = c; tokenType = "double"; } else { if (tokenType == "single") { offset--; bufferIndex--; //return the previous ASCII characters break; } else { buffer[length++] = c; tokenType = "double"; if (length == 2) { offset--; bufferIndex--; preIsTokened = true; break; } } } } else if (length > 0) { if (preIsTokened == true) { // empty the buffer length = 0; preIsTokened = false; } else { break; } } } } return new Token(new String(buffer, 0, length), start, start + length, tokenType ); } } }