/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ using System; using System.Globalization; using System.IO; using System.Text; using System.Text.RegularExpressions; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Util; namespace Lucene.Net.Analysis.CJK { /// ///

/// CJKTokenizer was modified from StopTokenizer which does a decent job for /// most European languages. and it perferm other token method for double-byte /// chars: the token will return at each two charactors with overlap match.
/// Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it /// also need filter filter zero length token ""
/// for Digit: digit, '+', '#' will token as letter
/// for more info on Asia language(Chinese Japanese Korean) text segmentation: /// please search google ///

/// /// @author Che, Dong /// @version $Id: CJKTokenizer.java,v 1.3 2003/01/22 20:54:47 otis Exp $ ///
public sealed class CJKTokenizer : Tokenizer { //~ Static fields/initializers --------------------------------------------- /// /// Word token type /// internal static readonly int WORD_TYPE = 0; /// /// Single byte token type /// internal static readonly int SINGLE_TOKEN_TYPE = 1; /// /// Double byte token type /// internal static readonly int DOUBLE_TOKEN_TYPE = 2; /// /// Names for token types /// internal static readonly String[] TOKEN_TYPE_NAMES = { "word", "single", "double" }; /// /// Max word length /// internal static readonly int MAX_WORD_LEN = 255; /// /// buffer size /// internal static readonly int IO_BUFFER_SIZE = 256; //~ Instance fields -------------------------------------------------------- /// /// word offset, used to imply which character(in ) is parsed /// private int offset = 0; /// /// the index used only for ioBuffer /// private int bufferIndex = 0; /// /// data length /// private int dataLen = 0; /// /// character buffer, store the characters which are used to compose
/// the returned Token ///
private char[] buffer = new char[MAX_WORD_LEN]; /// /// I/O buffer, used to store the content of the input(one of the
/// members of Tokenizer) ///
private char[] ioBuffer = new char[IO_BUFFER_SIZE]; /// /// word type: single=>ASCII double=>non-ASCII word=>default /// private int tokenType = WORD_TYPE; /// /// tag: previous character is a cached double-byte character "C1C2C3C4" /// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened) /// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4" /// private bool preIsTokened = false; private ITermAttribute termAtt; private IOffsetAttribute offsetAtt; private ITypeAttribute typeAtt; //~ Constructors ----------------------------------------------------------- /// /// Construct a token stream processing the given input. /// /// I/O reader public CJKTokenizer(TextReader _in) : base(_in) { Init(); } public CJKTokenizer(AttributeSource source, TextReader _in) : base(source, _in) { Init(); } public CJKTokenizer(AttributeFactory factory, TextReader _in) : base(factory, _in) { Init(); } private void Init() { termAtt = AddAttribute(); offsetAtt = AddAttribute(); typeAtt = AddAttribute(); } //~ Methods ---------------------------------------------------------------- /* * Returns true for the next token in the stream, or false at EOS. * See http://java.sun.com/j2se/1.3/docs/api/java/lang/char.UnicodeBlock.html * for detail. * * @return false for end of stream, true otherwise * * @throws java.io.IOException - throw IOException when read error
* happened in the InputStream * */ Regex isBasicLatin = new Regex(@"\p{IsBasicLatin}", RegexOptions.Compiled); Regex isHalfWidthAndFullWidthForms = new Regex(@"\p{IsHalfwidthandFullwidthForms}", RegexOptions.Compiled); public override bool IncrementToken() { ClearAttributes(); /* how many character(s) has been stored in buffer */ while (true) { // loop until we find a non-empty token int length = 0; /* the position used to create Token */ int start = offset; while (true) { // loop until we've found a full token /* current character */ char c; offset++; if (bufferIndex >= dataLen) { dataLen = input.Read(ioBuffer, 0, ioBuffer.Length); bufferIndex = 0; } if (dataLen == 0) // input.Read returns 0 when its empty, not -1, as in java { if (length > 0) { if (preIsTokened == true) { length = 0; preIsTokened = false; } else { offset--; } break; } else { offset--; return false; } } else { //get current character c = ioBuffer[bufferIndex++]; } //TODO: Using a Regex to determine the UnicodeCategory is probably slower than // If we just created a small class that would look it up for us, which // would likely be trivial, however time-consuming. I can't imagine a Regex // being fast for this, considering we have to pull a char from the buffer, // and convert it to a string before we run a regex on it. - cc bool isHalfFullForm = isHalfWidthAndFullWidthForms.Match(c.ToString()).Success; //if the current character is ASCII or Extend ASCII if ((isBasicLatin.Match(c.ToString()).Success) || (isHalfFullForm)) { if (isHalfFullForm) { int i = (int) c; if (i >= 65281 && i <= 65374) { // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN i = i - 65248; c = (char) i; } } // if the current character is a letter or "_" "+" "#" if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')) ) { if (length == 0) { // "javaC1C2C3C4linux"
// ^--: the current character begin to token the ASCII // letter start = offset - 1; } else if (tokenType == DOUBLE_TOKEN_TYPE) { // "javaC1C2C3C4linux"
// ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; if (preIsTokened == true) { // there is only one non-ASCII has been stored length = 0; preIsTokened = false; break; } else { break; } } // store the LowerCase(c) in the buffer buffer[length++] = char.ToLower(c); // TODO: is java invariant? If so, this should be ToLowerInvariant() tokenType = SINGLE_TOKEN_TYPE; // break the procedure if buffer overflowed! if (length == MAX_WORD_LEN) { break; } } else if (length > 0) { if (preIsTokened) { length = 0; preIsTokened = false; } else { break; } } } else { // non-ASCII letter, e.g."C1C2C3C4" if (char.IsLetter(c)) { if (length == 0) { start = offset - 1; buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; } else { if (tokenType == SINGLE_TOKEN_TYPE) { offset--; bufferIndex--; //return the previous ASCII characters break; } else { buffer[length++] = c; tokenType = DOUBLE_TOKEN_TYPE; if (length == 2) { offset--; bufferIndex--; preIsTokened = true; break; } } } } else if (length > 0) { if (preIsTokened == true) { // empty the buffer length = 0; preIsTokened = false; } else { break; } } } } if (length > 0) { termAtt.SetTermBuffer(buffer, 0, length); offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + length)); typeAtt.Type = TOKEN_TYPE_NAMES[tokenType]; return true; } else if (dataLen == 0) { offset--; return false; } // Cycle back and try for the next token (don't // return an empty string) } } public override void End() { // set final offset int finalOffset = CorrectOffset(offset); this.offsetAtt.SetOffset(finalOffset, finalOffset); } public override void Reset() { base.Reset(); offset = bufferIndex = dataLen = 0; preIsTokened = false; tokenType = WORD_TYPE; } public override void Reset(TextReader reader) { base.Reset(reader); Reset(); } } }