/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.IO; using System.Collections; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Util; namespace Lucene.Net.Analysis.NGram { public static class SideExtensions { public static string GetLabel(this Side theSide) { switch(theSide) { case Side.FRONT: return "front"; case Side.BACK: return "back"; default: throw new ArgumentException(string.Format("{0} is not a valid value for EdgeNGramTokenFilter.Side", theSide)); } } public static Side GetSide(string sideName) { if (Side.FRONT.GetLabel() == sideName) { return Side.FRONT; } if (Side.BACK.GetLabel() == sideName) { return Side.BACK; } return (Side)(-1); // TODO: returning null instead of null? Should an exception be thrown instead? } } /// /// Specifies which side of the input the n-gram should be generated from /// public enum Side { FRONT, BACK } /* * Tokenizes the given token into n-grams of given size(s). *

* This create n-grams from the beginning edge or ending edge of a input token. *

*/ public sealed class EdgeNGramTokenFilter : TokenFilter { public static Side DEFAULT_SIDE = Side.FRONT; public static int DEFAULT_MAX_GRAM_SIZE = 1; public static int DEFAULT_MIN_GRAM_SIZE = 1; private int minGram; private int maxGram; private Side side; private char[] curTermBuffer; private int curTermLength; private int curGramSize; private int tokStart; private ITermAttribute termAtt; private IOffsetAttribute offsetAtt; protected EdgeNGramTokenFilter(TokenStream input) : base(input) { this.termAtt = AddAttribute(); this.offsetAtt = AddAttribute(); } /* * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * holding the input to be tokenized * the from which to chop off an n-gram * the smallest n-gram to generate * the largest n-gram to generate */ public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) : base(input) { if (side != Side.FRONT && side != Side.BACK) { throw new System.ArgumentException("sideLabel must be either front or back"); } if (minGram < 1) { throw new System.ArgumentException("minGram must be greater than zero"); } if (minGram > maxGram) { throw new System.ArgumentException("minGram must not be greater than maxGram"); } this.minGram = minGram; this.maxGram = maxGram; this.side = side; this.termAtt = AddAttribute(); this.offsetAtt = AddAttribute(); } /* * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range * * holding the input to be tokenized * the name of the from which to chop off an n-gram * the smallest n-gram to generate * the largest n-gram to generate */ public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram) : this(input, SideExtensions.GetSide(sideLabel), minGram, maxGram) { } public override bool IncrementToken() { while (true) { if (curTermBuffer == null) { if (!input.IncrementToken()) { return false; } else { curTermBuffer = (char[])termAtt.TermBuffer().Clone(); curTermLength = termAtt.TermLength(); curGramSize = minGram; tokStart = offsetAtt.StartOffset; } } if (curGramSize <= maxGram) { if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams || curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit // grab gramSize chars from front or back int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; int end = start + curGramSize; ClearAttributes(); offsetAtt.SetOffset(tokStart + start, tokStart + end); termAtt.SetTermBuffer(curTermBuffer, start, curGramSize); curGramSize++; return true; } } curTermBuffer = null; } } public override void Reset() { base.Reset(); curTermBuffer = null; } } }