/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using System.Collections; using System.Collections.Generic; using System.Text; using System.IO; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Util; namespace Lucene.Net.Analysis.Ext { /// /// This analyzer targets short fields where *word* like searches are required. /// [SomeUser@GMAIL.com 1234567890] will be tokenized as /// [s.o.m.e.u.s.e.r..g.m.a.i.l..com..1.2.3.4.5.6.7.8.9.0] (read .'s as blank) /// /// Usage: /// QueryParser p = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "fieldName", new SingleCharTokenAnalyzer()); /// p.SetDefaultOperator(QueryParser.Operator.AND); /// p.SetEnablePositionIncrements(true); /// /// TopDocs td = src.Search(p.Parse("678"), 10); /// or /// TopDocs td = src.Search(p.Parse("\"gmail.com 1234\""), 10); /// public class SingleCharTokenAnalyzer : Analyzer { /// /// public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream t = null; t = new LetterOrDigitTokenizer(reader); t = new LowerCaseFilter(t); t = new ASCIIFoldingFilter(t); t = new SingleCharTokenizer(t); return t; } class SingleCharTokenizer : Tokenizer { TokenStream _input = null; ITermAttribute _termAttribute = null; IOffsetAttribute _offsetAttribute = null; IPositionIncrementAttribute _positionIncrementAttribute = null; char[] _buffer = null; int _offset = -1; int _length = -1; int _offsetInStream = -1; public SingleCharTokenizer(TokenStream input): base(input) { _input = input; _termAttribute = AddAttribute(); _offsetAttribute = AddAttribute(); _positionIncrementAttribute = AddAttribute(); } public override bool IncrementToken() { int positionIncrement = 0; if (_buffer == null || _offset >= _length) { if (!_input.IncrementToken()) return false; _offset = 0; _buffer = _termAttribute.TermBuffer(); _length = _termAttribute.TermLength(); positionIncrement++; _offsetInStream++; } _offsetAttribute.SetOffset(_offsetInStream, _offsetInStream + 1); _offsetInStream++; positionIncrement++; _positionIncrementAttribute.PositionIncrement = positionIncrement; _termAttribute.SetTermLength(1); _termAttribute.SetTermBuffer(_buffer[_offset++].ToString()); return true; } public override void Reset() { _buffer = null; _offset = -1; _length = -1; _offsetInStream = -1; base.Reset(); } protected override void Dispose(bool disposing) { _input.Close(); base.Dispose(disposing); } } } /// /// Another Analyzer. Every char which is not a letter or digit is treated as a word separator. /// [Name.Surname@gmail.com 123.456 ğüşıöç%ĞÜŞİÖÇ$ΑΒΓΔΕΖ#АБВГДЕ SSß] will be tokenized as /// [name surname gmail com 123 456 gusioc gusioc αβγδεζ абвгде ssss] /// /// No problem with searches like someuser@gmail or 123.456 since they are /// converted to phrase-query as "someuser gmail" or "123 456". /// public class UnaccentedWordAnalyzer : Analyzer { /// /// public override TokenStream TokenStream(string fieldName, TextReader reader) { TokenStream t = null; t = new LetterOrDigitTokenizer(reader); t = new LowerCaseFilter(t); t = new ASCIIFoldingFilter(t); return t; } } /// /// if a char is not a letter or digit, it is a word separator /// public class LetterOrDigitTokenizer : CharTokenizer { /// /// public LetterOrDigitTokenizer(TextReader reader): base(reader) { } /// /// protected override bool IsTokenChar(char c) { return char.IsLetterOrDigit(c); } } }