/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.IO;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Util;
namespace Lucene.Net.Analysis.Ext
{
///
/// This analyzer targets short fields where *word* like searches are required.
/// [SomeUser@GMAIL.com 1234567890] will be tokenized as
/// [s.o.m.e.u.s.e.r..g.m.a.i.l..com..1.2.3.4.5.6.7.8.9.0] (read .'s as blank)
///
/// Usage:
/// QueryParser p = new QueryParser(Lucene.Net.Util.Version.LUCENE_29, "fieldName", new SingleCharTokenAnalyzer());
/// p.SetDefaultOperator(QueryParser.Operator.AND);
/// p.SetEnablePositionIncrements(true);
///
/// TopDocs td = src.Search(p.Parse("678"), 10);
/// or
/// TopDocs td = src.Search(p.Parse("\"gmail.com 1234\""), 10);
///
public class SingleCharTokenAnalyzer : Analyzer
{
///
///
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream t = null;
t = new LetterOrDigitTokenizer(reader);
t = new LowerCaseFilter(t);
t = new ASCIIFoldingFilter(t);
t = new SingleCharTokenizer(t);
return t;
}
class SingleCharTokenizer : Tokenizer
{
TokenStream _input = null;
ITermAttribute _termAttribute = null;
IOffsetAttribute _offsetAttribute = null;
IPositionIncrementAttribute _positionIncrementAttribute = null;
char[] _buffer = null;
int _offset = -1;
int _length = -1;
int _offsetInStream = -1;
public SingleCharTokenizer(TokenStream input): base(input)
{
_input = input;
_termAttribute = AddAttribute();
_offsetAttribute = AddAttribute();
_positionIncrementAttribute = AddAttribute();
}
public override bool IncrementToken()
{
int positionIncrement = 0;
if (_buffer == null || _offset >= _length)
{
if (!_input.IncrementToken()) return false;
_offset = 0;
_buffer = _termAttribute.TermBuffer();
_length = _termAttribute.TermLength();
positionIncrement++;
_offsetInStream++;
}
_offsetAttribute.SetOffset(_offsetInStream, _offsetInStream + 1);
_offsetInStream++;
positionIncrement++;
_positionIncrementAttribute.PositionIncrement = positionIncrement;
_termAttribute.SetTermLength(1);
_termAttribute.SetTermBuffer(_buffer[_offset++].ToString());
return true;
}
public override void Reset()
{
_buffer = null;
_offset = -1;
_length = -1;
_offsetInStream = -1;
base.Reset();
}
protected override void Dispose(bool disposing)
{
_input.Close();
base.Dispose(disposing);
}
}
}
///
/// Another Analyzer. Every char which is not a letter or digit is treated as a word separator.
/// [Name.Surname@gmail.com 123.456 ğüşıöç%ĞÜŞİÖÇ$ΑΒΓΔΕΖ#АБВГДЕ SSß] will be tokenized as
/// [name surname gmail com 123 456 gusioc gusioc αβγδεζ абвгде ssss]
///
/// No problem with searches like someuser@gmail or 123.456 since they are
/// converted to phrase-query as "someuser gmail" or "123 456".
///
public class UnaccentedWordAnalyzer : Analyzer
{
///
///
public override TokenStream TokenStream(string fieldName, TextReader reader)
{
TokenStream t = null;
t = new LetterOrDigitTokenizer(reader);
t = new LowerCaseFilter(t);
t = new ASCIIFoldingFilter(t);
return t;
}
}
///
/// if a char is not a letter or digit, it is a word separator
///
public class LetterOrDigitTokenizer : CharTokenizer
{
///
///
public LetterOrDigitTokenizer(TextReader reader): base(reader)
{
}
///
///
protected override bool IsTokenChar(char c)
{
return char.IsLetterOrDigit(c);
}
}
}