/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using TermAttribute = Lucene.Net.Analysis.Tokenattributes.TermAttribute;
namespace Lucene.Net.Analysis
{
/// A filter that replaces accented characters in the ISO Latin 1 character set
/// (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
///
/// For instance, 'À' will be replaced by 'a'.
///
///
///
/// in favor of {@link ASCIIFoldingFilter} which covers a superset
/// of Latin 1. This class will be removed in Lucene 3.0.
///
[Obsolete("in favor of ASCIIFoldingFilter which covers a superset of Latin 1. This class will be removed in Lucene 3.0.")]
public class ISOLatin1AccentFilter:TokenFilter
{
public ISOLatin1AccentFilter(TokenStream input):base(input)
{
termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
}
private char[] output = new char[256];
private int outputPos;
private TermAttribute termAtt;
public override bool IncrementToken()
{
if (input.IncrementToken())
{
char[] buffer = termAtt.TermBuffer();
int length = termAtt.TermLength();
// If no characters actually require rewriting then we
// just return token as-is:
for (int i = 0; i < length; i++)
{
char c = buffer[i];
if (c >= '\u00c0' && c <= '\uFB06')
{
RemoveAccents(buffer, length);
termAtt.SetTermBuffer(output, 0, outputPos);
break;
}
}
return true;
}
else
return false;
}
/// Will be removed in Lucene 3.0. This method is final, as it should
/// not be overridden. Delegates to the backwards compatibility layer.
///
[Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
public override Token Next(Token reusableToken)
{
return base.Next(reusableToken);
}
/// Will be removed in Lucene 3.0. This method is final, as it should
/// not be overridden. Delegates to the backwards compatibility layer.
///
[Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
public override Token Next()
{
return base.Next();
}
/// To replace accented characters in a String by unaccented equivalents.
public void RemoveAccents(char[] input, int length)
{
// Worst-case length required:
int maxSizeNeeded = 2 * length;
int size = output.Length;
while (size < maxSizeNeeded)
size *= 2;
if (size != output.Length)
output = new char[size];
outputPos = 0;
int pos = 0;
for (int i = 0; i < length; i++, pos++)
{
char c = input[pos];
// Quick test: if it's not in range then just keep
// current character
if (c < '\u00c0' || c > '\uFB06')
output[outputPos++] = c;
else
{
switch (c)
{
case '\u00C0':
// À
case '\u00C1':
// �?
case '\u00C2':
// Â
case '\u00C3':
// Ã
case '\u00C4':
// Ä
case '\u00C5': // Ã…
output[outputPos++] = 'A';
break;
case '\u00C6': // Æ
output[outputPos++] = 'A';
output[outputPos++] = 'E';
break;
case '\u00C7': // Ç
output[outputPos++] = 'C';
break;
case '\u00C8':
// È
case '\u00C9':
// É
case '\u00CA':
// Ê
case '\u00CB': // Ë
output[outputPos++] = 'E';
break;
case '\u00CC':
// Ì
case '\u00CD':
// �?
case '\u00CE':
// ÃŽ
case '\u00CF': // �?
output[outputPos++] = 'I';
break;
case '\u0132': // IJ
output[outputPos++] = 'I';
output[outputPos++] = 'J';
break;
case '\u00D0': // �?
output[outputPos++] = 'D';
break;
case '\u00D1': // Ñ
output[outputPos++] = 'N';
break;
case '\u00D2':
// Ã’
case '\u00D3':
// Ó
case '\u00D4':
// Ô
case '\u00D5':
// Õ
case '\u00D6':
// Ö
case '\u00D8': // Ø
output[outputPos++] = 'O';
break;
case '\u0152': // Å’
output[outputPos++] = 'O';
output[outputPos++] = 'E';
break;
case '\u00DE': // Þ
output[outputPos++] = 'T';
output[outputPos++] = 'H';
break;
case '\u00D9':
// Ù
case '\u00DA':
// Ú
case '\u00DB':
// Û
case '\u00DC': // Ü
output[outputPos++] = 'U';
break;
case '\u00DD':
// �?
case '\u0178': // Ÿ
output[outputPos++] = 'Y';
break;
case '\u00E0':
// Ã
case '\u00E1':
// á
case '\u00E2':
// â
case '\u00E3':
// ã
case '\u00E4':
// ä
case '\u00E5': // å
output[outputPos++] = 'a';
break;
case '\u00E6': // æ
output[outputPos++] = 'a';
output[outputPos++] = 'e';
break;
case '\u00E7': // ç
output[outputPos++] = 'c';
break;
case '\u00E8':
// è
case '\u00E9':
// é
case '\u00EA':
// ê
case '\u00EB': // ë
output[outputPos++] = 'e';
break;
case '\u00EC':
// ì
case '\u00ED':
// Ã
case '\u00EE':
// î
case '\u00EF': // ï
output[outputPos++] = 'i';
break;
case '\u0133': // ij
output[outputPos++] = 'i';
output[outputPos++] = 'j';
break;
case '\u00F0': // ð
output[outputPos++] = 'd';
break;
case '\u00F1': // ñ
output[outputPos++] = 'n';
break;
case '\u00F2':
// ò
case '\u00F3':
// ó
case '\u00F4':
// ô
case '\u00F5':
// õ
case '\u00F6':
// ö
case '\u00F8': // ø
output[outputPos++] = 'o';
break;
case '\u0153': // Å“
output[outputPos++] = 'o';
output[outputPos++] = 'e';
break;
case '\u00DF': // ß
output[outputPos++] = 's';
output[outputPos++] = 's';
break;
case '\u00FE': // þ
output[outputPos++] = 't';
output[outputPos++] = 'h';
break;
case '\u00F9':
// ù
case '\u00FA':
// ú
case '\u00FB':
// û
case '\u00FC': // ü
output[outputPos++] = 'u';
break;
case '\u00FD':
// ý
case '\u00FF': // ÿ
output[outputPos++] = 'y';
break;
case '\uFB00': // ff
output[outputPos++] = 'f';
output[outputPos++] = 'f';
break;
case '\uFB01': // �?
output[outputPos++] = 'f';
output[outputPos++] = 'i';
break;
case '\uFB02': // fl
output[outputPos++] = 'f';
output[outputPos++] = 'l';
break;
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
// case '\uFB03': // ffi
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'i';
// break;
// case '\uFB04': // ffl
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'l';
// break;
case '\uFB05': // ſt
output[outputPos++] = 'f';
output[outputPos++] = 't';
break;
case '\uFB06': // st
output[outputPos++] = 's';
output[outputPos++] = 't';
break;
default:
output[outputPos++] = c;
break;
}
}
}
}
}
}