/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; using Lucene.Net.Analysis.Tokenattributes; namespace Lucene.Net.Analysis { /// A filter that replaces accented characters in the ISO Latin 1 character set /// (ISO-8859-1) by their unaccented equivalent. The case will not be altered. ///

/// For instance, 'À' will be replaced by 'a'. ///

/// ///

/// If you build a new index, use /// which covers a superset of Latin 1. /// This class is included for use with existing indexes and will be removed /// in a future release (possible Lucene 4.0) /// [Obsolete("If you build a new index, use ASCIIFoldingFilter which covers a superset of Latin 1. This class is included for use with existing indexes and will be removed in a future release (possible Lucene 4.0).")] public class ISOLatin1AccentFilter : TokenFilter { public ISOLatin1AccentFilter(TokenStream input):base(input) { termAtt = AddAttribute(); } private char[] output = new char[256]; private int outputPos; private readonly ITermAttribute termAtt; public override bool IncrementToken() { if (input.IncrementToken()) { char[] buffer = termAtt.TermBuffer(); int length = termAtt.TermLength(); // If no characters actually require rewriting then we // just return token as-is: for (int i = 0; i < length; i++) { char c = buffer[i]; if (c >= '\u00c0' && c <= '\uFB06') { RemoveAccents(buffer, length); termAtt.SetTermBuffer(output, 0, outputPos); break; } } return true; } return false; } /// To replace accented characters in a String by unaccented equivalents. public void RemoveAccents(char[] input, int length) { // Worst-case length required: int maxSizeNeeded = 2 * length; int size = output.Length; while (size < maxSizeNeeded) size *= 2; if (size != output.Length) output = new char[size]; outputPos = 0; int pos = 0; for (int i = 0; i < length; i++, pos++) { char c = input[pos]; // Quick test: if it's not in range then just keep // current character if (c < '\u00c0' || c > '\uFB06') output[outputPos++] = c; else { switch (c) { case '\u00C0': // À case '\u00C1': // �? case '\u00C2': //  case '\u00C3': // à case '\u00C4': // Ä case '\u00C5': // Å output[outputPos++] = 'A'; break; case '\u00C6': // Æ output[outputPos++] = 'A'; output[outputPos++] = 'E'; break; case '\u00C7': // Ç output[outputPos++] = 'C'; break; case '\u00C8': // È case '\u00C9': // É case '\u00CA': // Ê case '\u00CB': // Ë output[outputPos++] = 'E'; break; case '\u00CC': // Ì case '\u00CD': // �? case '\u00CE': // Î case '\u00CF': // �? output[outputPos++] = 'I'; break; case '\u0132': // IJ output[outputPos++] = 'I'; output[outputPos++] = 'J'; break; case '\u00D0': // �? output[outputPos++] = 'D'; break; case '\u00D1': // Ñ output[outputPos++] = 'N'; break; case '\u00D2': // Ò case '\u00D3': // Ó case '\u00D4': // Ô case '\u00D5': // Õ case '\u00D6': // Ö case '\u00D8': // Ø output[outputPos++] = 'O'; break; case '\u0152': // Œ output[outputPos++] = 'O'; output[outputPos++] = 'E'; break; case '\u00DE': // Þ output[outputPos++] = 'T'; output[outputPos++] = 'H'; break; case '\u00D9': // Ù case '\u00DA': // Ú case '\u00DB': // Û case '\u00DC': // Ü output[outputPos++] = 'U'; break; case '\u00DD': // �? case '\u0178': // Ÿ output[outputPos++] = 'Y'; break; case '\u00E0': // à case '\u00E1': // á case '\u00E2': // â case '\u00E3': // ã case '\u00E4': // ä case '\u00E5': // å output[outputPos++] = 'a'; break; case '\u00E6': // æ output[outputPos++] = 'a'; output[outputPos++] = 'e'; break; case '\u00E7': // ç output[outputPos++] = 'c'; break; case '\u00E8': // è case '\u00E9': // é case '\u00EA': // ê case '\u00EB': // ë output[outputPos++] = 'e'; break; case '\u00EC': // ì case '\u00ED': // í case '\u00EE': // î case '\u00EF': // ï output[outputPos++] = 'i'; break; case '\u0133': // ij output[outputPos++] = 'i'; output[outputPos++] = 'j'; break; case '\u00F0': // ð output[outputPos++] = 'd'; break; case '\u00F1': // ñ output[outputPos++] = 'n'; break; case '\u00F2': // ò case '\u00F3': // ó case '\u00F4': // ô case '\u00F5': // õ case '\u00F6': // ö case '\u00F8': // ø output[outputPos++] = 'o'; break; case '\u0153': // œ output[outputPos++] = 'o'; output[outputPos++] = 'e'; break; case '\u00DF': // ß output[outputPos++] = 's'; output[outputPos++] = 's'; break; case '\u00FE': // þ output[outputPos++] = 't'; output[outputPos++] = 'h'; break; case '\u00F9': // ù case '\u00FA': // ú case '\u00FB': // û case '\u00FC': // ü output[outputPos++] = 'u'; break; case '\u00FD': // ý case '\u00FF': // ÿ output[outputPos++] = 'y'; break; case '\uFB00': // ff output[outputPos++] = 'f'; output[outputPos++] = 'f'; break; case '\uFB01': // �? output[outputPos++] = 'f'; output[outputPos++] = 'i'; break; case '\uFB02': // fl output[outputPos++] = 'f'; output[outputPos++] = 'l'; break; // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive) // case '\uFB03': // ffi // output[outputPos++] = 'f'; // output[outputPos++] = 'f'; // output[outputPos++] = 'i'; // break; // case '\uFB04': // ffl // output[outputPos++] = 'f'; // output[outputPos++] = 'f'; // output[outputPos++] = 'l'; // break; case '\uFB05': // ſt output[outputPos++] = 'f'; output[outputPos++] = 't'; break; case '\uFB06': // st output[outputPos++] = 's'; output[outputPos++] = 't'; break; default: output[outputPos++] = c; break; } } } } } }