/* * Copyright 2004-2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ using System; namespace Lucene.Net.Analysis { /// A filter that replaces accented characters in the ISO Latin 1 character set /// (ISO-8859-1) by their unaccented equivalent. The case will not be altered. ///

/// For instance, 'à' will be replaced by 'a'. ///

///

public class ISOLatin1AccentFilter : TokenFilter { public ISOLatin1AccentFilter(TokenStream input) : base(input) { } public override Token Next() { Token t = input.Next(); if (t == null) return null; // Return a token with filtered characters. return new Token(RemoveAccents(t.TermText()), t.StartOffset(), t.EndOffset(), t.Type()); } /// To replace accented characters in a String by unaccented equivalents. public static System.String RemoveAccents(System.String input) { System.Text.StringBuilder output = new System.Text.StringBuilder(); for (int i = 0; i < input.Length; i++) { long val = input[i]; switch (input[i]) { case '\u00C0': // À case '\u00C1': // � case '\u00C2': //  case '\u00C3': // à case '\u00C4': // Ä case '\u00C5': // Å output.Append("A"); break; case '\u00C6': // Æ output.Append("AE"); break; case '\u00C7': // Ç output.Append("C"); break; case '\u00C8': // È case '\u00C9': // É case '\u00CA': // Ê case '\u00CB': // Ë output.Append("E"); break; case '\u00CC': // Ì case '\u00CD': // � case '\u00CE': // Î case '\u00CF': // � output.Append("I"); break; case '\u00D0': // � output.Append("D"); break; case '\u00D1': // Ñ output.Append("N"); break; case '\u00D2': // Ò case '\u00D3': // Ó case '\u00D4': // Ô case '\u00D5': // Õ case '\u00D6': // Ö case '\u00D8': // Ø output.Append("O"); break; case '\u0152': // Œ output.Append("OE"); break; case '\u00DE': // Þ output.Append("TH"); break; case '\u00D9': // Ù case '\u00DA': // Ú case '\u00DB': // Û case '\u00DC': // Ü output.Append("U"); break; case '\u00DD': // � case '\u0178': // Ÿ output.Append("Y"); break; case '\u00E0': // à case '\u00E1': // á case '\u00E2': // â case '\u00E3': // ã case '\u00E4': // ä case '\u00E5': // å output.Append("a"); break; case '\u00E6': // æ output.Append("ae"); break; case '\u00E7': // ç output.Append("c"); break; case '\u00E8': // è case '\u00E9': // é case '\u00EA': // ê case '\u00EB': // ë output.Append("e"); break; case '\u00EC': // ì case '\u00ED': // í case '\u00EE': // î case '\u00EF': // ï output.Append("i"); break; case '\u00F0': // ð output.Append("d"); break; case '\u00F1': // ñ output.Append("n"); break; case '\u00F2': // ò case '\u00F3': // ó case '\u00F4': // ô case '\u00F5': // õ case '\u00F6': // ö case '\u00F8': // ø output.Append("o"); break; case '\u0153': // œ output.Append("oe"); break; case '\u00DF': // ß output.Append("ss"); break; case '\u00FE': // þ output.Append("th"); break; case '\u00F9': // ù case '\u00FA': // ú case '\u00FB': // û case '\u00FC': // ü output.Append("u"); break; case '\u00FD': // ý case '\u00FF': // ÿ output.Append("y"); break; default: output.Append(input[i]); break; } } return output.ToString(); } } }