Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
RefinedSoundex |
|
| 2.375;2.375 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.codec.language; | |
19 | ||
20 | import org.apache.commons.codec.EncoderException; | |
21 | import org.apache.commons.codec.StringEncoder; | |
22 | ||
23 | /** | |
24 | * Encodes a string into a Refined Soundex value. A refined soundex code is | |
25 | * optimized for spell checking words. Soundex method originally developed by | |
26 | * <CITE>Margaret Odell</CITE> and <CITE>Robert Russell</CITE>. | |
27 | * | |
28 | * <p>This class is immutable and thread-safe.</p> | |
29 | * | |
30 | * @version $Id$ | |
31 | */ | |
32 | public class RefinedSoundex implements StringEncoder { | |
33 | ||
34 | /** | |
35 | * @since 1.4 | |
36 | */ | |
37 | public static final String US_ENGLISH_MAPPING_STRING = "01360240043788015936020505"; | |
38 | ||
39 | /** | |
40 | * RefinedSoundex is *refined* for a number of reasons one being that the | |
41 | * mappings have been altered. This implementation contains default | |
42 | * mappings for US English. | |
43 | */ | |
44 | 1 | private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); |
45 | ||
46 | /** | |
47 | * Every letter of the alphabet is "mapped" to a numerical value. This char | |
48 | * array holds the values to which each letter is mapped. This | |
49 | * implementation contains a default map for US_ENGLISH | |
50 | */ | |
51 | private final char[] soundexMapping; | |
52 | ||
53 | /** | |
54 | * This static variable contains an instance of the RefinedSoundex using | |
55 | * the US_ENGLISH mapping. | |
56 | */ | |
57 | 1 | public static final RefinedSoundex US_ENGLISH = new RefinedSoundex(); |
58 | ||
59 | /** | |
60 | * Creates an instance of the RefinedSoundex object using the default US | |
61 | * English mapping. | |
62 | */ | |
63 | 12 | public RefinedSoundex() { |
64 | 12 | this.soundexMapping = US_ENGLISH_MAPPING; |
65 | 12 | } |
66 | ||
67 | /** | |
68 | * Creates a refined soundex instance using a custom mapping. This | |
69 | * constructor can be used to customize the mapping, and/or possibly | |
70 | * provide an internationalized mapping for a non-Western character set. | |
71 | * | |
72 | * @param mapping | |
73 | * Mapping array to use when finding the corresponding code for | |
74 | * a given character | |
75 | */ | |
76 | 1 | public RefinedSoundex(char[] mapping) { |
77 | 1 | this.soundexMapping = new char[mapping.length]; |
78 | 1 | System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); |
79 | 1 | } |
80 | ||
81 | /** | |
82 | * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, | |
83 | * and/or possibly provide an internationalized mapping for a non-Western character set. | |
84 | * | |
85 | * @param mapping | |
86 | * Mapping string to use when finding the corresponding code for a given character | |
87 | * @since 1.4 | |
88 | */ | |
89 | 1 | public RefinedSoundex(String mapping) { |
90 | 1 | this.soundexMapping = mapping.toCharArray(); |
91 | 1 | } |
92 | ||
93 | /** | |
94 | * Returns the number of characters in the two encoded Strings that are the | |
95 | * same. This return value ranges from 0 to the length of the shortest | |
96 | * encoded String: 0 indicates little or no similarity, and 4 out of 4 (for | |
97 | * example) indicates strong similarity or identical values. For refined | |
98 | * Soundex, the return value can be greater than 4. | |
99 | * | |
100 | * @param s1 | |
101 | * A String that will be encoded and compared. | |
102 | * @param s2 | |
103 | * A String that will be encoded and compared. | |
104 | * @return The number of characters in the two encoded Strings that are the | |
105 | * same from 0 to to the length of the shortest encoded String. | |
106 | * | |
107 | * @see SoundexUtils#difference(StringEncoder,String,String) | |
108 | * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> | |
109 | * MS T-SQL DIFFERENCE</a> | |
110 | * | |
111 | * @throws EncoderException | |
112 | * if an error occurs encoding one of the strings | |
113 | * @since 1.3 | |
114 | */ | |
115 | public int difference(String s1, String s2) throws EncoderException { | |
116 | 12 | return SoundexUtils.difference(this, s1, s2); |
117 | } | |
118 | ||
119 | /** | |
120 | * Encodes an Object using the refined soundex algorithm. This method is | |
121 | * provided in order to satisfy the requirements of the Encoder interface, | |
122 | * and will throw an EncoderException if the supplied object is not of type | |
123 | * java.lang.String. | |
124 | * | |
125 | * @param obj | |
126 | * Object to encode | |
127 | * @return An object (or type java.lang.String) containing the refined | |
128 | * soundex code which corresponds to the String supplied. | |
129 | * @throws EncoderException | |
130 | * if the parameter supplied is not of type java.lang.String | |
131 | */ | |
132 | @Override | |
133 | public Object encode(Object obj) throws EncoderException { | |
134 | 4 | if (!(obj instanceof String)) { |
135 | 1 | throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String"); |
136 | } | |
137 | 3 | return soundex((String) obj); |
138 | } | |
139 | ||
140 | /** | |
141 | * Encodes a String using the refined soundex algorithm. | |
142 | * | |
143 | * @param str | |
144 | * A String object to encode | |
145 | * @return A Soundex code corresponding to the String supplied | |
146 | */ | |
147 | @Override | |
148 | public String encode(String str) { | |
149 | 43 | return soundex(str); |
150 | } | |
151 | ||
152 | /** | |
153 | * Returns the mapping code for a given character. The mapping codes are | |
154 | * maintained in an internal char array named soundexMapping, and the | |
155 | * default values of these mappings are US English. | |
156 | * | |
157 | * @param c | |
158 | * char to get mapping for | |
159 | * @return A character (really a numeral) to return for the given char | |
160 | */ | |
161 | char getMappingCode(char c) { | |
162 | 194 | if (!Character.isLetter(c)) { |
163 | 1 | return 0; |
164 | } | |
165 | 193 | return this.soundexMapping[Character.toUpperCase(c) - 'A']; |
166 | } | |
167 | ||
168 | /** | |
169 | * Retrieves the Refined Soundex code for a given String object. | |
170 | * | |
171 | * @param str | |
172 | * String to encode using the Refined Soundex algorithm | |
173 | * @return A soundex code for the String supplied | |
174 | */ | |
175 | public String soundex(String str) { | |
176 | 49 | if (str == null) { |
177 | 3 | return null; |
178 | } | |
179 | 46 | str = SoundexUtils.clean(str); |
180 | 46 | if (str.length() == 0) { |
181 | 7 | return str; |
182 | } | |
183 | ||
184 | 39 | StringBuilder sBuf = new StringBuilder(); |
185 | 39 | sBuf.append(str.charAt(0)); |
186 | ||
187 | char last, current; | |
188 | 39 | last = '*'; |
189 | ||
190 | 232 | for (int i = 0; i < str.length(); i++) { |
191 | ||
192 | 193 | current = getMappingCode(str.charAt(i)); |
193 | 193 | if (current == last) { |
194 | 20 | continue; |
195 | 173 | } else if (current != 0) { |
196 | 173 | sBuf.append(current); |
197 | } | |
198 | ||
199 | 173 | last = current; |
200 | ||
201 | } | |
202 | ||
203 | 39 | return sBuf.toString(); |
204 | } | |
205 | } |