Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Soundex |
|
| 2.8333333333333335;2.833 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.codec.language; | |
19 | ||
20 | import org.apache.commons.codec.EncoderException; | |
21 | import org.apache.commons.codec.StringEncoder; | |
22 | ||
23 | /** | |
24 | * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a | |
25 | * general purpose scheme to find word with similar phonemes. | |
26 | * | |
27 | * This class is thread-safe. | |
28 | * Although not strictly immutable, the {@link #maxLength} field is not actually used. | |
29 | * | |
30 | * @version $Id$ | |
31 | */ | |
32 | public class Soundex implements StringEncoder { | |
33 | ||
34 | /** | |
35 | * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position | |
36 | * means do not encode. | |
37 | * <p> | |
38 | * (This constant is provided as both an implementation convenience and to allow Javadoc to pick | |
39 | * up the value for the constant values page.) | |
40 | * </p> | |
41 | * | |
42 | * @see #US_ENGLISH_MAPPING | |
43 | */ | |
44 | public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; | |
45 | ||
46 | /** | |
47 | * This is a default mapping of the 26 letters used in US English. A value of <code>0</code> for a letter position | |
48 | * means do not encode. | |
49 | * | |
50 | * @see Soundex#Soundex(char[]) | |
51 | */ | |
52 | 1 | private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray(); |
53 | ||
54 | /** | |
55 | * An instance of Soundex using the US_ENGLISH_MAPPING mapping. | |
56 | * | |
57 | * @see #US_ENGLISH_MAPPING | |
58 | */ | |
59 | 1 | public static final Soundex US_ENGLISH = new Soundex(); |
60 | ||
61 | /** | |
62 | * The maximum length of a Soundex code - Soundex codes are only four characters by definition. | |
63 | * | |
64 | * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. | |
65 | */ | |
66 | 33 | @Deprecated |
67 | private int maxLength = 4; | |
68 | ||
69 | /** | |
70 | * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each | |
71 | * letter is mapped. This implementation contains a default map for US_ENGLISH | |
72 | */ | |
73 | private final char[] soundexMapping; | |
74 | ||
75 | /** | |
76 | * Creates an instance using US_ENGLISH_MAPPING | |
77 | * | |
78 | * @see Soundex#Soundex(char[]) | |
79 | * @see Soundex#US_ENGLISH_MAPPING | |
80 | */ | |
81 | 31 | public Soundex() { |
82 | 31 | this.soundexMapping = US_ENGLISH_MAPPING; |
83 | 31 | } |
84 | ||
85 | /** | |
86 | * Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized | |
87 | * mapping for a non-Western character set. | |
88 | * | |
89 | * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each | |
90 | * letter is mapped. This implementation contains a default map for US_ENGLISH | |
91 | * | |
92 | * @param mapping | |
93 | * Mapping array to use when finding the corresponding code for a given character | |
94 | */ | |
95 | 1 | public Soundex(char[] mapping) { |
96 | 1 | this.soundexMapping = new char[mapping.length]; |
97 | 1 | System.arraycopy(mapping, 0, this.soundexMapping, 0, mapping.length); |
98 | 1 | } |
99 | ||
100 | /** | |
101 | * Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, | |
102 | * and/or possibly provide an internationalized mapping for a non-Western character set. | |
103 | * | |
104 | * @param mapping | |
105 | * Mapping string to use when finding the corresponding code for a given character | |
106 | * @since 1.4 | |
107 | */ | |
108 | 1 | public Soundex(String mapping) { |
109 | 1 | this.soundexMapping = mapping.toCharArray(); |
110 | 1 | } |
111 | ||
112 | /** | |
113 | * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This | |
114 | * return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or | |
115 | * identical values. | |
116 | * | |
117 | * @param s1 | |
118 | * A String that will be encoded and compared. | |
119 | * @param s2 | |
120 | * A String that will be encoded and compared. | |
121 | * @return The number of characters in the two encoded Strings that are the same from 0 to 4. | |
122 | * | |
123 | * @see SoundexUtils#difference(StringEncoder,String,String) | |
124 | * @see <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS | |
125 | * T-SQL DIFFERENCE </a> | |
126 | * | |
127 | * @throws EncoderException | |
128 | * if an error occurs encoding one of the strings | |
129 | * @since 1.3 | |
130 | */ | |
131 | public int difference(String s1, String s2) throws EncoderException { | |
132 | 12 | return SoundexUtils.difference(this, s1, s2); |
133 | } | |
134 | ||
135 | /** | |
136 | * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of | |
137 | * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. | |
138 | * | |
139 | * @param obj | |
140 | * Object to encode | |
141 | * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String | |
142 | * supplied. | |
143 | * @throws EncoderException | |
144 | * if the parameter supplied is not of type java.lang.String | |
145 | * @throws IllegalArgumentException | |
146 | * if a character is not mapped | |
147 | */ | |
148 | @Override | |
149 | public Object encode(Object obj) throws EncoderException { | |
150 | 6 | if (!(obj instanceof String)) { |
151 | 1 | throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); |
152 | } | |
153 | 5 | return soundex((String) obj); |
154 | } | |
155 | ||
156 | /** | |
157 | * Encodes a String using the soundex algorithm. | |
158 | * | |
159 | * @param str | |
160 | * A String object to encode | |
161 | * @return A Soundex code corresponding to the String supplied | |
162 | * @throws IllegalArgumentException | |
163 | * if a character is not mapped | |
164 | */ | |
165 | @Override | |
166 | public String encode(String str) { | |
167 | 175 | return soundex(str); |
168 | } | |
169 | ||
170 | /** | |
171 | * Used internally by the SoundEx algorithm. | |
172 | * | |
173 | * Consonants from the same code group separated by W or H are treated as one. | |
174 | * | |
175 | * @param str | |
176 | * the cleaned working string to encode (in upper case). | |
177 | * @param index | |
178 | * the character position to encode | |
179 | * @return Mapping code for a particular character | |
180 | * @throws IllegalArgumentException | |
181 | * if the character is not mapped | |
182 | */ | |
183 | private char getMappingCode(String str, int index) { | |
184 | // map() throws IllegalArgumentException | |
185 | 971 | char mappedChar = this.map(str.charAt(index)); |
186 | // HW rule check | |
187 | 969 | if (index > 1 && mappedChar != '0') { |
188 | 367 | char hwChar = str.charAt(index - 1); |
189 | 367 | if ('H' == hwChar || 'W' == hwChar) { |
190 | 10 | char preHWChar = str.charAt(index - 2); |
191 | 10 | char firstCode = this.map(preHWChar); |
192 | 10 | if (firstCode == mappedChar || 'H' == preHWChar || 'W' == preHWChar) { |
193 | 4 | return 0; |
194 | } | |
195 | } | |
196 | } | |
197 | 965 | return mappedChar; |
198 | } | |
199 | ||
200 | /** | |
201 | * Returns the maxLength. Standard Soundex | |
202 | * | |
203 | * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. | |
204 | * @return int | |
205 | */ | |
206 | @Deprecated | |
207 | public int getMaxLength() { | |
208 | 0 | return this.maxLength; |
209 | } | |
210 | ||
211 | /** | |
212 | * Returns the soundex mapping. | |
213 | * | |
214 | * @return soundexMapping. | |
215 | */ | |
216 | private char[] getSoundexMapping() { | |
217 | 1960 | return this.soundexMapping; |
218 | } | |
219 | ||
220 | /** | |
221 | * Maps the given upper-case character to its Soundex code. | |
222 | * | |
223 | * @param ch | |
224 | * An upper-case character. | |
225 | * @return A Soundex code. | |
226 | * @throws IllegalArgumentException | |
227 | * Thrown if <code>ch</code> is not mapped. | |
228 | */ | |
229 | private char map(char ch) { | |
230 | 981 | int index = ch - 'A'; |
231 | 981 | if (index < 0 || index >= this.getSoundexMapping().length) { |
232 | 2 | throw new IllegalArgumentException("The character is not mapped: " + ch); |
233 | } | |
234 | 979 | return this.getSoundexMapping()[index]; |
235 | } | |
236 | ||
237 | /** | |
238 | * Sets the maxLength. | |
239 | * | |
240 | * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0. | |
241 | * @param maxLength | |
242 | * The maxLength to set | |
243 | */ | |
244 | @Deprecated | |
245 | public void setMaxLength(int maxLength) { | |
246 | 0 | this.maxLength = maxLength; |
247 | 0 | } |
248 | ||
249 | /** | |
250 | * Retrieves the Soundex code for a given String object. | |
251 | * | |
252 | * @param str | |
253 | * String to encode using the Soundex algorithm | |
254 | * @return A soundex code for the String supplied | |
255 | * @throws IllegalArgumentException | |
256 | * if a character is not mapped | |
257 | */ | |
258 | public String soundex(String str) { | |
259 | 184 | if (str == null) { |
260 | 3 | return null; |
261 | } | |
262 | 181 | str = SoundexUtils.clean(str); |
263 | 181 | if (str.length() == 0) { |
264 | 7 | return str; |
265 | } | |
266 | 174 | char out[] = {'0', '0', '0', '0'}; |
267 | char last, mapped; | |
268 | 174 | int incount = 1, count = 1; |
269 | 174 | out[0] = str.charAt(0); |
270 | // getMappingCode() throws IllegalArgumentException | |
271 | 174 | last = getMappingCode(str, 0); |
272 | 969 | while (incount < str.length() && count < out.length) { |
273 | 797 | mapped = getMappingCode(str, incount++); |
274 | 797 | if (mapped != 0) { |
275 | 793 | if (mapped != '0' && mapped != last) { |
276 | 383 | out[count++] = mapped; |
277 | } | |
278 | 793 | last = mapped; |
279 | } | |
280 | } | |
281 | 172 | return new String(out); |
282 | } | |
283 | ||
284 | } |