Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Metaphone |
|
| 9.333333333333334;9.333 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.codec.language; | |
19 | ||
20 | import org.apache.commons.codec.EncoderException; | |
21 | import org.apache.commons.codec.StringEncoder; | |
22 | ||
23 | /** | |
24 | * Encodes a string into a Metaphone value. | |
25 | * <p> | |
26 | * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. | |
27 | * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. | |
28 | * <p> | |
29 | * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, | |
30 | * p 39.</CITE> | |
31 | * <p> | |
32 | * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl | |
33 | * <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>. | |
34 | * They have had undocumented changes from the originally published algorithm. | |
35 | * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. | |
36 | * <p> | |
37 | * This class is conditionally thread-safe. | |
38 | * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)} | |
39 | * but is not volatile, and accesses are not synchronised. | |
40 | * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronisation | |
41 | * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} | |
42 | * after initial setup. | |
43 | * | |
44 | * @version $Id$ | |
45 | */ | |
46 | public class Metaphone implements StringEncoder { | |
47 | ||
48 | /** | |
49 | * Five values in the English language | |
50 | */ | |
51 | private static final String VOWELS = "AEIOU"; | |
52 | ||
53 | /** | |
54 | * Variable used in Metaphone algorithm | |
55 | */ | |
56 | private static final String FRONTV = "EIY"; | |
57 | ||
58 | /** | |
59 | * Variable used in Metaphone algorithm | |
60 | */ | |
61 | private static final String VARSON = "CSPTG"; | |
62 | ||
63 | /** | |
64 | * The max code length for metaphone is 4 | |
65 | */ | |
66 | 34 | private int maxCodeLen = 4; |
67 | ||
68 | /** | |
69 | * Creates an instance of the Metaphone encoder | |
70 | */ | |
71 | public Metaphone() { | |
72 | 34 | super(); |
73 | 34 | } |
74 | ||
75 | /** | |
76 | * Find the metaphone value of a String. This is similar to the | |
77 | * soundex algorithm, but better at finding similar sounding words. | |
78 | * All input is converted to upper case. | |
79 | * Limitations: Input format is expected to be a single ASCII word | |
80 | * with only characters in the A - Z range, no punctuation or numbers. | |
81 | * | |
82 | * @param txt String to find the metaphone code for | |
83 | * @return A metaphone code corresponding to the String supplied | |
84 | */ | |
85 | public String metaphone(String txt) { | |
86 | 13774 | boolean hard = false; |
87 | 13774 | if (txt == null || txt.length() == 0) { |
88 | 2 | return ""; |
89 | } | |
90 | // single character is itself | |
91 | 13772 | if (txt.length() == 1) { |
92 | 8 | return txt.toUpperCase(java.util.Locale.ENGLISH); |
93 | } | |
94 | ||
95 | 13764 | char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray(); |
96 | ||
97 | 13764 | StringBuilder local = new StringBuilder(40); // manipulate |
98 | 13764 | StringBuilder code = new StringBuilder(10); // output |
99 | // handle initial 2 characters exceptions | |
100 | 13764 | switch(inwd[0]) { |
101 | case 'K': | |
102 | case 'G': | |
103 | case 'P': /* looking for KN, etc*/ | |
104 | 3901 | if (inwd[1] == 'N') { |
105 | 17 | local.append(inwd, 1, inwd.length - 1); |
106 | } else { | |
107 | 3884 | local.append(inwd); |
108 | } | |
109 | 3884 | break; |
110 | case 'A': /* looking for AE */ | |
111 | 64 | if (inwd[1] == 'E') { |
112 | 1 | local.append(inwd, 1, inwd.length - 1); |
113 | } else { | |
114 | 63 | local.append(inwd); |
115 | } | |
116 | 63 | break; |
117 | case 'W': /* looking for WR or WH */ | |
118 | 318 | if (inwd[1] == 'R') { // WR -> R |
119 | 3 | local.append(inwd, 1, inwd.length - 1); |
120 | 3 | break; |
121 | } | |
122 | 315 | if (inwd[1] == 'H') { |
123 | 38 | local.append(inwd, 1, inwd.length - 1); |
124 | 38 | local.setCharAt(0, 'W'); // WH -> W |
125 | } else { | |
126 | 277 | local.append(inwd); |
127 | } | |
128 | 277 | break; |
129 | case 'X': /* initial X becomes S */ | |
130 | 28 | inwd[0] = 'S'; |
131 | 28 | local.append(inwd); |
132 | 28 | break; |
133 | default: | |
134 | 9453 | local.append(inwd); |
135 | } // now local has working string with initials fixed | |
136 | ||
137 | 13764 | int wdsz = local.length(); |
138 | 13764 | int n = 0; |
139 | ||
140 | while (code.length() < this.getMaxCodeLen() && | |
141 | 81745 | n < wdsz ) { // max code size of 4 works well |
142 | 67981 | char symb = local.charAt(n); |
143 | // remove duplicate letters except C | |
144 | 67981 | if (symb != 'C' && isPreviousChar( local, n, symb ) ) { |
145 | 5607 | n++; |
146 | } else { // not dup | |
147 | 62374 | switch(symb) { |
148 | case 'A': | |
149 | case 'E': | |
150 | case 'I': | |
151 | case 'O': | |
152 | case 'U': | |
153 | 30521 | if (n == 0) { |
154 | 70 | code.append(symb); |
155 | } | |
156 | break; // only use vowel if leading char | |
157 | case 'B': | |
158 | 65 | if ( isPreviousChar(local, n, 'M') && |
159 | isLastChar(wdsz, n) ) { // B is silent if word ends in MB | |
160 | 3 | break; |
161 | } | |
162 | 62 | code.append(symb); |
163 | 62 | break; |
164 | case 'C': // lots of C special cases | |
165 | /* discard if SCI, SCE or SCY */ | |
166 | 1982 | if ( isPreviousChar(local, n, 'S') && |
167 | !isLastChar(wdsz, n) && | |
168 | FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) { | |
169 | 3 | break; |
170 | } | |
171 | 1979 | if (regionMatch(local, n, "CIA")) { // "CIA" -> X |
172 | 1 | code.append('X'); |
173 | 1 | break; |
174 | } | |
175 | 1978 | if (!isLastChar(wdsz, n) && |
176 | FRONTV.indexOf(local.charAt(n + 1)) >= 0) { | |
177 | 82 | code.append('S'); |
178 | 82 | break; // CI,CE,CY -> S |
179 | } | |
180 | 1896 | if (isPreviousChar(local, n, 'S') && |
181 | isNextChar(local, n, 'H') ) { // SCH->sk | |
182 | 2 | code.append('K'); |
183 | 2 | break; |
184 | } | |
185 | 1894 | if (isNextChar(local, n, 'H')) { // detect CH |
186 | 4 | if (n == 0 && |
187 | wdsz >= 3 && | |
188 | isVowel(local,2) ) { // CH consonant -> K consonant | |
189 | 1 | code.append('K'); |
190 | } else { | |
191 | 3 | code.append('X'); // CHvowel -> X |
192 | } | |
193 | } else { | |
194 | 1890 | code.append('K'); |
195 | } | |
196 | 1890 | break; |
197 | case 'D': | |
198 | 445 | if (!isLastChar(wdsz, n + 1) && |
199 | isNextChar(local, n, 'G') && | |
200 | FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J | |
201 | 3 | code.append('J'); n += 2; |
202 | } else { | |
203 | 442 | code.append('T'); |
204 | } | |
205 | 442 | break; |
206 | case 'G': // GH silent at end or before consonant | |
207 | 1705 | if (isLastChar(wdsz, n + 1) && |
208 | isNextChar(local, n, 'H')) { | |
209 | 1 | break; |
210 | } | |
211 | 1704 | if (!isLastChar(wdsz, n + 1) && |
212 | isNextChar(local,n,'H') && | |
213 | !isVowel(local,n+2)) { | |
214 | 19 | break; |
215 | } | |
216 | 1685 | if (n > 0 && |
217 | ( regionMatch(local, n, "GN") || | |
218 | regionMatch(local, n, "GNED") ) ) { | |
219 | 0 | break; // silent G |
220 | } | |
221 | 1684 | if (isPreviousChar(local, n, 'G')) { |
222 | // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true | |
223 | 0 | hard = true; |
224 | } else { | |
225 | 1684 | hard = false; |
226 | } | |
227 | 1684 | if (!isLastChar(wdsz, n) && |
228 | FRONTV.indexOf(local.charAt(n + 1)) >= 0 && | |
229 | !hard) { | |
230 | 1547 | code.append('J'); |
231 | } else { | |
232 | 137 | code.append('K'); |
233 | } | |
234 | 137 | break; |
235 | case 'H': | |
236 | 654 | if (isLastChar(wdsz, n)) { |
237 | 205 | break; // terminal H |
238 | } | |
239 | 449 | if (n > 0 && |
240 | VARSON.indexOf(local.charAt(n - 1)) >= 0) { | |
241 | 27 | break; |
242 | } | |
243 | 422 | if (isVowel(local,n+1)) { |
244 | 1 | code.append('H'); // Hvowel |
245 | } | |
246 | break; | |
247 | case 'F': | |
248 | case 'J': | |
249 | case 'L': | |
250 | case 'M': | |
251 | case 'N': | |
252 | case 'R': | |
253 | 20128 | code.append(symb); |
254 | 20128 | break; |
255 | case 'K': | |
256 | 1963 | if (n > 0) { // not initial |
257 | 5 | if (!isPreviousChar(local, n, 'C')) { |
258 | 2 | code.append(symb); |
259 | } | |
260 | } else { | |
261 | 1958 | code.append(symb); // initial K |
262 | } | |
263 | 1958 | break; |
264 | case 'P': | |
265 | 245 | if (isNextChar(local,n,'H')) { |
266 | // PH -> F | |
267 | 1 | code.append('F'); |
268 | } else { | |
269 | 244 | code.append(symb); |
270 | } | |
271 | 244 | break; |
272 | case 'Q': | |
273 | 3 | code.append('K'); |
274 | 3 | break; |
275 | case 'S': | |
276 | 673 | if (regionMatch(local,n,"SH") || |
277 | regionMatch(local,n,"SIO") || | |
278 | regionMatch(local,n,"SIA")) { | |
279 | 4 | code.append('X'); |
280 | } else { | |
281 | 669 | code.append('S'); |
282 | } | |
283 | 669 | break; |
284 | case 'T': | |
285 | 640 | if (regionMatch(local,n,"TIA") || |
286 | regionMatch(local,n,"TIO")) { | |
287 | 2 | code.append('X'); |
288 | 2 | break; |
289 | } | |
290 | 638 | if (regionMatch(local,n,"TCH")) { |
291 | // Silent if in "TCH" | |
292 | 2 | break; |
293 | } | |
294 | // substitute numeral 0 for TH (resembles theta after all) | |
295 | 636 | if (regionMatch(local,n,"TH")) { |
296 | 2 | code.append('0'); |
297 | } else { | |
298 | 634 | code.append('T'); |
299 | } | |
300 | 634 | break; |
301 | case 'V': | |
302 | 1 | code.append('F'); break; |
303 | case 'W': | |
304 | case 'Y': // silent if not followed by vowel | |
305 | 2847 | if (!isLastChar(wdsz,n) && |
306 | isVowel(local,n+1)) { | |
307 | 314 | code.append(symb); |
308 | } | |
309 | break; | |
310 | case 'X': | |
311 | 6 | code.append('K'); |
312 | 6 | code.append('S'); |
313 | 6 | break; |
314 | case 'Z': | |
315 | 139 | code.append('S'); |
316 | 139 | break; |
317 | default: | |
318 | // do nothing | |
319 | break; | |
320 | } // end switch | |
321 | 62374 | n++; |
322 | } // end else from symb != 'C' | |
323 | 67981 | if (code.length() > this.getMaxCodeLen()) { |
324 | 2 | code.setLength(this.getMaxCodeLen()); |
325 | } | |
326 | 67981 | } |
327 | 13764 | return code.toString(); |
328 | } | |
329 | ||
330 | private boolean isVowel(StringBuilder string, int index) { | |
331 | 1132 | return VOWELS.indexOf(string.charAt(index)) >= 0; |
332 | } | |
333 | ||
334 | private boolean isPreviousChar(StringBuilder string, int index, char c) { | |
335 | 71631 | boolean matches = false; |
336 | 71631 | if( index > 0 && |
337 | index < string.length() ) { | |
338 | 54295 | matches = string.charAt(index - 1) == c; |
339 | } | |
340 | 71631 | return matches; |
341 | } | |
342 | ||
343 | private boolean isNextChar(StringBuilder string, int index, char c) { | |
344 | 4062 | boolean matches = false; |
345 | 4062 | if( index >= 0 && |
346 | index < string.length() - 1 ) { | |
347 | 4035 | matches = string.charAt(index + 1) == c; |
348 | } | |
349 | 4062 | return matches; |
350 | } | |
351 | ||
352 | private boolean regionMatch(StringBuilder string, int index, String test) { | |
353 | 6549 | boolean matches = false; |
354 | 6549 | if( index >= 0 && |
355 | index + test.length() - 1 < string.length() ) { | |
356 | 5282 | String substring = string.substring( index, index + test.length()); |
357 | 5282 | matches = substring.equals( test ); |
358 | } | |
359 | 6549 | return matches; |
360 | } | |
361 | ||
362 | private boolean isLastChar(int wdsz, int n) { | |
363 | 11025 | return n + 1 == wdsz; |
364 | } | |
365 | ||
366 | ||
367 | /** | |
368 | * Encodes an Object using the metaphone algorithm. This method | |
369 | * is provided in order to satisfy the requirements of the | |
370 | * Encoder interface, and will throw an EncoderException if the | |
371 | * supplied object is not of type java.lang.String. | |
372 | * | |
373 | * @param obj Object to encode | |
374 | * @return An object (or type java.lang.String) containing the | |
375 | * metaphone code which corresponds to the String supplied. | |
376 | * @throws EncoderException if the parameter supplied is not | |
377 | * of type java.lang.String | |
378 | */ | |
379 | @Override | |
380 | public Object encode(Object obj) throws EncoderException { | |
381 | 4 | if (!(obj instanceof String)) { |
382 | 1 | throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); |
383 | } | |
384 | 3 | return metaphone((String) obj); |
385 | } | |
386 | ||
387 | /** | |
388 | * Encodes a String using the Metaphone algorithm. | |
389 | * | |
390 | * @param str String object to encode | |
391 | * @return The metaphone code corresponding to the String supplied | |
392 | */ | |
393 | @Override | |
394 | public String encode(String str) { | |
395 | 7 | return metaphone(str); |
396 | } | |
397 | ||
398 | /** | |
399 | * Tests is the metaphones of two strings are identical. | |
400 | * | |
401 | * @param str1 First of two strings to compare | |
402 | * @param str2 Second of two strings to compare | |
403 | * @return {@code true} if the metaphones of these strings are identical, | |
404 | * {@code false} otherwise. | |
405 | */ | |
406 | public boolean isMetaphoneEqual(String str1, String str2) { | |
407 | 6862 | return metaphone(str1).equals(metaphone(str2)); |
408 | } | |
409 | ||
410 | /** | |
411 | * Returns the maxCodeLen. | |
412 | * @return int | |
413 | */ | |
414 | 149728 | public int getMaxCodeLen() { return this.maxCodeLen; } |
415 | ||
416 | /** | |
417 | * Sets the maxCodeLen. | |
418 | * @param maxCodeLen The maxCodeLen to set | |
419 | */ | |
420 | 1 | public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; } |
421 | ||
422 | } |