1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.commons.codec.language; |
19 | |
|
20 | |
import org.apache.commons.codec.EncoderException; |
21 | |
import org.apache.commons.codec.StringEncoder; |
22 | |
|
23 | |
|
24 | |
|
25 | |
|
26 | |
|
27 | |
|
28 | |
|
29 | |
|
30 | |
|
31 | |
|
32 | |
|
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
public class DoubleMetaphone implements StringEncoder { |
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
private static final String VOWELS = "AEIOUY"; |
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | 1 | private static final String[] SILENT_START = |
48 | |
{ "GN", "KN", "PN", "WR", "PS" }; |
49 | 1 | private static final String[] L_R_N_M_B_H_F_V_W_SPACE = |
50 | |
{ "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; |
51 | 1 | private static final String[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = |
52 | |
{ "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; |
53 | 1 | private static final String[] L_T_K_S_N_M_B_Z = |
54 | |
{ "L", "T", "K", "S", "N", "M", "B", "Z" }; |
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | 24 | private int maxCodeLen = 4; |
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
public DoubleMetaphone() { |
65 | 24 | super(); |
66 | 24 | } |
67 | |
|
68 | |
|
69 | |
|
70 | |
|
71 | |
|
72 | |
|
73 | |
|
74 | |
public String doubleMetaphone(final String value) { |
75 | 82 | return doubleMetaphone(value, false); |
76 | |
} |
77 | |
|
78 | |
|
79 | |
|
80 | |
|
81 | |
|
82 | |
|
83 | |
|
84 | |
|
85 | |
public String doubleMetaphone(String value, final boolean alternate) { |
86 | 6504 | value = cleanInput(value); |
87 | 6504 | if (value == null) { |
88 | 12 | return null; |
89 | |
} |
90 | |
|
91 | 6492 | final boolean slavoGermanic = isSlavoGermanic(value); |
92 | 6492 | int index = isSilentStart(value) ? 1 : 0; |
93 | |
|
94 | 6492 | final DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.getMaxCodeLen()); |
95 | |
|
96 | 42088 | while (!result.isComplete() && index <= value.length() - 1) { |
97 | 35596 | switch (value.charAt(index)) { |
98 | |
case 'A': |
99 | |
case 'E': |
100 | |
case 'I': |
101 | |
case 'O': |
102 | |
case 'U': |
103 | |
case 'Y': |
104 | 13681 | index = handleAEIOUY(result, index); |
105 | 13681 | break; |
106 | |
case 'B': |
107 | 884 | result.append('P'); |
108 | 884 | index = charAt(value, index + 1) == 'B' ? index + 2 : index + 1; |
109 | 884 | break; |
110 | |
case '\u00C7': |
111 | |
|
112 | 1 | result.append('S'); |
113 | 1 | index++; |
114 | 1 | break; |
115 | |
case 'C': |
116 | 1680 | index = handleC(value, result, index); |
117 | 1680 | break; |
118 | |
case 'D': |
119 | 1238 | index = handleD(value, result, index); |
120 | 1238 | break; |
121 | |
case 'F': |
122 | 646 | result.append('F'); |
123 | 646 | index = charAt(value, index + 1) == 'F' ? index + 2 : index + 1; |
124 | 646 | break; |
125 | |
case 'G': |
126 | 801 | index = handleG(value, result, index, slavoGermanic); |
127 | 801 | break; |
128 | |
case 'H': |
129 | 521 | index = handleH(value, result, index); |
130 | 521 | break; |
131 | |
case 'J': |
132 | 87 | index = handleJ(value, result, index, slavoGermanic); |
133 | 87 | break; |
134 | |
case 'K': |
135 | 324 | result.append('K'); |
136 | 324 | index = charAt(value, index + 1) == 'K' ? index + 2 : index + 1; |
137 | 324 | break; |
138 | |
case 'L': |
139 | 1797 | index = handleL(value, result, index); |
140 | 1797 | break; |
141 | |
case 'M': |
142 | 1241 | result.append('M'); |
143 | 1241 | index = conditionM0(value, index) ? index + 2 : index + 1; |
144 | 1241 | break; |
145 | |
case 'N': |
146 | 2777 | result.append('N'); |
147 | 2777 | index = charAt(value, index + 1) == 'N' ? index + 2 : index + 1; |
148 | 2777 | break; |
149 | |
case '\u00D1': |
150 | |
|
151 | 1 | result.append('N'); |
152 | 1 | index++; |
153 | 1 | break; |
154 | |
case 'P': |
155 | 1144 | index = handleP(value, result, index); |
156 | 1144 | break; |
157 | |
case 'Q': |
158 | 80 | result.append('K'); |
159 | 80 | index = charAt(value, index + 1) == 'Q' ? index + 2 : index + 1; |
160 | 80 | break; |
161 | |
case 'R': |
162 | 2737 | index = handleR(value, result, index, slavoGermanic); |
163 | 2737 | break; |
164 | |
case 'S': |
165 | 2151 | index = handleS(value, result, index, slavoGermanic); |
166 | 2151 | break; |
167 | |
case 'T': |
168 | 2224 | index = handleT(value, result, index); |
169 | 2224 | break; |
170 | |
case 'V': |
171 | 406 | result.append('F'); |
172 | 406 | index = charAt(value, index + 1) == 'V' ? index + 2 : index + 1; |
173 | 406 | break; |
174 | |
case 'W': |
175 | 519 | index = handleW(value, result, index); |
176 | 519 | break; |
177 | |
case 'X': |
178 | 152 | index = handleX(value, result, index); |
179 | 152 | break; |
180 | |
case 'Z': |
181 | 97 | index = handleZ(value, result, index, slavoGermanic); |
182 | 97 | break; |
183 | |
default: |
184 | 407 | index++; |
185 | 407 | break; |
186 | |
} |
187 | |
} |
188 | |
|
189 | 6492 | return alternate ? result.getAlternate() : result.getPrimary(); |
190 | |
} |
191 | |
|
192 | |
|
193 | |
|
194 | |
|
195 | |
|
196 | |
|
197 | |
|
198 | |
|
199 | |
|
200 | |
@Override |
201 | |
public Object encode(final Object obj) throws EncoderException { |
202 | 35 | if (!(obj instanceof String)) { |
203 | 3 | throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); |
204 | |
} |
205 | 32 | return doubleMetaphone((String) obj); |
206 | |
} |
207 | |
|
208 | |
|
209 | |
|
210 | |
|
211 | |
|
212 | |
|
213 | |
|
214 | |
@Override |
215 | |
public String encode(final String value) { |
216 | 30 | return doubleMetaphone(value); |
217 | |
} |
218 | |
|
219 | |
|
220 | |
|
221 | |
|
222 | |
|
223 | |
|
224 | |
|
225 | |
|
226 | |
|
227 | |
|
228 | |
|
229 | |
public boolean isDoubleMetaphoneEqual(final String value1, final String value2) { |
230 | 22 | return isDoubleMetaphoneEqual(value1, value2, false); |
231 | |
} |
232 | |
|
233 | |
|
234 | |
|
235 | |
|
236 | |
|
237 | |
|
238 | |
|
239 | |
|
240 | |
|
241 | |
|
242 | |
|
243 | |
public boolean isDoubleMetaphoneEqual(final String value1, final String value2, final boolean alternate) { |
244 | 1970 | return doubleMetaphone(value1, alternate).equals(doubleMetaphone(value2, alternate)); |
245 | |
} |
246 | |
|
247 | |
|
248 | |
|
249 | |
|
250 | |
|
251 | |
public int getMaxCodeLen() { |
252 | 19478 | return this.maxCodeLen; |
253 | |
} |
254 | |
|
255 | |
|
256 | |
|
257 | |
|
258 | |
|
259 | |
public void setMaxCodeLen(final int maxCodeLen) { |
260 | 1 | this.maxCodeLen = maxCodeLen; |
261 | 1 | } |
262 | |
|
263 | |
|
264 | |
|
265 | |
|
266 | |
|
267 | |
|
268 | |
private int handleAEIOUY(final DoubleMetaphoneResult result, final int index) { |
269 | 13681 | if (index == 0) { |
270 | 1515 | result.append('A'); |
271 | |
} |
272 | 13681 | return index + 1; |
273 | |
} |
274 | |
|
275 | |
|
276 | |
|
277 | |
|
278 | |
private int handleC(final String value, final DoubleMetaphoneResult result, int index) { |
279 | 1680 | if (conditionC0(value, index)) { |
280 | 16 | result.append('K'); |
281 | 16 | index += 2; |
282 | 1664 | } else if (index == 0 && contains(value, index, 6, "CAESAR")) { |
283 | 6 | result.append('S'); |
284 | 6 | index += 2; |
285 | 1658 | } else if (contains(value, index, 2, "CH")) { |
286 | 156 | index = handleCH(value, result, index); |
287 | 1502 | } else if (contains(value, index, 2, "CZ") && |
288 | |
!contains(value, index - 2, 4, "WICZ")) { |
289 | |
|
290 | 7 | result.append('S', 'X'); |
291 | 7 | index += 2; |
292 | 1495 | } else if (contains(value, index + 1, 3, "CIA")) { |
293 | |
|
294 | 2 | result.append('X'); |
295 | 2 | index += 3; |
296 | 1493 | } else if (contains(value, index, 2, "CC") && |
297 | |
!(index == 1 && charAt(value, 0) == 'M')) { |
298 | |
|
299 | 109 | return handleCC(value, result, index); |
300 | 1384 | } else if (contains(value, index, 2, "CK", "CG", "CQ")) { |
301 | 111 | result.append('K'); |
302 | 111 | index += 2; |
303 | 1273 | } else if (contains(value, index, 2, "CI", "CE", "CY")) { |
304 | |
|
305 | 286 | if (contains(value, index, 3, "CIO", "CIE", "CIA")) { |
306 | 46 | result.append('S', 'X'); |
307 | |
} else { |
308 | 240 | result.append('S'); |
309 | |
} |
310 | 286 | index += 2; |
311 | |
} else { |
312 | 987 | result.append('K'); |
313 | 987 | if (contains(value, index + 1, 2, " C", " Q", " G")) { |
314 | |
|
315 | 4 | index += 3; |
316 | 983 | } else if (contains(value, index + 1, 1, "C", "K", "Q") && |
317 | |
!contains(value, index + 1, 2, "CE", "CI")) { |
318 | 9 | index += 2; |
319 | |
} else { |
320 | 974 | index++; |
321 | |
} |
322 | |
} |
323 | |
|
324 | 1571 | return index; |
325 | |
} |
326 | |
|
327 | |
|
328 | |
|
329 | |
|
330 | |
private int handleCC(final String value, final DoubleMetaphoneResult result, int index) { |
331 | 109 | if (contains(value, index + 2, 1, "I", "E", "H") && |
332 | |
!contains(value, index + 2, 2, "HU")) { |
333 | |
|
334 | 22 | if ((index == 1 && charAt(value, index - 1) == 'A') || |
335 | |
contains(value, index - 1, 5, "UCCEE", "UCCES")) { |
336 | |
|
337 | 14 | result.append("KS"); |
338 | |
} else { |
339 | |
|
340 | 8 | result.append('X'); |
341 | |
} |
342 | 22 | index += 3; |
343 | |
} else { |
344 | 87 | result.append('K'); |
345 | 87 | index += 2; |
346 | |
} |
347 | |
|
348 | 109 | return index; |
349 | |
} |
350 | |
|
351 | |
|
352 | |
|
353 | |
|
354 | |
private int handleCH(final String value, final DoubleMetaphoneResult result, final int index) { |
355 | 156 | if (index > 0 && contains(value, index, 4, "CHAE")) { |
356 | 0 | result.append('K', 'X'); |
357 | 0 | return index + 2; |
358 | 156 | } else if (conditionCH0(value, index)) { |
359 | |
|
360 | 4 | result.append('K'); |
361 | 4 | return index + 2; |
362 | 152 | } else if (conditionCH1(value, index)) { |
363 | |
|
364 | 34 | result.append('K'); |
365 | 34 | return index + 2; |
366 | |
} else { |
367 | 118 | if (index > 0) { |
368 | 82 | if (contains(value, 0, 2, "MC")) { |
369 | 2 | result.append('K'); |
370 | |
} else { |
371 | 80 | result.append('X', 'K'); |
372 | |
} |
373 | |
} else { |
374 | 36 | result.append('X'); |
375 | |
} |
376 | 118 | return index + 2; |
377 | |
} |
378 | |
} |
379 | |
|
380 | |
|
381 | |
|
382 | |
|
383 | |
private int handleD(final String value, final DoubleMetaphoneResult result, int index) { |
384 | 1238 | if (contains(value, index, 2, "DG")) { |
385 | |
|
386 | 10 | if (contains(value, index + 2, 1, "I", "E", "Y")) { |
387 | 4 | result.append('J'); |
388 | 4 | index += 3; |
389 | |
|
390 | |
} else { |
391 | 6 | result.append("TK"); |
392 | 6 | index += 2; |
393 | |
} |
394 | 1228 | } else if (contains(value, index, 2, "DT", "DD")) { |
395 | 38 | result.append('T'); |
396 | 38 | index += 2; |
397 | |
} else { |
398 | 1190 | result.append('T'); |
399 | 1190 | index++; |
400 | |
} |
401 | 1238 | return index; |
402 | |
} |
403 | |
|
404 | |
|
405 | |
|
406 | |
|
407 | |
private int handleG(final String value, final DoubleMetaphoneResult result, int index, |
408 | |
final boolean slavoGermanic) { |
409 | 801 | if (charAt(value, index + 1) == 'H') { |
410 | 106 | index = handleGH(value, result, index); |
411 | 695 | } else if (charAt(value, index + 1) == 'N') { |
412 | 26 | if (index == 1 && isVowel(charAt(value, 0)) && !slavoGermanic) { |
413 | 0 | result.append("KN", "N"); |
414 | 26 | } else if (!contains(value, index + 2, 2, "EY") && |
415 | |
charAt(value, index + 1) != 'Y' && !slavoGermanic) { |
416 | 22 | result.append("N", "KN"); |
417 | |
} else { |
418 | 4 | result.append("KN"); |
419 | |
} |
420 | 26 | index = index + 2; |
421 | 669 | } else if (contains(value, index + 1, 2, "LI") && !slavoGermanic) { |
422 | 4 | result.append("KL", "L"); |
423 | 4 | index += 2; |
424 | 665 | } else if (index == 0 && |
425 | |
(charAt(value, index + 1) == 'Y' || |
426 | |
contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) { |
427 | |
|
428 | 16 | result.append('K', 'J'); |
429 | 16 | index += 2; |
430 | 649 | } else if ((contains(value, index + 1, 2, "ER") || |
431 | |
charAt(value, index + 1) == 'Y') && |
432 | |
!contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && |
433 | |
!contains(value, index - 1, 1, "E", "I") && |
434 | |
!contains(value, index - 1, 3, "RGY", "OGY")) { |
435 | |
|
436 | 22 | result.append('K', 'J'); |
437 | 22 | index += 2; |
438 | 627 | } else if (contains(value, index + 1, 1, "E", "I", "Y") || |
439 | |
contains(value, index - 1, 4, "AGGI", "OGGI")) { |
440 | |
|
441 | 182 | if (contains(value, 0 ,4, "VAN ", "VON ") || |
442 | |
contains(value, 0, 3, "SCH") || |
443 | |
contains(value, index + 1, 2, "ET")) { |
444 | |
|
445 | 2 | result.append('K'); |
446 | 180 | } else if (contains(value, index + 1, 3, "IER")) { |
447 | 4 | result.append('J'); |
448 | |
} else { |
449 | 176 | result.append('J', 'K'); |
450 | |
} |
451 | 182 | index += 2; |
452 | 445 | } else if (charAt(value, index + 1) == 'G') { |
453 | 34 | index += 2; |
454 | 34 | result.append('K'); |
455 | |
} else { |
456 | 411 | index++; |
457 | 411 | result.append('K'); |
458 | |
} |
459 | 801 | return index; |
460 | |
} |
461 | |
|
462 | |
|
463 | |
|
464 | |
|
465 | |
private int handleGH(final String value, final DoubleMetaphoneResult result, int index) { |
466 | 106 | if (index > 0 && !isVowel(charAt(value, index - 1))) { |
467 | 4 | result.append('K'); |
468 | 4 | index += 2; |
469 | 102 | } else if (index == 0) { |
470 | 8 | if (charAt(value, index + 2) == 'I') { |
471 | 4 | result.append('J'); |
472 | |
} else { |
473 | 4 | result.append('K'); |
474 | |
} |
475 | 8 | index += 2; |
476 | 94 | } else if ((index > 1 && contains(value, index - 2, 1, "B", "H", "D")) || |
477 | |
(index > 2 && contains(value, index - 3, 1, "B", "H", "D")) || |
478 | |
(index > 3 && contains(value, index - 4, 1, "B", "H"))) { |
479 | |
|
480 | 28 | index += 2; |
481 | |
} else { |
482 | 66 | if (index > 2 && charAt(value, index - 1) == 'U' && |
483 | |
contains(value, index - 3, 1, "C", "G", "L", "R", "T")) { |
484 | |
|
485 | 22 | result.append('F'); |
486 | 44 | } else if (index > 0 && charAt(value, index - 1) != 'I') { |
487 | 8 | result.append('K'); |
488 | |
} |
489 | 66 | index += 2; |
490 | |
} |
491 | 106 | return index; |
492 | |
} |
493 | |
|
494 | |
|
495 | |
|
496 | |
|
497 | |
private int handleH(final String value, final DoubleMetaphoneResult result, int index) { |
498 | |
|
499 | 521 | if ((index == 0 || isVowel(charAt(value, index - 1))) && |
500 | |
isVowel(charAt(value, index + 1))) { |
501 | 387 | result.append('H'); |
502 | 387 | index += 2; |
503 | |
|
504 | |
} else { |
505 | 134 | index++; |
506 | |
} |
507 | 521 | return index; |
508 | |
} |
509 | |
|
510 | |
|
511 | |
|
512 | |
|
513 | |
private int handleJ(final String value, final DoubleMetaphoneResult result, int index, |
514 | |
final boolean slavoGermanic) { |
515 | 87 | if (contains(value, index, 4, "JOSE") || contains(value, 0, 4, "SAN ")) { |
516 | |
|
517 | 11 | if ((index == 0 && (charAt(value, index + 4) == ' ') || |
518 | |
value.length() == 4) || contains(value, 0, 4, "SAN ")) { |
519 | 9 | result.append('H'); |
520 | |
} else { |
521 | 2 | result.append('J', 'H'); |
522 | |
} |
523 | 11 | index++; |
524 | |
} else { |
525 | 76 | if (index == 0 && !contains(value, index, 4, "JOSE")) { |
526 | 48 | result.append('J', 'A'); |
527 | 28 | } else if (isVowel(charAt(value, index - 1)) && !slavoGermanic && |
528 | |
(charAt(value, index + 1) == 'A' || charAt(value, index + 1) == 'O')) { |
529 | 10 | result.append('J', 'H'); |
530 | 18 | } else if (index == value.length() - 1) { |
531 | 0 | result.append('J', ' '); |
532 | 18 | } else if (!contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && |
533 | |
!contains(value, index - 1, 1, "S", "K", "L")) { |
534 | 18 | result.append('J'); |
535 | |
} |
536 | |
|
537 | 76 | if (charAt(value, index + 1) == 'J') { |
538 | 0 | index += 2; |
539 | |
} else { |
540 | 76 | index++; |
541 | |
} |
542 | |
} |
543 | 87 | return index; |
544 | |
} |
545 | |
|
546 | |
|
547 | |
|
548 | |
|
549 | |
private int handleL(final String value, final DoubleMetaphoneResult result, int index) { |
550 | 1797 | if (charAt(value, index + 1) == 'L') { |
551 | 353 | if (conditionL0(value, index)) { |
552 | 4 | result.appendPrimary('L'); |
553 | |
} else { |
554 | 349 | result.append('L'); |
555 | |
} |
556 | 353 | index += 2; |
557 | |
} else { |
558 | 1444 | index++; |
559 | 1444 | result.append('L'); |
560 | |
} |
561 | 1797 | return index; |
562 | |
} |
563 | |
|
564 | |
|
565 | |
|
566 | |
|
567 | |
private int handleP(final String value, final DoubleMetaphoneResult result, int index) { |
568 | 1144 | if (charAt(value, index + 1) == 'H') { |
569 | 82 | result.append('F'); |
570 | 82 | index += 2; |
571 | |
} else { |
572 | 1062 | result.append('P'); |
573 | 1062 | index = contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; |
574 | |
} |
575 | 1144 | return index; |
576 | |
} |
577 | |
|
578 | |
|
579 | |
|
580 | |
|
581 | |
private int handleR(final String value, final DoubleMetaphoneResult result, final int index, |
582 | |
final boolean slavoGermanic) { |
583 | 2737 | if (index == value.length() - 1 && !slavoGermanic && |
584 | |
contains(value, index - 2, 2, "IE") && |
585 | |
!contains(value, index - 4, 2, "ME", "MA")) { |
586 | 12 | result.appendAlternate('R'); |
587 | |
} else { |
588 | 2725 | result.append('R'); |
589 | |
} |
590 | 2737 | return charAt(value, index + 1) == 'R' ? index + 2 : index + 1; |
591 | |
} |
592 | |
|
593 | |
|
594 | |
|
595 | |
|
596 | |
private int handleS(final String value, final DoubleMetaphoneResult result, int index, |
597 | |
final boolean slavoGermanic) { |
598 | 2151 | if (contains(value, index - 1, 3, "ISL", "YSL")) { |
599 | |
|
600 | 12 | index++; |
601 | 2139 | } else if (index == 0 && contains(value, index, 5, "SUGAR")) { |
602 | |
|
603 | 4 | result.append('X', 'S'); |
604 | 4 | index++; |
605 | 2135 | } else if (contains(value, index, 2, "SH")) { |
606 | 78 | if (contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) { |
607 | |
|
608 | 6 | result.append('S'); |
609 | |
} else { |
610 | 72 | result.append('X'); |
611 | |
} |
612 | 78 | index += 2; |
613 | 2057 | } else if (contains(value, index, 3, "SIO", "SIA") || contains(value, index, 4, "SIAN")) { |
614 | |
|
615 | 50 | if (slavoGermanic) { |
616 | 0 | result.append('S'); |
617 | |
} else { |
618 | 50 | result.append('S', 'X'); |
619 | |
} |
620 | 50 | index += 3; |
621 | 2007 | } else if ((index == 0 && contains(value, index + 1, 1, "M", "N", "L", "W")) || |
622 | |
contains(value, index + 1, 1, "Z")) { |
623 | |
|
624 | |
|
625 | |
|
626 | |
|
627 | 48 | result.append('S', 'X'); |
628 | 48 | index = contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; |
629 | 1959 | } else if (contains(value, index, 2, "SC")) { |
630 | 114 | index = handleSC(value, result, index); |
631 | |
} else { |
632 | 1845 | if (index == value.length() - 1 && contains(value, index - 2, 2, "AI", "OI")) { |
633 | |
|
634 | 4 | result.appendAlternate('S'); |
635 | |
} else { |
636 | 1841 | result.append('S'); |
637 | |
} |
638 | 1845 | index = contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; |
639 | |
} |
640 | 2151 | return index; |
641 | |
} |
642 | |
|
643 | |
|
644 | |
|
645 | |
|
646 | |
private int handleSC(final String value, final DoubleMetaphoneResult result, final int index) { |
647 | 114 | if (charAt(value, index + 2) == 'H') { |
648 | |
|
649 | 38 | if (contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) { |
650 | |
|
651 | 10 | if (contains(value, index + 3, 2, "ER", "EN")) { |
652 | |
|
653 | 6 | result.append("X", "SK"); |
654 | |
} else { |
655 | 4 | result.append("SK"); |
656 | |
} |
657 | |
} else { |
658 | 28 | if (index == 0 && !isVowel(charAt(value, 3)) && charAt(value, 3) != 'W') { |
659 | 10 | result.append('X', 'S'); |
660 | |
} else { |
661 | 18 | result.append('X'); |
662 | |
} |
663 | |
} |
664 | 76 | } else if (contains(value, index + 2, 1, "I", "E", "Y")) { |
665 | 12 | result.append('S'); |
666 | |
} else { |
667 | 64 | result.append("SK"); |
668 | |
} |
669 | 114 | return index + 3; |
670 | |
} |
671 | |
|
672 | |
|
673 | |
|
674 | |
|
675 | |
private int handleT(final String value, final DoubleMetaphoneResult result, int index) { |
676 | 2224 | if (contains(value, index, 4, "TION")) { |
677 | 52 | result.append('X'); |
678 | 52 | index += 3; |
679 | 2172 | } else if (contains(value, index, 3, "TIA", "TCH")) { |
680 | 17 | result.append('X'); |
681 | 17 | index += 3; |
682 | 2155 | } else if (contains(value, index, 2, "TH") || contains(value, index, 3, "TTH")) { |
683 | 168 | if (contains(value, index + 2, 2, "OM", "AM") || |
684 | |
|
685 | |
contains(value, 0, 4, "VAN ", "VON ") || |
686 | |
contains(value, 0, 3, "SCH")) { |
687 | 10 | result.append('T'); |
688 | |
} else { |
689 | 158 | result.append('0', 'T'); |
690 | |
} |
691 | 168 | index += 2; |
692 | |
} else { |
693 | 1987 | result.append('T'); |
694 | 1987 | index = contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; |
695 | |
} |
696 | 2224 | return index; |
697 | |
} |
698 | |
|
699 | |
|
700 | |
|
701 | |
|
702 | |
private int handleW(final String value, final DoubleMetaphoneResult result, int index) { |
703 | 519 | if (contains(value, index, 2, "WR")) { |
704 | |
|
705 | 12 | result.append('R'); |
706 | 12 | index += 2; |
707 | |
} else { |
708 | 507 | if (index == 0 && (isVowel(charAt(value, index + 1)) || |
709 | |
contains(value, index, 2, "WH"))) { |
710 | 216 | if (isVowel(charAt(value, index + 1))) { |
711 | |
|
712 | 186 | result.append('A', 'F'); |
713 | |
} else { |
714 | |
|
715 | 30 | result.append('A'); |
716 | |
} |
717 | 216 | index++; |
718 | 291 | } else if ((index == value.length() - 1 && isVowel(charAt(value, index - 1))) || |
719 | |
contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || |
720 | |
contains(value, 0, 3, "SCH")) { |
721 | |
|
722 | 46 | result.appendAlternate('F'); |
723 | 46 | index++; |
724 | 245 | } else if (contains(value, index, 4, "WICZ", "WITZ")) { |
725 | |
|
726 | 16 | result.append("TS", "FX"); |
727 | 16 | index += 4; |
728 | |
} else { |
729 | 229 | index++; |
730 | |
} |
731 | |
} |
732 | 519 | return index; |
733 | |
} |
734 | |
|
735 | |
|
736 | |
|
737 | |
|
738 | |
private int handleX(final String value, final DoubleMetaphoneResult result, int index) { |
739 | 152 | if (index == 0) { |
740 | 5 | result.append('S'); |
741 | 5 | index++; |
742 | |
} else { |
743 | 147 | if (!((index == value.length() - 1) && |
744 | |
(contains(value, index - 3, 3, "IAU", "EAU") || |
745 | |
contains(value, index - 2, 2, "AU", "OU")))) { |
746 | |
|
747 | 141 | result.append("KS"); |
748 | |
} |
749 | 147 | index = contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; |
750 | |
} |
751 | 152 | return index; |
752 | |
} |
753 | |
|
754 | |
|
755 | |
|
756 | |
|
757 | |
private int handleZ(final String value, final DoubleMetaphoneResult result, int index, |
758 | |
final boolean slavoGermanic) { |
759 | 97 | if (charAt(value, index + 1) == 'H') { |
760 | |
|
761 | 2 | result.append('J'); |
762 | 2 | index += 2; |
763 | |
} else { |
764 | 95 | if (contains(value, index + 1, 2, "ZO", "ZI", "ZA") || |
765 | |
(slavoGermanic && (index > 0 && charAt(value, index - 1) != 'T'))) { |
766 | 12 | result.append("S", "TS"); |
767 | |
} else { |
768 | 83 | result.append('S'); |
769 | |
} |
770 | 95 | index = charAt(value, index + 1) == 'Z' ? index + 2 : index + 1; |
771 | |
} |
772 | 97 | return index; |
773 | |
} |
774 | |
|
775 | |
|
776 | |
|
777 | |
|
778 | |
|
779 | |
|
780 | |
private boolean conditionC0(final String value, final int index) { |
781 | 1680 | if (contains(value, index, 4, "CHIA")) { |
782 | 2 | return true; |
783 | 1678 | } else if (index <= 1) { |
784 | 680 | return false; |
785 | 998 | } else if (isVowel(charAt(value, index - 2))) { |
786 | 357 | return false; |
787 | 641 | } else if (!contains(value, index - 1, 3, "ACH")) { |
788 | 621 | return false; |
789 | |
} else { |
790 | 20 | final char c = charAt(value, index + 2); |
791 | 20 | return (c != 'I' && c != 'E') || |
792 | |
contains(value, index - 2, 6, "BACHER", "MACHER"); |
793 | |
} |
794 | |
} |
795 | |
|
796 | |
|
797 | |
|
798 | |
|
799 | |
private boolean conditionCH0(final String value, final int index) { |
800 | 156 | if (index != 0) { |
801 | 114 | return false; |
802 | 42 | } else if (!contains(value, index + 1, 5, "HARAC", "HARIS") && |
803 | |
!contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) { |
804 | 38 | return false; |
805 | 4 | } else if (contains(value, 0, 5, "CHORE")) { |
806 | 0 | return false; |
807 | |
} else { |
808 | 4 | return true; |
809 | |
} |
810 | |
} |
811 | |
|
812 | |
|
813 | |
|
814 | |
|
815 | |
private boolean conditionCH1(final String value, final int index) { |
816 | 152 | return ((contains(value, 0, 4, "VAN ", "VON ") || contains(value, 0, 3, "SCH")) || |
817 | |
contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || |
818 | |
contains(value, index + 2, 1, "T", "S") || |
819 | |
((contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && |
820 | |
(contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.length() - 1))); |
821 | |
} |
822 | |
|
823 | |
|
824 | |
|
825 | |
|
826 | |
private boolean conditionL0(final String value, final int index) { |
827 | 353 | if (index == value.length() - 3 && |
828 | |
contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) { |
829 | 2 | return true; |
830 | 351 | } else if ((contains(value, value.length() - 2, 2, "AS", "OS") || |
831 | |
contains(value, value.length() - 1, 1, "A", "O")) && |
832 | |
contains(value, index - 1, 4, "ALLE")) { |
833 | 2 | return true; |
834 | |
} else { |
835 | 349 | return false; |
836 | |
} |
837 | |
} |
838 | |
|
839 | |
|
840 | |
|
841 | |
|
842 | |
private boolean conditionM0(final String value, final int index) { |
843 | 1241 | if (charAt(value, index + 1) == 'M') { |
844 | 100 | return true; |
845 | |
} |
846 | 1141 | return contains(value, index - 1, 3, "UMB") && |
847 | |
((index + 1) == value.length() - 1 || contains(value, index + 2, 2, "ER")); |
848 | |
} |
849 | |
|
850 | |
|
851 | |
|
852 | |
|
853 | |
|
854 | |
|
855 | |
|
856 | |
private boolean isSlavoGermanic(final String value) { |
857 | 6492 | return value.indexOf('W') > -1 || value.indexOf('K') > -1 || |
858 | |
value.indexOf("CZ") > -1 || value.indexOf("WITZ") > -1; |
859 | |
} |
860 | |
|
861 | |
|
862 | |
|
863 | |
|
864 | |
private boolean isVowel(final char ch) { |
865 | 2201 | return VOWELS.indexOf(ch) != -1; |
866 | |
} |
867 | |
|
868 | |
|
869 | |
|
870 | |
|
871 | |
|
872 | |
|
873 | |
private boolean isSilentStart(final String value) { |
874 | 6492 | boolean result = false; |
875 | 38804 | for (final String element : SILENT_START) { |
876 | 32380 | if (value.startsWith(element)) { |
877 | 68 | result = true; |
878 | 68 | break; |
879 | |
} |
880 | |
} |
881 | 6492 | return result; |
882 | |
} |
883 | |
|
884 | |
|
885 | |
|
886 | |
|
887 | |
private String cleanInput(String input) { |
888 | 6504 | if (input == null) { |
889 | 3 | return null; |
890 | |
} |
891 | 6501 | input = input.trim(); |
892 | 6501 | if (input.length() == 0) { |
893 | 9 | return null; |
894 | |
} |
895 | 6492 | return input.toUpperCase(java.util.Locale.ENGLISH); |
896 | |
} |
897 | |
|
898 | |
|
899 | |
|
900 | |
|
901 | |
|
902 | |
|
903 | |
protected char charAt(final String value, final int index) { |
904 | 17628 | if (index < 0 || index >= value.length()) { |
905 | 1502 | return Character.MIN_VALUE; |
906 | |
} |
907 | 16126 | return value.charAt(index); |
908 | |
} |
909 | |
|
910 | |
|
911 | |
|
912 | |
|
913 | |
|
914 | |
protected static boolean contains(final String value, final int start, final int length, |
915 | |
final String... criteria) { |
916 | 53209 | boolean result = false; |
917 | 53209 | if (start >= 0 && start + length <= value.length()) { |
918 | 45038 | final String target = value.substring(start, start + length); |
919 | |
|
920 | 115116 | for (final String element : criteria) { |
921 | 72502 | if (target.equals(element)) { |
922 | 2424 | result = true; |
923 | 2424 | break; |
924 | |
} |
925 | |
} |
926 | |
} |
927 | 53209 | return result; |
928 | |
} |
929 | |
|
930 | |
|
931 | |
|
932 | |
|
933 | |
|
934 | |
|
935 | |
public class DoubleMetaphoneResult { |
936 | |
|
937 | 6492 | private final StringBuilder primary = new StringBuilder(getMaxCodeLen()); |
938 | 6492 | private final StringBuilder alternate = new StringBuilder(getMaxCodeLen()); |
939 | |
private final int maxLength; |
940 | |
|
941 | 6492 | public DoubleMetaphoneResult(final int maxLength) { |
942 | 6492 | this.maxLength = maxLength; |
943 | 6492 | } |
944 | |
|
945 | |
public void append(final char value) { |
946 | 21356 | appendPrimary(value); |
947 | 21356 | appendAlternate(value); |
948 | 21356 | } |
949 | |
|
950 | |
public void append(final char primary, final char alternate) { |
951 | 863 | appendPrimary(primary); |
952 | 863 | appendAlternate(alternate); |
953 | 863 | } |
954 | |
|
955 | |
public void appendPrimary(final char value) { |
956 | 22223 | if (this.primary.length() < this.maxLength) { |
957 | 22221 | this.primary.append(value); |
958 | |
} |
959 | 22223 | } |
960 | |
|
961 | |
public void appendAlternate(final char value) { |
962 | 22281 | if (this.alternate.length() < this.maxLength) { |
963 | 22263 | this.alternate.append(value); |
964 | |
} |
965 | 22281 | } |
966 | |
|
967 | |
public void append(final String value) { |
968 | 233 | appendPrimary(value); |
969 | 233 | appendAlternate(value); |
970 | 233 | } |
971 | |
|
972 | |
public void append(final String primary, final String alternate) { |
973 | 60 | appendPrimary(primary); |
974 | 60 | appendAlternate(alternate); |
975 | 60 | } |
976 | |
|
977 | |
public void appendPrimary(final String value) { |
978 | 293 | final int addChars = this.maxLength - this.primary.length(); |
979 | 293 | if (value.length() <= addChars) { |
980 | 249 | this.primary.append(value); |
981 | |
} else { |
982 | 44 | this.primary.append(value.substring(0, addChars)); |
983 | |
} |
984 | 293 | } |
985 | |
|
986 | |
public void appendAlternate(final String value) { |
987 | 293 | final int addChars = this.maxLength - this.alternate.length(); |
988 | 293 | if (value.length() <= addChars) { |
989 | 237 | this.alternate.append(value); |
990 | |
} else { |
991 | 56 | this.alternate.append(value.substring(0, addChars)); |
992 | |
} |
993 | 293 | } |
994 | |
|
995 | |
public String getPrimary() { |
996 | 3299 | return this.primary.toString(); |
997 | |
} |
998 | |
|
999 | |
public String getAlternate() { |
1000 | 3193 | return this.alternate.toString(); |
1001 | |
} |
1002 | |
|
1003 | |
public boolean isComplete() { |
1004 | 42088 | return this.primary.length() >= this.maxLength && |
1005 | |
this.alternate.length() >= this.maxLength; |
1006 | |
} |
1007 | |
} |
1008 | |
} |