1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.commons.codec.language.bm; |
19 | |
|
20 | |
import java.util.ArrayList; |
21 | |
import java.util.Arrays; |
22 | |
import java.util.Collections; |
23 | |
import java.util.EnumMap; |
24 | |
import java.util.HashSet; |
25 | |
import java.util.Iterator; |
26 | |
import java.util.LinkedHashSet; |
27 | |
import java.util.List; |
28 | |
import java.util.Locale; |
29 | |
import java.util.Map; |
30 | |
import java.util.Set; |
31 | |
import java.util.TreeSet; |
32 | |
|
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
public class PhoneticEngine { |
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | 134301 | static final class PhonemeBuilder { |
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
|
68 | |
public static PhonemeBuilder empty(final Languages.LanguageSet languages) { |
69 | 208023 | return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages))); |
70 | |
} |
71 | |
|
72 | |
private final Set<Rule.Phoneme> phonemes; |
73 | |
|
74 | 449897 | private PhonemeBuilder(final Set<Rule.Phoneme> phonemes) { |
75 | 449897 | this.phonemes = phonemes; |
76 | 449897 | } |
77 | |
|
78 | |
|
79 | |
|
80 | |
|
81 | |
|
82 | |
|
83 | |
|
84 | |
public PhonemeBuilder append(final CharSequence str) { |
85 | 75837 | final Set<Rule.Phoneme> newPhonemes = new LinkedHashSet<Rule.Phoneme>(); |
86 | |
|
87 | 75837 | for (final Rule.Phoneme ph : this.phonemes) { |
88 | 152223 | newPhonemes.add(ph.append(str)); |
89 | |
} |
90 | |
|
91 | 75837 | return new PhonemeBuilder(newPhonemes); |
92 | |
} |
93 | |
|
94 | |
|
95 | |
|
96 | |
|
97 | |
|
98 | |
|
99 | |
|
100 | |
|
101 | |
|
102 | |
|
103 | |
|
104 | |
|
105 | |
public PhonemeBuilder apply(final Rule.PhonemeExpr phonemeExpr, final int maxPhonemes) { |
106 | 31736 | final Set<Rule.Phoneme> newPhonemes = new LinkedHashSet<Rule.Phoneme>(); |
107 | |
|
108 | 31736 | EXPR: for (final Rule.Phoneme left : this.phonemes) { |
109 | 101477 | for (final Rule.Phoneme right : phonemeExpr.getPhonemes()) { |
110 | 175837 | final Rule.Phoneme join = left.join(right); |
111 | 175837 | if (!join.getLanguages().isEmpty()) { |
112 | 127487 | if (newPhonemes.size() < maxPhonemes) { |
113 | 126216 | newPhonemes.add(join); |
114 | |
} else { |
115 | |
break EXPR; |
116 | |
} |
117 | |
} |
118 | 174566 | } |
119 | |
} |
120 | |
|
121 | 31736 | return new PhonemeBuilder(newPhonemes); |
122 | |
} |
123 | |
|
124 | |
|
125 | |
|
126 | |
|
127 | |
|
128 | |
|
129 | |
public Set<Rule.Phoneme> getPhonemes() { |
130 | 275162 | return this.phonemes; |
131 | |
} |
132 | |
|
133 | |
|
134 | |
|
135 | |
|
136 | |
|
137 | |
|
138 | |
|
139 | |
|
140 | |
public String makeString() { |
141 | 67162 | final StringBuilder sb = new StringBuilder(); |
142 | |
|
143 | 67162 | for (final Rule.Phoneme ph : this.phonemes) { |
144 | 79365 | if (sb.length() > 0) { |
145 | 12135 | sb.append("|"); |
146 | |
} |
147 | 79365 | sb.append(ph.getPhonemeText()); |
148 | |
} |
149 | |
|
150 | 67162 | return sb.toString(); |
151 | |
} |
152 | |
} |
153 | |
|
154 | |
|
155 | |
|
156 | |
|
157 | |
|
158 | |
|
159 | |
|
160 | |
|
161 | |
|
162 | |
|
163 | |
|
164 | |
|
165 | |
|
166 | |
private static final class RulesApplication { |
167 | |
private final List<Rule> finalRules; |
168 | |
private final CharSequence input; |
169 | |
|
170 | |
private PhonemeBuilder phonemeBuilder; |
171 | |
private int i; |
172 | |
private final int maxPhonemes; |
173 | |
private boolean found; |
174 | |
|
175 | |
public RulesApplication(final List<Rule> finalRules, final CharSequence input, |
176 | 172993 | final PhonemeBuilder phonemeBuilder, final int i, final int maxPhonemes) { |
177 | 172993 | if (finalRules == null) { |
178 | 0 | throw new NullPointerException("The finalRules argument must not be null"); |
179 | |
} |
180 | 172993 | this.finalRules = finalRules; |
181 | 172993 | this.phonemeBuilder = phonemeBuilder; |
182 | 172993 | this.input = input; |
183 | 172993 | this.i = i; |
184 | 172993 | this.maxPhonemes = maxPhonemes; |
185 | 172993 | } |
186 | |
|
187 | |
public int getI() { |
188 | 172993 | return this.i; |
189 | |
} |
190 | |
|
191 | |
public PhonemeBuilder getPhonemeBuilder() { |
192 | 172993 | return this.phonemeBuilder; |
193 | |
} |
194 | |
|
195 | |
|
196 | |
|
197 | |
|
198 | |
|
199 | |
|
200 | |
|
201 | |
|
202 | |
public RulesApplication invoke() { |
203 | 172993 | this.found = false; |
204 | 172993 | int patternLength = 0; |
205 | 172993 | for (final Rule rule : this.finalRules) { |
206 | 32427496 | final String pattern = rule.getPattern(); |
207 | 32427496 | patternLength = pattern.length(); |
208 | |
|
209 | 32427496 | if (!rule.patternAndContextMatches(this.input, this.i)) { |
210 | 32395760 | continue; |
211 | |
} |
212 | |
|
213 | 31736 | this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes); |
214 | 31736 | this.found = true; |
215 | 31736 | break; |
216 | |
} |
217 | |
|
218 | 172993 | if (!this.found) { |
219 | 141257 | patternLength = 1; |
220 | |
} |
221 | |
|
222 | 172993 | this.i += patternLength; |
223 | 172993 | return this; |
224 | |
} |
225 | |
|
226 | |
public boolean isFound() { |
227 | 101508 | return this.found; |
228 | |
} |
229 | |
} |
230 | |
|
231 | 1 | private static final Map<NameType, Set<String>> NAME_PREFIXES = new EnumMap<NameType, Set<String>>(NameType.class); |
232 | |
|
233 | |
static { |
234 | 1 | NAME_PREFIXES.put(NameType.ASHKENAZI, |
235 | |
Collections.unmodifiableSet( |
236 | |
new HashSet<String>(Arrays.asList("bar", "ben", "da", "de", "van", "von")))); |
237 | 1 | NAME_PREFIXES.put(NameType.SEPHARDIC, |
238 | |
Collections.unmodifiableSet( |
239 | |
new HashSet<String>(Arrays.asList("al", "el", "da", "dal", "de", "del", "dela", "de la", |
240 | |
"della", "des", "di", "do", "dos", "du", "van", "von")))); |
241 | 1 | NAME_PREFIXES.put(NameType.GENERIC, |
242 | |
Collections.unmodifiableSet( |
243 | |
new HashSet<String>(Arrays.asList("da", "dal", "de", "del", "dela", "de la", "della", |
244 | |
"des", "di", "do", "dos", "du", "van", "von")))); |
245 | 1 | } |
246 | |
|
247 | |
|
248 | |
|
249 | |
|
250 | |
|
251 | |
|
252 | |
|
253 | |
private static CharSequence cacheSubSequence(final CharSequence cached) { |
254 | |
|
255 | 208023 | final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()]; |
256 | 208023 | return new CharSequence() { |
257 | |
@Override |
258 | |
public char charAt(final int index) { |
259 | 0 | return cached.charAt(index); |
260 | |
} |
261 | |
|
262 | |
@Override |
263 | |
public int length() { |
264 | 33010001 | return cached.length(); |
265 | |
} |
266 | |
|
267 | |
@Override |
268 | |
public CharSequence subSequence(final int start, final int end) { |
269 | 19584741 | if (start == end) { |
270 | 25935 | return ""; |
271 | |
} |
272 | |
|
273 | 19558806 | CharSequence res = cache[start][end - 1]; |
274 | 19558806 | if (res == null) { |
275 | 480035 | res = cached.subSequence(start, end); |
276 | 480035 | cache[start][end - 1] = res; |
277 | |
} |
278 | 19558806 | return res; |
279 | |
} |
280 | |
}; |
281 | |
} |
282 | |
|
283 | |
|
284 | |
|
285 | |
|
286 | |
|
287 | |
|
288 | |
|
289 | |
private static String join(final Iterable<String> strings, final String sep) { |
290 | 67128 | final StringBuilder sb = new StringBuilder(); |
291 | 67128 | final Iterator<String> si = strings.iterator(); |
292 | 67128 | if (si.hasNext()) { |
293 | 67128 | sb.append(si.next()); |
294 | |
} |
295 | 67149 | while (si.hasNext()) { |
296 | 21 | sb.append(sep).append(si.next()); |
297 | |
} |
298 | |
|
299 | 67128 | return sb.toString(); |
300 | |
} |
301 | |
|
302 | |
private static final int DEFAULT_MAX_PHONEMES = 20; |
303 | |
|
304 | |
private final Lang lang; |
305 | |
|
306 | |
private final NameType nameType; |
307 | |
|
308 | |
private final RuleType ruleType; |
309 | |
|
310 | |
private final boolean concat; |
311 | |
|
312 | |
private final int maxPhonemes; |
313 | |
|
314 | |
|
315 | |
|
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | |
|
321 | |
|
322 | |
|
323 | |
|
324 | |
public PhoneticEngine(final NameType nameType, final RuleType ruleType, final boolean concat) { |
325 | 96 | this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES); |
326 | 96 | } |
327 | |
|
328 | |
|
329 | |
|
330 | |
|
331 | |
|
332 | |
|
333 | |
|
334 | |
|
335 | |
|
336 | |
|
337 | |
|
338 | |
|
339 | |
|
340 | |
|
341 | |
public PhoneticEngine(final NameType nameType, final RuleType ruleType, final boolean concat, |
342 | 128 | final int maxPhonemes) { |
343 | 128 | if (ruleType == RuleType.RULES) { |
344 | 1 | throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES); |
345 | |
} |
346 | 127 | this.nameType = nameType; |
347 | 127 | this.ruleType = ruleType; |
348 | 127 | this.concat = concat; |
349 | 127 | this.lang = Lang.instance(nameType); |
350 | 127 | this.maxPhonemes = maxPhonemes; |
351 | 127 | } |
352 | |
|
353 | |
|
354 | |
|
355 | |
|
356 | |
|
357 | |
|
358 | |
|
359 | |
|
360 | |
|
361 | |
private PhonemeBuilder applyFinalRules(final PhonemeBuilder phonemeBuilder, final List<Rule> finalRules) { |
362 | 134324 | if (finalRules == null) { |
363 | 0 | throw new NullPointerException("finalRules can not be null"); |
364 | |
} |
365 | 134324 | if (finalRules.isEmpty()) { |
366 | 23 | return phonemeBuilder; |
367 | |
} |
368 | |
|
369 | 134301 | final Set<Rule.Phoneme> phonemes = new TreeSet<Rule.Phoneme>(Rule.Phoneme.COMPARATOR); |
370 | |
|
371 | 134301 | for (final Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) { |
372 | 140861 | PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages()); |
373 | 140861 | final CharSequence phonemeText = cacheSubSequence(phoneme.getPhonemeText()); |
374 | |
|
375 | 140861 | for (int i = 0; i < phonemeText.length();) { |
376 | 101508 | final RulesApplication rulesApplication = |
377 | |
new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke(); |
378 | 101508 | final boolean found = rulesApplication.isFound(); |
379 | 101508 | subBuilder = rulesApplication.getPhonemeBuilder(); |
380 | |
|
381 | 101508 | if (!found) { |
382 | |
|
383 | 75837 | subBuilder = subBuilder.append(phonemeText.subSequence(i, i + 1)); |
384 | |
} |
385 | |
|
386 | 101508 | i = rulesApplication.getI(); |
387 | 101508 | } |
388 | |
|
389 | 140861 | phonemes.addAll(subBuilder.getPhonemes()); |
390 | 140861 | } |
391 | |
|
392 | 134301 | return new PhonemeBuilder(phonemes); |
393 | |
} |
394 | |
|
395 | |
|
396 | |
|
397 | |
|
398 | |
|
399 | |
|
400 | |
|
401 | |
|
402 | |
public String encode(final String input) { |
403 | 67144 | final Languages.LanguageSet languageSet = this.lang.guessLanguages(input); |
404 | 67144 | return encode(input, languageSet); |
405 | |
} |
406 | |
|
407 | |
|
408 | |
|
409 | |
|
410 | |
|
411 | |
|
412 | |
|
413 | |
|
414 | |
|
415 | |
|
416 | |
public String encode(String input, final Languages.LanguageSet languageSet) { |
417 | 67168 | final List<Rule> rules = Rule.getInstance(this.nameType, RuleType.RULES, languageSet); |
418 | |
|
419 | 67168 | final List<Rule> finalRules1 = Rule.getInstance(this.nameType, this.ruleType, "common"); |
420 | |
|
421 | 67168 | final List<Rule> finalRules2 = Rule.getInstance(this.nameType, this.ruleType, languageSet); |
422 | |
|
423 | |
|
424 | |
|
425 | 67168 | input = input.toLowerCase(Locale.ENGLISH).replace('-', ' ').trim(); |
426 | |
|
427 | 67168 | if (this.nameType == NameType.GENERIC) { |
428 | 67125 | if (input.length() >= 2 && input.substring(0, 2).equals("d'")) { |
429 | 5 | final String remainder = input.substring(2); |
430 | 5 | final String combined = "d" + remainder; |
431 | 5 | return "(" + encode(remainder) + ")-(" + encode(combined) + ")"; |
432 | |
} |
433 | 67120 | for (final String l : NAME_PREFIXES.get(this.nameType)) { |
434 | |
|
435 | 939668 | if (input.startsWith(l + " ")) { |
436 | |
|
437 | 1 | final String remainder = input.substring(l.length() + 1); |
438 | 1 | final String combined = l + remainder; |
439 | 1 | return "(" + encode(remainder) + ")-(" + encode(combined) + ")"; |
440 | |
} |
441 | |
} |
442 | |
} |
443 | |
|
444 | 67162 | final List<String> words = Arrays.asList(input.split("\\s+")); |
445 | 67162 | final List<String> words2 = new ArrayList<String>(); |
446 | |
|
447 | |
|
448 | 67162 | switch (this.nameType) { |
449 | |
case SEPHARDIC: |
450 | 21 | for (final String aWord : words) { |
451 | 21 | final String[] parts = aWord.split("'"); |
452 | 21 | final String lastPart = parts[parts.length - 1]; |
453 | 21 | words2.add(lastPart); |
454 | 21 | } |
455 | 21 | words2.removeAll(NAME_PREFIXES.get(this.nameType)); |
456 | 21 | break; |
457 | |
case ASHKENAZI: |
458 | 22 | words2.addAll(words); |
459 | 22 | words2.removeAll(NAME_PREFIXES.get(this.nameType)); |
460 | 22 | break; |
461 | |
case GENERIC: |
462 | 67119 | words2.addAll(words); |
463 | 67119 | break; |
464 | |
default: |
465 | 0 | throw new IllegalStateException("Unreachable case: " + this.nameType); |
466 | |
} |
467 | |
|
468 | 67162 | if (this.concat) { |
469 | |
|
470 | 67128 | input = join(words2, " "); |
471 | 34 | } else if (words2.size() == 1) { |
472 | |
|
473 | 34 | input = words.iterator().next(); |
474 | |
} else { |
475 | |
|
476 | 0 | final StringBuilder result = new StringBuilder(); |
477 | 0 | for (final String word : words2) { |
478 | 0 | result.append("-").append(encode(word)); |
479 | |
} |
480 | |
|
481 | 0 | return result.substring(1); |
482 | |
} |
483 | |
|
484 | 67162 | PhonemeBuilder phonemeBuilder = PhonemeBuilder.empty(languageSet); |
485 | |
|
486 | |
|
487 | 67162 | final CharSequence inputCache = cacheSubSequence(input); |
488 | 67162 | for (int i = 0; i < inputCache.length();) { |
489 | 71485 | final RulesApplication rulesApplication = |
490 | |
new RulesApplication(rules, inputCache, phonemeBuilder, i, maxPhonemes).invoke(); |
491 | 71485 | i = rulesApplication.getI(); |
492 | 71485 | phonemeBuilder = rulesApplication.getPhonemeBuilder(); |
493 | 71485 | } |
494 | |
|
495 | |
|
496 | 67162 | phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1); |
497 | |
|
498 | 67162 | phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2); |
499 | |
|
500 | 67162 | return phonemeBuilder.makeString(); |
501 | |
} |
502 | |
|
503 | |
|
504 | |
|
505 | |
|
506 | |
|
507 | |
|
508 | |
public Lang getLang() { |
509 | 0 | return this.lang; |
510 | |
} |
511 | |
|
512 | |
|
513 | |
|
514 | |
|
515 | |
|
516 | |
|
517 | |
public NameType getNameType() { |
518 | 15 | return this.nameType; |
519 | |
} |
520 | |
|
521 | |
|
522 | |
|
523 | |
|
524 | |
|
525 | |
|
526 | |
public RuleType getRuleType() { |
527 | 14 | return this.ruleType; |
528 | |
} |
529 | |
|
530 | |
|
531 | |
|
532 | |
|
533 | |
|
534 | |
|
535 | |
public boolean isConcat() { |
536 | 25 | return this.concat; |
537 | |
} |
538 | |
|
539 | |
|
540 | |
|
541 | |
|
542 | |
|
543 | |
|
544 | |
|
545 | |
public int getMaxPhonemes() { |
546 | 24 | return this.maxPhonemes; |
547 | |
} |
548 | |
} |