1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.commons.codec.language.bm; |
19 | |
|
20 | |
import java.util.ArrayList; |
21 | |
import java.util.Arrays; |
22 | |
import java.util.Collections; |
23 | |
import java.util.EnumMap; |
24 | |
import java.util.HashSet; |
25 | |
import java.util.Iterator; |
26 | |
import java.util.LinkedHashSet; |
27 | |
import java.util.List; |
28 | |
import java.util.Locale; |
29 | |
import java.util.Map; |
30 | |
import java.util.Set; |
31 | |
import java.util.TreeSet; |
32 | |
|
33 | |
|
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
public class PhoneticEngine { |
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | 134301 | static final class PhonemeBuilder { |
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
public static PhonemeBuilder empty(Languages.LanguageSet languages) { |
68 | 208023 | return new PhonemeBuilder(Collections.singleton(new Rule.Phoneme("", languages))); |
69 | |
} |
70 | |
|
71 | |
private final Set<Rule.Phoneme> phonemes; |
72 | |
|
73 | 449897 | private PhonemeBuilder(Set<Rule.Phoneme> phonemes) { |
74 | 449897 | this.phonemes = phonemes; |
75 | 449897 | } |
76 | |
|
77 | |
|
78 | |
|
79 | |
|
80 | |
|
81 | |
|
82 | |
|
83 | |
public PhonemeBuilder append(CharSequence str) { |
84 | 75837 | Set<Rule.Phoneme> newPhonemes = new LinkedHashSet<Rule.Phoneme>(); |
85 | |
|
86 | 75837 | for (Rule.Phoneme ph : this.phonemes) { |
87 | 152223 | newPhonemes.add(ph.append(str)); |
88 | |
} |
89 | |
|
90 | 75837 | return new PhonemeBuilder(newPhonemes); |
91 | |
} |
92 | |
|
93 | |
|
94 | |
|
95 | |
|
96 | |
|
97 | |
|
98 | |
|
99 | |
|
100 | |
|
101 | |
|
102 | |
|
103 | |
|
104 | |
public PhonemeBuilder apply(Rule.PhonemeExpr phonemeExpr, int maxPhonemes) { |
105 | 31736 | Set<Rule.Phoneme> newPhonemes = new LinkedHashSet<Rule.Phoneme>(); |
106 | |
|
107 | 31736 | EXPR: for (Rule.Phoneme left : this.phonemes) { |
108 | 101477 | for (Rule.Phoneme right : phonemeExpr.getPhonemes()) { |
109 | 175837 | Rule.Phoneme join = left.join(right); |
110 | 175837 | if (!join.getLanguages().isEmpty()) { |
111 | 127487 | if (newPhonemes.size() < maxPhonemes) { |
112 | 126216 | newPhonemes.add(join); |
113 | |
} else { |
114 | |
break EXPR; |
115 | |
} |
116 | |
} |
117 | 174566 | } |
118 | |
} |
119 | |
|
120 | 31736 | return new PhonemeBuilder(newPhonemes); |
121 | |
} |
122 | |
|
123 | |
|
124 | |
|
125 | |
|
126 | |
|
127 | |
|
128 | |
public Set<Rule.Phoneme> getPhonemes() { |
129 | 275162 | return this.phonemes; |
130 | |
} |
131 | |
|
132 | |
|
133 | |
|
134 | |
|
135 | |
|
136 | |
|
137 | |
|
138 | |
|
139 | |
public String makeString() { |
140 | 67162 | final StringBuilder sb = new StringBuilder(); |
141 | |
|
142 | 67162 | for (Rule.Phoneme ph : this.phonemes) { |
143 | 79365 | if (sb.length() > 0) { |
144 | 12135 | sb.append("|"); |
145 | |
} |
146 | 79365 | sb.append(ph.getPhonemeText()); |
147 | |
} |
148 | |
|
149 | 67162 | return sb.toString(); |
150 | |
} |
151 | |
} |
152 | |
|
153 | |
|
154 | |
|
155 | |
|
156 | |
|
157 | |
|
158 | |
|
159 | |
|
160 | |
|
161 | |
|
162 | |
|
163 | |
|
164 | |
|
165 | |
private static final class RulesApplication { |
166 | |
private final List<Rule> finalRules; |
167 | |
private final CharSequence input; |
168 | |
|
169 | |
private PhonemeBuilder phonemeBuilder; |
170 | |
private int i; |
171 | |
private int maxPhonemes; |
172 | |
private boolean found; |
173 | |
|
174 | |
public RulesApplication(List<Rule> finalRules, CharSequence input, |
175 | 172993 | PhonemeBuilder phonemeBuilder, int i, int maxPhonemes) { |
176 | 172993 | if (finalRules == null) { |
177 | 0 | throw new NullPointerException("The finalRules argument must not be null"); |
178 | |
} |
179 | 172993 | this.finalRules = finalRules; |
180 | 172993 | this.phonemeBuilder = phonemeBuilder; |
181 | 172993 | this.input = input; |
182 | 172993 | this.i = i; |
183 | 172993 | this.maxPhonemes = maxPhonemes; |
184 | 172993 | } |
185 | |
|
186 | |
public int getI() { |
187 | 172993 | return this.i; |
188 | |
} |
189 | |
|
190 | |
public PhonemeBuilder getPhonemeBuilder() { |
191 | 172993 | return this.phonemeBuilder; |
192 | |
} |
193 | |
|
194 | |
|
195 | |
|
196 | |
|
197 | |
|
198 | |
|
199 | |
|
200 | |
|
201 | |
public RulesApplication invoke() { |
202 | 172993 | this.found = false; |
203 | 172993 | int patternLength = 0; |
204 | 172993 | for (Rule rule : this.finalRules) { |
205 | 32427496 | String pattern = rule.getPattern(); |
206 | 32427496 | patternLength = pattern.length(); |
207 | |
|
208 | 32427496 | if (!rule.patternAndContextMatches(this.input, this.i)) { |
209 | 32395760 | continue; |
210 | |
} |
211 | |
|
212 | 31736 | this.phonemeBuilder = this.phonemeBuilder.apply(rule.getPhoneme(), maxPhonemes); |
213 | 31736 | this.found = true; |
214 | 31736 | break; |
215 | |
} |
216 | |
|
217 | 172993 | if (!this.found) { |
218 | 141257 | patternLength = 1; |
219 | |
} |
220 | |
|
221 | 172993 | this.i += patternLength; |
222 | 172993 | return this; |
223 | |
} |
224 | |
|
225 | |
public boolean isFound() { |
226 | 101508 | return this.found; |
227 | |
} |
228 | |
} |
229 | |
|
230 | 1 | private static final Map<NameType, Set<String>> NAME_PREFIXES = new EnumMap<NameType, Set<String>>(NameType.class); |
231 | |
|
232 | |
static { |
233 | 1 | NAME_PREFIXES.put(NameType.ASHKENAZI, |
234 | |
Collections.unmodifiableSet( |
235 | |
new HashSet<String>(Arrays.asList("bar", "ben", "da", "de", "van", "von")))); |
236 | 1 | NAME_PREFIXES.put(NameType.SEPHARDIC, |
237 | |
Collections.unmodifiableSet( |
238 | |
new HashSet<String>(Arrays.asList("al", "el", "da", "dal", "de", "del", "dela", "de la", |
239 | |
"della", "des", "di", "do", "dos", "du", "van", "von")))); |
240 | 1 | NAME_PREFIXES.put(NameType.GENERIC, |
241 | |
Collections.unmodifiableSet( |
242 | |
new HashSet<String>(Arrays.asList("da", "dal", "de", "del", "dela", "de la", "della", |
243 | |
"des", "di", "do", "dos", "du", "van", "von")))); |
244 | 1 | } |
245 | |
|
246 | |
|
247 | |
|
248 | |
|
249 | |
|
250 | |
|
251 | |
|
252 | |
private static CharSequence cacheSubSequence(final CharSequence cached) { |
253 | |
|
254 | 208023 | final CharSequence[][] cache = new CharSequence[cached.length()][cached.length()]; |
255 | 208023 | return new CharSequence() { |
256 | |
@Override |
257 | |
public char charAt(int index) { |
258 | 0 | return cached.charAt(index); |
259 | |
} |
260 | |
|
261 | |
@Override |
262 | |
public int length() { |
263 | 33010001 | return cached.length(); |
264 | |
} |
265 | |
|
266 | |
@Override |
267 | |
public CharSequence subSequence(int start, int end) { |
268 | 19584741 | if (start == end) { |
269 | 25935 | return ""; |
270 | |
} |
271 | |
|
272 | 19558806 | CharSequence res = cache[start][end - 1]; |
273 | 19558806 | if (res == null) { |
274 | 480035 | res = cached.subSequence(start, end); |
275 | 480035 | cache[start][end - 1] = res; |
276 | |
} |
277 | 19558806 | return res; |
278 | |
} |
279 | |
}; |
280 | |
} |
281 | |
|
282 | |
|
283 | |
|
284 | |
|
285 | |
|
286 | |
|
287 | |
|
288 | |
private static String join(Iterable<String> strings, String sep) { |
289 | 67128 | StringBuilder sb = new StringBuilder(); |
290 | 67128 | Iterator<String> si = strings.iterator(); |
291 | 67128 | if (si.hasNext()) { |
292 | 67128 | sb.append(si.next()); |
293 | |
} |
294 | 67149 | while (si.hasNext()) { |
295 | 21 | sb.append(sep).append(si.next()); |
296 | |
} |
297 | |
|
298 | 67128 | return sb.toString(); |
299 | |
} |
300 | |
|
301 | |
private static final int DEFAULT_MAX_PHONEMES = 20; |
302 | |
|
303 | |
private final Lang lang; |
304 | |
|
305 | |
private final NameType nameType; |
306 | |
|
307 | |
private final RuleType ruleType; |
308 | |
|
309 | |
private final boolean concat; |
310 | |
|
311 | |
private final int maxPhonemes; |
312 | |
|
313 | |
|
314 | |
|
315 | |
|
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | |
|
321 | |
|
322 | |
|
323 | |
public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat) { |
324 | 96 | this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES); |
325 | 96 | } |
326 | |
|
327 | |
|
328 | |
|
329 | |
|
330 | |
|
331 | |
|
332 | |
|
333 | |
|
334 | |
|
335 | |
|
336 | |
|
337 | |
|
338 | |
|
339 | |
|
340 | 128 | public PhoneticEngine(NameType nameType, RuleType ruleType, boolean concat, int maxPhonemes) { |
341 | 128 | if (ruleType == RuleType.RULES) { |
342 | 1 | throw new IllegalArgumentException("ruleType must not be " + RuleType.RULES); |
343 | |
} |
344 | 127 | this.nameType = nameType; |
345 | 127 | this.ruleType = ruleType; |
346 | 127 | this.concat = concat; |
347 | 127 | this.lang = Lang.instance(nameType); |
348 | 127 | this.maxPhonemes = maxPhonemes; |
349 | 127 | } |
350 | |
|
351 | |
|
352 | |
|
353 | |
|
354 | |
|
355 | |
|
356 | |
|
357 | |
|
358 | |
|
359 | |
private PhonemeBuilder applyFinalRules(PhonemeBuilder phonemeBuilder, List<Rule> finalRules) { |
360 | 134324 | if (finalRules == null) { |
361 | 0 | throw new NullPointerException("finalRules can not be null"); |
362 | |
} |
363 | 134324 | if (finalRules.isEmpty()) { |
364 | 23 | return phonemeBuilder; |
365 | |
} |
366 | |
|
367 | 134301 | Set<Rule.Phoneme> phonemes = new TreeSet<Rule.Phoneme>(Rule.Phoneme.COMPARATOR); |
368 | |
|
369 | 134301 | for (Rule.Phoneme phoneme : phonemeBuilder.getPhonemes()) { |
370 | 140861 | PhonemeBuilder subBuilder = PhonemeBuilder.empty(phoneme.getLanguages()); |
371 | 140861 | CharSequence phonemeText = cacheSubSequence(phoneme.getPhonemeText()); |
372 | |
|
373 | 140861 | for (int i = 0; i < phonemeText.length();) { |
374 | 101508 | RulesApplication rulesApplication = |
375 | |
new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).invoke(); |
376 | 101508 | boolean found = rulesApplication.isFound(); |
377 | 101508 | subBuilder = rulesApplication.getPhonemeBuilder(); |
378 | |
|
379 | 101508 | if (!found) { |
380 | |
|
381 | 75837 | subBuilder = subBuilder.append(phonemeText.subSequence(i, i + 1)); |
382 | |
} |
383 | |
|
384 | 101508 | i = rulesApplication.getI(); |
385 | 101508 | } |
386 | |
|
387 | 140861 | phonemes.addAll(subBuilder.getPhonemes()); |
388 | 140861 | } |
389 | |
|
390 | 134301 | return new PhonemeBuilder(phonemes); |
391 | |
} |
392 | |
|
393 | |
|
394 | |
|
395 | |
|
396 | |
|
397 | |
|
398 | |
|
399 | |
|
400 | |
public String encode(String input) { |
401 | 67144 | Languages.LanguageSet languageSet = this.lang.guessLanguages(input); |
402 | 67144 | return encode(input, languageSet); |
403 | |
} |
404 | |
|
405 | |
|
406 | |
|
407 | |
|
408 | |
|
409 | |
|
410 | |
|
411 | |
|
412 | |
|
413 | |
|
414 | |
public String encode(String input, final Languages.LanguageSet languageSet) { |
415 | 67168 | final List<Rule> rules = Rule.getInstance(this.nameType, RuleType.RULES, languageSet); |
416 | |
|
417 | 67168 | final List<Rule> finalRules1 = Rule.getInstance(this.nameType, this.ruleType, "common"); |
418 | |
|
419 | 67168 | final List<Rule> finalRules2 = Rule.getInstance(this.nameType, this.ruleType, languageSet); |
420 | |
|
421 | |
|
422 | |
|
423 | 67168 | input = input.toLowerCase(Locale.ENGLISH).replace('-', ' ').trim(); |
424 | |
|
425 | 67168 | if (this.nameType == NameType.GENERIC) { |
426 | 67125 | if (input.length() >= 2 && input.substring(0, 2).equals("d'")) { |
427 | 5 | String remainder = input.substring(2); |
428 | 5 | String combined = "d" + remainder; |
429 | 5 | return "(" + encode(remainder) + ")-(" + encode(combined) + ")"; |
430 | |
} |
431 | 67120 | for (String l : NAME_PREFIXES.get(this.nameType)) { |
432 | |
|
433 | 939668 | if (input.startsWith(l + " ")) { |
434 | |
|
435 | 1 | String remainder = input.substring(l.length() + 1); |
436 | 1 | String combined = l + remainder; |
437 | 1 | return "(" + encode(remainder) + ")-(" + encode(combined) + ")"; |
438 | |
} |
439 | |
} |
440 | |
} |
441 | |
|
442 | 67162 | final List<String> words = Arrays.asList(input.split("\\s+")); |
443 | 67162 | final List<String> words2 = new ArrayList<String>(); |
444 | |
|
445 | |
|
446 | 67162 | switch (this.nameType) { |
447 | |
case SEPHARDIC: |
448 | 21 | for (String aWord : words) { |
449 | 21 | String[] parts = aWord.split("'"); |
450 | 21 | String lastPart = parts[parts.length - 1]; |
451 | 21 | words2.add(lastPart); |
452 | 21 | } |
453 | 21 | words2.removeAll(NAME_PREFIXES.get(this.nameType)); |
454 | 21 | break; |
455 | |
case ASHKENAZI: |
456 | 22 | words2.addAll(words); |
457 | 22 | words2.removeAll(NAME_PREFIXES.get(this.nameType)); |
458 | 22 | break; |
459 | |
case GENERIC: |
460 | 67119 | words2.addAll(words); |
461 | 67119 | break; |
462 | |
default: |
463 | 0 | throw new IllegalStateException("Unreachable case: " + this.nameType); |
464 | |
} |
465 | |
|
466 | 67162 | if (this.concat) { |
467 | |
|
468 | 67128 | input = join(words2, " "); |
469 | 34 | } else if (words2.size() == 1) { |
470 | |
|
471 | 34 | input = words.iterator().next(); |
472 | |
} else { |
473 | |
|
474 | 0 | StringBuilder result = new StringBuilder(); |
475 | 0 | for (String word : words2) { |
476 | 0 | result.append("-").append(encode(word)); |
477 | |
} |
478 | |
|
479 | 0 | return result.substring(1); |
480 | |
} |
481 | |
|
482 | 67162 | PhonemeBuilder phonemeBuilder = PhonemeBuilder.empty(languageSet); |
483 | |
|
484 | |
|
485 | 67162 | CharSequence inputCache = cacheSubSequence(input); |
486 | 67162 | for (int i = 0; i < inputCache.length();) { |
487 | 71485 | RulesApplication rulesApplication = |
488 | |
new RulesApplication(rules, inputCache, phonemeBuilder, i, maxPhonemes).invoke(); |
489 | 71485 | i = rulesApplication.getI(); |
490 | 71485 | phonemeBuilder = rulesApplication.getPhonemeBuilder(); |
491 | 71485 | } |
492 | |
|
493 | |
|
494 | 67162 | phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules1); |
495 | |
|
496 | 67162 | phonemeBuilder = applyFinalRules(phonemeBuilder, finalRules2); |
497 | |
|
498 | 67162 | return phonemeBuilder.makeString(); |
499 | |
} |
500 | |
|
501 | |
|
502 | |
|
503 | |
|
504 | |
|
505 | |
|
506 | |
public Lang getLang() { |
507 | 0 | return this.lang; |
508 | |
} |
509 | |
|
510 | |
|
511 | |
|
512 | |
|
513 | |
|
514 | |
|
515 | |
public NameType getNameType() { |
516 | 15 | return this.nameType; |
517 | |
} |
518 | |
|
519 | |
|
520 | |
|
521 | |
|
522 | |
|
523 | |
|
524 | |
public RuleType getRuleType() { |
525 | 14 | return this.ruleType; |
526 | |
} |
527 | |
|
528 | |
|
529 | |
|
530 | |
|
531 | |
|
532 | |
|
533 | |
public boolean isConcat() { |
534 | 25 | return this.concat; |
535 | |
} |
536 | |
|
537 | |
|
538 | |
|
539 | |
|
540 | |
|
541 | |
|
542 | |
|
543 | |
public int getMaxPhonemes() { |
544 | 24 | return this.maxPhonemes; |
545 | |
} |
546 | |
} |