1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.commons.codec.language.bm; |
19 | |
|
20 | |
import java.io.InputStream; |
21 | |
import java.util.ArrayList; |
22 | |
import java.util.Arrays; |
23 | |
import java.util.Collections; |
24 | |
import java.util.Comparator; |
25 | |
import java.util.EnumMap; |
26 | |
import java.util.HashMap; |
27 | |
import java.util.HashSet; |
28 | |
import java.util.List; |
29 | |
import java.util.Map; |
30 | |
import java.util.Scanner; |
31 | |
import java.util.Set; |
32 | |
import java.util.regex.Matcher; |
33 | |
import java.util.regex.Pattern; |
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
|
68 | |
|
69 | |
|
70 | |
|
71 | |
|
72 | |
|
73 | |
|
74 | |
|
75 | |
|
76 | |
|
77 | |
|
78 | |
|
79 | |
|
80 | 106145 | public class Rule { |
81 | |
|
82 | 4876451 | public static final class Phoneme implements PhonemeExpr { |
83 | 92232 | public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { |
84 | |
@Override |
85 | |
public int compare(Phoneme o1, Phoneme o2) { |
86 | 1220580 | for (int i = 0; i < o1.phonemeText.length(); i++) { |
87 | 1216151 | if (i >= o2.phonemeText.length()) { |
88 | 720 | return +1; |
89 | |
} |
90 | 1215431 | int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); |
91 | 1215431 | if (c != 0) { |
92 | 87082 | return c; |
93 | |
} |
94 | |
} |
95 | |
|
96 | 4429 | if (o1.phonemeText.length() < o2.phonemeText.length()) { |
97 | 1020 | return -1; |
98 | |
} |
99 | |
|
100 | 3409 | return 0; |
101 | |
} |
102 | |
}; |
103 | |
|
104 | |
private final CharSequence phonemeText; |
105 | |
private final Languages.LanguageSet languages; |
106 | |
|
107 | 542556 | public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) { |
108 | 542556 | this.phonemeText = phonemeText; |
109 | 542556 | this.languages = languages; |
110 | 542556 | } |
111 | |
|
112 | |
public Phoneme append(CharSequence str) { |
113 | 152227 | return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages); |
114 | |
} |
115 | |
|
116 | |
public Languages.LanguageSet getLanguages() { |
117 | 316698 | return this.languages; |
118 | |
} |
119 | |
|
120 | |
@Override |
121 | |
public Iterable<Phoneme> getPhonemes() { |
122 | 37869 | return Collections.singleton(this); |
123 | |
} |
124 | |
|
125 | |
public CharSequence getPhonemeText() { |
126 | 220364 | return this.phonemeText; |
127 | |
} |
128 | |
|
129 | |
public Phoneme join(Phoneme right) { |
130 | 175837 | return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), |
131 | |
this.languages.restrictTo(right.languages)); |
132 | |
} |
133 | |
} |
134 | |
|
135 | |
public interface PhonemeExpr { |
136 | |
Iterable<Phoneme> getPhonemes(); |
137 | |
} |
138 | |
|
139 | 63608 | public static final class PhonemeList implements PhonemeExpr { |
140 | |
private final List<Phoneme> phonemes; |
141 | |
|
142 | 1438 | public PhonemeList(List<Phoneme> phonemes) { |
143 | 1438 | this.phonemes = phonemes; |
144 | 1438 | } |
145 | |
|
146 | |
@Override |
147 | |
public List<Phoneme> getPhonemes() { |
148 | 63608 | return this.phonemes; |
149 | |
} |
150 | |
} |
151 | |
|
152 | |
|
153 | |
|
154 | |
|
155 | |
public static interface RPattern { |
156 | |
boolean isMatch(CharSequence input); |
157 | |
} |
158 | |
|
159 | 1 | public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { |
160 | |
@Override |
161 | |
public boolean isMatch(CharSequence input) { |
162 | 66010 | return true; |
163 | |
} |
164 | |
}; |
165 | |
|
166 | |
public static final String ALL = "ALL"; |
167 | |
|
168 | |
private static final String DOUBLE_QUOTE = "\""; |
169 | |
|
170 | |
private static final String HASH_INCLUDE = "#include"; |
171 | |
|
172 | 1 | private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES = |
173 | |
new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class); |
174 | |
|
175 | |
static { |
176 | 4 | for (NameType s : NameType.values()) { |
177 | 3 | Map<RuleType, Map<String, List<Rule>>> rts = new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class); |
178 | |
|
179 | 12 | for (RuleType rt : RuleType.values()) { |
180 | 9 | Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>(); |
181 | |
|
182 | 9 | Languages ls = Languages.getInstance(s); |
183 | 9 | for (String l : ls.getLanguages()) { |
184 | |
try { |
185 | 108 | rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l))); |
186 | 0 | } catch (IllegalStateException e) { |
187 | 0 | throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); |
188 | 108 | } |
189 | |
} |
190 | 9 | if (!rt.equals(RuleType.RULES)) { |
191 | 6 | rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common"))); |
192 | |
} |
193 | |
|
194 | 9 | rts.put(rt, Collections.unmodifiableMap(rs)); |
195 | |
} |
196 | |
|
197 | 3 | RULES.put(s, Collections.unmodifiableMap(rts)); |
198 | |
} |
199 | 1 | } |
200 | |
|
201 | |
private static boolean contains(CharSequence chars, char input) { |
202 | 312230 | for (int i = 0; i < chars.length(); i++) { |
203 | 270802 | if (chars.charAt(i) == input) { |
204 | 10453 | return true; |
205 | |
} |
206 | |
} |
207 | 41428 | return false; |
208 | |
} |
209 | |
|
210 | |
private static String createResourceName(NameType nameType, RuleType rt, String lang) { |
211 | 228 | return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", |
212 | |
nameType.getName(), rt.getName(), lang); |
213 | |
} |
214 | |
|
215 | |
private static Scanner createScanner(NameType nameType, RuleType rt, String lang) { |
216 | 114 | String resName = createResourceName(nameType, rt, lang); |
217 | 114 | InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); |
218 | |
|
219 | 114 | if (rulesIS == null) { |
220 | 0 | throw new IllegalArgumentException("Unable to load resource: " + resName); |
221 | |
} |
222 | |
|
223 | 114 | return new Scanner(rulesIS, ResourceConstants.ENCODING); |
224 | |
} |
225 | |
|
226 | |
private static Scanner createScanner(String lang) { |
227 | 35 | String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); |
228 | 35 | InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); |
229 | |
|
230 | 35 | if (rulesIS == null) { |
231 | 0 | throw new IllegalArgumentException("Unable to load resource: " + resName); |
232 | |
} |
233 | |
|
234 | 35 | return new Scanner(rulesIS, ResourceConstants.ENCODING); |
235 | |
} |
236 | |
|
237 | |
private static boolean endsWith(CharSequence input, CharSequence suffix) { |
238 | 894 | if (suffix.length() > input.length()) { |
239 | 124 | return false; |
240 | |
} |
241 | 783 | for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { |
242 | 770 | if (input.charAt(i) != suffix.charAt(j)) { |
243 | 757 | return false; |
244 | |
} |
245 | |
} |
246 | 13 | return true; |
247 | |
} |
248 | |
|
249 | |
|
250 | |
|
251 | |
|
252 | |
|
253 | |
|
254 | |
|
255 | |
|
256 | |
|
257 | |
|
258 | |
|
259 | |
|
260 | |
public static List<Rule> getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) { |
261 | 134336 | return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) : |
262 | |
getInstance(nameType, rt, Languages.ANY); |
263 | |
} |
264 | |
|
265 | |
|
266 | |
|
267 | |
|
268 | |
|
269 | |
|
270 | |
|
271 | |
|
272 | |
|
273 | |
|
274 | |
|
275 | |
|
276 | |
public static List<Rule> getInstance(NameType nameType, RuleType rt, String lang) { |
277 | 201505 | List<Rule> rules = RULES.get(nameType).get(rt).get(lang); |
278 | |
|
279 | 201505 | if (rules == null) { |
280 | 1 | throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", |
281 | |
nameType.getName(), rt.getName(), lang)); |
282 | |
} |
283 | |
|
284 | 201504 | return rules; |
285 | |
} |
286 | |
|
287 | |
private static Phoneme parsePhoneme(String ph) { |
288 | 6388 | int open = ph.indexOf("["); |
289 | 6388 | if (open >= 0) { |
290 | 597 | if (!ph.endsWith("]")) { |
291 | 0 | throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); |
292 | |
} |
293 | 597 | String before = ph.substring(0, open); |
294 | 597 | String in = ph.substring(open + 1, ph.length() - 1); |
295 | 597 | Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]"))); |
296 | |
|
297 | 597 | return new Phoneme(before, Languages.LanguageSet.from(langs)); |
298 | |
} else { |
299 | 5791 | return new Phoneme(ph, Languages.ANY_LANGUAGE); |
300 | |
} |
301 | |
} |
302 | |
|
303 | |
private static PhonemeExpr parsePhonemeExpr(String ph) { |
304 | 4507 | if (ph.startsWith("(")) { |
305 | 1438 | if (!ph.endsWith(")")) { |
306 | 0 | throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); |
307 | |
} |
308 | |
|
309 | 1438 | List<Phoneme> phs = new ArrayList<Phoneme>(); |
310 | 1438 | String body = ph.substring(1, ph.length() - 1); |
311 | 4757 | for (String part : body.split("[|]")) { |
312 | 3319 | phs.add(parsePhoneme(part)); |
313 | |
} |
314 | 1438 | if (body.startsWith("|") || body.endsWith("|")) { |
315 | 48 | phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); |
316 | |
} |
317 | |
|
318 | 1438 | return new PhonemeList(phs); |
319 | |
} else { |
320 | 3069 | return parsePhoneme(ph); |
321 | |
} |
322 | |
} |
323 | |
|
324 | |
private static List<Rule> parseRules(final Scanner scanner, final String location) { |
325 | 149 | List<Rule> lines = new ArrayList<Rule>(); |
326 | 149 | int currentLine = 0; |
327 | |
|
328 | 149 | boolean inMultilineComment = false; |
329 | 8245 | while (scanner.hasNextLine()) { |
330 | 8096 | currentLine++; |
331 | 8096 | String rawLine = scanner.nextLine(); |
332 | 8096 | String line = rawLine; |
333 | |
|
334 | 8096 | if (inMultilineComment) { |
335 | 2235 | if (line.endsWith(ResourceConstants.EXT_CMT_END)) { |
336 | 149 | inMultilineComment = false; |
337 | |
} |
338 | |
} else { |
339 | 5861 | if (line.startsWith(ResourceConstants.EXT_CMT_START)) { |
340 | 149 | inMultilineComment = true; |
341 | |
} else { |
342 | |
|
343 | 5712 | int cmtI = line.indexOf(ResourceConstants.CMT); |
344 | 5712 | if (cmtI >= 0) { |
345 | 857 | line = line.substring(0, cmtI); |
346 | |
} |
347 | |
|
348 | |
|
349 | 5712 | line = line.trim(); |
350 | |
|
351 | 5712 | if (line.length() == 0) { |
352 | 1170 | continue; |
353 | |
} |
354 | |
|
355 | 4542 | if (line.startsWith(HASH_INCLUDE)) { |
356 | |
|
357 | 35 | String incl = line.substring(HASH_INCLUDE.length()).trim(); |
358 | 35 | if (incl.contains(" ")) { |
359 | 0 | throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + |
360 | |
location); |
361 | |
} else { |
362 | 35 | lines.addAll(parseRules(createScanner(incl), location + "->" + incl)); |
363 | |
} |
364 | 35 | } else { |
365 | |
|
366 | 4507 | String[] parts = line.split("\\s+"); |
367 | 4507 | if (parts.length != 4) { |
368 | 0 | throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + |
369 | |
" parts: " + rawLine + " in " + location); |
370 | |
} else { |
371 | |
try { |
372 | 4507 | String pat = stripQuotes(parts[0]); |
373 | 4507 | String lCon = stripQuotes(parts[1]); |
374 | 4507 | String rCon = stripQuotes(parts[2]); |
375 | 4507 | PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); |
376 | 4507 | final int cLine = currentLine; |
377 | 4507 | Rule r = new Rule(pat, lCon, rCon, ph) { |
378 | 4507 | private final int myLine = cLine; |
379 | 4507 | private final String loc = location; |
380 | |
|
381 | |
@Override |
382 | |
public String toString() { |
383 | 0 | final StringBuilder sb = new StringBuilder(); |
384 | 0 | sb.append("Rule"); |
385 | 0 | sb.append("{line=").append(myLine); |
386 | 0 | sb.append(", loc='").append(loc).append('\''); |
387 | 0 | sb.append('}'); |
388 | 0 | return sb.toString(); |
389 | |
} |
390 | |
}; |
391 | 4507 | lines.add(r); |
392 | 0 | } catch (IllegalArgumentException e) { |
393 | 0 | throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + |
394 | |
location, e); |
395 | 4507 | } |
396 | |
} |
397 | |
} |
398 | |
} |
399 | |
} |
400 | 6926 | } |
401 | |
|
402 | 149 | return lines; |
403 | |
} |
404 | |
|
405 | |
|
406 | |
|
407 | |
|
408 | |
|
409 | |
|
410 | |
|
411 | |
|
412 | |
private static RPattern pattern(final String regex) { |
413 | 9016 | boolean startsWith = regex.startsWith("^"); |
414 | 9016 | boolean endsWith = regex.endsWith("$"); |
415 | 9016 | final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); |
416 | 9016 | boolean boxes = content.contains("["); |
417 | |
|
418 | 9016 | if (!boxes) { |
419 | 8046 | if (startsWith && endsWith) { |
420 | |
|
421 | 633 | if (content.length() == 0) { |
422 | |
|
423 | 610 | return new RPattern() { |
424 | |
@Override |
425 | |
public boolean isMatch(CharSequence input) { |
426 | 34729 | return input.length() == 0; |
427 | |
} |
428 | |
}; |
429 | |
} else { |
430 | 23 | return new RPattern() { |
431 | |
@Override |
432 | |
public boolean isMatch(CharSequence input) { |
433 | 6720 | return input.equals(content); |
434 | |
} |
435 | |
}; |
436 | |
} |
437 | 7413 | } else if ((startsWith || endsWith) && content.length() == 0) { |
438 | |
|
439 | 7191 | return ALL_STRINGS_RMATCHER; |
440 | 222 | } else if (startsWith) { |
441 | |
|
442 | 188 | return new RPattern() { |
443 | |
@Override |
444 | |
public boolean isMatch(CharSequence input) { |
445 | 53370 | return startsWith(input, content); |
446 | |
} |
447 | |
}; |
448 | 34 | } else if (endsWith) { |
449 | |
|
450 | 34 | return new RPattern() { |
451 | |
@Override |
452 | |
public boolean isMatch(CharSequence input) { |
453 | 894 | return endsWith(input, content); |
454 | |
} |
455 | |
}; |
456 | |
} |
457 | |
} else { |
458 | 970 | boolean startsWithBox = content.startsWith("["); |
459 | 970 | boolean endsWithBox = content.endsWith("]"); |
460 | |
|
461 | 970 | if (startsWithBox && endsWithBox) { |
462 | 946 | String boxContent = content.substring(1, content.length() - 1); |
463 | 946 | if (!boxContent.contains("[")) { |
464 | |
|
465 | 933 | boolean negate = boxContent.startsWith("^"); |
466 | 933 | if (negate) { |
467 | 28 | boxContent = boxContent.substring(1); |
468 | |
} |
469 | 933 | final String bContent = boxContent; |
470 | 933 | final boolean shouldMatch = !negate; |
471 | |
|
472 | 933 | if (startsWith && endsWith) { |
473 | |
|
474 | 55 | return new RPattern() { |
475 | |
@Override |
476 | |
public boolean isMatch(CharSequence input) { |
477 | 15896 | return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; |
478 | |
} |
479 | |
}; |
480 | 878 | } else if (startsWith) { |
481 | |
|
482 | 650 | return new RPattern() { |
483 | |
@Override |
484 | |
public boolean isMatch(CharSequence input) { |
485 | 43307 | return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; |
486 | |
} |
487 | |
}; |
488 | 228 | } else if (endsWith) { |
489 | |
|
490 | 228 | return new RPattern() { |
491 | |
@Override |
492 | |
public boolean isMatch(CharSequence input) { |
493 | 13986 | return input.length() > 0 && |
494 | |
contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; |
495 | |
} |
496 | |
}; |
497 | |
} |
498 | |
} |
499 | |
} |
500 | |
} |
501 | |
|
502 | 37 | return new RPattern() { |
503 | 37 | Pattern pattern = Pattern.compile(regex); |
504 | |
|
505 | |
@Override |
506 | |
public boolean isMatch(CharSequence input) { |
507 | 16196 | Matcher matcher = pattern.matcher(input); |
508 | 16196 | return matcher.find(); |
509 | |
} |
510 | |
}; |
511 | |
} |
512 | |
|
513 | |
private static boolean startsWith(CharSequence input, CharSequence prefix) { |
514 | 53370 | if (prefix.length() > input.length()) { |
515 | 4830 | return false; |
516 | |
} |
517 | 50012 | for (int i = 0; i < prefix.length(); i++) { |
518 | 49856 | if (input.charAt(i) != prefix.charAt(i)) { |
519 | 48384 | return false; |
520 | |
} |
521 | |
} |
522 | 156 | return true; |
523 | |
} |
524 | |
|
525 | |
private static String stripQuotes(String str) { |
526 | 18028 | if (str.startsWith(DOUBLE_QUOTE)) { |
527 | 18028 | str = str.substring(1); |
528 | |
} |
529 | |
|
530 | 18028 | if (str.endsWith(DOUBLE_QUOTE)) { |
531 | 18019 | str = str.substring(0, str.length() - 1); |
532 | |
} |
533 | |
|
534 | 18028 | return str; |
535 | |
} |
536 | |
|
537 | |
private final RPattern lContext; |
538 | |
|
539 | |
private final String pattern; |
540 | |
|
541 | |
private final PhonemeExpr phoneme; |
542 | |
|
543 | |
private final RPattern rContext; |
544 | |
|
545 | |
|
546 | |
|
547 | |
|
548 | |
|
549 | |
|
550 | |
|
551 | |
|
552 | |
|
553 | |
|
554 | |
|
555 | |
|
556 | |
|
557 | 4508 | public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) { |
558 | 4508 | this.pattern = pattern; |
559 | 4508 | this.lContext = pattern(lContext + "$"); |
560 | 4508 | this.rContext = pattern("^" + rContext); |
561 | 4508 | this.phoneme = phoneme; |
562 | 4508 | } |
563 | |
|
564 | |
|
565 | |
|
566 | |
|
567 | |
|
568 | |
|
569 | |
public RPattern getLContext() { |
570 | 0 | return this.lContext; |
571 | |
} |
572 | |
|
573 | |
|
574 | |
|
575 | |
|
576 | |
|
577 | |
|
578 | |
public String getPattern() { |
579 | 32427496 | return this.pattern; |
580 | |
} |
581 | |
|
582 | |
|
583 | |
|
584 | |
|
585 | |
|
586 | |
|
587 | |
public PhonemeExpr getPhoneme() { |
588 | 31736 | return this.phoneme; |
589 | |
} |
590 | |
|
591 | |
|
592 | |
|
593 | |
|
594 | |
|
595 | |
|
596 | |
public RPattern getRContext() { |
597 | 0 | return this.rContext; |
598 | |
} |
599 | |
|
600 | |
|
601 | |
|
602 | |
|
603 | |
|
604 | |
|
605 | |
|
606 | |
|
607 | |
|
608 | |
|
609 | |
|
610 | |
|
611 | |
public boolean patternAndContextMatches(CharSequence input, int i) { |
612 | 32427497 | if (i < 0) { |
613 | 1 | throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); |
614 | |
} |
615 | |
|
616 | 32427496 | int patternLength = this.pattern.length(); |
617 | 32427496 | int ipl = i + patternLength; |
618 | |
|
619 | 32427496 | if (ipl > input.length()) { |
620 | |
|
621 | 13169700 | return false; |
622 | |
} |
623 | |
|
624 | |
|
625 | |
|
626 | 19257796 | if (!input.subSequence(i, ipl).equals(this.pattern)) { |
627 | 19056307 | return false; |
628 | 201489 | } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { |
629 | 151870 | return false; |
630 | |
} |
631 | 49619 | return this.lContext.isMatch(input.subSequence(0, i)); |
632 | |
} |
633 | |
} |