1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.commons.codec.language.bm; |
19 | |
|
20 | |
import java.io.InputStream; |
21 | |
import java.util.ArrayList; |
22 | |
import java.util.Arrays; |
23 | |
import java.util.Collections; |
24 | |
import java.util.Comparator; |
25 | |
import java.util.EnumMap; |
26 | |
import java.util.HashMap; |
27 | |
import java.util.HashSet; |
28 | |
import java.util.List; |
29 | |
import java.util.Map; |
30 | |
import java.util.Scanner; |
31 | |
import java.util.Set; |
32 | |
import java.util.regex.Matcher; |
33 | |
import java.util.regex.Pattern; |
34 | |
|
35 | |
|
36 | |
|
37 | |
|
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
|
68 | |
|
69 | |
|
70 | |
|
71 | |
|
72 | |
|
73 | |
|
74 | |
|
75 | |
|
76 | |
|
77 | |
|
78 | |
|
79 | |
|
80 | 106145 | public class Rule { |
81 | |
|
82 | 4876451 | public static final class Phoneme implements PhonemeExpr { |
83 | 92232 | public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() { |
84 | |
@Override |
85 | |
public int compare(final Phoneme o1, final Phoneme o2) { |
86 | 1220580 | for (int i = 0; i < o1.phonemeText.length(); i++) { |
87 | 1216151 | if (i >= o2.phonemeText.length()) { |
88 | 720 | return +1; |
89 | |
} |
90 | 1215431 | final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i); |
91 | 1215431 | if (c != 0) { |
92 | 87082 | return c; |
93 | |
} |
94 | |
} |
95 | |
|
96 | 4429 | if (o1.phonemeText.length() < o2.phonemeText.length()) { |
97 | 1020 | return -1; |
98 | |
} |
99 | |
|
100 | 3409 | return 0; |
101 | |
} |
102 | |
}; |
103 | |
|
104 | |
private final CharSequence phonemeText; |
105 | |
private final Languages.LanguageSet languages; |
106 | |
|
107 | 542556 | public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) { |
108 | 542556 | this.phonemeText = phonemeText; |
109 | 542556 | this.languages = languages; |
110 | 542556 | } |
111 | |
|
112 | |
public Phoneme append(final CharSequence str) { |
113 | 152227 | return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages); |
114 | |
} |
115 | |
|
116 | |
public Languages.LanguageSet getLanguages() { |
117 | 316698 | return this.languages; |
118 | |
} |
119 | |
|
120 | |
@Override |
121 | |
public Iterable<Phoneme> getPhonemes() { |
122 | 37869 | return Collections.singleton(this); |
123 | |
} |
124 | |
|
125 | |
public CharSequence getPhonemeText() { |
126 | 220364 | return this.phonemeText; |
127 | |
} |
128 | |
|
129 | |
public Phoneme join(final Phoneme right) { |
130 | 175837 | return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(), |
131 | |
this.languages.restrictTo(right.languages)); |
132 | |
} |
133 | |
} |
134 | |
|
135 | |
public interface PhonemeExpr { |
136 | |
Iterable<Phoneme> getPhonemes(); |
137 | |
} |
138 | |
|
139 | 63608 | public static final class PhonemeList implements PhonemeExpr { |
140 | |
private final List<Phoneme> phonemes; |
141 | |
|
142 | 1438 | public PhonemeList(final List<Phoneme> phonemes) { |
143 | 1438 | this.phonemes = phonemes; |
144 | 1438 | } |
145 | |
|
146 | |
@Override |
147 | |
public List<Phoneme> getPhonemes() { |
148 | 63608 | return this.phonemes; |
149 | |
} |
150 | |
} |
151 | |
|
152 | |
|
153 | |
|
154 | |
|
155 | |
public static interface RPattern { |
156 | |
boolean isMatch(CharSequence input); |
157 | |
} |
158 | |
|
159 | 1 | public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() { |
160 | |
@Override |
161 | |
public boolean isMatch(final CharSequence input) { |
162 | 66010 | return true; |
163 | |
} |
164 | |
}; |
165 | |
|
166 | |
public static final String ALL = "ALL"; |
167 | |
|
168 | |
private static final String DOUBLE_QUOTE = "\""; |
169 | |
|
170 | |
private static final String HASH_INCLUDE = "#include"; |
171 | |
|
172 | 1 | private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES = |
173 | |
new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class); |
174 | |
|
175 | |
static { |
176 | 4 | for (final NameType s : NameType.values()) { |
177 | 3 | final Map<RuleType, Map<String, List<Rule>>> rts = |
178 | |
new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class); |
179 | |
|
180 | 12 | for (final RuleType rt : RuleType.values()) { |
181 | 9 | final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>(); |
182 | |
|
183 | 9 | final Languages ls = Languages.getInstance(s); |
184 | 9 | for (final String l : ls.getLanguages()) { |
185 | |
try { |
186 | 108 | rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l))); |
187 | 0 | } catch (final IllegalStateException e) { |
188 | 0 | throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e); |
189 | 108 | } |
190 | |
} |
191 | 9 | if (!rt.equals(RuleType.RULES)) { |
192 | 6 | rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common"))); |
193 | |
} |
194 | |
|
195 | 9 | rts.put(rt, Collections.unmodifiableMap(rs)); |
196 | |
} |
197 | |
|
198 | 3 | RULES.put(s, Collections.unmodifiableMap(rts)); |
199 | |
} |
200 | 1 | } |
201 | |
|
202 | |
private static boolean contains(final CharSequence chars, final char input) { |
203 | 312230 | for (int i = 0; i < chars.length(); i++) { |
204 | 270802 | if (chars.charAt(i) == input) { |
205 | 10453 | return true; |
206 | |
} |
207 | |
} |
208 | 41428 | return false; |
209 | |
} |
210 | |
|
211 | |
private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) { |
212 | 228 | return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt", |
213 | |
nameType.getName(), rt.getName(), lang); |
214 | |
} |
215 | |
|
216 | |
private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) { |
217 | 114 | final String resName = createResourceName(nameType, rt, lang); |
218 | 114 | final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); |
219 | |
|
220 | 114 | if (rulesIS == null) { |
221 | 0 | throw new IllegalArgumentException("Unable to load resource: " + resName); |
222 | |
} |
223 | |
|
224 | 114 | return new Scanner(rulesIS, ResourceConstants.ENCODING); |
225 | |
} |
226 | |
|
227 | |
private static Scanner createScanner(final String lang) { |
228 | 35 | final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang); |
229 | 35 | final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName); |
230 | |
|
231 | 35 | if (rulesIS == null) { |
232 | 0 | throw new IllegalArgumentException("Unable to load resource: " + resName); |
233 | |
} |
234 | |
|
235 | 35 | return new Scanner(rulesIS, ResourceConstants.ENCODING); |
236 | |
} |
237 | |
|
238 | |
private static boolean endsWith(final CharSequence input, final CharSequence suffix) { |
239 | 894 | if (suffix.length() > input.length()) { |
240 | 124 | return false; |
241 | |
} |
242 | 783 | for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) { |
243 | 770 | if (input.charAt(i) != suffix.charAt(j)) { |
244 | 757 | return false; |
245 | |
} |
246 | |
} |
247 | 13 | return true; |
248 | |
} |
249 | |
|
250 | |
|
251 | |
|
252 | |
|
253 | |
|
254 | |
|
255 | |
|
256 | |
|
257 | |
|
258 | |
|
259 | |
|
260 | |
|
261 | |
public static List<Rule> getInstance(final NameType nameType, final RuleType rt, |
262 | |
final Languages.LanguageSet langs) { |
263 | 134336 | return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) : |
264 | |
getInstance(nameType, rt, Languages.ANY); |
265 | |
} |
266 | |
|
267 | |
|
268 | |
|
269 | |
|
270 | |
|
271 | |
|
272 | |
|
273 | |
|
274 | |
|
275 | |
|
276 | |
|
277 | |
|
278 | |
public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) { |
279 | 201505 | final List<Rule> rules = RULES.get(nameType).get(rt).get(lang); |
280 | |
|
281 | 201505 | if (rules == null) { |
282 | 1 | throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.", |
283 | |
nameType.getName(), rt.getName(), lang)); |
284 | |
} |
285 | |
|
286 | 201504 | return rules; |
287 | |
} |
288 | |
|
289 | |
private static Phoneme parsePhoneme(final String ph) { |
290 | 6388 | final int open = ph.indexOf("["); |
291 | 6388 | if (open >= 0) { |
292 | 597 | if (!ph.endsWith("]")) { |
293 | 0 | throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'"); |
294 | |
} |
295 | 597 | final String before = ph.substring(0, open); |
296 | 597 | final String in = ph.substring(open + 1, ph.length() - 1); |
297 | 597 | final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]"))); |
298 | |
|
299 | 597 | return new Phoneme(before, Languages.LanguageSet.from(langs)); |
300 | |
} else { |
301 | 5791 | return new Phoneme(ph, Languages.ANY_LANGUAGE); |
302 | |
} |
303 | |
} |
304 | |
|
305 | |
private static PhonemeExpr parsePhonemeExpr(final String ph) { |
306 | 4507 | if (ph.startsWith("(")) { |
307 | 1438 | if (!ph.endsWith(")")) { |
308 | 0 | throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'"); |
309 | |
} |
310 | |
|
311 | 1438 | final List<Phoneme> phs = new ArrayList<Phoneme>(); |
312 | 1438 | final String body = ph.substring(1, ph.length() - 1); |
313 | 4757 | for (final String part : body.split("[|]")) { |
314 | 3319 | phs.add(parsePhoneme(part)); |
315 | |
} |
316 | 1438 | if (body.startsWith("|") || body.endsWith("|")) { |
317 | 48 | phs.add(new Phoneme("", Languages.ANY_LANGUAGE)); |
318 | |
} |
319 | |
|
320 | 1438 | return new PhonemeList(phs); |
321 | |
} else { |
322 | 3069 | return parsePhoneme(ph); |
323 | |
} |
324 | |
} |
325 | |
|
326 | |
private static List<Rule> parseRules(final Scanner scanner, final String location) { |
327 | 149 | final List<Rule> lines = new ArrayList<Rule>(); |
328 | 149 | int currentLine = 0; |
329 | |
|
330 | 149 | boolean inMultilineComment = false; |
331 | 8245 | while (scanner.hasNextLine()) { |
332 | 8096 | currentLine++; |
333 | 8096 | final String rawLine = scanner.nextLine(); |
334 | 8096 | String line = rawLine; |
335 | |
|
336 | 8096 | if (inMultilineComment) { |
337 | 2235 | if (line.endsWith(ResourceConstants.EXT_CMT_END)) { |
338 | 149 | inMultilineComment = false; |
339 | |
} |
340 | |
} else { |
341 | 5861 | if (line.startsWith(ResourceConstants.EXT_CMT_START)) { |
342 | 149 | inMultilineComment = true; |
343 | |
} else { |
344 | |
|
345 | 5712 | final int cmtI = line.indexOf(ResourceConstants.CMT); |
346 | 5712 | if (cmtI >= 0) { |
347 | 857 | line = line.substring(0, cmtI); |
348 | |
} |
349 | |
|
350 | |
|
351 | 5712 | line = line.trim(); |
352 | |
|
353 | 5712 | if (line.length() == 0) { |
354 | 1170 | continue; |
355 | |
} |
356 | |
|
357 | 4542 | if (line.startsWith(HASH_INCLUDE)) { |
358 | |
|
359 | 35 | final String incl = line.substring(HASH_INCLUDE.length()).trim(); |
360 | 35 | if (incl.contains(" ")) { |
361 | 0 | throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " + |
362 | |
location); |
363 | |
} else { |
364 | 35 | lines.addAll(parseRules(createScanner(incl), location + "->" + incl)); |
365 | |
} |
366 | 35 | } else { |
367 | |
|
368 | 4507 | final String[] parts = line.split("\\s+"); |
369 | 4507 | if (parts.length != 4) { |
370 | 0 | throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + |
371 | |
" parts: " + rawLine + " in " + location); |
372 | |
} else { |
373 | |
try { |
374 | 4507 | final String pat = stripQuotes(parts[0]); |
375 | 4507 | final String lCon = stripQuotes(parts[1]); |
376 | 4507 | final String rCon = stripQuotes(parts[2]); |
377 | 4507 | final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3])); |
378 | 4507 | final int cLine = currentLine; |
379 | 4507 | final Rule r = new Rule(pat, lCon, rCon, ph) { |
380 | 4507 | private final int myLine = cLine; |
381 | 4507 | private final String loc = location; |
382 | |
|
383 | |
@Override |
384 | |
public String toString() { |
385 | 0 | final StringBuilder sb = new StringBuilder(); |
386 | 0 | sb.append("Rule"); |
387 | 0 | sb.append("{line=").append(myLine); |
388 | 0 | sb.append(", loc='").append(loc).append('\''); |
389 | 0 | sb.append('}'); |
390 | 0 | return sb.toString(); |
391 | |
} |
392 | |
}; |
393 | 4507 | lines.add(r); |
394 | 0 | } catch (final IllegalArgumentException e) { |
395 | 0 | throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " + |
396 | |
location, e); |
397 | 4507 | } |
398 | |
} |
399 | |
} |
400 | |
} |
401 | |
} |
402 | 6926 | } |
403 | |
|
404 | 149 | return lines; |
405 | |
} |
406 | |
|
407 | |
|
408 | |
|
409 | |
|
410 | |
|
411 | |
|
412 | |
|
413 | |
|
414 | |
private static RPattern pattern(final String regex) { |
415 | 9016 | final boolean startsWith = regex.startsWith("^"); |
416 | 9016 | final boolean endsWith = regex.endsWith("$"); |
417 | 9016 | final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length()); |
418 | 9016 | final boolean boxes = content.contains("["); |
419 | |
|
420 | 9016 | if (!boxes) { |
421 | 8046 | if (startsWith && endsWith) { |
422 | |
|
423 | 633 | if (content.length() == 0) { |
424 | |
|
425 | 610 | return new RPattern() { |
426 | |
@Override |
427 | |
public boolean isMatch(final CharSequence input) { |
428 | 34729 | return input.length() == 0; |
429 | |
} |
430 | |
}; |
431 | |
} else { |
432 | 23 | return new RPattern() { |
433 | |
@Override |
434 | |
public boolean isMatch(final CharSequence input) { |
435 | 6720 | return input.equals(content); |
436 | |
} |
437 | |
}; |
438 | |
} |
439 | 7413 | } else if ((startsWith || endsWith) && content.length() == 0) { |
440 | |
|
441 | 7191 | return ALL_STRINGS_RMATCHER; |
442 | 222 | } else if (startsWith) { |
443 | |
|
444 | 188 | return new RPattern() { |
445 | |
@Override |
446 | |
public boolean isMatch(final CharSequence input) { |
447 | 53370 | return startsWith(input, content); |
448 | |
} |
449 | |
}; |
450 | 34 | } else if (endsWith) { |
451 | |
|
452 | 34 | return new RPattern() { |
453 | |
@Override |
454 | |
public boolean isMatch(final CharSequence input) { |
455 | 894 | return endsWith(input, content); |
456 | |
} |
457 | |
}; |
458 | |
} |
459 | |
} else { |
460 | 970 | final boolean startsWithBox = content.startsWith("["); |
461 | 970 | final boolean endsWithBox = content.endsWith("]"); |
462 | |
|
463 | 970 | if (startsWithBox && endsWithBox) { |
464 | 946 | String boxContent = content.substring(1, content.length() - 1); |
465 | 946 | if (!boxContent.contains("[")) { |
466 | |
|
467 | 933 | final boolean negate = boxContent.startsWith("^"); |
468 | 933 | if (negate) { |
469 | 28 | boxContent = boxContent.substring(1); |
470 | |
} |
471 | 933 | final String bContent = boxContent; |
472 | 933 | final boolean shouldMatch = !negate; |
473 | |
|
474 | 933 | if (startsWith && endsWith) { |
475 | |
|
476 | 55 | return new RPattern() { |
477 | |
@Override |
478 | |
public boolean isMatch(final CharSequence input) { |
479 | 15896 | return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch; |
480 | |
} |
481 | |
}; |
482 | 878 | } else if (startsWith) { |
483 | |
|
484 | 650 | return new RPattern() { |
485 | |
@Override |
486 | |
public boolean isMatch(final CharSequence input) { |
487 | 43307 | return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch; |
488 | |
} |
489 | |
}; |
490 | 228 | } else if (endsWith) { |
491 | |
|
492 | 228 | return new RPattern() { |
493 | |
@Override |
494 | |
public boolean isMatch(final CharSequence input) { |
495 | 13986 | return input.length() > 0 && |
496 | |
contains(bContent, input.charAt(input.length() - 1)) == shouldMatch; |
497 | |
} |
498 | |
}; |
499 | |
} |
500 | |
} |
501 | |
} |
502 | |
} |
503 | |
|
504 | 37 | return new RPattern() { |
505 | 37 | Pattern pattern = Pattern.compile(regex); |
506 | |
|
507 | |
@Override |
508 | |
public boolean isMatch(final CharSequence input) { |
509 | 16196 | final Matcher matcher = pattern.matcher(input); |
510 | 16196 | return matcher.find(); |
511 | |
} |
512 | |
}; |
513 | |
} |
514 | |
|
515 | |
private static boolean startsWith(final CharSequence input, final CharSequence prefix) { |
516 | 53370 | if (prefix.length() > input.length()) { |
517 | 4830 | return false; |
518 | |
} |
519 | 50012 | for (int i = 0; i < prefix.length(); i++) { |
520 | 49856 | if (input.charAt(i) != prefix.charAt(i)) { |
521 | 48384 | return false; |
522 | |
} |
523 | |
} |
524 | 156 | return true; |
525 | |
} |
526 | |
|
527 | |
private static String stripQuotes(String str) { |
528 | 18028 | if (str.startsWith(DOUBLE_QUOTE)) { |
529 | 18028 | str = str.substring(1); |
530 | |
} |
531 | |
|
532 | 18028 | if (str.endsWith(DOUBLE_QUOTE)) { |
533 | 18019 | str = str.substring(0, str.length() - 1); |
534 | |
} |
535 | |
|
536 | 18028 | return str; |
537 | |
} |
538 | |
|
539 | |
private final RPattern lContext; |
540 | |
|
541 | |
private final String pattern; |
542 | |
|
543 | |
private final PhonemeExpr phoneme; |
544 | |
|
545 | |
private final RPattern rContext; |
546 | |
|
547 | |
|
548 | |
|
549 | |
|
550 | |
|
551 | |
|
552 | |
|
553 | |
|
554 | |
|
555 | |
|
556 | |
|
557 | |
|
558 | |
|
559 | 4508 | public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) { |
560 | 4508 | this.pattern = pattern; |
561 | 4508 | this.lContext = pattern(lContext + "$"); |
562 | 4508 | this.rContext = pattern("^" + rContext); |
563 | 4508 | this.phoneme = phoneme; |
564 | 4508 | } |
565 | |
|
566 | |
|
567 | |
|
568 | |
|
569 | |
|
570 | |
|
571 | |
public RPattern getLContext() { |
572 | 0 | return this.lContext; |
573 | |
} |
574 | |
|
575 | |
|
576 | |
|
577 | |
|
578 | |
|
579 | |
|
580 | |
public String getPattern() { |
581 | 32427496 | return this.pattern; |
582 | |
} |
583 | |
|
584 | |
|
585 | |
|
586 | |
|
587 | |
|
588 | |
|
589 | |
public PhonemeExpr getPhoneme() { |
590 | 31736 | return this.phoneme; |
591 | |
} |
592 | |
|
593 | |
|
594 | |
|
595 | |
|
596 | |
|
597 | |
|
598 | |
public RPattern getRContext() { |
599 | 0 | return this.rContext; |
600 | |
} |
601 | |
|
602 | |
|
603 | |
|
604 | |
|
605 | |
|
606 | |
|
607 | |
|
608 | |
|
609 | |
|
610 | |
|
611 | |
|
612 | |
|
613 | |
public boolean patternAndContextMatches(final CharSequence input, final int i) { |
614 | 32427497 | if (i < 0) { |
615 | 1 | throw new IndexOutOfBoundsException("Can not match pattern at negative indexes"); |
616 | |
} |
617 | |
|
618 | 32427496 | final int patternLength = this.pattern.length(); |
619 | 32427496 | final int ipl = i + patternLength; |
620 | |
|
621 | 32427496 | if (ipl > input.length()) { |
622 | |
|
623 | 13169700 | return false; |
624 | |
} |
625 | |
|
626 | |
|
627 | |
|
628 | 19257796 | if (!input.subSequence(i, ipl).equals(this.pattern)) { |
629 | 19056307 | return false; |
630 | 201489 | } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) { |
631 | 151870 | return false; |
632 | |
} |
633 | 49619 | return this.lContext.isMatch(input.subSequence(0, i)); |
634 | |
} |
635 | |
} |