Coverage Report

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.commons.codec.language.bm;
 
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.EnumMap;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
  * A phoneme rule.
  * <p>
  * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
  * and a logical flag indicating if all languages must be in play. A rule matches if:
  * <ul>
  * <li>the pattern matches at the current position</li>
  * <li>the string up until the beginning of the pattern matches the left context</li>
  * <li>the string from the end of the pattern matches the right context</li>
  * <li>logical is ALL and all languages are in scope; or</li>
  * <li>logical is any other value and at least one language is in scope</li>
  * </ul>
  * <p>
  * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
  * to explicitly construct their own.
  * <p>
  * Rules are immutable and thread-safe.
  * <p>
  * <b>Rules resources</b>
  * <p>
  * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
  * named following the pattern:
  * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
  * <p>
  * The format of these resources is the following:
  * <ul>
  * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
  * will be interpreted as:
  * <ol>
  * <li>pattern</li>
  * <li>left context</li>
  * <li>right context</li>
  * <li>phoneme</li>
  * </ol>
  * </li>
  * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
  * as a comment.</li>
  * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
  * all content until a line ending in '*' and '/' is found.</li>
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  * </ul>
  *
  * @since 1.6
  * @version $Id$
  */
 public class Rule {
 
     public static final class Phoneme implements PhonemeExpr {
         public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
             @Override
             public int compare(final Phoneme o1, final Phoneme o2) {
                 for (int i = 0; i < o1.phonemeText.length(); i++) {
                     if (i >= o2.phonemeText.length()) {
                         return +1;
                     }
                     final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
                     if (c != 0) {
                         return c;
                     }
                 }
 
                 if (o1.phonemeText.length() < o2.phonemeText.length()) {
                     return -1;
                 }
 
                 return 0;
             }
         };
 
         private final CharSequence phonemeText;
         private final Languages.LanguageSet languages;
 
         public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
             this.phonemeText = phonemeText;
             this.languages = languages;
         }
 
         public Phoneme append(final CharSequence str) {
             return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
         }
 
         public Languages.LanguageSet getLanguages() {
             return this.languages;
         }
 
         @Override
         public Iterable<Phoneme> getPhonemes() {
             return Collections.singleton(this);
         }
 
         public CharSequence getPhonemeText() {
             return this.phonemeText;
         }
 
         public Phoneme join(final Phoneme right) {
             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
                                this.languages.restrictTo(right.languages));
         }
     }
 
     public interface PhonemeExpr {
         Iterable<Phoneme> getPhonemes();
     }
 
     public static final class PhonemeList implements PhonemeExpr {
         private final List<Phoneme> phonemes;
 
         public PhonemeList(final List<Phoneme> phonemes) {
             this.phonemes = phonemes;
         }
 
         @Override
         public List<Phoneme> getPhonemes() {
             return this.phonemes;
         }
     }
 
     /**
      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
      */
     public static interface RPattern {
         boolean isMatch(CharSequence input);
     }
 
     public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
         @Override
         public boolean isMatch(final CharSequence input) {
             return true;
         }
     };
 
     public static final String ALL = "ALL";
 
     private static final String DOUBLE_QUOTE = "\"";
 
     private static final String HASH_INCLUDE = "#include";
 
     private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
             new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
 
     static {
         for (final NameType s : NameType.values()) {
             final Map<RuleType, Map<String, List<Rule>>> rts =
                     new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
 
             for (final RuleType rt : RuleType.values()) {
                 final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
 
                 final Languages ls = Languages.getInstance(s);
                 for (final String l : ls.getLanguages()) {
                     try {
                         rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
                     } catch (final IllegalStateException e) {
                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
                     }
                 }
                 if (!rt.equals(RuleType.RULES)) {
                     rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
                 }
 
                 rts.put(rt, Collections.unmodifiableMap(rs));
             }
 
             RULES.put(s, Collections.unmodifiableMap(rts));
         }
     }
 
     private static boolean contains(final CharSequence chars, final char input) {
         for (int i = 0; i < chars.length(); i++) {
             if (chars.charAt(i) == input) {
                 return true;
             }
         }
         return false;
     }
 
     private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
                              nameType.getName(), rt.getName(), lang);
     }
 
     private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
         final String resName = createResourceName(nameType, rt, lang);
         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
 
         if (rulesIS == null) {
             throw new IllegalArgumentException("Unable to load resource: " + resName);
         }
 
         return new Scanner(rulesIS, ResourceConstants.ENCODING);
     }
 
     private static Scanner createScanner(final String lang) {
         final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
         final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
 
         if (rulesIS == null) {
             throw new IllegalArgumentException("Unable to load resource: " + resName);
         }
 
         return new Scanner(rulesIS, ResourceConstants.ENCODING);
     }
 
     private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
         if (suffix.length() > input.length()) {
             return false;
         }
         for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
             if (input.charAt(i) != suffix.charAt(j)) {
                 return false;
             }
         }
         return true;
     }
 
     /**
      * Gets rules for a combination of name type, rule type and languages.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param langs
      *            the set of languages to consider
      * @return a list of Rules that apply
      */
     public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
                                          final Languages.LanguageSet langs) {
         return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
                                      getInstance(nameType, rt, Languages.ANY);
     }
 
     /**
      * Gets rules for a combination of name type, rule type and a single language.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param lang
      *            the language to consider
      * @return a list rules for a combination of name type, rule type and a single language.
      */
     public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
         final List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
 
         if (rules == null) {
             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
                                                nameType.getName(), rt.getName(), lang));
         }
 
         return rules;
     }
 
     private static Phoneme parsePhoneme(final String ph) {
         final int open = ph.indexOf("[");
         if (open >= 0) {
             if (!ph.endsWith("]")) {
                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
             }
             final String before = ph.substring(0, open);
             final String in = ph.substring(open + 1, ph.length() - 1);
             final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
 
             return new Phoneme(before, Languages.LanguageSet.from(langs));
         } else {
             return new Phoneme(ph, Languages.ANY_LANGUAGE);
         }
     }
 
     private static PhonemeExpr parsePhonemeExpr(final String ph) {
         if (ph.startsWith("(")) { // we have a bracketed list of options
             if (!ph.endsWith(")")) {
                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
             }
 
             final List<Phoneme> phs = new ArrayList<Phoneme>();
             final String body = ph.substring(1, ph.length() - 1);
             for (final String part : body.split("[|]")) {
                 phs.add(parsePhoneme(part));
             }
             if (body.startsWith("|") || body.endsWith("|")) {
                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
             }
 
             return new PhonemeList(phs);
         } else {
             return parsePhoneme(ph);
         }
     }
 
     private static List<Rule> parseRules(final Scanner scanner, final String location) {
         final List<Rule> lines = new ArrayList<Rule>();
         int currentLine = 0;
 
         boolean inMultilineComment = false;
         while (scanner.hasNextLine()) {
             currentLine++;
             final String rawLine = scanner.nextLine();
             String line = rawLine;
 
             if (inMultilineComment) {
                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
                     inMultilineComment = false;
                 }
             } else {
                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
                     inMultilineComment = true;
                 } else {
                     // discard comments
                     final int cmtI = line.indexOf(ResourceConstants.CMT);
                     if (cmtI >= 0) {
                         line = line.substring(0, cmtI);
                     }
 
                     // trim leading-trailing whitespace
                     line = line.trim();
 
                     if (line.length() == 0) {
                         continue; // empty lines can be safely skipped
                     }
 
                     if (line.startsWith(HASH_INCLUDE)) {
                         // include statement
                         final String incl = line.substring(HASH_INCLUDE.length()).trim();
                         if (incl.contains(" ")) {
                             throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
                                                                location);
                         } else {
                             lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
                         }
                     } else {
                         // rule
                         final String[] parts = line.split("\\s+");
                         if (parts.length != 4) {
                             throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
                                                                " parts: " + rawLine + " in " + location);
                         } else {
                             try {
                                 final String pat = stripQuotes(parts[0]);
                                 final String lCon = stripQuotes(parts[1]);
                                 final String rCon = stripQuotes(parts[2]);
                                 final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
                                 final int cLine = currentLine;
                                 final Rule r = new Rule(pat, lCon, rCon, ph) {
                                     private final int myLine = cLine;
                                     private final String loc = location;
 
                                     @Override
                                     public String toString() {
                                         final StringBuilder sb = new StringBuilder();
                                         sb.append("Rule");
                                         sb.append("{line=").append(myLine);
                                         sb.append(", loc='").append(loc).append('\'');
                                         sb.append('}');
                                         return sb.toString();
                                     }
                                 };
                                 lines.add(r);
                             } catch (final IllegalArgumentException e) {
                                 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
                                                                 location, e);
                             }
                         }
                     }
                 }
             }
         }
 
         return lines;
     }
 
     /**
      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
      *
      * @param regex
      *            the regular expression to compile
      * @return an RPattern that will match this regex
      */
     private static RPattern pattern(final String regex) {
         final boolean startsWith = regex.startsWith("^");
         final boolean endsWith = regex.endsWith("$");
         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
         final boolean boxes = content.contains("[");
 
         if (!boxes) {
             if (startsWith && endsWith) {
                 // exact match
                 if (content.length() == 0) {
                     // empty
                     return new RPattern() {
                         @Override
                         public boolean isMatch(final CharSequence input) {
                             return input.length() == 0;
                         }
                     };
                 } else {
                     return new RPattern() {
                         @Override
                         public boolean isMatch(final CharSequence input) {
                             return input.equals(content);
                         }
                     };
                 }
             } else if ((startsWith || endsWith) && content.length() == 0) {
                 // matches every string
                 return ALL_STRINGS_RMATCHER;
             } else if (startsWith) {
                 // matches from start
                 return new RPattern() {
                     @Override
                     public boolean isMatch(final CharSequence input) {
                         return startsWith(input, content);
                     }
                 };
             } else if (endsWith) {
                 // matches from start
                 return new RPattern() {
                     @Override
                     public boolean isMatch(final CharSequence input) {
                         return endsWith(input, content);
                     }
                 };
             }
         } else {
             final boolean startsWithBox = content.startsWith("[");
             final boolean endsWithBox = content.endsWith("]");
 
             if (startsWithBox && endsWithBox) {
                 String boxContent = content.substring(1, content.length() - 1);
                 if (!boxContent.contains("[")) {
                     // box containing alternatives
                     final boolean negate = boxContent.startsWith("^");
                     if (negate) {
                         boxContent = boxContent.substring(1);
                     }
                     final String bContent = boxContent;
                     final boolean shouldMatch = !negate;
 
                     if (startsWith && endsWith) {
                         // exact match
                         return new RPattern() {
                             @Override
                             public boolean isMatch(final CharSequence input) {
                                 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
                             }
                         };
                     } else if (startsWith) {
                         // first char
                         return new RPattern() {
                             @Override
                             public boolean isMatch(final CharSequence input) {
                                 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
                             }
                         };
                     } else if (endsWith) {
                         // last char
                         return new RPattern() {
                             @Override
                             public boolean isMatch(final CharSequence input) {
                                 return input.length() > 0 &&
                                        contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
                             }
                         };
                     }
                 }
             }
         }
 
         return new RPattern() {
             Pattern pattern = Pattern.compile(regex);
 
             @Override
             public boolean isMatch(final CharSequence input) {
                 final Matcher matcher = pattern.matcher(input);
                 return matcher.find();
             }
         };
     }
 
     private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
         if (prefix.length() > input.length()) {
             return false;
         }
         for (int i = 0; i < prefix.length(); i++) {
             if (input.charAt(i) != prefix.charAt(i)) {
                 return false;
             }
         }
         return true;
     }
 
     private static String stripQuotes(String str) {
         if (str.startsWith(DOUBLE_QUOTE)) {
             str = str.substring(1);
         }
 
         if (str.endsWith(DOUBLE_QUOTE)) {
             str = str.substring(0, str.length() - 1);
         }
 
         return str;
     }
 
     private final RPattern lContext;
 
     private final String pattern;
 
     private final PhonemeExpr phoneme;
 
     private final RPattern rContext;
 
     /**
      * Creates a new rule.
      *
      * @param pattern
      *            the pattern
      * @param lContext
      *            the left context
      * @param rContext
      *            the right context
      * @param phoneme
      *            the resulting phoneme
      */
     public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
         this.pattern = pattern;
         this.lContext = pattern(lContext + "$");
         this.rContext = pattern("^" + rContext);
         this.phoneme = phoneme;
     }
 
     /**
      * Gets the left context. This is a regular expression that must match to the left of the pattern.
      *
      * @return the left context Pattern
      */
     public RPattern getLContext() {
         return this.lContext;
     }
 
     /**
      * Gets the pattern. This is a string-literal that must exactly match.
      *
      * @return the pattern
      */
     public String getPattern() {
         return this.pattern;
     }
 
     /**
      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
      *
      * @return the phoneme
      */
     public PhonemeExpr getPhoneme() {
         return this.phoneme;
     }
 
     /**
      * Gets the right context. This is a regular expression that must match to the right of the pattern.
      *
      * @return the right context Pattern
      */
     public RPattern getRContext() {
         return this.rContext;
     }
 
     /**
      * Decides if the pattern and context match the input starting at a position. It is a match if the
      * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
      * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
      *
      * @param input
      *            the input String
      * @param i
      *            the int position within the input
      * @return true if the pattern and left/right context match, false otherwise
      */
     public boolean patternAndContextMatches(final CharSequence input, final int i) {
         if (i < 0) {
             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
         }
 
         final int patternLength = this.pattern.length();
         final int ipl = i + patternLength;
 
         if (ipl > input.length()) {
             // not enough room for the pattern to match
             return false;
         }
 
         // evaluate the pattern, left context and right context
         // fail early if any of the evaluations is not successful
         if (!input.subSequence(i, ipl).equals(this.pattern)) {
             return false;
         } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
             return false;
         }
         return this.lContext.isMatch(input.subSequence(0, i));
     }
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.commons.codec.language.bm;
19
20		import java.io.InputStream;
21		import java.util.ArrayList;
22		import java.util.Arrays;
23		import java.util.Collections;
24		import java.util.Comparator;
25		import java.util.EnumMap;
26		import java.util.HashMap;
27		import java.util.HashSet;
28		import java.util.List;
29		import java.util.Map;
30		import java.util.Scanner;
31		import java.util.Set;
32		import java.util.regex.Matcher;
33		import java.util.regex.Pattern;
34
35		/**
36		* A phoneme rule.
37		* <p>
38		* Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
39		* and a logical flag indicating if all languages must be in play. A rule matches if:
40		* <ul>
41		* <li>the pattern matches at the current position</li>
42		* <li>the string up until the beginning of the pattern matches the left context</li>
43		* <li>the string from the end of the pattern matches the right context</li>
44		* <li>logical is ALL and all languages are in scope; or</li>
45		* <li>logical is any other value and at least one language is in scope</li>
46		* </ul>
47		* <p>
48		* Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
49		* to explicitly construct their own.
50		* <p>
51		* Rules are immutable and thread-safe.
52		* <p>
53		* <b>Rules resources</b>
54		* <p>
55		* Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
56		* named following the pattern:
57		* <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
58		* <p>
59		* The format of these resources is the following:
60		* <ul>
61		* <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
62		* will be interpreted as:
63		* <ol>
64		* <li>pattern</li>
65		* <li>left context</li>
66		* <li>right context</li>
67		* <li>phoneme</li>
68		* </ol>
69		* </li>
70		* <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
71		* as a comment.</li>
72		* <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
73		* all content until a line ending in '*' and '/' is found.</li>
74		* <li><b>Blank lines:</b> All blank lines will be skipped.</li>
75		* </ul>
76		*
77		* @since 1.6
78		* @version $Id$
79		*/
80	106145	public class Rule {
81
82	4876451	public static final class Phoneme implements PhonemeExpr {
83	92232	public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
84		@Override
85		public int compare(final Phoneme o1, final Phoneme o2) {
86	1220580	for (int i = 0; i < o1.phonemeText.length(); i++) {
87	1216151	if (i >= o2.phonemeText.length()) {
88	720	return +1;
89		}
90	1215431	final int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
91	1215431	if (c != 0) {
92	87082	return c;
93		}
94		}
95
96	4429	if (o1.phonemeText.length() < o2.phonemeText.length()) {
97	1020	return -1;
98		}
99
100	3409	return 0;
101		}
102		};
103
104		private final CharSequence phonemeText;
105		private final Languages.LanguageSet languages;
106
107	542556	public Phoneme(final CharSequence phonemeText, final Languages.LanguageSet languages) {
108	542556	this.phonemeText = phonemeText;
109	542556	this.languages = languages;
110	542556	}
111
112		public Phoneme append(final CharSequence str) {
113	152227	return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114		}
115
116		public Languages.LanguageSet getLanguages() {
117	316698	return this.languages;
118		}
119
120		@Override
121		public Iterable<Phoneme> getPhonemes() {
122	37869	return Collections.singleton(this);
123		}
124
125		public CharSequence getPhonemeText() {
126	220364	return this.phonemeText;
127		}
128
129		public Phoneme join(final Phoneme right) {
130	175837	return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131		this.languages.restrictTo(right.languages));
132		}
133		}
134
135		public interface PhonemeExpr {
136		Iterable<Phoneme> getPhonemes();
137		}
138
139	63608	public static final class PhonemeList implements PhonemeExpr {
140		private final List<Phoneme> phonemes;
141
142	1438	public PhonemeList(final List<Phoneme> phonemes) {
143	1438	this.phonemes = phonemes;
144	1438	}
145
146		@Override
147		public List<Phoneme> getPhonemes() {
148	63608	return this.phonemes;
149		}
150		}
151
152		/**
153		* A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
154		*/
155		public static interface RPattern {
156		boolean isMatch(CharSequence input);
157		}
158
159	1	public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160		@Override
161		public boolean isMatch(final CharSequence input) {
162	66010	return true;
163		}
164		};
165
166		public static final String ALL = "ALL";
167
168		private static final String DOUBLE_QUOTE = "\"";
169
170		private static final String HASH_INCLUDE = "#include";
171
172	1	private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173		new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174
175		static {
176	4	for (final NameType s : NameType.values()) {
177	3	final Map<RuleType, Map<String, List<Rule>>> rts =
178		new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
179
180	12	for (final RuleType rt : RuleType.values()) {
181	9	final Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
182
183	9	final Languages ls = Languages.getInstance(s);
184	9	for (final String l : ls.getLanguages()) {
185		try {
186	108	rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
187	0	} catch (final IllegalStateException e) {
188	0	throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
189	108	}
190		}
191	9	if (!rt.equals(RuleType.RULES)) {
192	6	rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
193		}
194
195	9	rts.put(rt, Collections.unmodifiableMap(rs));
196		}
197
198	3	RULES.put(s, Collections.unmodifiableMap(rts));
199		}
200	1	}
201
202		private static boolean contains(final CharSequence chars, final char input) {
203	312230	for (int i = 0; i < chars.length(); i++) {
204	270802	if (chars.charAt(i) == input) {
205	10453	return true;
206		}
207		}
208	41428	return false;
209		}
210
211		private static String createResourceName(final NameType nameType, final RuleType rt, final String lang) {
212	228	return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
213		nameType.getName(), rt.getName(), lang);
214		}
215
216		private static Scanner createScanner(final NameType nameType, final RuleType rt, final String lang) {
217	114	final String resName = createResourceName(nameType, rt, lang);
218	114	final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
219
220	114	if (rulesIS == null) {
221	0	throw new IllegalArgumentException("Unable to load resource: " + resName);
222		}
223
224	114	return new Scanner(rulesIS, ResourceConstants.ENCODING);
225		}
226
227		private static Scanner createScanner(final String lang) {
228	35	final String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
229	35	final InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
230
231	35	if (rulesIS == null) {
232	0	throw new IllegalArgumentException("Unable to load resource: " + resName);
233		}
234
235	35	return new Scanner(rulesIS, ResourceConstants.ENCODING);
236		}
237
238		private static boolean endsWith(final CharSequence input, final CharSequence suffix) {
239	894	if (suffix.length() > input.length()) {
240	124	return false;
241		}
242	783	for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
243	770	if (input.charAt(i) != suffix.charAt(j)) {
244	757	return false;
245		}
246		}
247	13	return true;
248		}
249
250		/**
251		* Gets rules for a combination of name type, rule type and languages.
252		*
253		* @param nameType
254		* the NameType to consider
255		* @param rt
256		* the RuleType to consider
257		* @param langs
258		* the set of languages to consider
259		* @return a list of Rules that apply
260		*/
261		public static List<Rule> getInstance(final NameType nameType, final RuleType rt,
262		final Languages.LanguageSet langs) {
263	134336	return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
264		getInstance(nameType, rt, Languages.ANY);
265		}
266
267		/**
268		* Gets rules for a combination of name type, rule type and a single language.
269		*
270		* @param nameType
271		* the NameType to consider
272		* @param rt
273		* the RuleType to consider
274		* @param lang
275		* the language to consider
276		* @return a list rules for a combination of name type, rule type and a single language.
277		*/
278		public static List<Rule> getInstance(final NameType nameType, final RuleType rt, final String lang) {
279	201505	final List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
280
281	201505	if (rules == null) {
282	1	throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
283		nameType.getName(), rt.getName(), lang));
284		}
285
286	201504	return rules;
287		}
288
289		private static Phoneme parsePhoneme(final String ph) {
290	6388	final int open = ph.indexOf("[");
291	6388	if (open >= 0) {
292	597	if (!ph.endsWith("]")) {
293	0	throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
294		}
295	597	final String before = ph.substring(0, open);
296	597	final String in = ph.substring(open + 1, ph.length() - 1);
297	597	final Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
298
299	597	return new Phoneme(before, Languages.LanguageSet.from(langs));
300		} else {
301	5791	return new Phoneme(ph, Languages.ANY_LANGUAGE);
302		}
303		}
304
305		private static PhonemeExpr parsePhonemeExpr(final String ph) {
306	4507	if (ph.startsWith("(")) { // we have a bracketed list of options
307	1438	if (!ph.endsWith(")")) {
308	0	throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
309		}
310
311	1438	final List<Phoneme> phs = new ArrayList<Phoneme>();
312	1438	final String body = ph.substring(1, ph.length() - 1);
313	4757	for (final String part : body.split("[\|]")) {
314	3319	phs.add(parsePhoneme(part));
315		}
316	1438	if (body.startsWith("\|") \|\| body.endsWith("\|")) {
317	48	phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
318		}
319
320	1438	return new PhonemeList(phs);
321		} else {
322	3069	return parsePhoneme(ph);
323		}
324		}
325
326		private static List<Rule> parseRules(final Scanner scanner, final String location) {
327	149	final List<Rule> lines = new ArrayList<Rule>();
328	149	int currentLine = 0;
329
330	149	boolean inMultilineComment = false;
331	8245	while (scanner.hasNextLine()) {
332	8096	currentLine++;
333	8096	final String rawLine = scanner.nextLine();
334	8096	String line = rawLine;
335
336	8096	if (inMultilineComment) {
337	2235	if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
338	149	inMultilineComment = false;
339		}
340		} else {
341	5861	if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
342	149	inMultilineComment = true;
343		} else {
344		// discard comments
345	5712	final int cmtI = line.indexOf(ResourceConstants.CMT);
346	5712	if (cmtI >= 0) {
347	857	line = line.substring(0, cmtI);
348		}
349
350		// trim leading-trailing whitespace
351	5712	line = line.trim();
352
353	5712	if (line.length() == 0) {
354	1170	continue; // empty lines can be safely skipped
355		}
356
357	4542	if (line.startsWith(HASH_INCLUDE)) {
358		// include statement
359	35	final String incl = line.substring(HASH_INCLUDE.length()).trim();
360	35	if (incl.contains(" ")) {
361	0	throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
362		location);
363		} else {
364	35	lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
365		}
366	35	} else {
367		// rule
368	4507	final String[] parts = line.split("\\s+");
369	4507	if (parts.length != 4) {
370	0	throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
371		" parts: " + rawLine + " in " + location);
372		} else {
373		try {
374	4507	final String pat = stripQuotes(parts[0]);
375	4507	final String lCon = stripQuotes(parts[1]);
376	4507	final String rCon = stripQuotes(parts[2]);
377	4507	final PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
378	4507	final int cLine = currentLine;
379	4507	final Rule r = new Rule(pat, lCon, rCon, ph) {
380	4507	private final int myLine = cLine;
381	4507	private final String loc = location;
382
383		@Override
384		public String toString() {
385	0	final StringBuilder sb = new StringBuilder();
386	0	sb.append("Rule");
387	0	sb.append("{line=").append(myLine);
388	0	sb.append(", loc='").append(loc).append('\'');
389	0	sb.append('}');
390	0	return sb.toString();
391		}
392		};
393	4507	lines.add(r);
394	0	} catch (final IllegalArgumentException e) {
395	0	throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
396		location, e);
397	4507	}
398		}
399		}
400		}
401		}
402	6926	}
403
404	149	return lines;
405		}
406
407		/**
408		* Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
409		*
410		* @param regex
411		* the regular expression to compile
412		* @return an RPattern that will match this regex
413		*/
414		private static RPattern pattern(final String regex) {
415	9016	final boolean startsWith = regex.startsWith("^");
416	9016	final boolean endsWith = regex.endsWith("$");
417	9016	final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
418	9016	final boolean boxes = content.contains("[");
419
420	9016	if (!boxes) {
421	8046	if (startsWith && endsWith) {
422		// exact match
423	633	if (content.length() == 0) {
424		// empty
425	610	return new RPattern() {
426		@Override
427		public boolean isMatch(final CharSequence input) {
428	34729	return input.length() == 0;
429		}
430		};
431		} else {
432	23	return new RPattern() {
433		@Override
434		public boolean isMatch(final CharSequence input) {
435	6720	return input.equals(content);
436		}
437		};
438		}
439	7413	} else if ((startsWith \|\| endsWith) && content.length() == 0) {
440		// matches every string
441	7191	return ALL_STRINGS_RMATCHER;
442	222	} else if (startsWith) {
443		// matches from start
444	188	return new RPattern() {
445		@Override
446		public boolean isMatch(final CharSequence input) {
447	53370	return startsWith(input, content);
448		}
449		};
450	34	} else if (endsWith) {
451		// matches from start
452	34	return new RPattern() {
453		@Override
454		public boolean isMatch(final CharSequence input) {
455	894	return endsWith(input, content);
456		}
457		};
458		}
459		} else {
460	970	final boolean startsWithBox = content.startsWith("[");
461	970	final boolean endsWithBox = content.endsWith("]");
462
463	970	if (startsWithBox && endsWithBox) {
464	946	String boxContent = content.substring(1, content.length() - 1);
465	946	if (!boxContent.contains("[")) {
466		// box containing alternatives
467	933	final boolean negate = boxContent.startsWith("^");
468	933	if (negate) {
469	28	boxContent = boxContent.substring(1);
470		}
471	933	final String bContent = boxContent;
472	933	final boolean shouldMatch = !negate;
473
474	933	if (startsWith && endsWith) {
475		// exact match
476	55	return new RPattern() {
477		@Override
478		public boolean isMatch(final CharSequence input) {
479	15896	return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
480		}
481		};
482	878	} else if (startsWith) {
483		// first char
484	650	return new RPattern() {
485		@Override
486		public boolean isMatch(final CharSequence input) {
487	43307	return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
488		}
489		};
490	228	} else if (endsWith) {
491		// last char
492	228	return new RPattern() {
493		@Override
494		public boolean isMatch(final CharSequence input) {
495	13986	return input.length() > 0 &&
496		contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
497		}
498		};
499		}
500		}
501		}
502		}
503
504	37	return new RPattern() {
505	37	Pattern pattern = Pattern.compile(regex);
506
507		@Override
508		public boolean isMatch(final CharSequence input) {
509	16196	final Matcher matcher = pattern.matcher(input);
510	16196	return matcher.find();
511		}
512		};
513		}
514
515		private static boolean startsWith(final CharSequence input, final CharSequence prefix) {
516	53370	if (prefix.length() > input.length()) {
517	4830	return false;
518		}
519	50012	for (int i = 0; i < prefix.length(); i++) {
520	49856	if (input.charAt(i) != prefix.charAt(i)) {
521	48384	return false;
522		}
523		}
524	156	return true;
525		}
526
527		private static String stripQuotes(String str) {
528	18028	if (str.startsWith(DOUBLE_QUOTE)) {
529	18028	str = str.substring(1);
530		}
531
532	18028	if (str.endsWith(DOUBLE_QUOTE)) {
533	18019	str = str.substring(0, str.length() - 1);
534		}
535
536	18028	return str;
537		}
538
539		private final RPattern lContext;
540
541		private final String pattern;
542
543		private final PhonemeExpr phoneme;
544
545		private final RPattern rContext;
546
547		/**
548		* Creates a new rule.
549		*
550		* @param pattern
551		* the pattern
552		* @param lContext
553		* the left context
554		* @param rContext
555		* the right context
556		* @param phoneme
557		* the resulting phoneme
558		*/
559	4508	public Rule(final String pattern, final String lContext, final String rContext, final PhonemeExpr phoneme) {
560	4508	this.pattern = pattern;
561	4508	this.lContext = pattern(lContext + "$");
562	4508	this.rContext = pattern("^" + rContext);
563	4508	this.phoneme = phoneme;
564	4508	}
565
566		/**
567		* Gets the left context. This is a regular expression that must match to the left of the pattern.
568		*
569		* @return the left context Pattern
570		*/
571		public RPattern getLContext() {
572	0	return this.lContext;
573		}
574
575		/**
576		* Gets the pattern. This is a string-literal that must exactly match.
577		*
578		* @return the pattern
579		*/
580		public String getPattern() {
581	32427496	return this.pattern;
582		}
583
584		/**
585		* Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
586		*
587		* @return the phoneme
588		*/
589		public PhonemeExpr getPhoneme() {
590	31736	return this.phoneme;
591		}
592
593		/**
594		* Gets the right context. This is a regular expression that must match to the right of the pattern.
595		*
596		* @return the right context Pattern
597		*/
598		public RPattern getRContext() {
599	0	return this.rContext;
600		}
601
602		/**
603		* Decides if the pattern and context match the input starting at a position. It is a match if the
604		* <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
605		* <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
606		*
607		* @param input
608		* the input String
609		* @param i
610		* the int position within the input
611		* @return true if the pattern and left/right context match, false otherwise
612		*/
613		public boolean patternAndContextMatches(final CharSequence input, final int i) {
614	32427497	if (i < 0) {
615	1	throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
616		}
617
618	32427496	final int patternLength = this.pattern.length();
619	32427496	final int ipl = i + patternLength;
620
621	32427496	if (ipl > input.length()) {
622		// not enough room for the pattern to match
623	13169700	return false;
624		}
625
626		// evaluate the pattern, left context and right context
627		// fail early if any of the evaluations is not successful
628	19257796	if (!input.subSequence(i, ipl).equals(this.pattern)) {
629	19056307	return false;
630	201489	} else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
631	151870	return false;
632		}
633	49619	return this.lContext.isMatch(input.subSequence(0, i));
634		}
635		}