Coverage Report

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.commons.codec.language.bm;
 
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.EnumMap;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 /**
  * A phoneme rule.
  * <p>
  * Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
  * and a logical flag indicating if all languages must be in play. A rule matches if:
  * <ul>
  * <li>the pattern matches at the current position</li>
  * <li>the string up until the beginning of the pattern matches the left context</li>
  * <li>the string from the end of the pattern matches the right context</li>
  * <li>logical is ALL and all languages are in scope; or</li>
  * <li>logical is any other value and at least one language is in scope</li>
  * </ul>
  * <p>
  * Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
  * to explicitly construct their own.
  * <p>
  * Rules are immutable and thread-safe.
  * <p>
  * <b>Rules resources</b>
  * <p>
  * Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
  * named following the pattern:
  * <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
  * <p>
  * The format of these resources is the following:
  * <ul>
  * <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
  * will be interpreted as:
  * <ol>
  * <li>pattern</li>
  * <li>left context</li>
  * <li>right context</li>
  * <li>phoneme</li>
  * </ol>
  * </li>
  * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
  * as a comment.</li>
  * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
  * all content until a line ending in '*' and '/' is found.</li>
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  * </ul>
  *
  * @since 1.6
  * @version $Id$
  */
 public class Rule {
 
     public static final class Phoneme implements PhonemeExpr {
         public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
             @Override
             public int compare(Phoneme o1, Phoneme o2) {
                 for (int i = 0; i < o1.phonemeText.length(); i++) {
                     if (i >= o2.phonemeText.length()) {
                         return +1;
                     }
                     int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
                     if (c != 0) {
                         return c;
                     }
                 }
 
                 if (o1.phonemeText.length() < o2.phonemeText.length()) {
                     return -1;
                 }
 
                 return 0;
             }
         };
 
         private final CharSequence phonemeText;
         private final Languages.LanguageSet languages;
 
         public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) {
             this.phonemeText = phonemeText;
             this.languages = languages;
         }
 
         public Phoneme append(CharSequence str) {
             return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
         }
 
         public Languages.LanguageSet getLanguages() {
             return this.languages;
         }
 
         @Override
         public Iterable<Phoneme> getPhonemes() {
             return Collections.singleton(this);
         }
 
         public CharSequence getPhonemeText() {
             return this.phonemeText;
         }
 
         public Phoneme join(Phoneme right) {
             return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
                                this.languages.restrictTo(right.languages));
         }
     }
 
     public interface PhonemeExpr {
         Iterable<Phoneme> getPhonemes();
     }
 
     public static final class PhonemeList implements PhonemeExpr {
         private final List<Phoneme> phonemes;
 
         public PhonemeList(List<Phoneme> phonemes) {
             this.phonemes = phonemes;
         }
 
         @Override
         public List<Phoneme> getPhonemes() {
             return this.phonemes;
         }
     }
 
     /**
      * A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
      */
     public static interface RPattern {
         boolean isMatch(CharSequence input);
     }
 
     public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
         @Override
         public boolean isMatch(CharSequence input) {
             return true;
         }
     };
 
     public static final String ALL = "ALL";
 
     private static final String DOUBLE_QUOTE = "\"";
 
     private static final String HASH_INCLUDE = "#include";
 
     private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
             new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
 
     static {
         for (NameType s : NameType.values()) {
             Map<RuleType, Map<String, List<Rule>>> rts = new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
 
             for (RuleType rt : RuleType.values()) {
                 Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
 
                 Languages ls = Languages.getInstance(s);
                 for (String l : ls.getLanguages()) {
                     try {
                         rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
                     } catch (IllegalStateException e) {
                         throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
                     }
                 }
                 if (!rt.equals(RuleType.RULES)) {
                     rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
                 }
 
                 rts.put(rt, Collections.unmodifiableMap(rs));
             }
 
             RULES.put(s, Collections.unmodifiableMap(rts));
         }
     }
 
     private static boolean contains(CharSequence chars, char input) {
         for (int i = 0; i < chars.length(); i++) {
             if (chars.charAt(i) == input) {
                 return true;
             }
         }
         return false;
     }
 
     private static String createResourceName(NameType nameType, RuleType rt, String lang) {
         return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
                              nameType.getName(), rt.getName(), lang);
     }
 
     private static Scanner createScanner(NameType nameType, RuleType rt, String lang) {
         String resName = createResourceName(nameType, rt, lang);
         InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
 
         if (rulesIS == null) {
             throw new IllegalArgumentException("Unable to load resource: " + resName);
         }
 
         return new Scanner(rulesIS, ResourceConstants.ENCODING);
     }
 
     private static Scanner createScanner(String lang) {
         String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
         InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
 
         if (rulesIS == null) {
             throw new IllegalArgumentException("Unable to load resource: " + resName);
         }
 
         return new Scanner(rulesIS, ResourceConstants.ENCODING);
     }
 
     private static boolean endsWith(CharSequence input, CharSequence suffix) {
         if (suffix.length() > input.length()) {
             return false;
         }
         for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
             if (input.charAt(i) != suffix.charAt(j)) {
                 return false;
             }
         }
         return true;
     }
 
     /**
      * Gets rules for a combination of name type, rule type and languages.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param langs
      *            the set of languages to consider
      * @return a list of Rules that apply
      */
     public static List<Rule> getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) {
         return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
                                      getInstance(nameType, rt, Languages.ANY);
     }
 
     /**
      * Gets rules for a combination of name type, rule type and a single language.
      *
      * @param nameType
      *            the NameType to consider
      * @param rt
      *            the RuleType to consider
      * @param lang
      *            the language to consider
      * @return a list rules for a combination of name type, rule type and a single language.
      */
     public static List<Rule> getInstance(NameType nameType, RuleType rt, String lang) {
         List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
 
         if (rules == null) {
             throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
                                                nameType.getName(), rt.getName(), lang));
         }
 
         return rules;
     }
 
     private static Phoneme parsePhoneme(String ph) {
         int open = ph.indexOf("[");
         if (open >= 0) {
             if (!ph.endsWith("]")) {
                 throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
             }
             String before = ph.substring(0, open);
             String in = ph.substring(open + 1, ph.length() - 1);
             Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
 
             return new Phoneme(before, Languages.LanguageSet.from(langs));
         } else {
             return new Phoneme(ph, Languages.ANY_LANGUAGE);
         }
     }
 
     private static PhonemeExpr parsePhonemeExpr(String ph) {
         if (ph.startsWith("(")) { // we have a bracketed list of options
             if (!ph.endsWith(")")) {
                 throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
             }
 
             List<Phoneme> phs = new ArrayList<Phoneme>();
             String body = ph.substring(1, ph.length() - 1);
             for (String part : body.split("[|]")) {
                 phs.add(parsePhoneme(part));
             }
             if (body.startsWith("|") || body.endsWith("|")) {
                 phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
             }
 
             return new PhonemeList(phs);
         } else {
             return parsePhoneme(ph);
         }
     }
 
     private static List<Rule> parseRules(final Scanner scanner, final String location) {
         List<Rule> lines = new ArrayList<Rule>();
         int currentLine = 0;
 
         boolean inMultilineComment = false;
         while (scanner.hasNextLine()) {
             currentLine++;
             String rawLine = scanner.nextLine();
             String line = rawLine;
 
             if (inMultilineComment) {
                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
                     inMultilineComment = false;
                 }
             } else {
                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
                     inMultilineComment = true;
                 } else {
                     // discard comments
                     int cmtI = line.indexOf(ResourceConstants.CMT);
                     if (cmtI >= 0) {
                         line = line.substring(0, cmtI);
                     }
 
                     // trim leading-trailing whitespace
                     line = line.trim();
 
                     if (line.length() == 0) {
                         continue; // empty lines can be safely skipped
                     }
 
                     if (line.startsWith(HASH_INCLUDE)) {
                         // include statement
                         String incl = line.substring(HASH_INCLUDE.length()).trim();
                         if (incl.contains(" ")) {
                             throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
                                                                location);
                         } else {
                             lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
                         }
                     } else {
                         // rule
                         String[] parts = line.split("\\s+");
                         if (parts.length != 4) {
                             throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
                                                                " parts: " + rawLine + " in " + location);
                         } else {
                             try {
                                 String pat = stripQuotes(parts[0]);
                                 String lCon = stripQuotes(parts[1]);
                                 String rCon = stripQuotes(parts[2]);
                                 PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
                                 final int cLine = currentLine;
                                 Rule r = new Rule(pat, lCon, rCon, ph) {
                                     private final int myLine = cLine;
                                     private final String loc = location;
 
                                     @Override
                                     public String toString() {
                                         final StringBuilder sb = new StringBuilder();
                                         sb.append("Rule");
                                         sb.append("{line=").append(myLine);
                                         sb.append(", loc='").append(loc).append('\'');
                                         sb.append('}');
                                         return sb.toString();
                                     }
                                 };
                                 lines.add(r);
                             } catch (IllegalArgumentException e) {
                                 throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
                                                                 location, e);
                             }
                         }
                     }
                 }
             }
         }
 
         return lines;
     }
 
     /**
      * Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
      *
      * @param regex
      *            the regular expression to compile
      * @return an RPattern that will match this regex
      */
     private static RPattern pattern(final String regex) {
         boolean startsWith = regex.startsWith("^");
         boolean endsWith = regex.endsWith("$");
         final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
         boolean boxes = content.contains("[");
 
         if (!boxes) {
             if (startsWith && endsWith) {
                 // exact match
                 if (content.length() == 0) {
                     // empty
                     return new RPattern() {
                         @Override
                         public boolean isMatch(CharSequence input) {
                             return input.length() == 0;
                         }
                     };
                 } else {
                     return new RPattern() {
                         @Override
                         public boolean isMatch(CharSequence input) {
                             return input.equals(content);
                         }
                     };
                 }
             } else if ((startsWith || endsWith) && content.length() == 0) {
                 // matches every string
                 return ALL_STRINGS_RMATCHER;
             } else if (startsWith) {
                 // matches from start
                 return new RPattern() {
                     @Override
                     public boolean isMatch(CharSequence input) {
                         return startsWith(input, content);
                     }
                 };
             } else if (endsWith) {
                 // matches from start
                 return new RPattern() {
                     @Override
                     public boolean isMatch(CharSequence input) {
                         return endsWith(input, content);
                     }
                 };
             }
         } else {
             boolean startsWithBox = content.startsWith("[");
             boolean endsWithBox = content.endsWith("]");
 
             if (startsWithBox && endsWithBox) {
                 String boxContent = content.substring(1, content.length() - 1);
                 if (!boxContent.contains("[")) {
                     // box containing alternatives
                     boolean negate = boxContent.startsWith("^");
                     if (negate) {
                         boxContent = boxContent.substring(1);
                     }
                     final String bContent = boxContent;
                     final boolean shouldMatch = !negate;
 
                     if (startsWith && endsWith) {
                         // exact match
                         return new RPattern() {
                             @Override
                             public boolean isMatch(CharSequence input) {
                                 return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
                             }
                         };
                     } else if (startsWith) {
                         // first char
                         return new RPattern() {
                             @Override
                             public boolean isMatch(CharSequence input) {
                                 return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
                             }
                         };
                     } else if (endsWith) {
                         // last char
                         return new RPattern() {
                             @Override
                             public boolean isMatch(CharSequence input) {
                                 return input.length() > 0 &&
                                        contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
                             }
                         };
                     }
                 }
             }
         }
 
         return new RPattern() {
             Pattern pattern = Pattern.compile(regex);
 
             @Override
             public boolean isMatch(CharSequence input) {
                 Matcher matcher = pattern.matcher(input);
                 return matcher.find();
             }
         };
     }
 
     private static boolean startsWith(CharSequence input, CharSequence prefix) {
         if (prefix.length() > input.length()) {
             return false;
         }
         for (int i = 0; i < prefix.length(); i++) {
             if (input.charAt(i) != prefix.charAt(i)) {
                 return false;
             }
         }
         return true;
     }
 
     private static String stripQuotes(String str) {
         if (str.startsWith(DOUBLE_QUOTE)) {
             str = str.substring(1);
         }
 
         if (str.endsWith(DOUBLE_QUOTE)) {
             str = str.substring(0, str.length() - 1);
         }
 
         return str;
     }
 
     private final RPattern lContext;
 
     private final String pattern;
 
     private final PhonemeExpr phoneme;
 
     private final RPattern rContext;
 
     /**
      * Creates a new rule.
      *
      * @param pattern
      *            the pattern
      * @param lContext
      *            the left context
      * @param rContext
      *            the right context
      * @param phoneme
      *            the resulting phoneme
      */
     public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) {
         this.pattern = pattern;
         this.lContext = pattern(lContext + "$");
         this.rContext = pattern("^" + rContext);
         this.phoneme = phoneme;
     }
 
     /**
      * Gets the left context. This is a regular expression that must match to the left of the pattern.
      *
      * @return the left context Pattern
      */
     public RPattern getLContext() {
         return this.lContext;
     }
 
     /**
      * Gets the pattern. This is a string-literal that must exactly match.
      *
      * @return the pattern
      */
     public String getPattern() {
         return this.pattern;
     }
 
     /**
      * Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
      *
      * @return the phoneme
      */
     public PhonemeExpr getPhoneme() {
         return this.phoneme;
     }
 
     /**
      * Gets the right context. This is a regular expression that must match to the right of the pattern.
      *
      * @return the right context Pattern
      */
     public RPattern getRContext() {
         return this.rContext;
     }
 
     /**
      * Decides if the pattern and context match the input starting at a position. It is a match if the
      * <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
      * <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
      *
      * @param input
      *            the input String
      * @param i
      *            the int position within the input
      * @return true if the pattern and left/right context match, false otherwise
      */
     public boolean patternAndContextMatches(CharSequence input, int i) {
         if (i < 0) {
             throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
         }
 
         int patternLength = this.pattern.length();
         int ipl = i + patternLength;
 
         if (ipl > input.length()) {
             // not enough room for the pattern to match
             return false;
         }
 
         // evaluate the pattern, left context and right context
         // fail early if any of the evaluations is not successful
         if (!input.subSequence(i, ipl).equals(this.pattern)) {
             return false;
         } else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
             return false;
         }
         return this.lContext.isMatch(input.subSequence(0, i));
     }
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.commons.codec.language.bm;
19
20		import java.io.InputStream;
21		import java.util.ArrayList;
22		import java.util.Arrays;
23		import java.util.Collections;
24		import java.util.Comparator;
25		import java.util.EnumMap;
26		import java.util.HashMap;
27		import java.util.HashSet;
28		import java.util.List;
29		import java.util.Map;
30		import java.util.Scanner;
31		import java.util.Set;
32		import java.util.regex.Matcher;
33		import java.util.regex.Pattern;
34
35		/**
36		* A phoneme rule.
37		* <p>
38		* Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply
39		* and a logical flag indicating if all languages must be in play. A rule matches if:
40		* <ul>
41		* <li>the pattern matches at the current position</li>
42		* <li>the string up until the beginning of the pattern matches the left context</li>
43		* <li>the string from the end of the pattern matches the right context</li>
44		* <li>logical is ALL and all languages are in scope; or</li>
45		* <li>logical is any other value and at least one language is in scope</li>
46		* </ul>
47		* <p>
48		* Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user
49		* to explicitly construct their own.
50		* <p>
51		* Rules are immutable and thread-safe.
52		* <p>
53		* <b>Rules resources</b>
54		* <p>
55		* Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically
56		* named following the pattern:
57		* <blockquote>org/apache/commons/codec/language/bm/${NameType#getName}_${RuleType#getName}_${language}.txt</blockquote>
58		* <p>
59		* The format of these resources is the following:
60		* <ul>
61		* <li><b>Rules:</b> whitespace separated, double-quoted strings. There should be 4 columns to each row, and these
62		* will be interpreted as:
63		* <ol>
64		* <li>pattern</li>
65		* <li>left context</li>
66		* <li>right context</li>
67		* <li>phoneme</li>
68		* </ol>
69		* </li>
70		* <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be discarded
71		* as a comment.</li>
72		* <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. This will skip
73		* all content until a line ending in '*' and '/' is found.</li>
74		* <li><b>Blank lines:</b> All blank lines will be skipped.</li>
75		* </ul>
76		*
77		* @since 1.6
78		* @version $Id$
79		*/
80	106145	public class Rule {
81
82	4876451	public static final class Phoneme implements PhonemeExpr {
83	92232	public static final Comparator<Phoneme> COMPARATOR = new Comparator<Phoneme>() {
84		@Override
85		public int compare(Phoneme o1, Phoneme o2) {
86	1220580	for (int i = 0; i < o1.phonemeText.length(); i++) {
87	1216151	if (i >= o2.phonemeText.length()) {
88	720	return +1;
89		}
90	1215431	int c = o1.phonemeText.charAt(i) - o2.phonemeText.charAt(i);
91	1215431	if (c != 0) {
92	87082	return c;
93		}
94		}
95
96	4429	if (o1.phonemeText.length() < o2.phonemeText.length()) {
97	1020	return -1;
98		}
99
100	3409	return 0;
101		}
102		};
103
104		private final CharSequence phonemeText;
105		private final Languages.LanguageSet languages;
106
107	542556	public Phoneme(CharSequence phonemeText, Languages.LanguageSet languages) {
108	542556	this.phonemeText = phonemeText;
109	542556	this.languages = languages;
110	542556	}
111
112		public Phoneme append(CharSequence str) {
113	152227	return new Phoneme(this.phonemeText.toString() + str.toString(), this.languages);
114		}
115
116		public Languages.LanguageSet getLanguages() {
117	316698	return this.languages;
118		}
119
120		@Override
121		public Iterable<Phoneme> getPhonemes() {
122	37869	return Collections.singleton(this);
123		}
124
125		public CharSequence getPhonemeText() {
126	220364	return this.phonemeText;
127		}
128
129		public Phoneme join(Phoneme right) {
130	175837	return new Phoneme(this.phonemeText.toString() + right.phonemeText.toString(),
131		this.languages.restrictTo(right.languages));
132		}
133		}
134
135		public interface PhonemeExpr {
136		Iterable<Phoneme> getPhonemes();
137		}
138
139	63608	public static final class PhonemeList implements PhonemeExpr {
140		private final List<Phoneme> phonemes;
141
142	1438	public PhonemeList(List<Phoneme> phonemes) {
143	1438	this.phonemes = phonemes;
144	1438	}
145
146		@Override
147		public List<Phoneme> getPhonemes() {
148	63608	return this.phonemes;
149		}
150		}
151
152		/**
153		* A minimal wrapper around the functionality of Pattern that we use, to allow for alternate implementations.
154		*/
155		public static interface RPattern {
156		boolean isMatch(CharSequence input);
157		}
158
159	1	public static final RPattern ALL_STRINGS_RMATCHER = new RPattern() {
160		@Override
161		public boolean isMatch(CharSequence input) {
162	66010	return true;
163		}
164		};
165
166		public static final String ALL = "ALL";
167
168		private static final String DOUBLE_QUOTE = "\"";
169
170		private static final String HASH_INCLUDE = "#include";
171
172	1	private static final Map<NameType, Map<RuleType, Map<String, List<Rule>>>> RULES =
173		new EnumMap<NameType, Map<RuleType, Map<String, List<Rule>>>>(NameType.class);
174
175		static {
176	4	for (NameType s : NameType.values()) {
177	3	Map<RuleType, Map<String, List<Rule>>> rts = new EnumMap<RuleType, Map<String, List<Rule>>>(RuleType.class);
178
179	12	for (RuleType rt : RuleType.values()) {
180	9	Map<String, List<Rule>> rs = new HashMap<String, List<Rule>>();
181
182	9	Languages ls = Languages.getInstance(s);
183	9	for (String l : ls.getLanguages()) {
184		try {
185	108	rs.put(l, parseRules(createScanner(s, rt, l), createResourceName(s, rt, l)));
186	0	} catch (IllegalStateException e) {
187	0	throw new IllegalStateException("Problem processing " + createResourceName(s, rt, l), e);
188	108	}
189		}
190	9	if (!rt.equals(RuleType.RULES)) {
191	6	rs.put("common", parseRules(createScanner(s, rt, "common"), createResourceName(s, rt, "common")));
192		}
193
194	9	rts.put(rt, Collections.unmodifiableMap(rs));
195		}
196
197	3	RULES.put(s, Collections.unmodifiableMap(rts));
198		}
199	1	}
200
201		private static boolean contains(CharSequence chars, char input) {
202	312230	for (int i = 0; i < chars.length(); i++) {
203	270802	if (chars.charAt(i) == input) {
204	10453	return true;
205		}
206		}
207	41428	return false;
208		}
209
210		private static String createResourceName(NameType nameType, RuleType rt, String lang) {
211	228	return String.format("org/apache/commons/codec/language/bm/%s_%s_%s.txt",
212		nameType.getName(), rt.getName(), lang);
213		}
214
215		private static Scanner createScanner(NameType nameType, RuleType rt, String lang) {
216	114	String resName = createResourceName(nameType, rt, lang);
217	114	InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
218
219	114	if (rulesIS == null) {
220	0	throw new IllegalArgumentException("Unable to load resource: " + resName);
221		}
222
223	114	return new Scanner(rulesIS, ResourceConstants.ENCODING);
224		}
225
226		private static Scanner createScanner(String lang) {
227	35	String resName = String.format("org/apache/commons/codec/language/bm/%s.txt", lang);
228	35	InputStream rulesIS = Languages.class.getClassLoader().getResourceAsStream(resName);
229
230	35	if (rulesIS == null) {
231	0	throw new IllegalArgumentException("Unable to load resource: " + resName);
232		}
233
234	35	return new Scanner(rulesIS, ResourceConstants.ENCODING);
235		}
236
237		private static boolean endsWith(CharSequence input, CharSequence suffix) {
238	894	if (suffix.length() > input.length()) {
239	124	return false;
240		}
241	783	for (int i = input.length() - 1, j = suffix.length() - 1; j >= 0; i--, j--) {
242	770	if (input.charAt(i) != suffix.charAt(j)) {
243	757	return false;
244		}
245		}
246	13	return true;
247		}
248
249		/**
250		* Gets rules for a combination of name type, rule type and languages.
251		*
252		* @param nameType
253		* the NameType to consider
254		* @param rt
255		* the RuleType to consider
256		* @param langs
257		* the set of languages to consider
258		* @return a list of Rules that apply
259		*/
260		public static List<Rule> getInstance(NameType nameType, RuleType rt, Languages.LanguageSet langs) {
261	134336	return langs.isSingleton() ? getInstance(nameType, rt, langs.getAny()) :
262		getInstance(nameType, rt, Languages.ANY);
263		}
264
265		/**
266		* Gets rules for a combination of name type, rule type and a single language.
267		*
268		* @param nameType
269		* the NameType to consider
270		* @param rt
271		* the RuleType to consider
272		* @param lang
273		* the language to consider
274		* @return a list rules for a combination of name type, rule type and a single language.
275		*/
276		public static List<Rule> getInstance(NameType nameType, RuleType rt, String lang) {
277	201505	List<Rule> rules = RULES.get(nameType).get(rt).get(lang);
278
279	201505	if (rules == null) {
280	1	throw new IllegalArgumentException(String.format("No rules found for %s, %s, %s.",
281		nameType.getName(), rt.getName(), lang));
282		}
283
284	201504	return rules;
285		}
286
287		private static Phoneme parsePhoneme(String ph) {
288	6388	int open = ph.indexOf("[");
289	6388	if (open >= 0) {
290	597	if (!ph.endsWith("]")) {
291	0	throw new IllegalArgumentException("Phoneme expression contains a '[' but does not end in ']'");
292		}
293	597	String before = ph.substring(0, open);
294	597	String in = ph.substring(open + 1, ph.length() - 1);
295	597	Set<String> langs = new HashSet<String>(Arrays.asList(in.split("[+]")));
296
297	597	return new Phoneme(before, Languages.LanguageSet.from(langs));
298		} else {
299	5791	return new Phoneme(ph, Languages.ANY_LANGUAGE);
300		}
301		}
302
303		private static PhonemeExpr parsePhonemeExpr(String ph) {
304	4507	if (ph.startsWith("(")) { // we have a bracketed list of options
305	1438	if (!ph.endsWith(")")) {
306	0	throw new IllegalArgumentException("Phoneme starts with '(' so must end with ')'");
307		}
308
309	1438	List<Phoneme> phs = new ArrayList<Phoneme>();
310	1438	String body = ph.substring(1, ph.length() - 1);
311	4757	for (String part : body.split("[\|]")) {
312	3319	phs.add(parsePhoneme(part));
313		}
314	1438	if (body.startsWith("\|") \|\| body.endsWith("\|")) {
315	48	phs.add(new Phoneme("", Languages.ANY_LANGUAGE));
316		}
317
318	1438	return new PhonemeList(phs);
319		} else {
320	3069	return parsePhoneme(ph);
321		}
322		}
323
324		private static List<Rule> parseRules(final Scanner scanner, final String location) {
325	149	List<Rule> lines = new ArrayList<Rule>();
326	149	int currentLine = 0;
327
328	149	boolean inMultilineComment = false;
329	8245	while (scanner.hasNextLine()) {
330	8096	currentLine++;
331	8096	String rawLine = scanner.nextLine();
332	8096	String line = rawLine;
333
334	8096	if (inMultilineComment) {
335	2235	if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
336	149	inMultilineComment = false;
337		}
338		} else {
339	5861	if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
340	149	inMultilineComment = true;
341		} else {
342		// discard comments
343	5712	int cmtI = line.indexOf(ResourceConstants.CMT);
344	5712	if (cmtI >= 0) {
345	857	line = line.substring(0, cmtI);
346		}
347
348		// trim leading-trailing whitespace
349	5712	line = line.trim();
350
351	5712	if (line.length() == 0) {
352	1170	continue; // empty lines can be safely skipped
353		}
354
355	4542	if (line.startsWith(HASH_INCLUDE)) {
356		// include statement
357	35	String incl = line.substring(HASH_INCLUDE.length()).trim();
358	35	if (incl.contains(" ")) {
359	0	throw new IllegalArgumentException("Malformed import statement '" + rawLine + "' in " +
360		location);
361		} else {
362	35	lines.addAll(parseRules(createScanner(incl), location + "->" + incl));
363		}
364	35	} else {
365		// rule
366	4507	String[] parts = line.split("\\s+");
367	4507	if (parts.length != 4) {
368	0	throw new IllegalArgumentException("Malformed rule statement split into " + parts.length +
369		" parts: " + rawLine + " in " + location);
370		} else {
371		try {
372	4507	String pat = stripQuotes(parts[0]);
373	4507	String lCon = stripQuotes(parts[1]);
374	4507	String rCon = stripQuotes(parts[2]);
375	4507	PhonemeExpr ph = parsePhonemeExpr(stripQuotes(parts[3]));
376	4507	final int cLine = currentLine;
377	4507	Rule r = new Rule(pat, lCon, rCon, ph) {
378	4507	private final int myLine = cLine;
379	4507	private final String loc = location;
380
381		@Override
382		public String toString() {
383	0	final StringBuilder sb = new StringBuilder();
384	0	sb.append("Rule");
385	0	sb.append("{line=").append(myLine);
386	0	sb.append(", loc='").append(loc).append('\'');
387	0	sb.append('}');
388	0	return sb.toString();
389		}
390		};
391	4507	lines.add(r);
392	0	} catch (IllegalArgumentException e) {
393	0	throw new IllegalStateException("Problem parsing line '" + currentLine + "' in " +
394		location, e);
395	4507	}
396		}
397		}
398		}
399		}
400	6926	}
401
402	149	return lines;
403		}
404
405		/**
406		* Attempts to compile the regex into direct string ops, falling back to Pattern and Matcher in the worst case.
407		*
408		* @param regex
409		* the regular expression to compile
410		* @return an RPattern that will match this regex
411		*/
412		private static RPattern pattern(final String regex) {
413	9016	boolean startsWith = regex.startsWith("^");
414	9016	boolean endsWith = regex.endsWith("$");
415	9016	final String content = regex.substring(startsWith ? 1 : 0, endsWith ? regex.length() - 1 : regex.length());
416	9016	boolean boxes = content.contains("[");
417
418	9016	if (!boxes) {
419	8046	if (startsWith && endsWith) {
420		// exact match
421	633	if (content.length() == 0) {
422		// empty
423	610	return new RPattern() {
424		@Override
425		public boolean isMatch(CharSequence input) {
426	34729	return input.length() == 0;
427		}
428		};
429		} else {
430	23	return new RPattern() {
431		@Override
432		public boolean isMatch(CharSequence input) {
433	6720	return input.equals(content);
434		}
435		};
436		}
437	7413	} else if ((startsWith \|\| endsWith) && content.length() == 0) {
438		// matches every string
439	7191	return ALL_STRINGS_RMATCHER;
440	222	} else if (startsWith) {
441		// matches from start
442	188	return new RPattern() {
443		@Override
444		public boolean isMatch(CharSequence input) {
445	53370	return startsWith(input, content);
446		}
447		};
448	34	} else if (endsWith) {
449		// matches from start
450	34	return new RPattern() {
451		@Override
452		public boolean isMatch(CharSequence input) {
453	894	return endsWith(input, content);
454		}
455		};
456		}
457		} else {
458	970	boolean startsWithBox = content.startsWith("[");
459	970	boolean endsWithBox = content.endsWith("]");
460
461	970	if (startsWithBox && endsWithBox) {
462	946	String boxContent = content.substring(1, content.length() - 1);
463	946	if (!boxContent.contains("[")) {
464		// box containing alternatives
465	933	boolean negate = boxContent.startsWith("^");
466	933	if (negate) {
467	28	boxContent = boxContent.substring(1);
468		}
469	933	final String bContent = boxContent;
470	933	final boolean shouldMatch = !negate;
471
472	933	if (startsWith && endsWith) {
473		// exact match
474	55	return new RPattern() {
475		@Override
476		public boolean isMatch(CharSequence input) {
477	15896	return input.length() == 1 && contains(bContent, input.charAt(0)) == shouldMatch;
478		}
479		};
480	878	} else if (startsWith) {
481		// first char
482	650	return new RPattern() {
483		@Override
484		public boolean isMatch(CharSequence input) {
485	43307	return input.length() > 0 && contains(bContent, input.charAt(0)) == shouldMatch;
486		}
487		};
488	228	} else if (endsWith) {
489		// last char
490	228	return new RPattern() {
491		@Override
492		public boolean isMatch(CharSequence input) {
493	13986	return input.length() > 0 &&
494		contains(bContent, input.charAt(input.length() - 1)) == shouldMatch;
495		}
496		};
497		}
498		}
499		}
500		}
501
502	37	return new RPattern() {
503	37	Pattern pattern = Pattern.compile(regex);
504
505		@Override
506		public boolean isMatch(CharSequence input) {
507	16196	Matcher matcher = pattern.matcher(input);
508	16196	return matcher.find();
509		}
510		};
511		}
512
513		private static boolean startsWith(CharSequence input, CharSequence prefix) {
514	53370	if (prefix.length() > input.length()) {
515	4830	return false;
516		}
517	50012	for (int i = 0; i < prefix.length(); i++) {
518	49856	if (input.charAt(i) != prefix.charAt(i)) {
519	48384	return false;
520		}
521		}
522	156	return true;
523		}
524
525		private static String stripQuotes(String str) {
526	18028	if (str.startsWith(DOUBLE_QUOTE)) {
527	18028	str = str.substring(1);
528		}
529
530	18028	if (str.endsWith(DOUBLE_QUOTE)) {
531	18019	str = str.substring(0, str.length() - 1);
532		}
533
534	18028	return str;
535		}
536
537		private final RPattern lContext;
538
539		private final String pattern;
540
541		private final PhonemeExpr phoneme;
542
543		private final RPattern rContext;
544
545		/**
546		* Creates a new rule.
547		*
548		* @param pattern
549		* the pattern
550		* @param lContext
551		* the left context
552		* @param rContext
553		* the right context
554		* @param phoneme
555		* the resulting phoneme
556		*/
557	4508	public Rule(String pattern, String lContext, String rContext, PhonemeExpr phoneme) {
558	4508	this.pattern = pattern;
559	4508	this.lContext = pattern(lContext + "$");
560	4508	this.rContext = pattern("^" + rContext);
561	4508	this.phoneme = phoneme;
562	4508	}
563
564		/**
565		* Gets the left context. This is a regular expression that must match to the left of the pattern.
566		*
567		* @return the left context Pattern
568		*/
569		public RPattern getLContext() {
570	0	return this.lContext;
571		}
572
573		/**
574		* Gets the pattern. This is a string-literal that must exactly match.
575		*
576		* @return the pattern
577		*/
578		public String getPattern() {
579	32427496	return this.pattern;
580		}
581
582		/**
583		* Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match.
584		*
585		* @return the phoneme
586		*/
587		public PhonemeExpr getPhoneme() {
588	31736	return this.phoneme;
589		}
590
591		/**
592		* Gets the right context. This is a regular expression that must match to the right of the pattern.
593		*
594		* @return the right context Pattern
595		*/
596		public RPattern getRContext() {
597	0	return this.rContext;
598		}
599
600		/**
601		* Decides if the pattern and context match the input starting at a position. It is a match if the
602		* <code>lContext</code> matches <code>input</code> up to <code>i</code>, <code>pattern</code> matches at i and
603		* <code>rContext</code> matches from the end of the match of <code>pattern</code> to the end of <code>input</code>.
604		*
605		* @param input
606		* the input String
607		* @param i
608		* the int position within the input
609		* @return true if the pattern and left/right context match, false otherwise
610		*/
611		public boolean patternAndContextMatches(CharSequence input, int i) {
612	32427497	if (i < 0) {
613	1	throw new IndexOutOfBoundsException("Can not match pattern at negative indexes");
614		}
615
616	32427496	int patternLength = this.pattern.length();
617	32427496	int ipl = i + patternLength;
618
619	32427496	if (ipl > input.length()) {
620		// not enough room for the pattern to match
621	13169700	return false;
622		}
623
624		// evaluate the pattern, left context and right context
625		// fail early if any of the evaluations is not successful
626	19257796	if (!input.subSequence(i, ipl).equals(this.pattern)) {
627	19056307	return false;
628	201489	} else if (!this.rContext.isMatch(input.subSequence(ipl, input.length()))) {
629	151870	return false;
630		}
631	49619	return this.lContext.isMatch(input.subSequence(0, i));
632		}
633		}