Coverage Report

Coverage Report - org.apache.commons.codec.language.bm.Lang

Classes in this File

Line Coverage

Branch Coverage

Complexity

Lang

93%

46/49

88%

23/26

3.143

Lang$1

N/A

3.143

Lang$LangRule

100%

7/7

N/A

3.143

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.commons.codec.language.bm;
 
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.EnumMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Scanner;
 import java.util.Set;
 import java.util.regex.Pattern;
 
 /**
  * Language guessing utility.
  * <p>
  * This class encapsulates rules used to guess the possible languages that a word originates from. This is
  * done by reference to a whole series of rules distributed in resource files.
  * <p>
  * Instances of this class are typically managed through the static factory method instance().
  * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
  * <p>
  * This class is intended to be immutable and thread-safe.
  * <p>
  * <b>Lang resources</b>
  * <p>
  * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
  * They are systematically named following the pattern:
  * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
  * The format of these resources is the following:
  * <ul>
  * <li><b>Rules:</b> whitespace separated strings.
  * There should be 3 columns to each row, and these will be interpreted as:
  * <ol>
  * <li>pattern: a regular expression.</li>
  * <li>languages: a '+'-separated list of languages.</li>
  * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
  * </ol>
  * </li>
  * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
  * discarded as a comment.</li>
  * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
  * This will skip all content until a line ending in '*' and '/' is found.</li>
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
  * </ul>
  * <p>
  * Port of lang.php
  *
  * @since 1.6
  * @version $Id$
  */
 public class Lang {
     // Implementation note: This class is divided into two sections. The first part is a static factory interface that
     // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
     // encapsulate a particular language-guessing rule table and the language guessing itself.
     //
     // It may make sense in the future to expose the private constructor to allow power users to build custom language-
     // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
     // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
 
     private static final class LangRule {
         private final boolean acceptOnMatch;
         private final Set<String> languages;
         private final Pattern pattern;
 
         private LangRule(Pattern pattern, Set<String> languages, boolean acceptOnMatch) {
             this.pattern = pattern;
             this.languages = languages;
             this.acceptOnMatch = acceptOnMatch;
         }
 
         public boolean matches(String txt) {
             return this.pattern.matcher(txt).find();
         }
     }
 
     private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
 
     private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
 
     static {
         for (NameType s : NameType.values()) {
             Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
         }
     }
 
     /**
      * Gets a Lang instance for one of the supported NameTypes.
      *
      * @param nameType
      *            the NameType to look up
      * @return a Lang encapsulating the language guessing rules for that name type
      */
     public static Lang instance(NameType nameType) {
         return Langs.get(nameType);
     }
 
     /**
      * Loads language rules from a resource.
      * <p>
      * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
      * You will only need to call this yourself if you are developing custom language mapping rules.
      *
      * @param languageRulesResourceName
      *            the fully-qualified resource name to load
      * @param languages
      *            the languages that these rules will support
      * @return a Lang encapsulating the loaded language-guessing rules.
      */
     public static Lang loadFromResource(String languageRulesResourceName, Languages languages) {
         List<LangRule> rules = new ArrayList<LangRule>();
         InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
 
         if (lRulesIS == null) {
             throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
         }
 
         Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
         boolean inExtendedComment = false;
         while (scanner.hasNextLine()) {
             String rawLine = scanner.nextLine();
             String line = rawLine;
 
             if (inExtendedComment) {
                 // check for closing comment marker, otherwise discard doc comment line
                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
                     inExtendedComment = false;
                 }
             } else {
                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
                     inExtendedComment = true;
                 } else {
                     // discard comments
                     int cmtI = line.indexOf(ResourceConstants.CMT);
                     if (cmtI >= 0) {
                         line = line.substring(0, cmtI);
                     }
 
                     // trim leading-trailing whitespace
                     line = line.trim();
 
                     if (line.length() == 0) {
                         continue; // empty lines can be safely skipped
                     }
 
                     // split it up
                     String[] parts = line.split("\\s+");
 
                     if (parts.length != 3) {
                         throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" +
                                                            languageRulesResourceName + "'");
                     }
 
                     Pattern pattern = Pattern.compile(parts[0]);
                     String[] langs = parts[1].split("\\+");
                     boolean accept = parts[2].equals("true");
 
                     rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
                 }
             }
         }
 
         return new Lang(rules, languages);
     }
 
     private final Languages languages;
     private final List<LangRule> rules;
 
     private Lang(List<LangRule> rules, Languages languages) {
         this.rules = Collections.unmodifiableList(rules);
         this.languages = languages;
     }
 
     /**
      * Guesses the language of a word.
      *
      * @param text
      *            the word
      * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
      */
     public String guessLanguage(String text) {
         Languages.LanguageSet ls = guessLanguages(text);
         return ls.isSingleton() ? ls.getAny() : Languages.ANY;
     }
 
     /**
      * Guesses the languages of a word.
      *
      * @param input
      *            the word
      * @return a Set of Strings of language names that are potential matches for the input word
      */
     public Languages.LanguageSet guessLanguages(String input) {
         String text = input.toLowerCase(Locale.ENGLISH);
 
         Set<String> langs = new HashSet<String>(this.languages.getLanguages());
         for (LangRule rule : this.rules) {
             if (rule.matches(text)) {
                 if (rule.acceptOnMatch) {
                     langs.retainAll(rule.languages);
                 } else {
                     langs.removeAll(rule.languages);
                 }
             }
         }
 
         Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
         return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
     }
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.commons.codec.language.bm;
19
20		import java.io.InputStream;
21		import java.util.ArrayList;
22		import java.util.Arrays;
23		import java.util.Collections;
24		import java.util.EnumMap;
25		import java.util.HashSet;
26		import java.util.List;
27		import java.util.Locale;
28		import java.util.Map;
29		import java.util.Scanner;
30		import java.util.Set;
31		import java.util.regex.Pattern;
32
33		/**
34		* Language guessing utility.
35		* <p>
36		* This class encapsulates rules used to guess the possible languages that a word originates from. This is
37		* done by reference to a whole series of rules distributed in resource files.
38		* <p>
39		* Instances of this class are typically managed through the static factory method instance().
40		* Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
41		* <p>
42		* This class is intended to be immutable and thread-safe.
43		* <p>
44		* <b>Lang resources</b>
45		* <p>
46		* Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
47		* They are systematically named following the pattern:
48		* <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
49		* The format of these resources is the following:
50		* <ul>
51		* <li><b>Rules:</b> whitespace separated strings.
52		* There should be 3 columns to each row, and these will be interpreted as:
53		* <ol>
54		* <li>pattern: a regular expression.</li>
55		* <li>languages: a '+'-separated list of languages.</li>
56		* <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
57		* </ol>
58		* </li>
59		* <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
60		* discarded as a comment.</li>
61		* <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
62		* This will skip all content until a line ending in '*' and '/' is found.</li>
63		* <li><b>Blank lines:</b> All blank lines will be skipped.</li>
64		* </ul>
65		* <p>
66		* Port of lang.php
67		*
68		* @since 1.6
69		* @version $Id$
70		*/
71		public class Lang {
72		// Implementation note: This class is divided into two sections. The first part is a static factory interface that
73		// exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
74		// encapsulate a particular language-guessing rule table and the language guessing itself.
75		//
76		// It may make sense in the future to expose the private constructor to allow power users to build custom language-
77		// guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
78		// should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
79
80	7766	private static final class LangRule {
81		private final boolean acceptOnMatch;
82		private final Set<String> languages;
83		private final Pattern pattern;
84
85	756	private LangRule(Pattern pattern, Set<String> languages, boolean acceptOnMatch) {
86	756	this.pattern = pattern;
87	756	this.languages = languages;
88	756	this.acceptOnMatch = acceptOnMatch;
89	756	}
90
91		public boolean matches(String txt) {
92	16924824	return this.pattern.matcher(txt).find();
93		}
94		}
95
96	1	private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
97
98		private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
99
100		static {
101	4	for (NameType s : NameType.values()) {
102	3	Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
103		}
104	1	}
105
106		/**
107		* Gets a Lang instance for one of the supported NameTypes.
108		*
109		* @param nameType
110		* the NameType to look up
111		* @return a Lang encapsulating the language guessing rules for that name type
112		*/
113		public static Lang instance(NameType nameType) {
114	145	return Langs.get(nameType);
115		}
116
117		/**
118		* Loads language rules from a resource.
119		* <p>
120		* In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
121		* You will only need to call this yourself if you are developing custom language mapping rules.
122		*
123		* @param languageRulesResourceName
124		* the fully-qualified resource name to load
125		* @param languages
126		* the languages that these rules will support
127		* @return a Lang encapsulating the loaded language-guessing rules.
128		*/
129		public static Lang loadFromResource(String languageRulesResourceName, Languages languages) {
130	4	List<LangRule> rules = new ArrayList<LangRule>();
131	4	InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
132
133	4	if (lRulesIS == null) {
134	1	throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
135		}
136
137	3	Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
138	3	boolean inExtendedComment = false;
139	882	while (scanner.hasNextLine()) {
140	879	String rawLine = scanner.nextLine();
141	879	String line = rawLine;
142
143	879	if (inExtendedComment) {
144		// check for closing comment marker, otherwise discard doc comment line
145	45	if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
146	3	inExtendedComment = false;
147		}
148		} else {
149	834	if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
150	3	inExtendedComment = true;
151		} else {
152		// discard comments
153	831	int cmtI = line.indexOf(ResourceConstants.CMT);
154	831	if (cmtI >= 0) {
155	144	line = line.substring(0, cmtI);
156		}
157
158		// trim leading-trailing whitespace
159	831	line = line.trim();
160
161	831	if (line.length() == 0) {
162	75	continue; // empty lines can be safely skipped
163		}
164
165		// split it up
166	756	String[] parts = line.split("\\s+");
167
168	756	if (parts.length != 3) {
169	0	throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" +
170		languageRulesResourceName + "'");
171		}
172
173	756	Pattern pattern = Pattern.compile(parts[0]);
174	756	String[] langs = parts[1].split("\\+");
175	756	boolean accept = parts[2].equals("true");
176
177	756	rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
178		}
179		}
180	804	}
181
182	3	return new Lang(rules, languages);
183		}
184
185		private final Languages languages;
186		private final List<LangRule> rules;
187
188	3	private Lang(List<LangRule> rules, Languages languages) {
189	3	this.rules = Collections.unmodifiableList(rules);
190	3	this.languages = languages;
191	3	}
192
193		/**
194		* Guesses the language of a word.
195		*
196		* @param text
197		* the word
198		* @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
199		*/
200		public String guessLanguage(String text) {
201	0	Languages.LanguageSet ls = guessLanguages(text);
202	0	return ls.isSingleton() ? ls.getAny() : Languages.ANY;
203		}
204
205		/**
206		* Guesses the languages of a word.
207		*
208		* @param input
209		* the word
210		* @return a Set of Strings of language names that are potential matches for the input word
211		*/
212		public Languages.LanguageSet guessLanguages(String input) {
213	67162	String text = input.toLowerCase(Locale.ENGLISH);
214
215	67162	Set<String> langs = new HashSet<String>(this.languages.getLanguages());
216	67162	for (LangRule rule : this.rules) {
217	16924824	if (rule.matches(text)) {
218	3505	if (rule.acceptOnMatch) {
219	407	langs.retainAll(rule.languages);
220		} else {
221	3098	langs.removeAll(rule.languages);
222		}
223		}
224		}
225
226	67162	Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
227	67162	return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
228		}
229		}