Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Lang |
|
| 3.142857142857143;3.143 | ||||
Lang$1 |
|
| 3.142857142857143;3.143 | ||||
Lang$LangRule |
|
| 3.142857142857143;3.143 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.codec.language.bm; | |
19 | ||
20 | import java.io.InputStream; | |
21 | import java.util.ArrayList; | |
22 | import java.util.Arrays; | |
23 | import java.util.Collections; | |
24 | import java.util.EnumMap; | |
25 | import java.util.HashSet; | |
26 | import java.util.List; | |
27 | import java.util.Locale; | |
28 | import java.util.Map; | |
29 | import java.util.Scanner; | |
30 | import java.util.Set; | |
31 | import java.util.regex.Pattern; | |
32 | ||
33 | /** | |
34 | * Language guessing utility. | |
35 | * <p> | |
36 | * This class encapsulates rules used to guess the possible languages that a word originates from. This is | |
37 | * done by reference to a whole series of rules distributed in resource files. | |
38 | * <p> | |
39 | * Instances of this class are typically managed through the static factory method instance(). | |
40 | * Unless you are developing your own language guessing rules, you will not need to interact with this class directly. | |
41 | * <p> | |
42 | * This class is intended to be immutable and thread-safe. | |
43 | * <p> | |
44 | * <b>Lang resources</b> | |
45 | * <p> | |
46 | * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files. | |
47 | * They are systematically named following the pattern: | |
48 | * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote> | |
49 | * The format of these resources is the following: | |
50 | * <ul> | |
51 | * <li><b>Rules:</b> whitespace separated strings. | |
52 | * There should be 3 columns to each row, and these will be interpreted as: | |
53 | * <ol> | |
54 | * <li>pattern: a regular expression.</li> | |
55 | * <li>languages: a '+'-separated list of languages.</li> | |
56 | * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li> | |
57 | * </ol> | |
58 | * </li> | |
59 | * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be | |
60 | * discarded as a comment.</li> | |
61 | * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode. | |
62 | * This will skip all content until a line ending in '*' and '/' is found.</li> | |
63 | * <li><b>Blank lines:</b> All blank lines will be skipped.</li> | |
64 | * </ul> | |
65 | * <p> | |
66 | * Port of lang.php | |
67 | * | |
68 | * @since 1.6 | |
69 | * @version $Id$ | |
70 | */ | |
71 | public class Lang { | |
72 | // Implementation note: This class is divided into two sections. The first part is a static factory interface that | |
73 | // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that | |
74 | // encapsulate a particular language-guessing rule table and the language guessing itself. | |
75 | // | |
76 | // It may make sense in the future to expose the private constructor to allow power users to build custom language- | |
77 | // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users | |
78 | // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances. | |
79 | ||
80 | 7766 | private static final class LangRule { |
81 | private final boolean acceptOnMatch; | |
82 | private final Set<String> languages; | |
83 | private final Pattern pattern; | |
84 | ||
85 | 756 | private LangRule(Pattern pattern, Set<String> languages, boolean acceptOnMatch) { |
86 | 756 | this.pattern = pattern; |
87 | 756 | this.languages = languages; |
88 | 756 | this.acceptOnMatch = acceptOnMatch; |
89 | 756 | } |
90 | ||
91 | public boolean matches(String txt) { | |
92 | 16924824 | return this.pattern.matcher(txt).find(); |
93 | } | |
94 | } | |
95 | ||
96 | 1 | private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class); |
97 | ||
98 | private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt"; | |
99 | ||
100 | static { | |
101 | 4 | for (NameType s : NameType.values()) { |
102 | 3 | Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s))); |
103 | } | |
104 | 1 | } |
105 | ||
106 | /** | |
107 | * Gets a Lang instance for one of the supported NameTypes. | |
108 | * | |
109 | * @param nameType | |
110 | * the NameType to look up | |
111 | * @return a Lang encapsulating the language guessing rules for that name type | |
112 | */ | |
113 | public static Lang instance(NameType nameType) { | |
114 | 145 | return Langs.get(nameType); |
115 | } | |
116 | ||
117 | /** | |
118 | * Loads language rules from a resource. | |
119 | * <p> | |
120 | * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method. | |
121 | * You will only need to call this yourself if you are developing custom language mapping rules. | |
122 | * | |
123 | * @param languageRulesResourceName | |
124 | * the fully-qualified resource name to load | |
125 | * @param languages | |
126 | * the languages that these rules will support | |
127 | * @return a Lang encapsulating the loaded language-guessing rules. | |
128 | */ | |
129 | public static Lang loadFromResource(String languageRulesResourceName, Languages languages) { | |
130 | 4 | List<LangRule> rules = new ArrayList<LangRule>(); |
131 | 4 | InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName); |
132 | ||
133 | 4 | if (lRulesIS == null) { |
134 | 1 | throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN); |
135 | } | |
136 | ||
137 | 3 | Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING); |
138 | 3 | boolean inExtendedComment = false; |
139 | 882 | while (scanner.hasNextLine()) { |
140 | 879 | String rawLine = scanner.nextLine(); |
141 | 879 | String line = rawLine; |
142 | ||
143 | 879 | if (inExtendedComment) { |
144 | // check for closing comment marker, otherwise discard doc comment line | |
145 | 45 | if (line.endsWith(ResourceConstants.EXT_CMT_END)) { |
146 | 3 | inExtendedComment = false; |
147 | } | |
148 | } else { | |
149 | 834 | if (line.startsWith(ResourceConstants.EXT_CMT_START)) { |
150 | 3 | inExtendedComment = true; |
151 | } else { | |
152 | // discard comments | |
153 | 831 | int cmtI = line.indexOf(ResourceConstants.CMT); |
154 | 831 | if (cmtI >= 0) { |
155 | 144 | line = line.substring(0, cmtI); |
156 | } | |
157 | ||
158 | // trim leading-trailing whitespace | |
159 | 831 | line = line.trim(); |
160 | ||
161 | 831 | if (line.length() == 0) { |
162 | 75 | continue; // empty lines can be safely skipped |
163 | } | |
164 | ||
165 | // split it up | |
166 | 756 | String[] parts = line.split("\\s+"); |
167 | ||
168 | 756 | if (parts.length != 3) { |
169 | 0 | throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" + |
170 | languageRulesResourceName + "'"); | |
171 | } | |
172 | ||
173 | 756 | Pattern pattern = Pattern.compile(parts[0]); |
174 | 756 | String[] langs = parts[1].split("\\+"); |
175 | 756 | boolean accept = parts[2].equals("true"); |
176 | ||
177 | 756 | rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept)); |
178 | } | |
179 | } | |
180 | 804 | } |
181 | ||
182 | 3 | return new Lang(rules, languages); |
183 | } | |
184 | ||
185 | private final Languages languages; | |
186 | private final List<LangRule> rules; | |
187 | ||
188 | 3 | private Lang(List<LangRule> rules, Languages languages) { |
189 | 3 | this.rules = Collections.unmodifiableList(rules); |
190 | 3 | this.languages = languages; |
191 | 3 | } |
192 | ||
193 | /** | |
194 | * Guesses the language of a word. | |
195 | * | |
196 | * @param text | |
197 | * the word | |
198 | * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match | |
199 | */ | |
200 | public String guessLanguage(String text) { | |
201 | 0 | Languages.LanguageSet ls = guessLanguages(text); |
202 | 0 | return ls.isSingleton() ? ls.getAny() : Languages.ANY; |
203 | } | |
204 | ||
205 | /** | |
206 | * Guesses the languages of a word. | |
207 | * | |
208 | * @param input | |
209 | * the word | |
210 | * @return a Set of Strings of language names that are potential matches for the input word | |
211 | */ | |
212 | public Languages.LanguageSet guessLanguages(String input) { | |
213 | 67162 | String text = input.toLowerCase(Locale.ENGLISH); |
214 | ||
215 | 67162 | Set<String> langs = new HashSet<String>(this.languages.getLanguages()); |
216 | 67162 | for (LangRule rule : this.rules) { |
217 | 16924824 | if (rule.matches(text)) { |
218 | 3505 | if (rule.acceptOnMatch) { |
219 | 407 | langs.retainAll(rule.languages); |
220 | } else { | |
221 | 3098 | langs.removeAll(rule.languages); |
222 | } | |
223 | } | |
224 | } | |
225 | ||
226 | 67162 | Languages.LanguageSet ls = Languages.LanguageSet.from(langs); |
227 | 67162 | return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls; |
228 | } | |
229 | } |