Coverage Report - org.apache.commons.codec.language.bm.Lang
 
Classes in this File Line Coverage Branch Coverage Complexity
Lang
93%
46/49
88%
23/26
3.143
Lang$1
N/A
N/A
3.143
Lang$LangRule
100%
7/7
N/A
3.143
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.commons.codec.language.bm;
 19  
 
 20  
 import java.io.InputStream;
 21  
 import java.util.ArrayList;
 22  
 import java.util.Arrays;
 23  
 import java.util.Collections;
 24  
 import java.util.EnumMap;
 25  
 import java.util.HashSet;
 26  
 import java.util.List;
 27  
 import java.util.Locale;
 28  
 import java.util.Map;
 29  
 import java.util.Scanner;
 30  
 import java.util.Set;
 31  
 import java.util.regex.Pattern;
 32  
 
 33  
 /**
 34  
  * Language guessing utility.
 35  
  * <p>
 36  
  * This class encapsulates rules used to guess the possible languages that a word originates from. This is
 37  
  * done by reference to a whole series of rules distributed in resource files.
 38  
  * <p>
 39  
  * Instances of this class are typically managed through the static factory method instance().
 40  
  * Unless you are developing your own language guessing rules, you will not need to interact with this class directly.
 41  
  * <p>
 42  
  * This class is intended to be immutable and thread-safe.
 43  
  * <p>
 44  
  * <b>Lang resources</b>
 45  
  * <p>
 46  
  * Language guessing rules are typically loaded from resource files. These are UTF-8 encoded text files.
 47  
  * They are systematically named following the pattern:
 48  
  * <blockquote>org/apache/commons/codec/language/bm/lang.txt</blockquote>
 49  
  * The format of these resources is the following:
 50  
  * <ul>
 51  
  * <li><b>Rules:</b> whitespace separated strings.
 52  
  * There should be 3 columns to each row, and these will be interpreted as:
 53  
  * <ol>
 54  
  * <li>pattern: a regular expression.</li>
 55  
  * <li>languages: a '+'-separated list of languages.</li>
 56  
  * <li>acceptOnMatch: 'true' or 'false' indicating if a match rules in or rules out the language.</li>
 57  
  * </ol>
 58  
  * </li>
 59  
  * <li><b>End-of-line comments:</b> Any occurrence of '//' will cause all text following on that line to be
 60  
  * discarded as a comment.</li>
 61  
  * <li><b>Multi-line comments:</b> Any line starting with '/*' will start multi-line commenting mode.
 62  
  * This will skip all content until a line ending in '*' and '/' is found.</li>
 63  
  * <li><b>Blank lines:</b> All blank lines will be skipped.</li>
 64  
  * </ul>
 65  
  * <p>
 66  
  * Port of lang.php
 67  
  *
 68  
  * @since 1.6
 69  
  * @version $Id$
 70  
  */
 71  
 public class Lang {
 72  
     // Implementation note: This class is divided into two sections. The first part is a static factory interface that
 73  
     // exposes the LANGUAGE_RULES_RN resource as a Lang instance. The second part is the Lang instance methods that
 74  
     // encapsulate a particular language-guessing rule table and the language guessing itself.
 75  
     //
 76  
     // It may make sense in the future to expose the private constructor to allow power users to build custom language-
 77  
     // guessing rules, perhaps by marking it protected and allowing sub-classing. However, the vast majority of users
 78  
     // should be strongly encouraged to use the static factory <code>instance</code> method to get their Lang instances.
 79  
 
 80  7766
     private static final class LangRule {
 81  
         private final boolean acceptOnMatch;
 82  
         private final Set<String> languages;
 83  
         private final Pattern pattern;
 84  
 
 85  756
         private LangRule(final Pattern pattern, final Set<String> languages, final boolean acceptOnMatch) {
 86  756
             this.pattern = pattern;
 87  756
             this.languages = languages;
 88  756
             this.acceptOnMatch = acceptOnMatch;
 89  756
         }
 90  
 
 91  
         public boolean matches(final String txt) {
 92  16924824
             return this.pattern.matcher(txt).find();
 93  
         }
 94  
     }
 95  
 
 96  1
     private static final Map<NameType, Lang> Langs = new EnumMap<NameType, Lang>(NameType.class);
 97  
 
 98  
     private static final String LANGUAGE_RULES_RN = "org/apache/commons/codec/language/bm/lang.txt";
 99  
 
 100  
     static {
 101  4
         for (final NameType s : NameType.values()) {
 102  3
             Langs.put(s, loadFromResource(LANGUAGE_RULES_RN, Languages.getInstance(s)));
 103  
         }
 104  1
     }
 105  
 
 106  
     /**
 107  
      * Gets a Lang instance for one of the supported NameTypes.
 108  
      *
 109  
      * @param nameType
 110  
      *            the NameType to look up
 111  
      * @return a Lang encapsulating the language guessing rules for that name type
 112  
      */
 113  
     public static Lang instance(final NameType nameType) {
 114  145
         return Langs.get(nameType);
 115  
     }
 116  
 
 117  
     /**
 118  
      * Loads language rules from a resource.
 119  
      * <p>
 120  
      * In normal use, you will obtain instances of Lang through the {@link #instance(NameType)} method.
 121  
      * You will only need to call this yourself if you are developing custom language mapping rules.
 122  
      *
 123  
      * @param languageRulesResourceName
 124  
      *            the fully-qualified resource name to load
 125  
      * @param languages
 126  
      *            the languages that these rules will support
 127  
      * @return a Lang encapsulating the loaded language-guessing rules.
 128  
      */
 129  
     public static Lang loadFromResource(final String languageRulesResourceName, final Languages languages) {
 130  4
         final List<LangRule> rules = new ArrayList<LangRule>();
 131  4
         final InputStream lRulesIS = Lang.class.getClassLoader().getResourceAsStream(languageRulesResourceName);
 132  
 
 133  4
         if (lRulesIS == null) {
 134  1
             throw new IllegalStateException("Unable to resolve required resource:" + LANGUAGE_RULES_RN);
 135  
         }
 136  
 
 137  3
         final Scanner scanner = new Scanner(lRulesIS, ResourceConstants.ENCODING);
 138  3
         boolean inExtendedComment = false;
 139  882
         while (scanner.hasNextLine()) {
 140  879
             final String rawLine = scanner.nextLine();
 141  879
             String line = rawLine;
 142  
 
 143  879
             if (inExtendedComment) {
 144  
                 // check for closing comment marker, otherwise discard doc comment line
 145  45
                 if (line.endsWith(ResourceConstants.EXT_CMT_END)) {
 146  3
                     inExtendedComment = false;
 147  
                 }
 148  
             } else {
 149  834
                 if (line.startsWith(ResourceConstants.EXT_CMT_START)) {
 150  3
                     inExtendedComment = true;
 151  
                 } else {
 152  
                     // discard comments
 153  831
                     final int cmtI = line.indexOf(ResourceConstants.CMT);
 154  831
                     if (cmtI >= 0) {
 155  144
                         line = line.substring(0, cmtI);
 156  
                     }
 157  
 
 158  
                     // trim leading-trailing whitespace
 159  831
                     line = line.trim();
 160  
 
 161  831
                     if (line.length() == 0) {
 162  75
                         continue; // empty lines can be safely skipped
 163  
                     }
 164  
 
 165  
                     // split it up
 166  756
                     final String[] parts = line.split("\\s+");
 167  
 
 168  756
                     if (parts.length != 3) {
 169  0
                         throw new IllegalArgumentException("Malformed line '" + rawLine + "' in language resource '" +
 170  
                                                            languageRulesResourceName + "'");
 171  
                     }
 172  
 
 173  756
                     final Pattern pattern = Pattern.compile(parts[0]);
 174  756
                     final String[] langs = parts[1].split("\\+");
 175  756
                     final boolean accept = parts[2].equals("true");
 176  
 
 177  756
                     rules.add(new LangRule(pattern, new HashSet<String>(Arrays.asList(langs)), accept));
 178  
                 }
 179  
             }
 180  804
         }
 181  
 
 182  3
         return new Lang(rules, languages);
 183  
     }
 184  
 
 185  
     private final Languages languages;
 186  
     private final List<LangRule> rules;
 187  
 
 188  3
     private Lang(final List<LangRule> rules, final Languages languages) {
 189  3
         this.rules = Collections.unmodifiableList(rules);
 190  3
         this.languages = languages;
 191  3
     }
 192  
 
 193  
     /**
 194  
      * Guesses the language of a word.
 195  
      *
 196  
      * @param text
 197  
      *            the word
 198  
      * @return the language that the word originates from or {@link Languages#ANY} if there was no unique match
 199  
      */
 200  
     public String guessLanguage(final String text) {
 201  0
         final Languages.LanguageSet ls = guessLanguages(text);
 202  0
         return ls.isSingleton() ? ls.getAny() : Languages.ANY;
 203  
     }
 204  
 
 205  
     /**
 206  
      * Guesses the languages of a word.
 207  
      *
 208  
      * @param input
 209  
      *            the word
 210  
      * @return a Set of Strings of language names that are potential matches for the input word
 211  
      */
 212  
     public Languages.LanguageSet guessLanguages(final String input) {
 213  67162
         final String text = input.toLowerCase(Locale.ENGLISH);
 214  
 
 215  67162
         final Set<String> langs = new HashSet<String>(this.languages.getLanguages());
 216  67162
         for (final LangRule rule : this.rules) {
 217  16924824
             if (rule.matches(text)) {
 218  3505
                 if (rule.acceptOnMatch) {
 219  407
                     langs.retainAll(rule.languages);
 220  
                 } else {
 221  3098
                     langs.removeAll(rule.languages);
 222  
                 }
 223  
             }
 224  
         }
 225  
 
 226  67162
         final Languages.LanguageSet ls = Languages.LanguageSet.from(langs);
 227  67162
         return ls.equals(Languages.NO_LANGUAGES) ? Languages.ANY_LANGUAGE : ls;
 228  
     }
 229  
 }