Coverage Report - org.apache.commons.codec.language.MatchRatingApproachEncoder
 
Classes in this File Line Coverage Branch Coverage Complexity
MatchRatingApproachEncoder
100%
113/113
97%
78/80
5.455
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.commons.codec.language;
 18  
 
 19  
 import java.util.Locale;
 20  
 
 21  
 import org.apache.commons.codec.EncoderException;
 22  
 import org.apache.commons.codec.StringEncoder;
 23  
 
 24  
 /**
 25  
  * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977.
 26  
  *
 27  
  * This class is immutable and thread-safe.
 28  
  *
 29  
  * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a>
 30  
  * @since 1.8
 31  
  */
 32  96
 public class MatchRatingApproachEncoder implements StringEncoder {
 33  
 
 34  
     private static final String SPACE = " ";
 35  
 
 36  
     private static final String EMPTY = "";
 37  
 
 38  
     /**
 39  
      * Constants used mainly for the min rating value.
 40  
      */
 41  
     private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8,
 42  
                              ELEVEN = 11, TWELVE = 12;
 43  
 
 44  
     /**
 45  
      * The plain letter equivalent of the accented letters.
 46  
      */
 47  
     private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave
 48  
             "AaEeIiOoUuYy" + // acute
 49  
             "AaEeIiOoUuYy" + // circumflex
 50  
             "AaOoNn" + // tilde
 51  
             "AaEeIiOoUuYy" + // umlaut
 52  
             "Aa" + // ring
 53  
             "Cc" + // cedilla
 54  
             "OoUu"; // double acute
 55  
 
 56  
     /**
 57  
      * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc...
 58  
      */
 59  
     private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
 60  
             "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
 61  
             "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
 62  
             "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
 63  
             "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
 64  
             "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
 65  
 
 66  1
     private static final String[] DOUBLE_CONSONANT =
 67  
             new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS",
 68  
                            "TT", "VV", "WW", "XX", "YY", "ZZ" };
 69  
 
 70  
     /**
 71  
      * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any
 72  
      * spaces.
 73  
      *
 74  
      * <h2>API Usage</h2>
 75  
      * <p>
 76  
      * Consider this method private, it is package protected for unit testing only.
 77  
      * </p>
 78  
      *
 79  
      * @param name
 80  
      *            The name to be cleaned
 81  
      * @return The cleaned name
 82  
      */
 83  
     String cleanName(final String name) {
 84  78
         String upperName = name.toUpperCase(Locale.ENGLISH);
 85  
 
 86  78
         final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
 87  468
         for (final String str : charsToTrim) {
 88  390
             upperName = upperName.replaceAll(str, EMPTY);
 89  
         }
 90  
 
 91  78
         upperName = removeAccents(upperName);
 92  78
         upperName = upperName.replaceAll("\\s+", EMPTY);
 93  
 
 94  78
         return upperName;
 95  
     }
 96  
 
 97  
     /**
 98  
      * Encodes an Object using the Match Rating Approach algo. Method is here to satisfy the requirements of the
 99  
      * Encoder interface Throws an EncoderException if input object is not of type java.lang.String.
 100  
      *
 101  
      * @param pObject
 102  
      *            Object to encode
 103  
      * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the
 104  
      *         String supplied.
 105  
      * @throws EncoderException
 106  
      *             if the parameter supplied is not of type java.lang.String
 107  
      */
 108  
     @Override
 109  
     public final Object encode(final Object pObject) throws EncoderException {
 110  4
         if (!(pObject instanceof String)) {
 111  1
             throw new EncoderException(
 112  
                     "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String");
 113  
         }
 114  3
         return encode((String) pObject);
 115  
     }
 116  
 
 117  
     /**
 118  
      * Encodes a String using the Match Rating Approach (MRA) algorithm.
 119  
      *
 120  
      * @param name
 121  
      *            String object to encode
 122  
      * @return The MRA code corresponding to the String supplied
 123  
      */
 124  
     @Override
 125  
     public final String encode(String name) {
 126  
         // Bulletproof for trivial input - NINO
 127  17
         if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) {
 128  14
             return EMPTY;
 129  
         }
 130  
 
 131  
         // Preprocessing
 132  3
         name = cleanName(name);
 133  
 
 134  
         // BEGIN: Actual encoding part of the algorithm...
 135  
         // 1. Delete all vowels unless the vowel begins the word
 136  3
         name = removeVowels(name);
 137  
 
 138  
         // 2. Remove second consonant from any double consonant
 139  3
         name = removeDoubleConsonants(name);
 140  
 
 141  
         // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters
 142  3
         name = getFirst3Last3(name);
 143  
 
 144  3
         return name;
 145  
     }
 146  
 
 147  
     /**
 148  
      * Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name.
 149  
      *
 150  
      * <h2>API Usage</h2>
 151  
      * <p>
 152  
      * Consider this method private, it is package protected for unit testing only.
 153  
      * </p>
 154  
      *
 155  
      * @param name
 156  
      *            The string to get the substrings from
 157  
      * @return Annexed first & last 3 letters of input word.
 158  
      */
 159  
     String getFirst3Last3(final String name) {
 160  79
         final int nameLength = name.length();
 161  
 
 162  79
         if (nameLength > SIX) {
 163  11
             final String firstThree = name.substring(0, THREE);
 164  11
             final String lastThree = name.substring(nameLength - THREE, nameLength);
 165  11
             return firstThree + lastThree;
 166  
         } else {
 167  68
             return name;
 168  
         }
 169  
     }
 170  
 
 171  
     /**
 172  
      * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the
 173  
      * min rating. Values strictly from documentation.
 174  
      *
 175  
      * <h2>API Usage</h2>
 176  
      * <p>
 177  
      * Consider this method private, it is package protected for unit testing only.
 178  
      * </p>
 179  
      *
 180  
      * @param sumLength
 181  
      *            The length of 2 strings sent down
 182  
      * @return The min rating value
 183  
      */
 184  
     int getMinRating(final int sumLength) {
 185  47
         int minRating = 0;
 186  
 
 187  47
         if (sumLength <= FOUR) {
 188  4
             minRating = FIVE;
 189  43
         } else if (sumLength >= FIVE && sumLength <= SEVEN) {
 190  18
             minRating = FOUR;
 191  25
         } else if (sumLength >= EIGHT && sumLength <= ELEVEN) {
 192  17
             minRating = THREE;
 193  8
         } else if (sumLength == TWELVE) {
 194  7
             minRating = TWO;
 195  
         } else {
 196  1
             minRating = ONE; // docs said little here.
 197  
         }
 198  
 
 199  47
         return minRating;
 200  
     }
 201  
 
 202  
     /**
 203  
      * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the
 204  
      * strings are cleaned in the same way as {@link #encode(String)}.
 205  
      *
 206  
      * @param name1
 207  
      *            First of the 2 strings (names) to compare
 208  
      * @param name2
 209  
      *            Second of the 2 names to compare
 210  
      * @return <code>true</code> if the encodings are identical <code>false</code> otherwise.
 211  
      */
 212  
     public boolean isEncodeEquals(String name1, String name2) {
 213  
         // Bulletproof for trivial input - NINO
 214  51
         if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) {
 215  5
             return false;
 216  46
         } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) {
 217  5
             return false;
 218  41
         } else if (name1.length() == 1 || name2.length() == 1) {
 219  3
             return false;
 220  38
         } else if (name1.equalsIgnoreCase(name2)) {
 221  1
             return true;
 222  
         }
 223  
 
 224  
         // Preprocessing
 225  37
         name1 = cleanName(name1);
 226  37
         name2 = cleanName(name2);
 227  
 
 228  
         // Actual MRA Algorithm
 229  
 
 230  
         // 1. Remove vowels
 231  37
         name1 = removeVowels(name1);
 232  37
         name2 = removeVowels(name2);
 233  
 
 234  
         // 2. Remove double consonants
 235  37
         name1 = removeDoubleConsonants(name1);
 236  37
         name2 = removeDoubleConsonants(name2);
 237  
 
 238  
         // 3. Reduce down to 3 letters
 239  37
         name1 = getFirst3Last3(name1);
 240  37
         name2 = getFirst3Last3(name2);
 241  
 
 242  
         // 4. Check for length difference - if 3 or greater then no similarity
 243  
         // comparison is done
 244  37
         if (Math.abs(name1.length() - name2.length()) >= THREE) {
 245  1
             return false;
 246  
         }
 247  
 
 248  
         // 5. Obtain the minimum rating value by calculating the length sum of the
 249  
         // encoded Strings and sending it down.
 250  36
         final int sumLength = Math.abs(name1.length() + name2.length());
 251  36
         int minRating = 0;
 252  36
         minRating = getMinRating(sumLength);
 253  
 
 254  
         // 6. Process the encoded Strings from left to right and remove any
 255  
         // identical characters found from both Strings respectively.
 256  36
         final int count = leftToRightThenRightToLeftProcessing(name1, name2);
 257  
 
 258  
         // 7. Each PNI item that has a similarity rating equal to or greater than
 259  
         // the min is considered to be a good candidate match
 260  36
         return count >= minRating;
 261  
 
 262  
     }
 263  
 
 264  
     /**
 265  
      * Determines if a letter is a vowel.
 266  
      *
 267  
      * <h2>API Usage</h2>
 268  
      * <p>
 269  
      * Consider this method private, it is package protected for unit testing only.
 270  
      * </p>
 271  
      *
 272  
      * @param letter
 273  
      *            The letter under investiagtion
 274  
      * @return True if a vowel, else false
 275  
      */
 276  
     boolean isVowel(final String letter) {
 277  83
         return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") ||
 278  
                letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U");
 279  
     }
 280  
 
 281  
     /**
 282  
      * Processes the names from left to right (first) then right to left removing identical letters in same positions.
 283  
      * Then subtracts the longer string that remains from 6 and returns this.
 284  
      *
 285  
      * <h2>API Usage</h2>
 286  
      * <p>
 287  
      * Consider this method private, it is package protected for unit testing only.
 288  
      * </p>
 289  
      *
 290  
      * @param name1
 291  
      *            name2
 292  
      * @return
 293  
      */
 294  
     int leftToRightThenRightToLeftProcessing(final String name1, final String name2) {
 295  38
         final char[] name1Char = name1.toCharArray();
 296  38
         final char[] name2Char = name2.toCharArray();
 297  
 
 298  38
         final int name1Size = name1.length() - 1;
 299  38
         final int name2Size = name2.length() - 1;
 300  
 
 301  38
         String name1LtRStart = EMPTY;
 302  38
         String name1LtREnd = EMPTY;
 303  
 
 304  38
         String name2RtLStart = EMPTY;
 305  38
         String name2RtLEnd = EMPTY;
 306  
 
 307  195
         for (int i = 0; i < name1Char.length; i++) {
 308  165
             if (i > name2Size) {
 309  8
                 break;
 310  
             }
 311  
 
 312  157
             name1LtRStart = name1.substring(i, i + 1);
 313  157
             name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1);
 314  
 
 315  157
             name2RtLStart = name2.substring(i, i + 1);
 316  157
             name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1);
 317  
 
 318  
             // Left to right...
 319  157
             if (name1LtRStart.equals(name2RtLStart)) {
 320  88
                 name1Char[i] = ' ';
 321  88
                 name2Char[i] = ' ';
 322  
             }
 323  
 
 324  
             // Right to left...
 325  157
             if (name1LtREnd.equals(name2RtLEnd)) {
 326  59
                 name1Char[name1Size - i] = ' ';
 327  59
                 name2Char[name2Size - i] = ' ';
 328  
             }
 329  
         }
 330  
 
 331  
         // Char arrays -> string & remove extraneous space
 332  38
         final String strA = new String(name1Char).replaceAll("\\s+", EMPTY);
 333  38
         final String strB = new String(name2Char).replaceAll("\\s+", EMPTY);
 334  
 
 335  
         // Final bit - subtract longest string from 6 and return this int value
 336  38
         if (strA.length() > strB.length()) {
 337  8
             return Math.abs(SIX - strA.length());
 338  
         } else {
 339  30
             return Math.abs(SIX - strB.length());
 340  
         }
 341  
     }
 342  
 
 343  
     /**
 344  
      * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved.
 345  
      * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
 346  
      *
 347  
      * @param accentedWord
 348  
      *            The word that may have accents in it.
 349  
      * @return De-accented word
 350  
      */
 351  
     String removeAccents(final String accentedWord) {
 352  87
         if (accentedWord == null) {
 353  1
             return null;
 354  
         }
 355  
 
 356  86
         final StringBuilder sb = new StringBuilder();
 357  86
         final int n = accentedWord.length();
 358  
 
 359  794
         for (int i = 0; i < n; i++) {
 360  708
             final char c = accentedWord.charAt(i);
 361  708
             final int pos = UNICODE.indexOf(c);
 362  708
             if (pos > -1) {
 363  60
                 sb.append(PLAIN_ASCII.charAt(pos));
 364  
             } else {
 365  648
                 sb.append(c);
 366  
             }
 367  
         }
 368  
 
 369  86
         return sb.toString();
 370  
     }
 371  
 
 372  
     /**
 373  
      * Replaces any double consonant pair with the single letter equivalent.
 374  
      *
 375  
      * <h2>API Usage</h2>
 376  
      * <p>
 377  
      * Consider this method private, it is package protected for unit testing only.
 378  
      * </p>
 379  
      *
 380  
      * @param name
 381  
      *            String to have double consonants removed
 382  
      * @return Single consonant word
 383  
      */
 384  
     String removeDoubleConsonants(final String name) {
 385  80
         String replacedName = name.toUpperCase();
 386  1760
         for (final String dc : DOUBLE_CONSONANT) {
 387  1680
             if (replacedName.contains(dc)) {
 388  12
                 final String singleLetter = dc.substring(0, 1);
 389  12
                 replacedName = replacedName.replace(dc, singleLetter);
 390  
             }
 391  
         }
 392  80
         return replacedName;
 393  
     }
 394  
 
 395  
     /**
 396  
      * Deletes all vowels unless the vowel begins the word.
 397  
      *
 398  
      * <h2>API Usage</h2>
 399  
      * <p>
 400  
      * Consider this method private, it is package protected for unit testing only.
 401  
      * </p>
 402  
      *
 403  
      * @param name
 404  
      *            The name to have vowels removed
 405  
      * @return De-voweled word
 406  
      */
 407  
     String removeVowels(String name) {
 408  
         // Extract first letter
 409  80
         final String firstLetter = name.substring(0, 1);
 410  
 
 411  80
         name = name.replaceAll("A", EMPTY);
 412  80
         name = name.replaceAll("E", EMPTY);
 413  80
         name = name.replaceAll("I", EMPTY);
 414  80
         name = name.replaceAll("O", EMPTY);
 415  80
         name = name.replaceAll("U", EMPTY);
 416  
 
 417  80
         name = name.replaceAll("\\s{2,}\\b", SPACE);
 418  
 
 419  
         // return isVowel(firstLetter) ? (firstLetter + name) : name;
 420  80
         if (isVowel(firstLetter)) {
 421  16
             return firstLetter + name;
 422  
         } else {
 423  64
             return name;
 424  
         }
 425  
     }
 426  
 }