Coverage Report - org.apache.commons.codec.language.Metaphone
 
Classes in this File Line Coverage Branch Coverage Complexity
Metaphone
98%
144/146
84%
146/172
9.333
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.commons.codec.language;
 19  
 
 20  
 import org.apache.commons.codec.EncoderException;
 21  
 import org.apache.commons.codec.StringEncoder;
 22  
 
 23  
 /**
 24  
  * Encodes a string into a Metaphone value.
 25  
  * <p>
 26  
  * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>.
 27  
  * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere.
 28  
  * <p>
 29  
  * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990,
 30  
  * p 39.</CITE>
 31  
  * <p>
 32  
  * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl
 33  
  * <a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a>.
 34  
  * They have had undocumented changes from the originally published algorithm.
 35  
  * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>.
 36  
  * <p>
 37  
  * This class is conditionally thread-safe.
 38  
  * The instance field {@link #maxCodeLen} is mutable {@link #setMaxCodeLen(int)}
 39  
  * but is not volatile, and accesses are not synchronised.
 40  
  * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronisation
 41  
  * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)}
 42  
  * after initial setup.
 43  
  *
 44  
  * @version $Id$
 45  
  */
 46  
 public class Metaphone implements StringEncoder {
 47  
 
 48  
     /**
 49  
      * Five values in the English language
 50  
      */
 51  
     private static final String VOWELS = "AEIOU";
 52  
 
 53  
     /**
 54  
      * Variable used in Metaphone algorithm
 55  
      */
 56  
     private static final String FRONTV = "EIY";
 57  
 
 58  
     /**
 59  
      * Variable used in Metaphone algorithm
 60  
      */
 61  
     private static final String VARSON = "CSPTG";
 62  
 
 63  
     /**
 64  
      * The max code length for metaphone is 4
 65  
      */
 66  34
     private int maxCodeLen = 4;
 67  
 
 68  
     /**
 69  
      * Creates an instance of the Metaphone encoder
 70  
      */
 71  
     public Metaphone() {
 72  34
         super();
 73  34
     }
 74  
 
 75  
     /**
 76  
      * Find the metaphone value of a String. This is similar to the
 77  
      * soundex algorithm, but better at finding similar sounding words.
 78  
      * All input is converted to upper case.
 79  
      * Limitations: Input format is expected to be a single ASCII word
 80  
      * with only characters in the A - Z range, no punctuation or numbers.
 81  
      *
 82  
      * @param txt String to find the metaphone code for
 83  
      * @return A metaphone code corresponding to the String supplied
 84  
      */
 85  
     public String metaphone(String txt) {
 86  13774
         boolean hard = false;
 87  13774
         if (txt == null || txt.length() == 0) {
 88  2
             return "";
 89  
         }
 90  
         // single character is itself
 91  13772
         if (txt.length() == 1) {
 92  8
             return txt.toUpperCase(java.util.Locale.ENGLISH);
 93  
         }
 94  
 
 95  13764
         char[] inwd = txt.toUpperCase(java.util.Locale.ENGLISH).toCharArray();
 96  
 
 97  13764
         StringBuilder local = new StringBuilder(40); // manipulate
 98  13764
         StringBuilder code = new StringBuilder(10); //   output
 99  
         // handle initial 2 characters exceptions
 100  13764
         switch(inwd[0]) {
 101  
         case 'K':
 102  
         case 'G':
 103  
         case 'P': /* looking for KN, etc*/
 104  3901
             if (inwd[1] == 'N') {
 105  17
                 local.append(inwd, 1, inwd.length - 1);
 106  
             } else {
 107  3884
                 local.append(inwd);
 108  
             }
 109  3884
             break;
 110  
         case 'A': /* looking for AE */
 111  64
             if (inwd[1] == 'E') {
 112  1
                 local.append(inwd, 1, inwd.length - 1);
 113  
             } else {
 114  63
                 local.append(inwd);
 115  
             }
 116  63
             break;
 117  
         case 'W': /* looking for WR or WH */
 118  318
             if (inwd[1] == 'R') {   // WR -> R
 119  3
                 local.append(inwd, 1, inwd.length - 1);
 120  3
                 break;
 121  
             }
 122  315
             if (inwd[1] == 'H') {
 123  38
                 local.append(inwd, 1, inwd.length - 1);
 124  38
                 local.setCharAt(0, 'W'); // WH -> W
 125  
             } else {
 126  277
                 local.append(inwd);
 127  
             }
 128  277
             break;
 129  
         case 'X': /* initial X becomes S */
 130  28
             inwd[0] = 'S';
 131  28
             local.append(inwd);
 132  28
             break;
 133  
         default:
 134  9453
             local.append(inwd);
 135  
         } // now local has working string with initials fixed
 136  
 
 137  13764
         int wdsz = local.length();
 138  13764
         int n = 0;
 139  
 
 140  
         while (code.length() < this.getMaxCodeLen() &&
 141  81745
                n < wdsz ) { // max code size of 4 works well
 142  67981
             char symb = local.charAt(n);
 143  
             // remove duplicate letters except C
 144  67981
             if (symb != 'C' && isPreviousChar( local, n, symb ) ) {
 145  5607
                 n++;
 146  
             } else { // not dup
 147  62374
                 switch(symb) {
 148  
                 case 'A':
 149  
                 case 'E':
 150  
                 case 'I':
 151  
                 case 'O':
 152  
                 case 'U':
 153  30521
                     if (n == 0) {
 154  70
                         code.append(symb);
 155  
                     }
 156  
                     break; // only use vowel if leading char
 157  
                 case 'B':
 158  65
                     if ( isPreviousChar(local, n, 'M') &&
 159  
                          isLastChar(wdsz, n) ) { // B is silent if word ends in MB
 160  3
                         break;
 161  
                     }
 162  62
                     code.append(symb);
 163  62
                     break;
 164  
                 case 'C': // lots of C special cases
 165  
                     /* discard if SCI, SCE or SCY */
 166  1982
                     if ( isPreviousChar(local, n, 'S') &&
 167  
                          !isLastChar(wdsz, n) &&
 168  
                          FRONTV.indexOf(local.charAt(n + 1)) >= 0 ) {
 169  3
                         break;
 170  
                     }
 171  1979
                     if (regionMatch(local, n, "CIA")) { // "CIA" -> X
 172  1
                         code.append('X');
 173  1
                         break;
 174  
                     }
 175  1978
                     if (!isLastChar(wdsz, n) &&
 176  
                         FRONTV.indexOf(local.charAt(n + 1)) >= 0) {
 177  82
                         code.append('S');
 178  82
                         break; // CI,CE,CY -> S
 179  
                     }
 180  1896
                     if (isPreviousChar(local, n, 'S') &&
 181  
                         isNextChar(local, n, 'H') ) { // SCH->sk
 182  2
                         code.append('K');
 183  2
                         break;
 184  
                     }
 185  1894
                     if (isNextChar(local, n, 'H')) { // detect CH
 186  4
                         if (n == 0 &&
 187  
                             wdsz >= 3 &&
 188  
                             isVowel(local,2) ) { // CH consonant -> K consonant
 189  1
                             code.append('K');
 190  
                         } else {
 191  3
                             code.append('X'); // CHvowel -> X
 192  
                         }
 193  
                     } else {
 194  1890
                         code.append('K');
 195  
                     }
 196  1890
                     break;
 197  
                 case 'D':
 198  445
                     if (!isLastChar(wdsz, n + 1) &&
 199  
                         isNextChar(local, n, 'G') &&
 200  
                         FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J
 201  3
                         code.append('J'); n += 2;
 202  
                     } else {
 203  442
                         code.append('T');
 204  
                     }
 205  442
                     break;
 206  
                 case 'G': // GH silent at end or before consonant
 207  1705
                     if (isLastChar(wdsz, n + 1) &&
 208  
                         isNextChar(local, n, 'H')) {
 209  1
                         break;
 210  
                     }
 211  1704
                     if (!isLastChar(wdsz, n + 1) &&
 212  
                         isNextChar(local,n,'H') &&
 213  
                         !isVowel(local,n+2)) {
 214  19
                         break;
 215  
                     }
 216  1685
                     if (n > 0 &&
 217  
                         ( regionMatch(local, n, "GN") ||
 218  
                           regionMatch(local, n, "GNED") ) ) {
 219  0
                         break; // silent G
 220  
                     }
 221  1684
                     if (isPreviousChar(local, n, 'G')) {
 222  
                         // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true
 223  0
                         hard = true;
 224  
                     } else {
 225  1684
                         hard = false;
 226  
                     }
 227  1684
                     if (!isLastChar(wdsz, n) &&
 228  
                         FRONTV.indexOf(local.charAt(n + 1)) >= 0 &&
 229  
                         !hard) {
 230  1547
                         code.append('J');
 231  
                     } else {
 232  137
                         code.append('K');
 233  
                     }
 234  137
                     break;
 235  
                 case 'H':
 236  654
                     if (isLastChar(wdsz, n)) {
 237  205
                         break; // terminal H
 238  
                     }
 239  449
                     if (n > 0 &&
 240  
                         VARSON.indexOf(local.charAt(n - 1)) >= 0) {
 241  27
                         break;
 242  
                     }
 243  422
                     if (isVowel(local,n+1)) {
 244  1
                         code.append('H'); // Hvowel
 245  
                     }
 246  
                     break;
 247  
                 case 'F':
 248  
                 case 'J':
 249  
                 case 'L':
 250  
                 case 'M':
 251  
                 case 'N':
 252  
                 case 'R':
 253  20128
                     code.append(symb);
 254  20128
                     break;
 255  
                 case 'K':
 256  1963
                     if (n > 0) { // not initial
 257  5
                         if (!isPreviousChar(local, n, 'C')) {
 258  2
                             code.append(symb);
 259  
                         }
 260  
                     } else {
 261  1958
                         code.append(symb); // initial K
 262  
                     }
 263  1958
                     break;
 264  
                 case 'P':
 265  245
                     if (isNextChar(local,n,'H')) {
 266  
                         // PH -> F
 267  1
                         code.append('F');
 268  
                     } else {
 269  244
                         code.append(symb);
 270  
                     }
 271  244
                     break;
 272  
                 case 'Q':
 273  3
                     code.append('K');
 274  3
                     break;
 275  
                 case 'S':
 276  673
                     if (regionMatch(local,n,"SH") ||
 277  
                         regionMatch(local,n,"SIO") ||
 278  
                         regionMatch(local,n,"SIA")) {
 279  4
                         code.append('X');
 280  
                     } else {
 281  669
                         code.append('S');
 282  
                     }
 283  669
                     break;
 284  
                 case 'T':
 285  640
                     if (regionMatch(local,n,"TIA") ||
 286  
                         regionMatch(local,n,"TIO")) {
 287  2
                         code.append('X');
 288  2
                         break;
 289  
                     }
 290  638
                     if (regionMatch(local,n,"TCH")) {
 291  
                         // Silent if in "TCH"
 292  2
                         break;
 293  
                     }
 294  
                     // substitute numeral 0 for TH (resembles theta after all)
 295  636
                     if (regionMatch(local,n,"TH")) {
 296  2
                         code.append('0');
 297  
                     } else {
 298  634
                         code.append('T');
 299  
                     }
 300  634
                     break;
 301  
                 case 'V':
 302  1
                     code.append('F'); break;
 303  
                 case 'W':
 304  
                 case 'Y': // silent if not followed by vowel
 305  2847
                     if (!isLastChar(wdsz,n) &&
 306  
                         isVowel(local,n+1)) {
 307  314
                         code.append(symb);
 308  
                     }
 309  
                     break;
 310  
                 case 'X':
 311  6
                     code.append('K');
 312  6
                     code.append('S');
 313  6
                     break;
 314  
                 case 'Z':
 315  139
                     code.append('S');
 316  139
                     break;
 317  
                 default:
 318  
                     // do nothing
 319  
                     break;
 320  
                 } // end switch
 321  62374
                 n++;
 322  
             } // end else from symb != 'C'
 323  67981
             if (code.length() > this.getMaxCodeLen()) {
 324  2
                 code.setLength(this.getMaxCodeLen());
 325  
             }
 326  67981
         }
 327  13764
         return code.toString();
 328  
     }
 329  
 
 330  
     private boolean isVowel(StringBuilder string, int index) {
 331  1132
         return VOWELS.indexOf(string.charAt(index)) >= 0;
 332  
     }
 333  
 
 334  
     private boolean isPreviousChar(StringBuilder string, int index, char c) {
 335  71631
         boolean matches = false;
 336  71631
         if( index > 0 &&
 337  
             index < string.length() ) {
 338  54295
             matches = string.charAt(index - 1) == c;
 339  
         }
 340  71631
         return matches;
 341  
     }
 342  
 
 343  
     private boolean isNextChar(StringBuilder string, int index, char c) {
 344  4062
         boolean matches = false;
 345  4062
         if( index >= 0 &&
 346  
             index < string.length() - 1 ) {
 347  4035
             matches = string.charAt(index + 1) == c;
 348  
         }
 349  4062
         return matches;
 350  
     }
 351  
 
 352  
     private boolean regionMatch(StringBuilder string, int index, String test) {
 353  6549
         boolean matches = false;
 354  6549
         if( index >= 0 &&
 355  
             index + test.length() - 1 < string.length() ) {
 356  5282
             String substring = string.substring( index, index + test.length());
 357  5282
             matches = substring.equals( test );
 358  
         }
 359  6549
         return matches;
 360  
     }
 361  
 
 362  
     private boolean isLastChar(int wdsz, int n) {
 363  11025
         return n + 1 == wdsz;
 364  
     }
 365  
 
 366  
 
 367  
     /**
 368  
      * Encodes an Object using the metaphone algorithm.  This method
 369  
      * is provided in order to satisfy the requirements of the
 370  
      * Encoder interface, and will throw an EncoderException if the
 371  
      * supplied object is not of type java.lang.String.
 372  
      *
 373  
      * @param obj Object to encode
 374  
      * @return An object (or type java.lang.String) containing the
 375  
      *         metaphone code which corresponds to the String supplied.
 376  
      * @throws EncoderException if the parameter supplied is not
 377  
      *                          of type java.lang.String
 378  
      */
 379  
     @Override
 380  
     public Object encode(Object obj) throws EncoderException {
 381  4
         if (!(obj instanceof String)) {
 382  1
             throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String");
 383  
         }
 384  3
         return metaphone((String) obj);
 385  
     }
 386  
 
 387  
     /**
 388  
      * Encodes a String using the Metaphone algorithm.
 389  
      *
 390  
      * @param str String object to encode
 391  
      * @return The metaphone code corresponding to the String supplied
 392  
      */
 393  
     @Override
 394  
     public String encode(String str) {
 395  7
         return metaphone(str);
 396  
     }
 397  
 
 398  
     /**
 399  
      * Tests is the metaphones of two strings are identical.
 400  
      *
 401  
      * @param str1 First of two strings to compare
 402  
      * @param str2 Second of two strings to compare
 403  
      * @return {@code true} if the metaphones of these strings are identical,
 404  
      *        {@code false} otherwise.
 405  
      */
 406  
     public boolean isMetaphoneEqual(String str1, String str2) {
 407  6862
         return metaphone(str1).equals(metaphone(str2));
 408  
     }
 409  
 
 410  
     /**
 411  
      * Returns the maxCodeLen.
 412  
      * @return int
 413  
      */
 414  149728
     public int getMaxCodeLen() { return this.maxCodeLen; }
 415  
 
 416  
     /**
 417  
      * Sets the maxCodeLen.
 418  
      * @param maxCodeLen The maxCodeLen to set
 419  
      */
 420  1
     public void setMaxCodeLen(int maxCodeLen) { this.maxCodeLen = maxCodeLen; }
 421  
 
 422  
 }