Coverage Report - org.apache.commons.codec.language.Nysiis
 
Classes in this File Line Coverage Branch Coverage Complexity
Nysiis
100%
86/86
100%
74/74
7.25
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.commons.codec.language;
 19  
 
 20  
 import java.util.regex.Pattern;
 21  
 
 22  
 import org.apache.commons.codec.EncoderException;
 23  
 import org.apache.commons.codec.StringEncoder;
 24  
 
 25  
 /**
 26  
  * Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a
 27  
  * general purpose scheme to find word with similar phonemes.
 28  
  * <p>
 29  
  * NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm.
 30  
  * <p>
 31  
  * Algorithm description:
 32  
  * <pre>
 33  
  * 1. Transcode first characters of name
 34  
  *   1a. MAC ->   MCC
 35  
  *   1b. KN  ->   NN
 36  
  *   1c. K   ->   C
 37  
  *   1d. PH  ->   FF
 38  
  *   1e. PF  ->   FF
 39  
  *   1f. SCH ->   SSS
 40  
  * 2. Transcode last characters of name
 41  
  *   2a. EE, IE          ->   Y
 42  
  *   2b. DT,RT,RD,NT,ND  ->   D
 43  
  * 3. First character of key = first character of name
 44  
  * 4. Transcode remaining characters by following these rules, incrementing by one character each time
 45  
  *   4a. EV  ->   AF  else A,E,I,O,U -> A
 46  
  *   4b. Q   ->   G
 47  
  *   4c. Z   ->   S
 48  
  *   4d. M   ->   N
 49  
  *   4e. KN  ->   N   else K -> C
 50  
  *   4f. SCH ->   SSS
 51  
  *   4g. PH  ->   FF
 52  
  *   4h. H   ->   If previous or next is nonvowel, previous
 53  
  *   4i. W   ->   If previous is vowel, previous
 54  
  *   4j. Add current to key if current != last key character
 55  
  * 5. If last character is S, remove it
 56  
  * 6. If last characters are AY, replace with Y
 57  
  * 7. If last character is A, remove it
 58  
  * 8. Collapse all strings of repeated characters
 59  
  * 9. Add original first character of name as first character of key
 60  
  * </pre>
 61  
  * <p>
 62  
  * This class is immutable and thread-safe.
 63  
  *
 64  
  * @see <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a>
 65  
  * @see <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a>
 66  
  * @see Soundex
 67  
  * @since 1.7
 68  
  * @version $Id$
 69  
  */
 70  
 public class Nysiis implements StringEncoder {
 71  
 
 72  1
     private static final char[] CHARS_A   = new char[] { 'A' };
 73  1
     private static final char[] CHARS_AF  = new char[] { 'A', 'F' };
 74  1
     private static final char[] CHARS_C   = new char[] { 'C' };
 75  1
     private static final char[] CHARS_FF  = new char[] { 'F', 'F' };
 76  1
     private static final char[] CHARS_G   = new char[] { 'G' };
 77  1
     private static final char[] CHARS_N   = new char[] { 'N' };
 78  1
     private static final char[] CHARS_NN  = new char[] { 'N', 'N' };
 79  1
     private static final char[] CHARS_S   = new char[] { 'S' };
 80  1
     private static final char[] CHARS_SSS = new char[] { 'S', 'S', 'S' };
 81  
 
 82  1
     private static final Pattern PAT_MAC    = Pattern.compile("^MAC");
 83  1
     private static final Pattern PAT_KN     = Pattern.compile("^KN");
 84  1
     private static final Pattern PAT_K      = Pattern.compile("^K");
 85  1
     private static final Pattern PAT_PH_PF  = Pattern.compile("^(PH|PF)");
 86  1
     private static final Pattern PAT_SCH    = Pattern.compile("^SCH");
 87  1
     private static final Pattern PAT_EE_IE  = Pattern.compile("(EE|IE)$");
 88  1
     private static final Pattern PAT_DT_ETC = Pattern.compile("(DT|RT|RD|NT|ND)$");
 89  
 
 90  
     private static final char SPACE = ' ';
 91  
     private static final int TRUE_LENGTH = 6;
 92  
 
 93  
     /**
 94  
      * Tests if the given character is a vowel.
 95  
      *
 96  
      * @param c
 97  
      *            the character to test
 98  
      * @return {@code true} if the character is a vowel, {@code false} otherwise
 99  
      */
 100  
     private static boolean isVowel(final char c) {
 101  340
         return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U';
 102  
     }
 103  
 
 104  
     /**
 105  
      * Transcodes the remaining parts of the String. The method operates on a sliding window, looking at 4 characters at
 106  
      * a time: [i-1, i, i+1, i+2].
 107  
      *
 108  
      * @param prev
 109  
      *            the previous character
 110  
      * @param curr
 111  
      *            the current character
 112  
      * @param next
 113  
      *            the next character
 114  
      * @param aNext
 115  
      *            the after next character
 116  
      * @return a transcoded array of characters, starting from the current position
 117  
      */
 118  
     private static char[] transcodeRemaining(final char prev, final char curr, final char next, final char aNext) {
 119  
         // 1. EV -> AF
 120  321
         if (curr == 'E' && next == 'V') {
 121  2
             return CHARS_AF;
 122  
         }
 123  
 
 124  
         // A, E, I, O, U -> A
 125  319
         if (isVowel(curr)) {
 126  111
             return CHARS_A;
 127  
         }
 128  
 
 129  
         // 2. Q -> G, Z -> S, M -> N
 130  208
         if (curr == 'Q') {
 131  2
             return CHARS_G;
 132  206
         } else if (curr == 'Z') {
 133  5
             return CHARS_S;
 134  201
         } else if (curr == 'M') {
 135  11
             return CHARS_N;
 136  
         }
 137  
 
 138  
         // 3. KN -> NN else K -> C
 139  190
         if (curr == 'K') {
 140  5
             if (next == 'N') {
 141  1
                 return CHARS_NN;
 142  
             } else {
 143  4
                 return CHARS_C;
 144  
             }
 145  
         }
 146  
 
 147  
         // 4. SCH -> SSS
 148  185
         if (curr == 'S' && next == 'C' && aNext == 'H') {
 149  2
             return CHARS_SSS;
 150  
         }
 151  
 
 152  
         // PH -> FF
 153  183
         if (curr == 'P' && next == 'H') {
 154  1
             return CHARS_FF;
 155  
         }
 156  
 
 157  
         // 5. H -> If previous or next is a non vowel, previous.
 158  182
         if (curr == 'H' && (!isVowel(prev) || !isVowel(next))) {
 159  11
             return new char[] { prev };
 160  
         }
 161  
 
 162  
         // 6. W -> If previous is vowel, previous.
 163  171
         if (curr == 'W' && isVowel(prev)) {
 164  4
             return new char[] { prev };
 165  
         }
 166  
 
 167  167
         return new char[] { curr };
 168  
     }
 169  
 
 170  
     /** Indicates the strict mode. */
 171  
     private final boolean strict;
 172  
 
 173  
     /**
 174  
      * Creates an instance of the {@link Nysiis} encoder with strict mode (original form),
 175  
      * i.e. encoded strings have a maximum length of 6.
 176  
      */
 177  
     public Nysiis() {
 178  23
         this(true);
 179  23
     }
 180  
 
 181  
     /**
 182  
      * Create an instance of the {@link Nysiis} encoder with the specified strict mode:
 183  
      *
 184  
      * <ul>
 185  
      *  <li>{@code true}: encoded strings have a maximum length of 6</li>
 186  
      *  <li>{@code false}: encoded strings may have arbitrary length</li>
 187  
      * </ul>
 188  
      *
 189  
      * @param strict
 190  
      *            the strict mode
 191  
      */
 192  47
     public Nysiis(final boolean strict) {
 193  47
         this.strict = strict;
 194  47
     }
 195  
 
 196  
     /**
 197  
      * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the
 198  
      * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type
 199  
      * {@link String}.
 200  
      *
 201  
      * @param obj
 202  
      *            Object to encode
 203  
      * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String.
 204  
      * @throws EncoderException
 205  
      *            if the parameter supplied is not of a {@link String}
 206  
      * @throws IllegalArgumentException
 207  
      *            if a character is not mapped
 208  
      */
 209  
     @Override
 210  
     public Object encode(Object obj) throws EncoderException {
 211  4
         if (!(obj instanceof String)) {
 212  1
             throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String");
 213  
         }
 214  3
         return this.nysiis((String) obj);
 215  
     }
 216  
 
 217  
     /**
 218  
      * Encodes a String using the NYSIIS algorithm.
 219  
      *
 220  
      * @param str
 221  
      *            A String object to encode
 222  
      * @return A Nysiis code corresponding to the String supplied
 223  
      * @throws IllegalArgumentException
 224  
      *            if a character is not mapped
 225  
      */
 226  
     @Override
 227  
     public String encode(String str) {
 228  93
         return this.nysiis(str);
 229  
     }
 230  
 
 231  
     /**
 232  
      * Indicates the strict mode for this {@link Nysiis} encoder.
 233  
      *
 234  
      * @return {@code true} if the encoder is configured for strict mode, {@code false} otherwise
 235  
      */
 236  
     public boolean isStrict() {
 237  92
         return this.strict;
 238  
     }
 239  
 
 240  
     /**
 241  
      * Retrieves the NYSIIS code for a given String object.
 242  
      *
 243  
      * @param str
 244  
      *            String to encode using the NYSIIS algorithm
 245  
      * @return A NYSIIS code for the String supplied
 246  
      */
 247  
     public String nysiis(String str) {
 248  96
         if (str == null) {
 249  1
             return null;
 250  
         }
 251  
 
 252  
         // Use the same clean rules as Soundex
 253  95
         str = SoundexUtils.clean(str);
 254  
 
 255  95
         if (str.length() == 0) {
 256  3
             return str;
 257  
         }
 258  
 
 259  
         // Translate first characters of name:
 260  
         // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS
 261  92
         str = PAT_MAC.matcher(str).replaceFirst("MCC");
 262  92
         str = PAT_KN.matcher(str).replaceFirst("NN");
 263  92
         str = PAT_K.matcher(str).replaceFirst("C");
 264  92
         str = PAT_PH_PF.matcher(str).replaceFirst("FF");
 265  92
         str = PAT_SCH.matcher(str).replaceFirst("SSS");
 266  
 
 267  
         // Translate last characters of name:
 268  
         // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D
 269  92
         str = PAT_EE_IE.matcher(str).replaceFirst("Y");
 270  92
         str = PAT_DT_ETC.matcher(str).replaceFirst("D");
 271  
 
 272  
         // First character of key = first character of name.
 273  92
         StringBuilder key = new StringBuilder(str.length());
 274  92
         key.append(str.charAt(0));
 275  
 
 276  
         // Transcode remaining characters, incrementing by one character each time
 277  92
         final char[] chars = str.toCharArray();
 278  92
         final int len = chars.length;
 279  
 
 280  413
         for (int i = 1; i < len; i++) {
 281  321
             final char next = i < len - 1 ? chars[i + 1] : SPACE;
 282  321
             final char aNext = i < len - 2 ? chars[i + 2] : SPACE;
 283  321
             final char[] transcoded = transcodeRemaining(chars[i - 1], chars[i], next, aNext);
 284  321
             System.arraycopy(transcoded, 0, chars, i, transcoded.length);
 285  
 
 286  
             // only append the current char to the key if it is different from the last one
 287  321
             if (chars[i] != chars[i - 1]) {
 288  254
                 key.append(chars[i]);
 289  
             }
 290  
         }
 291  
 
 292  92
         if (key.length() > 1) {
 293  86
             char lastChar = key.charAt(key.length() - 1);
 294  
 
 295  
             // If last character is S, remove it.
 296  86
             if (lastChar == 'S') {
 297  10
                 key.deleteCharAt(key.length() - 1);
 298  10
                 lastChar = key.charAt(key.length() - 1);
 299  
             }
 300  
 
 301  86
             if (key.length() > 2) {
 302  66
                 final char last2Char = key.charAt(key.length() - 2);
 303  
                 // If last characters are AY, replace with Y.
 304  66
                 if (last2Char == 'A' && lastChar == 'Y') {
 305  4
                     key.deleteCharAt(key.length() - 2);
 306  
                 }
 307  
             }
 308  
 
 309  
             // If last character is A, remove it.
 310  86
             if (lastChar == 'A') {
 311  12
                 key.deleteCharAt(key.length() - 1);
 312  
             }
 313  
         }
 314  
 
 315  92
         final String string = key.toString();
 316  92
         return this.isStrict() ? string.substring(0, Math.min(TRUE_LENGTH, string.length())) : string;
 317  
     }
 318  
 
 319  
 }