Coverage Report - org.apache.commons.codec.language.ColognePhonetic
 
Classes in this File Line Coverage Branch Coverage Complexity
ColognePhonetic
97%
83/85
93%
80/86
3.3
ColognePhonetic$CologneBuffer
100%
11/11
N/A
3.3
ColognePhonetic$CologneInputBuffer
78%
11/14
N/A
3.3
ColognePhonetic$CologneOutputBuffer
100%
9/9
N/A
3.3
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.commons.codec.language;
 19  
 
 20  
 import java.util.Locale;
 21  
 
 22  
 import org.apache.commons.codec.EncoderException;
 23  
 import org.apache.commons.codec.StringEncoder;
 24  
 
 25  
 /**
 26  
  * Encodes a string into a Cologne Phonetic value.
 27  
  * <p>
 28  
  * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&ouml;lner Phonetik</a>
 29  
  * (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969.
 30  
  * <p>
 31  
  * The <i>K&ouml;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language.
 32  
  * It is related to the well-known soundex algorithm.
 33  
  * <p>
 34  
  *
 35  
  * <h2>Algorithm</h2>
 36  
  *
 37  
  * <ul>
 38  
  *
 39  
  * <li>
 40  
  * <h3>Step 1:</h3>
 41  
  * After preprocessing (conversion to upper case, transcription of <a
 42  
  * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
 43  
  * letters of the supplied text are replaced by their phonetic code according to the following table.
 44  
  * <table border="1">
 45  
  * <tbody>
 46  
  * <tr>
 47  
  * <th>Letter</th>
 48  
  * <th>Context</th>
 49  
  * <th align="center">Code</th>
 50  
  * </tr>
 51  
  * <tr>
 52  
  * <td>A, E, I, J, O, U, Y</td>
 53  
  * <td></td>
 54  
  * <td align="center">0</td>
 55  
  * </tr>
 56  
  * <tr>
 57  
  *
 58  
  * <td>H</td>
 59  
  * <td></td>
 60  
  * <td align="center">-</td>
 61  
  * </tr>
 62  
  * <tr>
 63  
  * <td>B</td>
 64  
  * <td></td>
 65  
  * <td rowspan="2" align="center">1</td>
 66  
  * </tr>
 67  
  * <tr>
 68  
  * <td>P</td>
 69  
  * <td>not before H</td>
 70  
  *
 71  
  * </tr>
 72  
  * <tr>
 73  
  * <td>D, T</td>
 74  
  * <td>not before C, S, Z</td>
 75  
  * <td align="center">2</td>
 76  
  * </tr>
 77  
  * <tr>
 78  
  * <td>F, V, W</td>
 79  
  * <td></td>
 80  
  * <td rowspan="2" align="center">3</td>
 81  
  * </tr>
 82  
  * <tr>
 83  
  *
 84  
  * <td>P</td>
 85  
  * <td>before H</td>
 86  
  * </tr>
 87  
  * <tr>
 88  
  * <td>G, K, Q</td>
 89  
  * <td></td>
 90  
  * <td rowspan="3" align="center">4</td>
 91  
  * </tr>
 92  
  * <tr>
 93  
  * <td rowspan="2">C</td>
 94  
  * <td>at onset before A, H, K, L, O, Q, R, U, X</td>
 95  
  *
 96  
  * </tr>
 97  
  * <tr>
 98  
  * <td>before A, H, K, O, Q, U, X except after S, Z</td>
 99  
  * </tr>
 100  
  * <tr>
 101  
  * <td>X</td>
 102  
  * <td>not after C, K, Q</td>
 103  
  * <td align="center">48</td>
 104  
  * </tr>
 105  
  * <tr>
 106  
  * <td>L</td>
 107  
  * <td></td>
 108  
  *
 109  
  * <td align="center">5</td>
 110  
  * </tr>
 111  
  * <tr>
 112  
  * <td>M, N</td>
 113  
  * <td></td>
 114  
  * <td align="center">6</td>
 115  
  * </tr>
 116  
  * <tr>
 117  
  * <td>R</td>
 118  
  * <td></td>
 119  
  * <td align="center">7</td>
 120  
  * </tr>
 121  
  *
 122  
  * <tr>
 123  
  * <td>S, Z</td>
 124  
  * <td></td>
 125  
  * <td rowspan="6" align="center">8</td>
 126  
  * </tr>
 127  
  * <tr>
 128  
  * <td rowspan="3">C</td>
 129  
  * <td>after S, Z</td>
 130  
  * </tr>
 131  
  * <tr>
 132  
  * <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
 133  
  * </tr>
 134  
  *
 135  
  * <tr>
 136  
  * <td>not before A, H, K, O, Q, U, X</td>
 137  
  * </tr>
 138  
  * <tr>
 139  
  * <td>D, T</td>
 140  
  * <td>before C, S, Z</td>
 141  
  * </tr>
 142  
  * <tr>
 143  
  * <td>X</td>
 144  
  * <td>after C, K, Q</td>
 145  
  * </tr>
 146  
  * </tbody>
 147  
  * </table>
 148  
  * <p>
 149  
  * <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
 150  
  * K&ouml;lner Phonetik -- Buchstabencodes</a>)</i></small>
 151  
  * </p>
 152  
  *
 153  
  * <h4>Example:</h4>
 154  
  *
 155  
  * {@code "M}&uuml;{@code ller-L}&uuml;{@code denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"}
 156  
  *
 157  
  * </li>
 158  
  *
 159  
  * <li>
 160  
  * <h3>Step 2:</h3>
 161  
  * Collapse of all multiple consecutive code digits.
 162  
  * <h4>Example:</h4>
 163  
  * {@code "6005507500206880022" => "6050750206802"}</li>
 164  
  *
 165  
  * <li>
 166  
  * <h3>Step 3:</h3>
 167  
  * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
 168  
  * if they occur after removing the "0" digits.
 169  
  *
 170  
  * <h4>Example:</h4>
 171  
  * {@code "6050750206802" => "65752682"}</li>
 172  
  *
 173  
  * </ul>
 174  
  *
 175  
  * This class is thread-safe.
 176  
  *
 177  
  * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&ouml;lner Phonetik (in German)</a>
 178  
  * @since 1.5
 179  
  */
 180  13
 public class ColognePhonetic implements StringEncoder {
 181  
 
 182  
     // Predefined char arrays for better performance and less GC load
 183  1
     private static final char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' };
 184  1
     private static final char[] SCZ = new char[] { 'S', 'C', 'Z' };
 185  1
     private static final char[] WFPV = new char[] { 'W', 'F', 'P', 'V' };
 186  1
     private static final char[] GKQ = new char[] { 'G', 'K', 'Q' };
 187  1
     private static final char[] CKQ = new char[] { 'C', 'K', 'Q' };
 188  1
     private static final char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' };
 189  1
     private static final char[] SZ = new char[] { 'S', 'Z' };
 190  1
     private static final char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' };
 191  1
     private static final char[] TDX = new char[] { 'T', 'D', 'X' };
 192  
 
 193  
     /**
 194  
      * This class is not thread-safe; the field {@link #length} is mutable.
 195  
      * However, it is not shared between threads, as it is constructed on demand
 196  
      * by the method {@link ColognePhonetic#colognePhonetic(String)}
 197  
      */
 198  
     private abstract class CologneBuffer {
 199  
 
 200  
         protected final char[] data;
 201  
 
 202  200
         protected int length = 0;
 203  
 
 204  100
         public CologneBuffer(final char[] data) {
 205  100
             this.data = data;
 206  100
             this.length = data.length;
 207  100
         }
 208  
 
 209  100
         public CologneBuffer(final int buffSize) {
 210  100
             this.data = new char[buffSize];
 211  100
             this.length = 0;
 212  100
         }
 213  
 
 214  
         protected abstract char[] copyData(int start, final int length);
 215  
 
 216  
         public int length() {
 217  566
             return length;
 218  
         }
 219  
 
 220  
         @Override
 221  
         public String toString() {
 222  100
             return new String(copyData(0, length));
 223  
         }
 224  
     }
 225  
 
 226  
     private class CologneOutputBuffer extends CologneBuffer {
 227  
 
 228  100
         public CologneOutputBuffer(final int buffSize) {
 229  100
             super(buffSize);
 230  100
         }
 231  
 
 232  
         public void addRight(final char chr) {
 233  241
             data[length] = chr;
 234  241
             length++;
 235  241
         }
 236  
 
 237  
         @Override
 238  
         protected char[] copyData(final int start, final int length) {
 239  100
             final char[] newData = new char[length];
 240  100
             System.arraycopy(data, start, newData, 0, length);
 241  100
             return newData;
 242  
         }
 243  
     }
 244  
 
 245  13
     private class CologneInputBuffer extends CologneBuffer {
 246  
 
 247  100
         public CologneInputBuffer(final char[] data) {
 248  100
             super(data);
 249  100
         }
 250  
 
 251  
         public void addLeft(final char ch) {
 252  3
             length++;
 253  3
             data[getNextPos()] = ch;
 254  3
         }
 255  
 
 256  
         @Override
 257  
         protected char[] copyData(final int start, final int length) {
 258  0
             final char[] newData = new char[length];
 259  0
             System.arraycopy(data, data.length - this.length + start, newData, 0, length);
 260  0
             return newData;
 261  
         }
 262  
 
 263  
         public char getNextChar() {
 264  831
             return data[getNextPos()];
 265  
         }
 266  
 
 267  
         protected int getNextPos() {
 268  834
             return data.length - length;
 269  
         }
 270  
 
 271  
         public char removeNext() {
 272  466
             final char ch = getNextChar();
 273  466
             length--;
 274  466
             return ch;
 275  
         }
 276  
     }
 277  
 
 278  
     /**
 279  
      * Maps some Germanic characters to plain for internal processing. The following characters are mapped:
 280  
      * <ul>
 281  
      * <li>capital a, umlaut mark</li>
 282  
      * <li>capital u, umlaut mark</li>
 283  
      * <li>capital o, umlaut mark</li>
 284  
      * <li>small sharp s, German</li>
 285  
      * </ul>
 286  
      */
 287  1
     private static final char[][] PREPROCESS_MAP = new char[][]{
 288  
         {'\u00C4', 'A'}, // capital a, umlaut mark
 289  
         {'\u00DC', 'U'}, // capital u, umlaut mark
 290  
         {'\u00D6', 'O'}, // capital o, umlaut mark
 291  
         {'\u00DF', 'S'} // small sharp s, German
 292  
     };
 293  
 
 294  
     /*
 295  
      * Returns whether the array contains the key, or not.
 296  
      */
 297  
     private static boolean arrayContains(final char[] arr, final char key) {
 298  5029
         for (final char element : arr) {
 299  4255
             if (element == key) {
 300  249
                 return true;
 301  
             }
 302  
         }
 303  774
         return false;
 304  
     }
 305  
 
 306  
     /**
 307  
      * <p>
 308  
      * Implements the <i>K&ouml;lner Phonetik</i> algorithm.
 309  
      * </p>
 310  
      * <p>
 311  
      * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
 312  
      * </p>
 313  
      *
 314  
      * @param text
 315  
      * @return the corresponding encoding according to the <i>K&ouml;lner Phonetik</i> algorithm
 316  
      */
 317  
     public String colognePhonetic(String text) {
 318  101
         if (text == null) {
 319  1
             return null;
 320  
         }
 321  
 
 322  100
         text = preprocess(text);
 323  
 
 324  100
         final CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
 325  100
         final CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());
 326  
 
 327  
         char nextChar;
 328  
 
 329  100
         char lastChar = '-';
 330  100
         char lastCode = '/';
 331  
         char code;
 332  
         char chr;
 333  
 
 334  100
         int rightLength = input.length();
 335  
 
 336  566
         while (rightLength > 0) {
 337  466
             chr = input.removeNext();
 338  
 
 339  466
             if ((rightLength = input.length()) > 0) {
 340  365
                 nextChar = input.getNextChar();
 341  
             } else {
 342  101
                 nextChar = '-';
 343  
             }
 344  
 
 345  466
             if (arrayContains(AEIJOUY, chr)) {
 346  187
                 code = '0';
 347  279
             } else if (chr == 'H' || chr < 'A' || chr > 'Z') {
 348  42
                 if (lastCode == '/') {
 349  15
                     continue;
 350  
                 }
 351  27
                 code = '-';
 352  237
             } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
 353  19
                 code = '1';
 354  218
             } else if ((chr == 'D' || chr == 'T') && !arrayContains(SCZ, nextChar)) {
 355  19
                 code = '2';
 356  199
             } else if (arrayContains(WFPV, chr)) {
 357  14
                 code = '3';
 358  185
             } else if (arrayContains(GKQ, chr)) {
 359  16
                 code = '4';
 360  169
             } else if (chr == 'X' && !arrayContains(CKQ, lastChar)) {
 361  3
                 code = '4';
 362  3
                 input.addLeft('S');
 363  3
                 rightLength++;
 364  166
             } else if (chr == 'S' || chr == 'Z') {
 365  30
                 code = '8';
 366  136
             } else if (chr == 'C') {
 367  20
                 if (lastCode == '/') {
 368  3
                     if (arrayContains(AHKLOQRUX, nextChar)) {
 369  3
                         code = '4';
 370  
                     } else {
 371  0
                         code = '8';
 372  
                     }
 373  
                 } else {
 374  17
                     if (arrayContains(SZ, lastChar) || !arrayContains(AHOUKQX, nextChar)) {
 375  11
                         code = '8';
 376  
                     } else {
 377  6
                         code = '4';
 378  
                     }
 379  
                 }
 380  116
             } else if (arrayContains(TDX, chr)) {
 381  7
                 code = '8';
 382  109
             } else if (chr == 'R') {
 383  34
                 code = '7';
 384  75
             } else if (chr == 'L') {
 385  26
                 code = '5';
 386  49
             } else if (chr == 'M' || chr == 'N') {
 387  49
                 code = '6';
 388  
             } else {
 389  0
                 code = chr;
 390  
             }
 391  
 
 392  451
             if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
 393  241
                 output.addRight(code);
 394  
             }
 395  
 
 396  451
             lastChar = chr;
 397  451
             lastCode = code;
 398  
         }
 399  100
         return output.toString();
 400  
     }
 401  
 
 402  
     @Override
 403  
     public Object encode(final Object object) throws EncoderException {
 404  4
         if (!(object instanceof String)) {
 405  1
             throw new EncoderException("This method's parameter was expected to be of the type " +
 406  
                 String.class.getName() +
 407  
                 ". But actually it was of the type " +
 408  
                 object.getClass().getName() +
 409  
                 ".");
 410  
         }
 411  3
         return encode((String) object);
 412  
     }
 413  
 
 414  
     @Override
 415  
     public String encode(final String text) {
 416  85
         return colognePhonetic(text);
 417  
     }
 418  
 
 419  
     public boolean isEncodeEqual(final String text1, final String text2) {
 420  8
         return colognePhonetic(text1).equals(colognePhonetic(text2));
 421  
     }
 422  
 
 423  
     /**
 424  
      * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}.
 425  
      */
 426  
     private String preprocess(String text) {
 427  100
         text = text.toUpperCase(Locale.GERMAN);
 428  
 
 429  100
         final char[] chrs = text.toCharArray();
 430  
 
 431  563
         for (int index = 0; index < chrs.length; index++) {
 432  463
             if (chrs[index] > 'Z') {
 433  19
                 for (final char[] element : PREPROCESS_MAP) {
 434  19
                     if (chrs[index] == element[0]) {
 435  10
                         chrs[index] = element[1];
 436  10
                         break;
 437  
                     }
 438  
                 }
 439  
             }
 440  
         }
 441  100
         return new String(chrs);
 442  
     }
 443  
 }