Coverage Report - org.apache.commons.codec.language.ColognePhonetic
 
Classes in this File Line Coverage Branch Coverage Complexity
ColognePhonetic
97%
74/76
93%
80/86
3.3
ColognePhonetic$CologneBuffer
100%
11/11
N/A
3.3
ColognePhonetic$CologneInputBuffer
78%
11/14
N/A
3.3
ColognePhonetic$CologneOutputBuffer
100%
9/9
N/A
3.3
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *      http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.commons.codec.language;
 19  
 
 20  
 import java.util.Locale;
 21  
 
 22  
 import org.apache.commons.codec.EncoderException;
 23  
 import org.apache.commons.codec.StringEncoder;
 24  
 
 25  
 /**
 26  
  * Encodes a string into a Cologne Phonetic value.
 27  
  * <p>
 28  
  * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">K&ouml;lner Phonetik</a>
 29  
  * (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969.
 30  
  * <p>
 31  
  * The <i>K&ouml;lner Phonetik</i> is a phonetic algorithm which is optimized for the German language.
 32  
  * It is related to the well-known soundex algorithm.
 33  
  * <p>
 34  
  *
 35  
  * <h2>Algorithm</h2>
 36  
  *
 37  
  * <ul>
 38  
  *
 39  
  * <li>
 40  
  * <h3>Step 1:</h3>
 41  
  * After preprocessing (conversion to upper case, transcription of <a
 42  
  * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the
 43  
  * letters of the supplied text are replaced by their phonetic code according to the following table.
 44  
  * <table border="1">
 45  
  * <tbody>
 46  
  * <tr>
 47  
  * <th>Letter</th>
 48  
  * <th>Context</th>
 49  
  * <th align="center">Code</th>
 50  
  * </tr>
 51  
  * <tr>
 52  
  * <td>A, E, I, J, O, U, Y</td>
 53  
  * <td></td>
 54  
  * <td align="center">0</td>
 55  
  * </tr>
 56  
  * <tr>
 57  
  *
 58  
  * <td>H</td>
 59  
  * <td></td>
 60  
  * <td align="center">-</td>
 61  
  * </tr>
 62  
  * <tr>
 63  
  * <td>B</td>
 64  
  * <td></td>
 65  
  * <td rowspan="2" align="center">1</td>
 66  
  * </tr>
 67  
  * <tr>
 68  
  * <td>P</td>
 69  
  * <td>not before H</td>
 70  
  *
 71  
  * </tr>
 72  
  * <tr>
 73  
  * <td>D, T</td>
 74  
  * <td>not before C, S, Z</td>
 75  
  * <td align="center">2</td>
 76  
  * </tr>
 77  
  * <tr>
 78  
  * <td>F, V, W</td>
 79  
  * <td></td>
 80  
  * <td rowspan="2" align="center">3</td>
 81  
  * </tr>
 82  
  * <tr>
 83  
  *
 84  
  * <td>P</td>
 85  
  * <td>before H</td>
 86  
  * </tr>
 87  
  * <tr>
 88  
  * <td>G, K, Q</td>
 89  
  * <td></td>
 90  
  * <td rowspan="3" align="center">4</td>
 91  
  * </tr>
 92  
  * <tr>
 93  
  * <td rowspan="2">C</td>
 94  
  * <td>at onset before A, H, K, L, O, Q, R, U, X</td>
 95  
  *
 96  
  * </tr>
 97  
  * <tr>
 98  
  * <td>before A, H, K, O, Q, U, X except after S, Z</td>
 99  
  * </tr>
 100  
  * <tr>
 101  
  * <td>X</td>
 102  
  * <td>not after C, K, Q</td>
 103  
  * <td align="center">48</td>
 104  
  * </tr>
 105  
  * <tr>
 106  
  * <td>L</td>
 107  
  * <td></td>
 108  
  *
 109  
  * <td align="center">5</td>
 110  
  * </tr>
 111  
  * <tr>
 112  
  * <td>M, N</td>
 113  
  * <td></td>
 114  
  * <td align="center">6</td>
 115  
  * </tr>
 116  
  * <tr>
 117  
  * <td>R</td>
 118  
  * <td></td>
 119  
  * <td align="center">7</td>
 120  
  * </tr>
 121  
  *
 122  
  * <tr>
 123  
  * <td>S, Z</td>
 124  
  * <td></td>
 125  
  * <td rowspan="6" align="center">8</td>
 126  
  * </tr>
 127  
  * <tr>
 128  
  * <td rowspan="3">C</td>
 129  
  * <td>after S, Z</td>
 130  
  * </tr>
 131  
  * <tr>
 132  
  * <td>at onset except before A, H, K, L, O, Q, R, U, X</td>
 133  
  * </tr>
 134  
  *
 135  
  * <tr>
 136  
  * <td>not before A, H, K, O, Q, U, X</td>
 137  
  * </tr>
 138  
  * <tr>
 139  
  * <td>D, T</td>
 140  
  * <td>before C, S, Z</td>
 141  
  * </tr>
 142  
  * <tr>
 143  
  * <td>X</td>
 144  
  * <td>after C, K, Q</td>
 145  
  * </tr>
 146  
  * </tbody>
 147  
  * </table>
 148  
  * <p>
 149  
  * <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de):
 150  
  * K&ouml;lner Phonetik -- Buchstabencodes</a>)</i></small>
 151  
  * </p>
 152  
  *
 153  
  * <h4>Example:</h4>
 154  
  *
 155  
  * {@code "M}&uuml;{@code ller-L}&uuml;{@code denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"}
 156  
  *
 157  
  * </li>
 158  
  *
 159  
  * <li>
 160  
  * <h3>Step 2:</h3>
 161  
  * Collapse of all multiple consecutive code digits.
 162  
  * <h4>Example:</h4>
 163  
  * {@code "6005507500206880022" => "6050750206802"}</li>
 164  
  *
 165  
  * <li>
 166  
  * <h3>Step 3:</h3>
 167  
  * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur
 168  
  * if they occur after removing the "0" digits.
 169  
  *
 170  
  * <h4>Example:</h4>
 171  
  * {@code "6050750206802" => "65752682"}</li>
 172  
  *
 173  
  * </ul>
 174  
  *
 175  
  * This class is thread-safe.
 176  
  *
 177  
  * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): K&ouml;lner Phonetik (in German)</a>
 178  
  * @since 1.5
 179  
  */
 180  13
 public class ColognePhonetic implements StringEncoder {
 181  
 
 182  
     /**
 183  
      * This class is not thread-safe; the field {@link #length} is mutable.
 184  
      * However, it is not shared between threads, as it is constructed on demand
 185  
      * by the method {@link ColognePhonetic#colognePhonetic(String)}
 186  
      */
 187  
     private abstract class CologneBuffer {
 188  
 
 189  
         protected final char[] data;
 190  
 
 191  200
         protected int length = 0;
 192  
 
 193  100
         public CologneBuffer(char[] data) {
 194  100
             this.data = data;
 195  100
             this.length = data.length;
 196  100
         }
 197  
 
 198  100
         public CologneBuffer(int buffSize) {
 199  100
             this.data = new char[buffSize];
 200  100
             this.length = 0;
 201  100
         }
 202  
 
 203  
         protected abstract char[] copyData(int start, final int length);
 204  
 
 205  
         public int length() {
 206  566
             return length;
 207  
         }
 208  
 
 209  
         @Override
 210  
         public String toString() {
 211  100
             return new String(copyData(0, length));
 212  
         }
 213  
     }
 214  
 
 215  
     private class CologneOutputBuffer extends CologneBuffer {
 216  
 
 217  100
         public CologneOutputBuffer(int buffSize) {
 218  100
             super(buffSize);
 219  100
         }
 220  
 
 221  
         public void addRight(char chr) {
 222  241
             data[length] = chr;
 223  241
             length++;
 224  241
         }
 225  
 
 226  
         @Override
 227  
         protected char[] copyData(int start, final int length) {
 228  100
             char[] newData = new char[length];
 229  100
             System.arraycopy(data, start, newData, 0, length);
 230  100
             return newData;
 231  
         }
 232  
     }
 233  
 
 234  13
     private class CologneInputBuffer extends CologneBuffer {
 235  
 
 236  100
         public CologneInputBuffer(char[] data) {
 237  100
             super(data);
 238  100
         }
 239  
 
 240  
         public void addLeft(char ch) {
 241  3
             length++;
 242  3
             data[getNextPos()] = ch;
 243  3
         }
 244  
 
 245  
         @Override
 246  
         protected char[] copyData(int start, final int length) {
 247  0
             char[] newData = new char[length];
 248  0
             System.arraycopy(data, data.length - this.length + start, newData, 0, length);
 249  0
             return newData;
 250  
         }
 251  
 
 252  
         public char getNextChar() {
 253  831
             return data[getNextPos()];
 254  
         }
 255  
 
 256  
         protected int getNextPos() {
 257  834
             return data.length - length;
 258  
         }
 259  
 
 260  
         public char removeNext() {
 261  466
             char ch = getNextChar();
 262  466
             length--;
 263  466
             return ch;
 264  
         }
 265  
     }
 266  
 
 267  
     /**
 268  
      * Maps some Germanic characters to plain for internal processing. The following characters are mapped:
 269  
      * <ul>
 270  
      * <li>capital a, umlaut mark</li>
 271  
      * <li>capital u, umlaut mark</li>
 272  
      * <li>capital o, umlaut mark</li>
 273  
      * <li>small sharp s, German</li>
 274  
      * </ul>
 275  
      */
 276  1
     private static final char[][] PREPROCESS_MAP = new char[][]{
 277  
         {'\u00C4', 'A'}, // capital a, umlaut mark
 278  
         {'\u00DC', 'U'}, // capital u, umlaut mark
 279  
         {'\u00D6', 'O'}, // capital o, umlaut mark
 280  
         {'\u00DF', 'S'} // small sharp s, German
 281  
     };
 282  
 
 283  
     /*
 284  
      * Returns whether the array contains the key, or not.
 285  
      */
 286  
     private static boolean arrayContains(char[] arr, char key) {
 287  5029
         for (char element : arr) {
 288  4255
             if (element == key) {
 289  249
                 return true;
 290  
             }
 291  
         }
 292  774
         return false;
 293  
     }
 294  
 
 295  
     /**
 296  
      * <p>
 297  
      * Implements the <i>K&ouml;lner Phonetik</i> algorithm.
 298  
      * </p>
 299  
      * <p>
 300  
      * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass.
 301  
      * </p>
 302  
      *
 303  
      * @param text
 304  
      * @return the corresponding encoding according to the <i>K&ouml;lner Phonetik</i> algorithm
 305  
      */
 306  
     public String colognePhonetic(String text) {
 307  101
         if (text == null) {
 308  1
             return null;
 309  
         }
 310  
 
 311  100
         text = preprocess(text);
 312  
 
 313  100
         CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2);
 314  100
         CologneInputBuffer input = new CologneInputBuffer(text.toCharArray());
 315  
 
 316  
         char nextChar;
 317  
 
 318  100
         char lastChar = '-';
 319  100
         char lastCode = '/';
 320  
         char code;
 321  
         char chr;
 322  
 
 323  100
         int rightLength = input.length();
 324  
 
 325  566
         while (rightLength > 0) {
 326  466
             chr = input.removeNext();
 327  
 
 328  466
             if ((rightLength = input.length()) > 0) {
 329  365
                 nextChar = input.getNextChar();
 330  
             } else {
 331  101
                 nextChar = '-';
 332  
             }
 333  
 
 334  466
             if (arrayContains(new char[]{'A', 'E', 'I', 'J', 'O', 'U', 'Y'}, chr)) {
 335  187
                 code = '0';
 336  279
             } else if (chr == 'H' || chr < 'A' || chr > 'Z') {
 337  42
                 if (lastCode == '/') {
 338  15
                     continue;
 339  
                 }
 340  27
                 code = '-';
 341  237
             } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) {
 342  19
                 code = '1';
 343  218
             } else if ((chr == 'D' || chr == 'T') && !arrayContains(new char[]{'S', 'C', 'Z'}, nextChar)) {
 344  19
                 code = '2';
 345  199
             } else if (arrayContains(new char[]{'W', 'F', 'P', 'V'}, chr)) {
 346  14
                 code = '3';
 347  185
             } else if (arrayContains(new char[]{'G', 'K', 'Q'}, chr)) {
 348  16
                 code = '4';
 349  169
             } else if (chr == 'X' && !arrayContains(new char[]{'C', 'K', 'Q'}, lastChar)) {
 350  3
                 code = '4';
 351  3
                 input.addLeft('S');
 352  3
                 rightLength++;
 353  166
             } else if (chr == 'S' || chr == 'Z') {
 354  30
                 code = '8';
 355  136
             } else if (chr == 'C') {
 356  20
                 if (lastCode == '/') {
 357  3
                     if (arrayContains(new char[]{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, nextChar)) {
 358  3
                         code = '4';
 359  
                     } else {
 360  0
                         code = '8';
 361  
                     }
 362  
                 } else {
 363  17
                     if (arrayContains(new char[]{'S', 'Z'}, lastChar) ||
 364  
                         !arrayContains(new char[]{'A', 'H', 'O', 'U', 'K', 'Q', 'X'}, nextChar)) {
 365  11
                         code = '8';
 366  
                     } else {
 367  6
                         code = '4';
 368  
                     }
 369  
                 }
 370  116
             } else if (arrayContains(new char[]{'T', 'D', 'X'}, chr)) {
 371  7
                 code = '8';
 372  109
             } else if (chr == 'R') {
 373  34
                 code = '7';
 374  75
             } else if (chr == 'L') {
 375  26
                 code = '5';
 376  49
             } else if (chr == 'M' || chr == 'N') {
 377  49
                 code = '6';
 378  
             } else {
 379  0
                 code = chr;
 380  
             }
 381  
 
 382  451
             if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) {
 383  241
                 output.addRight(code);
 384  
             }
 385  
 
 386  451
             lastChar = chr;
 387  451
             lastCode = code;
 388  
         }
 389  100
         return output.toString();
 390  
     }
 391  
 
 392  
     @Override
 393  
     public Object encode(Object object) throws EncoderException {
 394  4
         if (!(object instanceof String)) {
 395  1
             throw new EncoderException("This method's parameter was expected to be of the type " +
 396  
                 String.class.getName() +
 397  
                 ". But actually it was of the type " +
 398  
                 object.getClass().getName() +
 399  
                 ".");
 400  
         }
 401  3
         return encode((String) object);
 402  
     }
 403  
 
 404  
     @Override
 405  
     public String encode(String text) {
 406  85
         return colognePhonetic(text);
 407  
     }
 408  
 
 409  
     public boolean isEncodeEqual(String text1, String text2) {
 410  8
         return colognePhonetic(text1).equals(colognePhonetic(text2));
 411  
     }
 412  
 
 413  
     /**
 414  
      * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}.
 415  
      */
 416  
     private String preprocess(String text) {
 417  100
         text = text.toUpperCase(Locale.GERMAN);
 418  
 
 419  100
         char[] chrs = text.toCharArray();
 420  
 
 421  563
         for (int index = 0; index < chrs.length; index++) {
 422  463
             if (chrs[index] > 'Z') {
 423  19
                 for (char[] element : PREPROCESS_MAP) {
 424  19
                     if (chrs[index] == element[0]) {
 425  10
                         chrs[index] = element[1];
 426  10
                         break;
 427  
                     }
 428  
                 }
 429  
             }
 430  
         }
 431  100
         return new String(chrs);
 432  
     }
 433  
 }