Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
ColognePhonetic |
|
| 3.3;3.3 | ||||
ColognePhonetic$CologneBuffer |
|
| 3.3;3.3 | ||||
ColognePhonetic$CologneInputBuffer |
|
| 3.3;3.3 | ||||
ColognePhonetic$CologneOutputBuffer |
|
| 3.3;3.3 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.codec.language; | |
19 | ||
20 | import java.util.Locale; | |
21 | ||
22 | import org.apache.commons.codec.EncoderException; | |
23 | import org.apache.commons.codec.StringEncoder; | |
24 | ||
25 | /** | |
26 | * Encodes a string into a Cologne Phonetic value. | |
27 | * <p> | |
28 | * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Kölner Phonetik</a> | |
29 | * (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969. | |
30 | * <p> | |
31 | * The <i>Kölner Phonetik</i> is a phonetic algorithm which is optimized for the German language. | |
32 | * It is related to the well-known soundex algorithm. | |
33 | * <p> | |
34 | * | |
35 | * <h2>Algorithm</h2> | |
36 | * | |
37 | * <ul> | |
38 | * | |
39 | * <li> | |
40 | * <h3>Step 1:</h3> | |
41 | * After preprocessing (conversion to upper case, transcription of <a | |
42 | * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the | |
43 | * letters of the supplied text are replaced by their phonetic code according to the following table. | |
44 | * <table border="1"> | |
45 | * <tbody> | |
46 | * <tr> | |
47 | * <th>Letter</th> | |
48 | * <th>Context</th> | |
49 | * <th align="center">Code</th> | |
50 | * </tr> | |
51 | * <tr> | |
52 | * <td>A, E, I, J, O, U, Y</td> | |
53 | * <td></td> | |
54 | * <td align="center">0</td> | |
55 | * </tr> | |
56 | * <tr> | |
57 | * | |
58 | * <td>H</td> | |
59 | * <td></td> | |
60 | * <td align="center">-</td> | |
61 | * </tr> | |
62 | * <tr> | |
63 | * <td>B</td> | |
64 | * <td></td> | |
65 | * <td rowspan="2" align="center">1</td> | |
66 | * </tr> | |
67 | * <tr> | |
68 | * <td>P</td> | |
69 | * <td>not before H</td> | |
70 | * | |
71 | * </tr> | |
72 | * <tr> | |
73 | * <td>D, T</td> | |
74 | * <td>not before C, S, Z</td> | |
75 | * <td align="center">2</td> | |
76 | * </tr> | |
77 | * <tr> | |
78 | * <td>F, V, W</td> | |
79 | * <td></td> | |
80 | * <td rowspan="2" align="center">3</td> | |
81 | * </tr> | |
82 | * <tr> | |
83 | * | |
84 | * <td>P</td> | |
85 | * <td>before H</td> | |
86 | * </tr> | |
87 | * <tr> | |
88 | * <td>G, K, Q</td> | |
89 | * <td></td> | |
90 | * <td rowspan="3" align="center">4</td> | |
91 | * </tr> | |
92 | * <tr> | |
93 | * <td rowspan="2">C</td> | |
94 | * <td>at onset before A, H, K, L, O, Q, R, U, X</td> | |
95 | * | |
96 | * </tr> | |
97 | * <tr> | |
98 | * <td>before A, H, K, O, Q, U, X except after S, Z</td> | |
99 | * </tr> | |
100 | * <tr> | |
101 | * <td>X</td> | |
102 | * <td>not after C, K, Q</td> | |
103 | * <td align="center">48</td> | |
104 | * </tr> | |
105 | * <tr> | |
106 | * <td>L</td> | |
107 | * <td></td> | |
108 | * | |
109 | * <td align="center">5</td> | |
110 | * </tr> | |
111 | * <tr> | |
112 | * <td>M, N</td> | |
113 | * <td></td> | |
114 | * <td align="center">6</td> | |
115 | * </tr> | |
116 | * <tr> | |
117 | * <td>R</td> | |
118 | * <td></td> | |
119 | * <td align="center">7</td> | |
120 | * </tr> | |
121 | * | |
122 | * <tr> | |
123 | * <td>S, Z</td> | |
124 | * <td></td> | |
125 | * <td rowspan="6" align="center">8</td> | |
126 | * </tr> | |
127 | * <tr> | |
128 | * <td rowspan="3">C</td> | |
129 | * <td>after S, Z</td> | |
130 | * </tr> | |
131 | * <tr> | |
132 | * <td>at onset except before A, H, K, L, O, Q, R, U, X</td> | |
133 | * </tr> | |
134 | * | |
135 | * <tr> | |
136 | * <td>not before A, H, K, O, Q, U, X</td> | |
137 | * </tr> | |
138 | * <tr> | |
139 | * <td>D, T</td> | |
140 | * <td>before C, S, Z</td> | |
141 | * </tr> | |
142 | * <tr> | |
143 | * <td>X</td> | |
144 | * <td>after C, K, Q</td> | |
145 | * </tr> | |
146 | * </tbody> | |
147 | * </table> | |
148 | * <p> | |
149 | * <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de): | |
150 | * Kölner Phonetik -- Buchstabencodes</a>)</i></small> | |
151 | * </p> | |
152 | * | |
153 | * <h4>Example:</h4> | |
154 | * | |
155 | * {@code "M}ü{@code ller-L}ü{@code denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"} | |
156 | * | |
157 | * </li> | |
158 | * | |
159 | * <li> | |
160 | * <h3>Step 2:</h3> | |
161 | * Collapse of all multiple consecutive code digits. | |
162 | * <h4>Example:</h4> | |
163 | * {@code "6005507500206880022" => "6050750206802"}</li> | |
164 | * | |
165 | * <li> | |
166 | * <h3>Step 3:</h3> | |
167 | * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur | |
168 | * if they occur after removing the "0" digits. | |
169 | * | |
170 | * <h4>Example:</h4> | |
171 | * {@code "6050750206802" => "65752682"}</li> | |
172 | * | |
173 | * </ul> | |
174 | * | |
175 | * This class is thread-safe. | |
176 | * | |
177 | * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): Kölner Phonetik (in German)</a> | |
178 | * @since 1.5 | |
179 | */ | |
180 | 13 | public class ColognePhonetic implements StringEncoder { |
181 | ||
182 | // Predefined char arrays for better performance and less GC load | |
183 | 1 | private static final char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' }; |
184 | 1 | private static final char[] SCZ = new char[] { 'S', 'C', 'Z' }; |
185 | 1 | private static final char[] WFPV = new char[] { 'W', 'F', 'P', 'V' }; |
186 | 1 | private static final char[] GKQ = new char[] { 'G', 'K', 'Q' }; |
187 | 1 | private static final char[] CKQ = new char[] { 'C', 'K', 'Q' }; |
188 | 1 | private static final char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' }; |
189 | 1 | private static final char[] SZ = new char[] { 'S', 'Z' }; |
190 | 1 | private static final char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' }; |
191 | 1 | private static final char[] TDX = new char[] { 'T', 'D', 'X' }; |
192 | ||
193 | /** | |
194 | * This class is not thread-safe; the field {@link #length} is mutable. | |
195 | * However, it is not shared between threads, as it is constructed on demand | |
196 | * by the method {@link ColognePhonetic#colognePhonetic(String)} | |
197 | */ | |
198 | private abstract class CologneBuffer { | |
199 | ||
200 | protected final char[] data; | |
201 | ||
202 | 200 | protected int length = 0; |
203 | ||
204 | 100 | public CologneBuffer(final char[] data) { |
205 | 100 | this.data = data; |
206 | 100 | this.length = data.length; |
207 | 100 | } |
208 | ||
209 | 100 | public CologneBuffer(final int buffSize) { |
210 | 100 | this.data = new char[buffSize]; |
211 | 100 | this.length = 0; |
212 | 100 | } |
213 | ||
214 | protected abstract char[] copyData(int start, final int length); | |
215 | ||
216 | public int length() { | |
217 | 566 | return length; |
218 | } | |
219 | ||
220 | @Override | |
221 | public String toString() { | |
222 | 100 | return new String(copyData(0, length)); |
223 | } | |
224 | } | |
225 | ||
226 | private class CologneOutputBuffer extends CologneBuffer { | |
227 | ||
228 | 100 | public CologneOutputBuffer(final int buffSize) { |
229 | 100 | super(buffSize); |
230 | 100 | } |
231 | ||
232 | public void addRight(final char chr) { | |
233 | 241 | data[length] = chr; |
234 | 241 | length++; |
235 | 241 | } |
236 | ||
237 | @Override | |
238 | protected char[] copyData(final int start, final int length) { | |
239 | 100 | final char[] newData = new char[length]; |
240 | 100 | System.arraycopy(data, start, newData, 0, length); |
241 | 100 | return newData; |
242 | } | |
243 | } | |
244 | ||
245 | 13 | private class CologneInputBuffer extends CologneBuffer { |
246 | ||
247 | 100 | public CologneInputBuffer(final char[] data) { |
248 | 100 | super(data); |
249 | 100 | } |
250 | ||
251 | public void addLeft(final char ch) { | |
252 | 3 | length++; |
253 | 3 | data[getNextPos()] = ch; |
254 | 3 | } |
255 | ||
256 | @Override | |
257 | protected char[] copyData(final int start, final int length) { | |
258 | 0 | final char[] newData = new char[length]; |
259 | 0 | System.arraycopy(data, data.length - this.length + start, newData, 0, length); |
260 | 0 | return newData; |
261 | } | |
262 | ||
263 | public char getNextChar() { | |
264 | 831 | return data[getNextPos()]; |
265 | } | |
266 | ||
267 | protected int getNextPos() { | |
268 | 834 | return data.length - length; |
269 | } | |
270 | ||
271 | public char removeNext() { | |
272 | 466 | final char ch = getNextChar(); |
273 | 466 | length--; |
274 | 466 | return ch; |
275 | } | |
276 | } | |
277 | ||
278 | /** | |
279 | * Maps some Germanic characters to plain for internal processing. The following characters are mapped: | |
280 | * <ul> | |
281 | * <li>capital a, umlaut mark</li> | |
282 | * <li>capital u, umlaut mark</li> | |
283 | * <li>capital o, umlaut mark</li> | |
284 | * <li>small sharp s, German</li> | |
285 | * </ul> | |
286 | */ | |
287 | 1 | private static final char[][] PREPROCESS_MAP = new char[][]{ |
288 | {'\u00C4', 'A'}, // capital a, umlaut mark | |
289 | {'\u00DC', 'U'}, // capital u, umlaut mark | |
290 | {'\u00D6', 'O'}, // capital o, umlaut mark | |
291 | {'\u00DF', 'S'} // small sharp s, German | |
292 | }; | |
293 | ||
294 | /* | |
295 | * Returns whether the array contains the key, or not. | |
296 | */ | |
297 | private static boolean arrayContains(final char[] arr, final char key) { | |
298 | 5029 | for (final char element : arr) { |
299 | 4255 | if (element == key) { |
300 | 249 | return true; |
301 | } | |
302 | } | |
303 | 774 | return false; |
304 | } | |
305 | ||
306 | /** | |
307 | * <p> | |
308 | * Implements the <i>Kölner Phonetik</i> algorithm. | |
309 | * </p> | |
310 | * <p> | |
311 | * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass. | |
312 | * </p> | |
313 | * | |
314 | * @param text | |
315 | * @return the corresponding encoding according to the <i>Kölner Phonetik</i> algorithm | |
316 | */ | |
317 | public String colognePhonetic(String text) { | |
318 | 101 | if (text == null) { |
319 | 1 | return null; |
320 | } | |
321 | ||
322 | 100 | text = preprocess(text); |
323 | ||
324 | 100 | final CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2); |
325 | 100 | final CologneInputBuffer input = new CologneInputBuffer(text.toCharArray()); |
326 | ||
327 | char nextChar; | |
328 | ||
329 | 100 | char lastChar = '-'; |
330 | 100 | char lastCode = '/'; |
331 | char code; | |
332 | char chr; | |
333 | ||
334 | 100 | int rightLength = input.length(); |
335 | ||
336 | 566 | while (rightLength > 0) { |
337 | 466 | chr = input.removeNext(); |
338 | ||
339 | 466 | if ((rightLength = input.length()) > 0) { |
340 | 365 | nextChar = input.getNextChar(); |
341 | } else { | |
342 | 101 | nextChar = '-'; |
343 | } | |
344 | ||
345 | 466 | if (arrayContains(AEIJOUY, chr)) { |
346 | 187 | code = '0'; |
347 | 279 | } else if (chr == 'H' || chr < 'A' || chr > 'Z') { |
348 | 42 | if (lastCode == '/') { |
349 | 15 | continue; |
350 | } | |
351 | 27 | code = '-'; |
352 | 237 | } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) { |
353 | 19 | code = '1'; |
354 | 218 | } else if ((chr == 'D' || chr == 'T') && !arrayContains(SCZ, nextChar)) { |
355 | 19 | code = '2'; |
356 | 199 | } else if (arrayContains(WFPV, chr)) { |
357 | 14 | code = '3'; |
358 | 185 | } else if (arrayContains(GKQ, chr)) { |
359 | 16 | code = '4'; |
360 | 169 | } else if (chr == 'X' && !arrayContains(CKQ, lastChar)) { |
361 | 3 | code = '4'; |
362 | 3 | input.addLeft('S'); |
363 | 3 | rightLength++; |
364 | 166 | } else if (chr == 'S' || chr == 'Z') { |
365 | 30 | code = '8'; |
366 | 136 | } else if (chr == 'C') { |
367 | 20 | if (lastCode == '/') { |
368 | 3 | if (arrayContains(AHKLOQRUX, nextChar)) { |
369 | 3 | code = '4'; |
370 | } else { | |
371 | 0 | code = '8'; |
372 | } | |
373 | } else { | |
374 | 17 | if (arrayContains(SZ, lastChar) || !arrayContains(AHOUKQX, nextChar)) { |
375 | 11 | code = '8'; |
376 | } else { | |
377 | 6 | code = '4'; |
378 | } | |
379 | } | |
380 | 116 | } else if (arrayContains(TDX, chr)) { |
381 | 7 | code = '8'; |
382 | 109 | } else if (chr == 'R') { |
383 | 34 | code = '7'; |
384 | 75 | } else if (chr == 'L') { |
385 | 26 | code = '5'; |
386 | 49 | } else if (chr == 'M' || chr == 'N') { |
387 | 49 | code = '6'; |
388 | } else { | |
389 | 0 | code = chr; |
390 | } | |
391 | ||
392 | 451 | if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) { |
393 | 241 | output.addRight(code); |
394 | } | |
395 | ||
396 | 451 | lastChar = chr; |
397 | 451 | lastCode = code; |
398 | } | |
399 | 100 | return output.toString(); |
400 | } | |
401 | ||
402 | @Override | |
403 | public Object encode(final Object object) throws EncoderException { | |
404 | 4 | if (!(object instanceof String)) { |
405 | 1 | throw new EncoderException("This method's parameter was expected to be of the type " + |
406 | String.class.getName() + | |
407 | ". But actually it was of the type " + | |
408 | object.getClass().getName() + | |
409 | "."); | |
410 | } | |
411 | 3 | return encode((String) object); |
412 | } | |
413 | ||
414 | @Override | |
415 | public String encode(final String text) { | |
416 | 85 | return colognePhonetic(text); |
417 | } | |
418 | ||
419 | public boolean isEncodeEqual(final String text1, final String text2) { | |
420 | 8 | return colognePhonetic(text1).equals(colognePhonetic(text2)); |
421 | } | |
422 | ||
423 | /** | |
424 | * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}. | |
425 | */ | |
426 | private String preprocess(String text) { | |
427 | 100 | text = text.toUpperCase(Locale.GERMAN); |
428 | ||
429 | 100 | final char[] chrs = text.toCharArray(); |
430 | ||
431 | 563 | for (int index = 0; index < chrs.length; index++) { |
432 | 463 | if (chrs[index] > 'Z') { |
433 | 19 | for (final char[] element : PREPROCESS_MAP) { |
434 | 19 | if (chrs[index] == element[0]) { |
435 | 10 | chrs[index] = element[1]; |
436 | 10 | break; |
437 | } | |
438 | } | |
439 | } | |
440 | } | |
441 | 100 | return new String(chrs); |
442 | } | |
443 | } |