Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
ColognePhonetic |
|
| 3.3;3.3 | ||||
ColognePhonetic$CologneBuffer |
|
| 3.3;3.3 | ||||
ColognePhonetic$CologneInputBuffer |
|
| 3.3;3.3 | ||||
ColognePhonetic$CologneOutputBuffer |
|
| 3.3;3.3 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.codec.language; | |
19 | ||
20 | import java.util.Locale; | |
21 | ||
22 | import org.apache.commons.codec.EncoderException; | |
23 | import org.apache.commons.codec.StringEncoder; | |
24 | ||
25 | /** | |
26 | * Encodes a string into a Cologne Phonetic value. | |
27 | * <p> | |
28 | * Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Kölner Phonetik</a> | |
29 | * (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969. | |
30 | * <p> | |
31 | * The <i>Kölner Phonetik</i> is a phonetic algorithm which is optimized for the German language. | |
32 | * It is related to the well-known soundex algorithm. | |
33 | * <p> | |
34 | * | |
35 | * <h2>Algorithm</h2> | |
36 | * | |
37 | * <ul> | |
38 | * | |
39 | * <li> | |
40 | * <h3>Step 1:</h3> | |
41 | * After preprocessing (conversion to upper case, transcription of <a | |
42 | * href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the | |
43 | * letters of the supplied text are replaced by their phonetic code according to the following table. | |
44 | * <table border="1"> | |
45 | * <tbody> | |
46 | * <tr> | |
47 | * <th>Letter</th> | |
48 | * <th>Context</th> | |
49 | * <th align="center">Code</th> | |
50 | * </tr> | |
51 | * <tr> | |
52 | * <td>A, E, I, J, O, U, Y</td> | |
53 | * <td></td> | |
54 | * <td align="center">0</td> | |
55 | * </tr> | |
56 | * <tr> | |
57 | * | |
58 | * <td>H</td> | |
59 | * <td></td> | |
60 | * <td align="center">-</td> | |
61 | * </tr> | |
62 | * <tr> | |
63 | * <td>B</td> | |
64 | * <td></td> | |
65 | * <td rowspan="2" align="center">1</td> | |
66 | * </tr> | |
67 | * <tr> | |
68 | * <td>P</td> | |
69 | * <td>not before H</td> | |
70 | * | |
71 | * </tr> | |
72 | * <tr> | |
73 | * <td>D, T</td> | |
74 | * <td>not before C, S, Z</td> | |
75 | * <td align="center">2</td> | |
76 | * </tr> | |
77 | * <tr> | |
78 | * <td>F, V, W</td> | |
79 | * <td></td> | |
80 | * <td rowspan="2" align="center">3</td> | |
81 | * </tr> | |
82 | * <tr> | |
83 | * | |
84 | * <td>P</td> | |
85 | * <td>before H</td> | |
86 | * </tr> | |
87 | * <tr> | |
88 | * <td>G, K, Q</td> | |
89 | * <td></td> | |
90 | * <td rowspan="3" align="center">4</td> | |
91 | * </tr> | |
92 | * <tr> | |
93 | * <td rowspan="2">C</td> | |
94 | * <td>at onset before A, H, K, L, O, Q, R, U, X</td> | |
95 | * | |
96 | * </tr> | |
97 | * <tr> | |
98 | * <td>before A, H, K, O, Q, U, X except after S, Z</td> | |
99 | * </tr> | |
100 | * <tr> | |
101 | * <td>X</td> | |
102 | * <td>not after C, K, Q</td> | |
103 | * <td align="center">48</td> | |
104 | * </tr> | |
105 | * <tr> | |
106 | * <td>L</td> | |
107 | * <td></td> | |
108 | * | |
109 | * <td align="center">5</td> | |
110 | * </tr> | |
111 | * <tr> | |
112 | * <td>M, N</td> | |
113 | * <td></td> | |
114 | * <td align="center">6</td> | |
115 | * </tr> | |
116 | * <tr> | |
117 | * <td>R</td> | |
118 | * <td></td> | |
119 | * <td align="center">7</td> | |
120 | * </tr> | |
121 | * | |
122 | * <tr> | |
123 | * <td>S, Z</td> | |
124 | * <td></td> | |
125 | * <td rowspan="6" align="center">8</td> | |
126 | * </tr> | |
127 | * <tr> | |
128 | * <td rowspan="3">C</td> | |
129 | * <td>after S, Z</td> | |
130 | * </tr> | |
131 | * <tr> | |
132 | * <td>at onset except before A, H, K, L, O, Q, R, U, X</td> | |
133 | * </tr> | |
134 | * | |
135 | * <tr> | |
136 | * <td>not before A, H, K, O, Q, U, X</td> | |
137 | * </tr> | |
138 | * <tr> | |
139 | * <td>D, T</td> | |
140 | * <td>before C, S, Z</td> | |
141 | * </tr> | |
142 | * <tr> | |
143 | * <td>X</td> | |
144 | * <td>after C, K, Q</td> | |
145 | * </tr> | |
146 | * </tbody> | |
147 | * </table> | |
148 | * <p> | |
149 | * <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de): | |
150 | * Kölner Phonetik -- Buchstabencodes</a>)</i></small> | |
151 | * </p> | |
152 | * | |
153 | * <h4>Example:</h4> | |
154 | * | |
155 | * {@code "M}ü{@code ller-L}ü{@code denscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"} | |
156 | * | |
157 | * </li> | |
158 | * | |
159 | * <li> | |
160 | * <h3>Step 2:</h3> | |
161 | * Collapse of all multiple consecutive code digits. | |
162 | * <h4>Example:</h4> | |
163 | * {@code "6005507500206880022" => "6050750206802"}</li> | |
164 | * | |
165 | * <li> | |
166 | * <h3>Step 3:</h3> | |
167 | * Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur | |
168 | * if they occur after removing the "0" digits. | |
169 | * | |
170 | * <h4>Example:</h4> | |
171 | * {@code "6050750206802" => "65752682"}</li> | |
172 | * | |
173 | * </ul> | |
174 | * | |
175 | * This class is thread-safe. | |
176 | * | |
177 | * @see <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): Kölner Phonetik (in German)</a> | |
178 | * @since 1.5 | |
179 | */ | |
180 | 13 | public class ColognePhonetic implements StringEncoder { |
181 | ||
182 | /** | |
183 | * This class is not thread-safe; the field {@link #length} is mutable. | |
184 | * However, it is not shared between threads, as it is constructed on demand | |
185 | * by the method {@link ColognePhonetic#colognePhonetic(String)} | |
186 | */ | |
187 | private abstract class CologneBuffer { | |
188 | ||
189 | protected final char[] data; | |
190 | ||
191 | 200 | protected int length = 0; |
192 | ||
193 | 100 | public CologneBuffer(char[] data) { |
194 | 100 | this.data = data; |
195 | 100 | this.length = data.length; |
196 | 100 | } |
197 | ||
198 | 100 | public CologneBuffer(int buffSize) { |
199 | 100 | this.data = new char[buffSize]; |
200 | 100 | this.length = 0; |
201 | 100 | } |
202 | ||
203 | protected abstract char[] copyData(int start, final int length); | |
204 | ||
205 | public int length() { | |
206 | 566 | return length; |
207 | } | |
208 | ||
209 | @Override | |
210 | public String toString() { | |
211 | 100 | return new String(copyData(0, length)); |
212 | } | |
213 | } | |
214 | ||
215 | private class CologneOutputBuffer extends CologneBuffer { | |
216 | ||
217 | 100 | public CologneOutputBuffer(int buffSize) { |
218 | 100 | super(buffSize); |
219 | 100 | } |
220 | ||
221 | public void addRight(char chr) { | |
222 | 241 | data[length] = chr; |
223 | 241 | length++; |
224 | 241 | } |
225 | ||
226 | @Override | |
227 | protected char[] copyData(int start, final int length) { | |
228 | 100 | char[] newData = new char[length]; |
229 | 100 | System.arraycopy(data, start, newData, 0, length); |
230 | 100 | return newData; |
231 | } | |
232 | } | |
233 | ||
234 | 13 | private class CologneInputBuffer extends CologneBuffer { |
235 | ||
236 | 100 | public CologneInputBuffer(char[] data) { |
237 | 100 | super(data); |
238 | 100 | } |
239 | ||
240 | public void addLeft(char ch) { | |
241 | 3 | length++; |
242 | 3 | data[getNextPos()] = ch; |
243 | 3 | } |
244 | ||
245 | @Override | |
246 | protected char[] copyData(int start, final int length) { | |
247 | 0 | char[] newData = new char[length]; |
248 | 0 | System.arraycopy(data, data.length - this.length + start, newData, 0, length); |
249 | 0 | return newData; |
250 | } | |
251 | ||
252 | public char getNextChar() { | |
253 | 831 | return data[getNextPos()]; |
254 | } | |
255 | ||
256 | protected int getNextPos() { | |
257 | 834 | return data.length - length; |
258 | } | |
259 | ||
260 | public char removeNext() { | |
261 | 466 | char ch = getNextChar(); |
262 | 466 | length--; |
263 | 466 | return ch; |
264 | } | |
265 | } | |
266 | ||
267 | /** | |
268 | * Maps some Germanic characters to plain for internal processing. The following characters are mapped: | |
269 | * <ul> | |
270 | * <li>capital a, umlaut mark</li> | |
271 | * <li>capital u, umlaut mark</li> | |
272 | * <li>capital o, umlaut mark</li> | |
273 | * <li>small sharp s, German</li> | |
274 | * </ul> | |
275 | */ | |
276 | 1 | private static final char[][] PREPROCESS_MAP = new char[][]{ |
277 | {'\u00C4', 'A'}, // capital a, umlaut mark | |
278 | {'\u00DC', 'U'}, // capital u, umlaut mark | |
279 | {'\u00D6', 'O'}, // capital o, umlaut mark | |
280 | {'\u00DF', 'S'} // small sharp s, German | |
281 | }; | |
282 | ||
283 | /* | |
284 | * Returns whether the array contains the key, or not. | |
285 | */ | |
286 | private static boolean arrayContains(char[] arr, char key) { | |
287 | 5029 | for (char element : arr) { |
288 | 4255 | if (element == key) { |
289 | 249 | return true; |
290 | } | |
291 | } | |
292 | 774 | return false; |
293 | } | |
294 | ||
295 | /** | |
296 | * <p> | |
297 | * Implements the <i>Kölner Phonetik</i> algorithm. | |
298 | * </p> | |
299 | * <p> | |
300 | * In contrast to the initial description of the algorithm, this implementation does the encoding in one pass. | |
301 | * </p> | |
302 | * | |
303 | * @param text | |
304 | * @return the corresponding encoding according to the <i>Kölner Phonetik</i> algorithm | |
305 | */ | |
306 | public String colognePhonetic(String text) { | |
307 | 101 | if (text == null) { |
308 | 1 | return null; |
309 | } | |
310 | ||
311 | 100 | text = preprocess(text); |
312 | ||
313 | 100 | CologneOutputBuffer output = new CologneOutputBuffer(text.length() * 2); |
314 | 100 | CologneInputBuffer input = new CologneInputBuffer(text.toCharArray()); |
315 | ||
316 | char nextChar; | |
317 | ||
318 | 100 | char lastChar = '-'; |
319 | 100 | char lastCode = '/'; |
320 | char code; | |
321 | char chr; | |
322 | ||
323 | 100 | int rightLength = input.length(); |
324 | ||
325 | 566 | while (rightLength > 0) { |
326 | 466 | chr = input.removeNext(); |
327 | ||
328 | 466 | if ((rightLength = input.length()) > 0) { |
329 | 365 | nextChar = input.getNextChar(); |
330 | } else { | |
331 | 101 | nextChar = '-'; |
332 | } | |
333 | ||
334 | 466 | if (arrayContains(new char[]{'A', 'E', 'I', 'J', 'O', 'U', 'Y'}, chr)) { |
335 | 187 | code = '0'; |
336 | 279 | } else if (chr == 'H' || chr < 'A' || chr > 'Z') { |
337 | 42 | if (lastCode == '/') { |
338 | 15 | continue; |
339 | } | |
340 | 27 | code = '-'; |
341 | 237 | } else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) { |
342 | 19 | code = '1'; |
343 | 218 | } else if ((chr == 'D' || chr == 'T') && !arrayContains(new char[]{'S', 'C', 'Z'}, nextChar)) { |
344 | 19 | code = '2'; |
345 | 199 | } else if (arrayContains(new char[]{'W', 'F', 'P', 'V'}, chr)) { |
346 | 14 | code = '3'; |
347 | 185 | } else if (arrayContains(new char[]{'G', 'K', 'Q'}, chr)) { |
348 | 16 | code = '4'; |
349 | 169 | } else if (chr == 'X' && !arrayContains(new char[]{'C', 'K', 'Q'}, lastChar)) { |
350 | 3 | code = '4'; |
351 | 3 | input.addLeft('S'); |
352 | 3 | rightLength++; |
353 | 166 | } else if (chr == 'S' || chr == 'Z') { |
354 | 30 | code = '8'; |
355 | 136 | } else if (chr == 'C') { |
356 | 20 | if (lastCode == '/') { |
357 | 3 | if (arrayContains(new char[]{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, nextChar)) { |
358 | 3 | code = '4'; |
359 | } else { | |
360 | 0 | code = '8'; |
361 | } | |
362 | } else { | |
363 | 17 | if (arrayContains(new char[]{'S', 'Z'}, lastChar) || |
364 | !arrayContains(new char[]{'A', 'H', 'O', 'U', 'K', 'Q', 'X'}, nextChar)) { | |
365 | 11 | code = '8'; |
366 | } else { | |
367 | 6 | code = '4'; |
368 | } | |
369 | } | |
370 | 116 | } else if (arrayContains(new char[]{'T', 'D', 'X'}, chr)) { |
371 | 7 | code = '8'; |
372 | 109 | } else if (chr == 'R') { |
373 | 34 | code = '7'; |
374 | 75 | } else if (chr == 'L') { |
375 | 26 | code = '5'; |
376 | 49 | } else if (chr == 'M' || chr == 'N') { |
377 | 49 | code = '6'; |
378 | } else { | |
379 | 0 | code = chr; |
380 | } | |
381 | ||
382 | 451 | if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) { |
383 | 241 | output.addRight(code); |
384 | } | |
385 | ||
386 | 451 | lastChar = chr; |
387 | 451 | lastCode = code; |
388 | } | |
389 | 100 | return output.toString(); |
390 | } | |
391 | ||
392 | @Override | |
393 | public Object encode(Object object) throws EncoderException { | |
394 | 4 | if (!(object instanceof String)) { |
395 | 1 | throw new EncoderException("This method's parameter was expected to be of the type " + |
396 | String.class.getName() + | |
397 | ". But actually it was of the type " + | |
398 | object.getClass().getName() + | |
399 | "."); | |
400 | } | |
401 | 3 | return encode((String) object); |
402 | } | |
403 | ||
404 | @Override | |
405 | public String encode(String text) { | |
406 | 85 | return colognePhonetic(text); |
407 | } | |
408 | ||
409 | public boolean isEncodeEqual(String text1, String text2) { | |
410 | 8 | return colognePhonetic(text1).equals(colognePhonetic(text2)); |
411 | } | |
412 | ||
413 | /** | |
414 | * Converts the string to upper case and replaces germanic characters as defined in {@link #PREPROCESS_MAP}. | |
415 | */ | |
416 | private String preprocess(String text) { | |
417 | 100 | text = text.toUpperCase(Locale.GERMAN); |
418 | ||
419 | 100 | char[] chrs = text.toCharArray(); |
420 | ||
421 | 563 | for (int index = 0; index < chrs.length; index++) { |
422 | 463 | if (chrs[index] > 'Z') { |
423 | 19 | for (char[] element : PREPROCESS_MAP) { |
424 | 19 | if (chrs[index] == element[0]) { |
425 | 10 | chrs[index] = element[1]; |
426 | 10 | break; |
427 | } | |
428 | } | |
429 | } | |
430 | } | |
431 | 100 | return new String(chrs); |
432 | } | |
433 | } |