Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
MatchRatingApproachEncoder |
|
| 5.454545454545454;5.455 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | package org.apache.commons.codec.language; | |
18 | ||
19 | import java.util.Locale; | |
20 | ||
21 | import org.apache.commons.codec.EncoderException; | |
22 | import org.apache.commons.codec.StringEncoder; | |
23 | ||
24 | /** | |
25 | * Match Rating Approach Phonetic Algorithm Developed by <CITE>Western Airlines</CITE> in 1977. | |
26 | * | |
27 | * This class is immutable and thread-safe. | |
28 | * | |
29 | * @see <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a> | |
30 | * @since 1.8 | |
31 | */ | |
32 | 96 | public class MatchRatingApproachEncoder implements StringEncoder { |
33 | ||
34 | private static final String SPACE = " "; | |
35 | ||
36 | private static final String EMPTY = ""; | |
37 | ||
38 | /** | |
39 | * Constants used mainly for the min rating value. | |
40 | */ | |
41 | private static final int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8, | |
42 | ELEVEN = 11, TWELVE = 12; | |
43 | ||
44 | /** | |
45 | * The plain letter equivalent of the accented letters. | |
46 | */ | |
47 | private static final String PLAIN_ASCII = "AaEeIiOoUu" + // grave | |
48 | "AaEeIiOoUuYy" + // acute | |
49 | "AaEeIiOoUuYy" + // circumflex | |
50 | "AaOoNn" + // tilde | |
51 | "AaEeIiOoUuYy" + // umlaut | |
52 | "Aa" + // ring | |
53 | "Cc" + // cedilla | |
54 | "OoUu"; // double acute | |
55 | ||
56 | /** | |
57 | * Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc... | |
58 | */ | |
59 | private static final String UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" + | |
60 | "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" + | |
61 | "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" + | |
62 | "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" + | |
63 | "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" + | |
64 | "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171"; | |
65 | ||
66 | 1 | private static final String[] DOUBLE_CONSONANT = |
67 | new String[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", | |
68 | "TT", "VV", "WW", "XX", "YY", "ZZ" }; | |
69 | ||
70 | /** | |
71 | * Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any | |
72 | * spaces. | |
73 | * | |
74 | * <h2>API Usage</h2> | |
75 | * <p> | |
76 | * Consider this method private, it is package protected for unit testing only. | |
77 | * </p> | |
78 | * | |
79 | * @param name | |
80 | * The name to be cleaned | |
81 | * @return The cleaned name | |
82 | */ | |
83 | String cleanName(final String name) { | |
84 | 78 | String upperName = name.toUpperCase(Locale.ENGLISH); |
85 | ||
86 | 78 | final String[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; |
87 | 468 | for (final String str : charsToTrim) { |
88 | 390 | upperName = upperName.replaceAll(str, EMPTY); |
89 | } | |
90 | ||
91 | 78 | upperName = removeAccents(upperName); |
92 | 78 | upperName = upperName.replaceAll("\\s+", EMPTY); |
93 | ||
94 | 78 | return upperName; |
95 | } | |
96 | ||
97 | /** | |
98 | * Encodes an Object using the Match Rating Approach algo. Method is here to satisfy the requirements of the | |
99 | * Encoder interface Throws an EncoderException if input object is not of type java.lang.String. | |
100 | * | |
101 | * @param pObject | |
102 | * Object to encode | |
103 | * @return An object (or type java.lang.String) containing the Match Rating Approach code which corresponds to the | |
104 | * String supplied. | |
105 | * @throws EncoderException | |
106 | * if the parameter supplied is not of type java.lang.String | |
107 | */ | |
108 | @Override | |
109 | public final Object encode(final Object pObject) throws EncoderException { | |
110 | 4 | if (!(pObject instanceof String)) { |
111 | 1 | throw new EncoderException( |
112 | "Parameter supplied to Match Rating Approach encoder is not of type java.lang.String"); | |
113 | } | |
114 | 3 | return encode((String) pObject); |
115 | } | |
116 | ||
117 | /** | |
118 | * Encodes a String using the Match Rating Approach (MRA) algorithm. | |
119 | * | |
120 | * @param name | |
121 | * String object to encode | |
122 | * @return The MRA code corresponding to the String supplied | |
123 | */ | |
124 | @Override | |
125 | public final String encode(String name) { | |
126 | // Bulletproof for trivial input - NINO | |
127 | 17 | if (name == null || EMPTY.equalsIgnoreCase(name) || SPACE.equalsIgnoreCase(name) || name.length() == 1) { |
128 | 14 | return EMPTY; |
129 | } | |
130 | ||
131 | // Preprocessing | |
132 | 3 | name = cleanName(name); |
133 | ||
134 | // BEGIN: Actual encoding part of the algorithm... | |
135 | // 1. Delete all vowels unless the vowel begins the word | |
136 | 3 | name = removeVowels(name); |
137 | ||
138 | // 2. Remove second consonant from any double consonant | |
139 | 3 | name = removeDoubleConsonants(name); |
140 | ||
141 | // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters | |
142 | 3 | name = getFirst3Last3(name); |
143 | ||
144 | 3 | return name; |
145 | } | |
146 | ||
147 | /** | |
148 | * Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name. | |
149 | * | |
150 | * <h2>API Usage</h2> | |
151 | * <p> | |
152 | * Consider this method private, it is package protected for unit testing only. | |
153 | * </p> | |
154 | * | |
155 | * @param name | |
156 | * The string to get the substrings from | |
157 | * @return Annexed first & last 3 letters of input word. | |
158 | */ | |
159 | String getFirst3Last3(final String name) { | |
160 | 79 | final int nameLength = name.length(); |
161 | ||
162 | 79 | if (nameLength > SIX) { |
163 | 11 | final String firstThree = name.substring(0, THREE); |
164 | 11 | final String lastThree = name.substring(nameLength - THREE, nameLength); |
165 | 11 | return firstThree + lastThree; |
166 | } else { | |
167 | 68 | return name; |
168 | } | |
169 | } | |
170 | ||
171 | /** | |
172 | * Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the | |
173 | * min rating. Values strictly from documentation. | |
174 | * | |
175 | * <h2>API Usage</h2> | |
176 | * <p> | |
177 | * Consider this method private, it is package protected for unit testing only. | |
178 | * </p> | |
179 | * | |
180 | * @param sumLength | |
181 | * The length of 2 strings sent down | |
182 | * @return The min rating value | |
183 | */ | |
184 | int getMinRating(final int sumLength) { | |
185 | 47 | int minRating = 0; |
186 | ||
187 | 47 | if (sumLength <= FOUR) { |
188 | 4 | minRating = FIVE; |
189 | 43 | } else if (sumLength >= FIVE && sumLength <= SEVEN) { |
190 | 18 | minRating = FOUR; |
191 | 25 | } else if (sumLength >= EIGHT && sumLength <= ELEVEN) { |
192 | 17 | minRating = THREE; |
193 | 8 | } else if (sumLength == TWELVE) { |
194 | 7 | minRating = TWO; |
195 | } else { | |
196 | 1 | minRating = ONE; // docs said little here. |
197 | } | |
198 | ||
199 | 47 | return minRating; |
200 | } | |
201 | ||
202 | /** | |
203 | * Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the | |
204 | * strings are cleaned in the same way as {@link #encode(String)}. | |
205 | * | |
206 | * @param name1 | |
207 | * First of the 2 strings (names) to compare | |
208 | * @param name2 | |
209 | * Second of the 2 names to compare | |
210 | * @return <code>true</code> if the encodings are identical <code>false</code> otherwise. | |
211 | */ | |
212 | public boolean isEncodeEquals(String name1, String name2) { | |
213 | // Bulletproof for trivial input - NINO | |
214 | 51 | if (name1 == null || EMPTY.equalsIgnoreCase(name1) || SPACE.equalsIgnoreCase(name1)) { |
215 | 5 | return false; |
216 | 46 | } else if (name2 == null || EMPTY.equalsIgnoreCase(name2) || SPACE.equalsIgnoreCase(name2)) { |
217 | 5 | return false; |
218 | 41 | } else if (name1.length() == 1 || name2.length() == 1) { |
219 | 3 | return false; |
220 | 38 | } else if (name1.equalsIgnoreCase(name2)) { |
221 | 1 | return true; |
222 | } | |
223 | ||
224 | // Preprocessing | |
225 | 37 | name1 = cleanName(name1); |
226 | 37 | name2 = cleanName(name2); |
227 | ||
228 | // Actual MRA Algorithm | |
229 | ||
230 | // 1. Remove vowels | |
231 | 37 | name1 = removeVowels(name1); |
232 | 37 | name2 = removeVowels(name2); |
233 | ||
234 | // 2. Remove double consonants | |
235 | 37 | name1 = removeDoubleConsonants(name1); |
236 | 37 | name2 = removeDoubleConsonants(name2); |
237 | ||
238 | // 3. Reduce down to 3 letters | |
239 | 37 | name1 = getFirst3Last3(name1); |
240 | 37 | name2 = getFirst3Last3(name2); |
241 | ||
242 | // 4. Check for length difference - if 3 or greater then no similarity | |
243 | // comparison is done | |
244 | 37 | if (Math.abs(name1.length() - name2.length()) >= THREE) { |
245 | 1 | return false; |
246 | } | |
247 | ||
248 | // 5. Obtain the minimum rating value by calculating the length sum of the | |
249 | // encoded Strings and sending it down. | |
250 | 36 | final int sumLength = Math.abs(name1.length() + name2.length()); |
251 | 36 | int minRating = 0; |
252 | 36 | minRating = getMinRating(sumLength); |
253 | ||
254 | // 6. Process the encoded Strings from left to right and remove any | |
255 | // identical characters found from both Strings respectively. | |
256 | 36 | final int count = leftToRightThenRightToLeftProcessing(name1, name2); |
257 | ||
258 | // 7. Each PNI item that has a similarity rating equal to or greater than | |
259 | // the min is considered to be a good candidate match | |
260 | 36 | return count >= minRating; |
261 | ||
262 | } | |
263 | ||
264 | /** | |
265 | * Determines if a letter is a vowel. | |
266 | * | |
267 | * <h2>API Usage</h2> | |
268 | * <p> | |
269 | * Consider this method private, it is package protected for unit testing only. | |
270 | * </p> | |
271 | * | |
272 | * @param letter | |
273 | * The letter under investiagtion | |
274 | * @return True if a vowel, else false | |
275 | */ | |
276 | boolean isVowel(final String letter) { | |
277 | 83 | return letter.equalsIgnoreCase("E") || letter.equalsIgnoreCase("A") || letter.equalsIgnoreCase("O") || |
278 | letter.equalsIgnoreCase("I") || letter.equalsIgnoreCase("U"); | |
279 | } | |
280 | ||
281 | /** | |
282 | * Processes the names from left to right (first) then right to left removing identical letters in same positions. | |
283 | * Then subtracts the longer string that remains from 6 and returns this. | |
284 | * | |
285 | * <h2>API Usage</h2> | |
286 | * <p> | |
287 | * Consider this method private, it is package protected for unit testing only. | |
288 | * </p> | |
289 | * | |
290 | * @param name1 | |
291 | * name2 | |
292 | * @return | |
293 | */ | |
294 | int leftToRightThenRightToLeftProcessing(final String name1, final String name2) { | |
295 | 38 | final char[] name1Char = name1.toCharArray(); |
296 | 38 | final char[] name2Char = name2.toCharArray(); |
297 | ||
298 | 38 | final int name1Size = name1.length() - 1; |
299 | 38 | final int name2Size = name2.length() - 1; |
300 | ||
301 | 38 | String name1LtRStart = EMPTY; |
302 | 38 | String name1LtREnd = EMPTY; |
303 | ||
304 | 38 | String name2RtLStart = EMPTY; |
305 | 38 | String name2RtLEnd = EMPTY; |
306 | ||
307 | 195 | for (int i = 0; i < name1Char.length; i++) { |
308 | 165 | if (i > name2Size) { |
309 | 8 | break; |
310 | } | |
311 | ||
312 | 157 | name1LtRStart = name1.substring(i, i + 1); |
313 | 157 | name1LtREnd = name1.substring(name1Size - i, name1Size - i + 1); |
314 | ||
315 | 157 | name2RtLStart = name2.substring(i, i + 1); |
316 | 157 | name2RtLEnd = name2.substring(name2Size - i, name2Size - i + 1); |
317 | ||
318 | // Left to right... | |
319 | 157 | if (name1LtRStart.equals(name2RtLStart)) { |
320 | 88 | name1Char[i] = ' '; |
321 | 88 | name2Char[i] = ' '; |
322 | } | |
323 | ||
324 | // Right to left... | |
325 | 157 | if (name1LtREnd.equals(name2RtLEnd)) { |
326 | 59 | name1Char[name1Size - i] = ' '; |
327 | 59 | name2Char[name2Size - i] = ' '; |
328 | } | |
329 | } | |
330 | ||
331 | // Char arrays -> string & remove extraneous space | |
332 | 38 | final String strA = new String(name1Char).replaceAll("\\s+", EMPTY); |
333 | 38 | final String strB = new String(name2Char).replaceAll("\\s+", EMPTY); |
334 | ||
335 | // Final bit - subtract longest string from 6 and return this int value | |
336 | 38 | if (strA.length() > strB.length()) { |
337 | 8 | return Math.abs(SIX - strA.length()); |
338 | } else { | |
339 | 30 | return Math.abs(SIX - strB.length()); |
340 | } | |
341 | } | |
342 | ||
343 | /** | |
344 | * Removes accented letters and replaces with non-accented ascii equivalent Case is preserved. | |
345 | * http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29 | |
346 | * | |
347 | * @param accentedWord | |
348 | * The word that may have accents in it. | |
349 | * @return De-accented word | |
350 | */ | |
351 | String removeAccents(final String accentedWord) { | |
352 | 87 | if (accentedWord == null) { |
353 | 1 | return null; |
354 | } | |
355 | ||
356 | 86 | final StringBuilder sb = new StringBuilder(); |
357 | 86 | final int n = accentedWord.length(); |
358 | ||
359 | 794 | for (int i = 0; i < n; i++) { |
360 | 708 | final char c = accentedWord.charAt(i); |
361 | 708 | final int pos = UNICODE.indexOf(c); |
362 | 708 | if (pos > -1) { |
363 | 60 | sb.append(PLAIN_ASCII.charAt(pos)); |
364 | } else { | |
365 | 648 | sb.append(c); |
366 | } | |
367 | } | |
368 | ||
369 | 86 | return sb.toString(); |
370 | } | |
371 | ||
372 | /** | |
373 | * Replaces any double consonant pair with the single letter equivalent. | |
374 | * | |
375 | * <h2>API Usage</h2> | |
376 | * <p> | |
377 | * Consider this method private, it is package protected for unit testing only. | |
378 | * </p> | |
379 | * | |
380 | * @param name | |
381 | * String to have double consonants removed | |
382 | * @return Single consonant word | |
383 | */ | |
384 | String removeDoubleConsonants(final String name) { | |
385 | 80 | String replacedName = name.toUpperCase(); |
386 | 1760 | for (final String dc : DOUBLE_CONSONANT) { |
387 | 1680 | if (replacedName.contains(dc)) { |
388 | 12 | final String singleLetter = dc.substring(0, 1); |
389 | 12 | replacedName = replacedName.replace(dc, singleLetter); |
390 | } | |
391 | } | |
392 | 80 | return replacedName; |
393 | } | |
394 | ||
395 | /** | |
396 | * Deletes all vowels unless the vowel begins the word. | |
397 | * | |
398 | * <h2>API Usage</h2> | |
399 | * <p> | |
400 | * Consider this method private, it is package protected for unit testing only. | |
401 | * </p> | |
402 | * | |
403 | * @param name | |
404 | * The name to have vowels removed | |
405 | * @return De-voweled word | |
406 | */ | |
407 | String removeVowels(String name) { | |
408 | // Extract first letter | |
409 | 80 | final String firstLetter = name.substring(0, 1); |
410 | ||
411 | 80 | name = name.replaceAll("A", EMPTY); |
412 | 80 | name = name.replaceAll("E", EMPTY); |
413 | 80 | name = name.replaceAll("I", EMPTY); |
414 | 80 | name = name.replaceAll("O", EMPTY); |
415 | 80 | name = name.replaceAll("U", EMPTY); |
416 | ||
417 | 80 | name = name.replaceAll("\\s{2,}\\b", SPACE); |
418 | ||
419 | // return isVowel(firstLetter) ? (firstLetter + name) : name; | |
420 | 80 | if (isVowel(firstLetter)) { |
421 | 16 | return firstLetter + name; |
422 | } else { | |
423 | 64 | return name; |
424 | } | |
425 | } | |
426 | } |