View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language;
19  
20  import org.apache.commons.codec.EncoderException;
21  import org.apache.commons.codec.StringEncoderAbstractTest;
22  import org.junit.Assert;
23  import org.junit.Test;
24  
25  /**
26   * Tests {@link Nysiis}
27   *
28   * @since 1.7
29   * @version $Id$
30   */
31  public class NysiisTest extends StringEncoderAbstractTest<Nysiis> {
32  
33      private final Nysiis fullNysiis = new Nysiis(false);
34  
35      /**
36       * Takes an array of String pairs where each pair's first element is the input and the second element the expected
37       * encoding.
38       *
39       * @param testValues
40       *            an array of String pairs where each pair's first element is the input and the second element the
41       *            expected encoding.
42       * @throws EncoderException
43       */
44      private void assertEncodings(final String[]... testValues) throws EncoderException {
45          for (final String[] arr : testValues) {
46              Assert.assertEquals("Problem with " + arr[0], arr[1], this.fullNysiis.encode(arr[0]));
47          }
48      }
49  
50      @Override
51      protected Nysiis createStringEncoder() {
52          return new Nysiis();
53      }
54  
55      private void encodeAll(final String[] strings, final String expectedEncoding) {
56          for (final String string : strings) {
57              Assert.assertEquals("Problem with " + string, expectedEncoding, getStringEncoder().encode(string));
58          }
59      }
60  
61      @Test
62      public void testBran() {
63          encodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN");
64      }
65  
66      @Test
67      public void testCap() {
68          this.encodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP");
69      }
70  
71      @Test
72      public void testDad() {
73          // Data Quality and Record Linkage Techniques P.121 claims this is DAN,
74          // but it should be DAD, verified also with dropby.com
75          this.encodeAll(new String[] { "Dent" }, "DAD");
76      }
77  
78      @Test
79      public void testDan() {
80          this.encodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN");
81      }
82  
83      /**
84       * Tests data gathered from around the internet.
85       *
86       * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a>
87       * @throws EncoderException
88       */
89      @Test
90      public void testDropBy() throws EncoderException {
91          // Explanation of differences between this implementation and the one at dropby.com is
92          // prepended to the test string. The referenced rules refer to the outlined steps the
93          // class description for Nysiis.
94  
95          this.assertEncodings(
96                  // 1. Transcode first characters of name
97                  new String[] { "MACINTOSH", "MCANT" },
98                  // violates 4j: the second N should not be added, as the first
99                  //              key char is already a N
100                 new String[] { "KNUTH", "NAT" },           // Original: NNAT; modified: NATH
101                 // O and E are transcoded to A because of rule 4a
102                 // H also to A because of rule 4h
103                 // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h
104                 // that skips the next char in such a case?
105                 // the remaining A is removed because of rule 7
106                 new String[] { "KOEHN", "CAN" },           // Original: C
107                 // violates 4j: see also KNUTH
108                 new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN]
109                 // violates 4j: see also KNUTH
110                 new String[] { "PFEISTER", "FASTAR" },     // Original: FFASTA[R]
111                 // violates 4j: see also KNUTH
112                 new String[] { "SCHOENHOEFT", "SANAFT" },  // Original: SSANAF[T]
113                 // 2. Transcode last characters of name:
114                 new String[] { "MCKEE", "MCY" },
115                 new String[] { "MACKIE", "MCY" },
116                 new String[] { "HEITSCHMIDT", "HATSNAD" },
117                 new String[] { "BART", "BAD" },
118                 new String[] { "HURD", "HAD" },
119                 new String[] { "HUNT", "HAD" },
120                 new String[] { "WESTERLUND", "WASTARLAD" },
121                 // 4. Transcode remaining characters by following these rules,
122                 //    incrementing by one character each time:
123                 new String[] { "CASSTEVENS", "CASTAFAN" },
124                 new String[] { "VASQUEZ", "VASG" },
125                 new String[] { "FRAZIER", "FRASAR" },
126                 new String[] { "BOWMAN", "BANAN" },
127                 new String[] { "MCKNIGHT", "MCNAGT" },
128                 new String[] { "RICKERT", "RACAD" },
129                 // violates 5: the last S is not removed
130                 // when comparing to DEUTS, which is phonetically similar
131                 // the result it also DAT, which is correct for DEUTSCH too imo
132                 new String[] { "DEUTSCH", "DAT" },         // Original: DATS
133                 new String[] { "WESTPHAL", "WASTFAL" },
134                 // violates 4h: the H should be transcoded to S and thus ignored as
135                 // the first key character is also S
136                 new String[] { "SHRIVER", "SRAVAR" },      // Original: SHRAVA[R]
137                 // same as KOEHN, the L gets mysteriously lost
138                 new String[] { "KUHL", "CAL" },            // Original: C
139                 new String[] { "RAWSON", "RASAN" },
140                 // If last character is S, remove it
141                 new String[] { "JILES", "JAL" },
142                 // violates 6: if the last two characters are AY, remove A
143                 new String[] { "CARRAWAY", "CARY" },       // Original: CARAY
144                 new String[] { "YAMADA", "YANAD" });
145     }
146 
147     @Test
148     public void testFal() {
149         this.encodeAll(new String[] { "Phil" }, "FAL");
150     }
151 
152     /**
153      * Tests data gathered from around the internets.
154      *
155      * @throws EncoderException
156      */
157     @Test
158     public void testOthers() throws EncoderException {
159         this.assertEncodings(
160                 new String[] { "O'Daniel", "ODANAL" },
161                 new String[] { "O'Donnel", "ODANAL" },
162                 new String[] { "Cory", "CARY" },
163                 new String[] { "Corey", "CARY" },
164                 new String[] { "Kory", "CARY" },
165                 //
166                 new String[] { "FUZZY", "FASY" });
167     }
168 
169     /**
170      * Tests rule 1: Translate first characters of name: MAC → MCC, KN → N, K → C, PH, PF → FF, SCH → SSS
171      *
172      * @throws EncoderException
173      */
174     @Test
175     public void testRule1() throws EncoderException {
176         this.assertEncodings(
177                 new String[] { "MACX", "MCX" },
178                 new String[] { "KNX", "NX" },
179                 new String[] { "KX", "CX" },
180                 new String[] { "PHX", "FX" },
181                 new String[] { "PFX", "FX" },
182                 new String[] { "SCHX", "SX" });
183     }
184 
185     /**
186      * Tests rule 2: Translate last characters of name: EE → Y, IE → Y, DT, RT, RD, NT, ND → D
187      *
188      * @throws EncoderException
189      */
190     @Test
191     public void testRule2() throws EncoderException {
192         this.assertEncodings(
193                 new String[] { "XEE", "XY" },
194                 new String[] { "XIE", "XY" },
195                 new String[] { "XDT", "XD" },
196                 new String[] { "XRT", "XD" },
197                 new String[] { "XRD", "XD" },
198                 new String[] { "XNT", "XD" },
199                 new String[] { "XND", "XD" });
200     }
201 
202     /**
203      * Tests rule 4.1: EV → AF else A, E, I, O, U → A
204      *
205      * @throws EncoderException
206      */
207     @Test
208     public void testRule4Dot1() throws EncoderException {
209         this.assertEncodings(
210                 new String[] { "XEV", "XAF" },
211                 new String[] { "XAX", "XAX" },
212                 new String[] { "XEX", "XAX" },
213                 new String[] { "XIX", "XAX" },
214                 new String[] { "XOX", "XAX" },
215                 new String[] { "XUX", "XAX" });
216     }
217 
218     /**
219      * Tests rule 4.2: Q → G, Z → S, M → N
220      *
221      * @throws EncoderException
222      */
223     @Test
224     public void testRule4Dot2() throws EncoderException {
225         this.assertEncodings(
226                 new String[] { "XQ", "XG" },
227                 new String[] { "XZ", "X" },
228                 new String[] { "XM", "XN" });
229     }
230 
231     /**
232      * Tests rule 5: If last character is S, remove it.
233      *
234      * @throws EncoderException
235      */
236     @Test
237     public void testRule5() throws EncoderException {
238         this.assertEncodings(
239                 new String[] { "XS", "X" },
240                 new String[] { "XSS", "X" });
241     }
242 
243     /**
244      * Tests rule 6: If last characters are AY, replace with Y.
245      *
246      * @throws EncoderException
247      */
248     @Test
249     public void testRule6() throws EncoderException {
250         this.assertEncodings(
251                 new String[] { "XAY", "XY" },
252                 new String[] { "XAYS", "XY" }); // Rules 5, 6
253     }
254 
255     /**
256      * Tests rule 7: If last character is A, remove it.
257      *
258      * @throws EncoderException
259      */
260     @Test
261     public void testRule7() throws EncoderException {
262         this.assertEncodings(
263                 new String[] { "XA", "X" },
264                 new String[] { "XAS", "X" }); // Rules 5, 7
265     }
266     @Test
267     public void testSnad() {
268         // Data Quality and Record Linkage Techniques P.121 claims this is SNAT,
269         // but it should be SNAD
270         this.encodeAll(new String[] { "Schmidt" }, "SNAD");
271     }
272 
273     @Test
274     public void testSnat() {
275         this.encodeAll(new String[] { "Smith", "Schmit" }, "SNAT");
276     }
277 
278     @Test
279     public void testSpecialBranches() {
280         this.encodeAll(new String[] { "Kobwick" }, "CABWAC");
281         this.encodeAll(new String[] { "Kocher" }, "CACAR");
282         this.encodeAll(new String[] { "Fesca" }, "FASC");
283         this.encodeAll(new String[] { "Shom" }, "SAN");
284         this.encodeAll(new String[] { "Ohlo" }, "OL");
285         this.encodeAll(new String[] { "Uhu" }, "UH");
286         this.encodeAll(new String[] { "Um" }, "UN");
287     }
288 
289     @Test
290     public void testTranan() {
291         this.encodeAll(new String[] { "Trueman", "Truman" }, "TRANAN");
292     }
293 
294     @Test
295     public void testTrueVariant() {
296         final Nysiis encoder = new Nysiis(true);
297 
298         final String encoded = encoder.encode("WESTERLUND");
299         Assert.assertTrue(encoded.length() <= 6);
300         Assert.assertEquals("WASTAR", encoded);
301     }
302 
303 }