View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.commons.codec.language.bm;
19  
20  import static org.junit.Assert.*;
21  
22  import java.util.Arrays;
23  import java.util.HashSet;
24  import java.util.Map;
25  import java.util.TreeMap;
26  
27  import org.junit.Test;
28  
29  /**
30   * Tests PhoneticEngine and Languages.LanguageSet in ways very similar to code found in solr-3.6.0.
31   *
32   * @since 1.7
33   */
34  public class PhoneticEngineRegressionTest {
35  
36      @Test
37      public void testSolrGENERIC() {
38          Map<String, String> args;
39  
40          // concat is true, ruleType is EXACT
41          args = new TreeMap<String, String>();
42          args.put("nameType", "GENERIC");
43          assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
44          args.put("ruleType", "EXACT");
45          assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
46          assertEquals(encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
47          args.put("languageSet", "italian,greek,spanish");
48          assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anxelo");
49          assertEquals(encode(args, true, "1234"), "");
50  
51          // concat is false, ruleType is EXACT
52          args = new TreeMap<String, String>();
53          assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
54          args.put("ruleType", "EXACT");
55          assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo");
56          assertEquals(encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)");
57          args.put("languageSet", "italian,greek,spanish");
58          assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anxelo");
59          assertEquals(encode(args, false, "1234"), "");
60  
61          // concat is true, ruleType is APPROX
62          args = new TreeMap<String, String>();
63          assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
64          args.put("ruleType", "APPROX");
65          assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
66          assertEquals(encode(args, true, "D'Angelo"), "(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)");
67          args.put("languageSet", "italian,greek,spanish");
68          assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
69          assertEquals(encode(args, true, "1234"), "");
70  
71          // concat is false, ruleType is APPROX
72          args = new TreeMap<String, String>();
73          assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
74          args.put("ruleType", "APPROX");
75          assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo");
76          assertEquals(encode(args, false, "D'Angelo"), "(YngYlo|Yngilo|agilo|angYlo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongYlo|ongilo|oniilo|onilo|onxilo|onzilo)-(dYngYlo|dYngilo|dagilo|dangYlo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongYlo|dongilo|doniilo|donilo|donxilo|donzilo)");
77          args.put("languageSet", "italian,greek,spanish");
78          assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo");
79          assertEquals(encode(args, false, "1234"), "");
80      }
81  
82      @Test
83      public void testSolrASHKENAZI() {
84          Map<String, String> args;
85  
86          // concat is true, ruleType is EXACT
87          args = new TreeMap<String, String>();
88          args.put("nameType", "ASHKENAZI");
89          assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
90          args.put("ruleType", "EXACT");
91          assertEquals(encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo");
92          assertEquals(encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
93          args.put("languageSet", "italian,greek,spanish");
94          assertEquals(encode(args, true, "Angelo"), "angelo|anxelo");
95          assertEquals(encode(args, true, "1234"), "");
96  
97          // concat is false, ruleType is EXACT
98          args = new TreeMap<String, String>();
99          args.put("nameType", "ASHKENAZI");
100         assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
101         args.put("ruleType", "EXACT");
102         assertEquals(encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo");
103         assertEquals(encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo");
104         args.put("languageSet", "italian,greek,spanish");
105         assertEquals(encode(args, false, "Angelo"), "angelo|anxelo");
106         assertEquals(encode(args, false, "1234"), "");
107 
108         // concat is true, ruleType is APPROX
109         args = new TreeMap<String, String>();
110         args.put("nameType", "ASHKENAZI");
111         assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
112         args.put("ruleType", "APPROX");
113         assertEquals(encode(args, true, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
114         assertEquals(encode(args, true, "D'Angelo"), "dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo");
115         args.put("languageSet", "italian,greek,spanish");
116         assertEquals(encode(args, true, "Angelo"), "angilo|anxilo|ongilo|onxilo");
117         assertEquals(encode(args, true, "1234"), "");
118 
119         // concat is false, ruleType is APPROX
120         args = new TreeMap<String, String>();
121         args.put("nameType", "ASHKENAZI");
122         assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
123         args.put("ruleType", "APPROX");
124         assertEquals(encode(args, false, "Angelo"), "YngYlo|Yngilo|angYlo|angilo|anilo|anxilo|anzilo|ongYlo|ongilo|onilo|onxilo|onzilo");
125         assertEquals(encode(args, false, "D'Angelo"), "dYngYlo|dYngilo|dangYlo|dangilo|danilo|danxilo|danzilo|dongYlo|dongilo|donilo|donxilo|donzilo");
126         args.put("languageSet", "italian,greek,spanish");
127         assertEquals(encode(args, false, "Angelo"), "angilo|anxilo|ongilo|onxilo");
128         assertEquals(encode(args, false, "1234"), "");
129     }
130 
131     @Test
132     public void testSolrSEPHARDIC() {
133         Map<String, String> args;
134 
135         // concat is true, ruleType is EXACT
136         args = new TreeMap<String, String>();
137         args.put("nameType", "SEPHARDIC");
138         assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
139         args.put("ruleType", "EXACT");
140         assertEquals(encode(args, true, "Angelo"), "anZelo|andZelo|anxelo");
141         assertEquals(encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo");
142         args.put("languageSet", "italian,greek,spanish");
143         assertEquals(encode(args, true, "Angelo"), "andZelo|anxelo");
144         assertEquals(encode(args, true, "1234"), "");
145 
146         // concat is false, ruleType is EXACT
147         args = new TreeMap<String, String>();
148         args.put("nameType", "SEPHARDIC");
149         assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
150         args.put("ruleType", "EXACT");
151         assertEquals(encode(args, false, "Angelo"), "anZelo|andZelo|anxelo");
152         assertEquals(encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo");
153         args.put("languageSet", "italian,greek,spanish");
154         assertEquals(encode(args, false, "Angelo"), "andZelo|anxelo");
155         assertEquals(encode(args, false, "1234"), "");
156 
157         // concat is true, ruleType is APPROX
158         args = new TreeMap<String, String>();
159         args.put("nameType", "SEPHARDIC");
160         assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
161         args.put("ruleType", "APPROX");
162         assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
163         assertEquals(encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
164         args.put("languageSet", "italian,greek,spanish");
165         assertEquals(encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
166         assertEquals(encode(args, true, "1234"), "");
167 
168         // concat is false, ruleType is APPROX
169         args = new TreeMap<String, String>();
170         args.put("nameType", "SEPHARDIC");
171         assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
172         args.put("ruleType", "APPROX");
173         assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
174         assertEquals(encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu");
175         args.put("languageSet", "italian,greek,spanish");
176         assertEquals(encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu");
177         assertEquals(encode(args, false, "1234"), "");
178     }
179 
180     @Test
181     public void testCompatibilityWithOriginalVersion() {
182         // see CODEC-187
183         // comparison: http://stevemorse.org/census/soundex.html
184 
185         Map<String, String> args = new TreeMap<String, String>();
186         args.put("nameType", "GENERIC");
187         args.put("ruleType", "APPROX");
188 
189         assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abran|abrom|abron|avram|avrom|obram|obran|obrom|obron|ovram|ovrom");
190         assertEquals(encode(args, true, "Bendzin"), "bndzn|bntsn|bnzn|vndzn|vntsn");
191 
192         args.put("nameType", "ASHKENAZI");
193         args.put("ruleType", "APPROX");
194 
195         assertEquals(encode(args, true, "abram"), "Ybram|Ybrom|abram|abrom|avram|avrom|imbram|imbrom|obram|obrom|ombram|ombrom|ovram|ovrom");
196         assertEquals(encode(args, true, "Halpern"), "YlpYrn|Ylpirn|alpYrn|alpirn|olpYrn|olpirn|xalpirn|xolpirn");
197 
198     }
199 
200     /**
201      * This code is similar in style to code found in Solr:
202      * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java
203      *
204      * Making a JUnit test out of it to protect Solr from possible future
205      * regressions in Commons-Codec.
206      */
207     private static String encode(final Map<String, String> args, final boolean concat, final String input) {
208         Languages.LanguageSet languageSet;
209         PhoneticEngine engine;
210 
211         // PhoneticEngine = NameType + RuleType + concat
212         // we use common-codec's defaults: GENERIC + APPROX + true
213         final String nameTypeArg = args.get("nameType");
214         final NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : NameType.valueOf(nameTypeArg);
215 
216         final String ruleTypeArg = args.get("ruleType");
217         final RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : RuleType.valueOf(ruleTypeArg);
218 
219         engine = new PhoneticEngine(nameType, ruleType, concat);
220 
221         // LanguageSet: defaults to automagic, otherwise a comma-separated list.
222         final String languageSetArg = args.get("languageSet");
223         if (languageSetArg == null || languageSetArg.equals("auto")) {
224             languageSet = null;
225         } else {
226             languageSet = Languages.LanguageSet.from(new HashSet<String>(Arrays.asList(languageSetArg.split(","))));
227         }
228 
229         /*
230             org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this:
231 
232             encoded = (languages == null)
233                 ? engine.encode(termAtt.toString())
234                 : engine.encode(termAtt.toString(), languages);
235 
236             Hence our approach, below:
237         */
238         if (languageSet == null) {
239             return engine.encode(input);
240         } else {
241             return engine.encode(input, languageSet);
242         }
243     }
244 }