View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.text.similarity;
18  
19  import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
20  import static org.junit.jupiter.api.Assertions.assertEquals;
21  
22  import java.util.ArrayList;
23  import java.util.Collection;
24  import java.util.Collections;
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.List;
28  import java.util.Set;
29  import java.util.function.Function;
30  import java.util.regex.Pattern;
31  
32  import org.junit.jupiter.api.Test;
33  
34  /**
35   * Tests {@link IntersectionSimilarity}.
36   */
37  public class IntersectionSimilarityTest {
38      private static <T> void assertIntersection(final IntersectionSimilarity<T> similarity, final CharSequence cs1, final CharSequence cs2, final int sizeA,
39          final int sizeB, final int intersection) {
40          final IntersectionResult result = similarity.apply(cs1, cs2);
41          assertEquals(sizeA, result.getSizeA(), "Size A error");
42          assertEquals(sizeB, result.getSizeB(), "Size B error");
43          assertEquals(intersection, result.getIntersection(), "Intersection error");
44      }
45  
46      /**
47       * Convert the {@link CharSequence} to a {@link List} of bigrams (pairs of characters). These are represented using 2
48       * 16-bit chars packed into a 32-bit int.
49       *
50       * @param sequence the sequence
51       * @return the list
52       */
53      private static List<Integer> toBigramList(final CharSequence sequence) {
54          final int length = sequence.length();
55          final List<Integer> list = new ArrayList<>(length);
56          if (length > 1) {
57              char ch2 = sequence.charAt(0);
58              for (int i = 1; i < length; i++) {
59                  final char ch1 = ch2;
60                  ch2 = sequence.charAt(i);
61                  list.add(Integer.valueOf(ch1 << 16 | ch2));
62              }
63          }
64          return list;
65      }
66  
67      /**
68       * Convert the {@link CharSequence} to a {@link Set} of bigrams (pairs of characters). These are represented using 2
69       * 16-bit chars packed into a 32-bit int.
70       *
71       * @param sequence the sequence
72       * @return the set
73       */
74      private static Set<Integer> toBigramSet(final CharSequence sequence) {
75          final int length = sequence.length();
76          final Set<Integer> set = new HashSet<>(length);
77          if (length > 1) {
78              char ch2 = sequence.charAt(0);
79              for (int i = 1; i < length; i++) {
80                  final char ch1 = ch2;
81                  ch2 = sequence.charAt(i);
82                  set.add(Integer.valueOf(ch1 << 16 | ch2));
83              }
84          }
85          return set;
86      }
87  
88      /**
89       * Convert the {@link CharSequence} to a {@link List} of {@link Character}s.
90       *
91       * @param sequence the sequence
92       * @return the list
93       */
94      private static List<Character> toCharacterList(final CharSequence sequence) {
95          final int length = sequence.length();
96          final List<Character> list = new ArrayList<>(length);
97          for (int i = 0; i < length; i++) {
98              list.add(sequence.charAt(i));
99          }
100         return list;
101     }
102 
103     /**
104      * Convert the {@link CharSequence} to a {@link Set} of {@link Character}s.
105      *
106      * @param sequence the sequence
107      * @return the set
108      */
109     private static Set<Character> toCharacterSet(final CharSequence sequence) {
110         final int length = sequence.length();
111         final Set<Character> set = new HashSet<>(length);
112         for (int i = 0; i < length; i++) {
113             set.add(sequence.charAt(i));
114         }
115         return set;
116     }
117 
118     private static int toF1ScorePercent(final IntersectionResult result) {
119         final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
120         // Convert to percentage
121         return (int) Math.round(value * 100);
122     }
123 
124     @Test
125     public void testApplyNullNull() {
126         assertThatIllegalArgumentException()
127             .isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, null));
128     }
129 
130     @Test
131     public void testApplyNullString() {
132         assertThatIllegalArgumentException()
133             .isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, "right"));
134     }
135 
136     @Test
137     public void testApplyStringNull() {
138         assertThatIllegalArgumentException()
139             .isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply("left", null));
140     }
141 
142     @Test
143     public void testConstructorWithNullConverterThrows() {
144         assertThatIllegalArgumentException().isThrownBy(() -> new IntersectionSimilarity<>(null));
145     }
146 
147     @Test
148     public void testF1ScoreUsingListWordBigrams() {
149         // Example of a word letter pairs algorithm by Simon White:
150         // http://www.catalysoft.com/articles/StrikeAMatch.html
151         // This splits into words using whitespace and then computes uppercase
152         // bigrams for each word.
153 
154         // Split on whitespace
155         final Pattern pattern = Pattern.compile("\\s+");
156 
157         // Compute using pairs of characters (bigrams) for each word.
158         // This can be done using a 32-bit int to store two 16-bit characters
159         final Function<CharSequence, Collection<Integer>> converter = cs -> {
160             final List<Integer> set = new ArrayList<>();
161             for (final String word : pattern.split(cs)) {
162                 if (word.length() > 1) {
163                     // The strings are converted to upper case
164                     char ch2 = Character.toUpperCase(word.charAt(0));
165                     for (int i = 1; i < word.length(); i++) {
166                         final char ch1 = ch2;
167                         ch2 = Character.toUpperCase(word.charAt(i));
168                         set.add(Integer.valueOf(ch1 << 16 | ch2));
169                     }
170                 }
171             }
172             return set;
173         };
174         final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
175 
176         String bookTitle;
177         final String search1 = "Web Database Applications";
178         final String search2 = "PHP Web Applications";
179         final String search3 = "Web Aplications";
180         bookTitle = "Web Database Applications with PHP & MySQL";
181         assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
182         assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
183         assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
184         bookTitle = "Creating Database Web Applications with PHP and ASP";
185         assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
186         assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
187         assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
188         bookTitle = "Building Database Applications on the Web Using PHP3";
189         assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
190         assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
191         assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
192         bookTitle = "Building Web Database Applications with Visual Studio 6";
193         assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
194         assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
195         assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
196         bookTitle = "Web Application Development With PHP";
197         assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
198         assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
199         assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
200         bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
201         assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
202         assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
203         assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
204         bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
205         assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
206         assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
207         assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
208         bookTitle = "How to Find a Scholarship Online";
209         assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
210         assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
211         assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
212     }
213 
214     @Test
215     public void testIntersectionUsingListBigrams() {
216         // Compute using pairs of characters (bigrams).
217         // This can be done using a 32-bit int to store two 16-bit characters.
218         // This test uses a list and so duplicates should be matched.
219         final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
220 
221         // Expected:
222         // size A or B = sequence length - 1
223         // intersection = count of matching bigrams (include duplicates)
224         assertIntersection(similarity, "", "", 0, 0, 0);
225         assertIntersection(similarity, "a", "", 0, 0, 0);
226         assertIntersection(similarity, "a", "a", 0, 0, 0);
227         assertIntersection(similarity, "a", "b", 0, 0, 0);
228         assertIntersection(similarity, "aa", "ab", 1, 1, 0);
229         assertIntersection(similarity, "ab", "ab", 1, 1, 1);
230         assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
231         assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
232         assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
233         assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
234         assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
235         assertIntersection(similarity, "the same", "the same", 7, 7, 7);
236         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
237     }
238 
239     @Test
240     public void testIntersectionUsingListCharacter() {
241         // Compute using single characters.
242         // This test uses a list and so duplicates should be matched.
243         final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
244 
245         // Expected:
246         // size A or B = sequence length
247         // intersection = count of matching characters (include duplicates)
248         assertIntersection(similarity, "", "", 0, 0, 0);
249         assertIntersection(similarity, "a", "", 1, 0, 0);
250         assertIntersection(similarity, "a", "a", 1, 1, 1);
251         assertIntersection(similarity, "a", "b", 1, 1, 0);
252         assertIntersection(similarity, "aa", "ab", 2, 2, 1);
253         assertIntersection(similarity, "ab", "ab", 2, 2, 2);
254         assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
255         assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
256         assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
257         assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
258         assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
259         assertIntersection(similarity, "the same", "the same", 8, 8, 8);
260         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
261     }
262 
263     @Test
264     public void testIntersectionUsingSetBigrams() {
265         // Compute using pairs of characters (bigrams).
266         // This can be done using a 32-bit int to store two 16-bit characters.
267         // This test uses a set and so should not allow duplicates.
268         final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
269 
270         // Expected:
271         // size A or B = count of unique bigrams (exclude duplicates)
272         // intersection = count of unique matching bigrams (exclude duplicates)
273         assertIntersection(similarity, "", "", 0, 0, 0);
274         assertIntersection(similarity, "a", "", 0, 0, 0);
275         assertIntersection(similarity, "a", "a", 0, 0, 0);
276         assertIntersection(similarity, "a", "b", 0, 0, 0);
277         assertIntersection(similarity, "aa", "ab", 1, 1, 0);
278         assertIntersection(similarity, "ab", "ab", 1, 1, 1);
279         assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
280         assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
281         assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
282         assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
283         assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
284         assertIntersection(similarity, "the same", "the same", 7, 7, 7);
285         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
286     }
287 
288     @Test
289     public void testIntersectionUsingSetCharacter() {
290         // Compute using single characters.
291         // This test uses a set and so should not allow duplicates.
292         final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
293 
294         // Expected:
295         // size A or B = count of unique characters (exclude duplicates)
296         // intersection = count of unique matching characters (exclude duplicates)
297         assertIntersection(similarity, "", "", 0, 0, 0);
298         assertIntersection(similarity, "a", "", 1, 0, 0);
299         assertIntersection(similarity, "a", "a", 1, 1, 1);
300         assertIntersection(similarity, "a", "b", 1, 1, 0);
301         assertIntersection(similarity, "aa", "ab", 1, 2, 1);
302         assertIntersection(similarity, "ab", "ab", 2, 2, 2);
303         assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
304         assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
305         assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
306         assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
307         assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
308         assertIntersection(similarity, "the same", "the same", 7, 7, 7);
309         assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
310     }
311 
312     @Test
313     public void testIntersectionUsingSetCharacterListCharacter() {
314         // Compute using a custom converter that returns a Set and a List.
315         // This is an edge-case test.
316         final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
317         final String sequence1 = "aabbccdd";
318         final String sequence2 = "aaaaaabbbfffff";
319         converter.put(sequence1, toCharacterSet(sequence1));
320         converter.put(sequence2, toCharacterList(sequence2));
321         final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(converter::get);
322 
323         // Expected:
324         // size A = count of unique characters (exclude duplicates)
325         // size B = sequence length
326         // intersection = count of matching characters (exclude duplicates)
327         assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
328         assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
329     }
330 }