1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.text.similarity;
18
19 import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
20 import static org.junit.jupiter.api.Assertions.assertEquals;
21
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import java.util.Collections;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.List;
28 import java.util.Set;
29 import java.util.function.Function;
30 import java.util.regex.Pattern;
31
32 import org.junit.jupiter.api.Test;
33
34
35
36
37 public class IntersectionSimilarityTest {
38 private static <T> void assertIntersection(final IntersectionSimilarity<T> similarity, final CharSequence cs1, final CharSequence cs2, final int sizeA,
39 final int sizeB, final int intersection) {
40 final IntersectionResult result = similarity.apply(cs1, cs2);
41 assertEquals(sizeA, result.getSizeA(), "Size A error");
42 assertEquals(sizeB, result.getSizeB(), "Size B error");
43 assertEquals(intersection, result.getIntersection(), "Intersection error");
44 }
45
46
47
48
49
50
51
52
53 private static List<Integer> toBigramList(final CharSequence sequence) {
54 final int length = sequence.length();
55 final List<Integer> list = new ArrayList<>(length);
56 if (length > 1) {
57 char ch2 = sequence.charAt(0);
58 for (int i = 1; i < length; i++) {
59 final char ch1 = ch2;
60 ch2 = sequence.charAt(i);
61 list.add(Integer.valueOf(ch1 << 16 | ch2));
62 }
63 }
64 return list;
65 }
66
67
68
69
70
71
72
73
74 private static Set<Integer> toBigramSet(final CharSequence sequence) {
75 final int length = sequence.length();
76 final Set<Integer> set = new HashSet<>(length);
77 if (length > 1) {
78 char ch2 = sequence.charAt(0);
79 for (int i = 1; i < length; i++) {
80 final char ch1 = ch2;
81 ch2 = sequence.charAt(i);
82 set.add(Integer.valueOf(ch1 << 16 | ch2));
83 }
84 }
85 return set;
86 }
87
88
89
90
91
92
93
94 private static List<Character> toCharacterList(final CharSequence sequence) {
95 final int length = sequence.length();
96 final List<Character> list = new ArrayList<>(length);
97 for (int i = 0; i < length; i++) {
98 list.add(sequence.charAt(i));
99 }
100 return list;
101 }
102
103
104
105
106
107
108
109 private static Set<Character> toCharacterSet(final CharSequence sequence) {
110 final int length = sequence.length();
111 final Set<Character> set = new HashSet<>(length);
112 for (int i = 0; i < length; i++) {
113 set.add(sequence.charAt(i));
114 }
115 return set;
116 }
117
118 private static int toF1ScorePercent(final IntersectionResult result) {
119 final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
120
121 return (int) Math.round(value * 100);
122 }
123
124 @Test
125 public void testApplyNullNull() {
126 assertThatIllegalArgumentException()
127 .isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, null));
128 }
129
130 @Test
131 public void testApplyNullString() {
132 assertThatIllegalArgumentException()
133 .isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply(null, "right"));
134 }
135
136 @Test
137 public void testApplyStringNull() {
138 assertThatIllegalArgumentException()
139 .isThrownBy(() -> new IntersectionSimilarity<>(cs -> new HashSet<>(Collections.singletonList(cs))).apply("left", null));
140 }
141
142 @Test
143 public void testConstructorWithNullConverterThrows() {
144 assertThatIllegalArgumentException().isThrownBy(() -> new IntersectionSimilarity<>(null));
145 }
146
147 @Test
148 public void testF1ScoreUsingListWordBigrams() {
149
150
151
152
153
154
155 final Pattern pattern = Pattern.compile("\\s+");
156
157
158
159 final Function<CharSequence, Collection<Integer>> converter = cs -> {
160 final List<Integer> set = new ArrayList<>();
161 for (final String word : pattern.split(cs)) {
162 if (word.length() > 1) {
163
164 char ch2 = Character.toUpperCase(word.charAt(0));
165 for (int i = 1; i < word.length(); i++) {
166 final char ch1 = ch2;
167 ch2 = Character.toUpperCase(word.charAt(i));
168 set.add(Integer.valueOf(ch1 << 16 | ch2));
169 }
170 }
171 }
172 return set;
173 };
174 final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
175
176 String bookTitle;
177 final String search1 = "Web Database Applications";
178 final String search2 = "PHP Web Applications";
179 final String search3 = "Web Aplications";
180 bookTitle = "Web Database Applications with PHP & MySQL";
181 assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
182 assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
183 assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
184 bookTitle = "Creating Database Web Applications with PHP and ASP";
185 assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
186 assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
187 assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
188 bookTitle = "Building Database Applications on the Web Using PHP3";
189 assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
190 assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
191 assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
192 bookTitle = "Building Web Database Applications with Visual Studio 6";
193 assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
194 assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
195 assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
196 bookTitle = "Web Application Development With PHP";
197 assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
198 assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
199 assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
200 bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
201 assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
202 assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
203 assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
204 bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
205 assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
206 assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
207 assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
208 bookTitle = "How to Find a Scholarship Online";
209 assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
210 assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
211 assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
212 }
213
214 @Test
215 public void testIntersectionUsingListBigrams() {
216
217
218
219 final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
220
221
222
223
224 assertIntersection(similarity, "", "", 0, 0, 0);
225 assertIntersection(similarity, "a", "", 0, 0, 0);
226 assertIntersection(similarity, "a", "a", 0, 0, 0);
227 assertIntersection(similarity, "a", "b", 0, 0, 0);
228 assertIntersection(similarity, "aa", "ab", 1, 1, 0);
229 assertIntersection(similarity, "ab", "ab", 1, 1, 1);
230 assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
231 assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
232 assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
233 assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
234 assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
235 assertIntersection(similarity, "the same", "the same", 7, 7, 7);
236 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
237 }
238
239 @Test
240 public void testIntersectionUsingListCharacter() {
241
242
243 final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
244
245
246
247
248 assertIntersection(similarity, "", "", 0, 0, 0);
249 assertIntersection(similarity, "a", "", 1, 0, 0);
250 assertIntersection(similarity, "a", "a", 1, 1, 1);
251 assertIntersection(similarity, "a", "b", 1, 1, 0);
252 assertIntersection(similarity, "aa", "ab", 2, 2, 1);
253 assertIntersection(similarity, "ab", "ab", 2, 2, 2);
254 assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
255 assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
256 assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
257 assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
258 assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
259 assertIntersection(similarity, "the same", "the same", 8, 8, 8);
260 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
261 }
262
263 @Test
264 public void testIntersectionUsingSetBigrams() {
265
266
267
268 final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
269
270
271
272
273 assertIntersection(similarity, "", "", 0, 0, 0);
274 assertIntersection(similarity, "a", "", 0, 0, 0);
275 assertIntersection(similarity, "a", "a", 0, 0, 0);
276 assertIntersection(similarity, "a", "b", 0, 0, 0);
277 assertIntersection(similarity, "aa", "ab", 1, 1, 0);
278 assertIntersection(similarity, "ab", "ab", 1, 1, 1);
279 assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
280 assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
281 assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
282 assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
283 assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
284 assertIntersection(similarity, "the same", "the same", 7, 7, 7);
285 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
286 }
287
288 @Test
289 public void testIntersectionUsingSetCharacter() {
290
291
292 final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
293
294
295
296
297 assertIntersection(similarity, "", "", 0, 0, 0);
298 assertIntersection(similarity, "a", "", 1, 0, 0);
299 assertIntersection(similarity, "a", "a", 1, 1, 1);
300 assertIntersection(similarity, "a", "b", 1, 1, 0);
301 assertIntersection(similarity, "aa", "ab", 1, 2, 1);
302 assertIntersection(similarity, "ab", "ab", 2, 2, 2);
303 assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
304 assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
305 assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
306 assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
307 assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
308 assertIntersection(similarity, "the same", "the same", 7, 7, 7);
309 assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
310 }
311
312 @Test
313 public void testIntersectionUsingSetCharacterListCharacter() {
314
315
316 final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
317 final String sequence1 = "aabbccdd";
318 final String sequence2 = "aaaaaabbbfffff";
319 converter.put(sequence1, toCharacterSet(sequence1));
320 converter.put(sequence2, toCharacterList(sequence2));
321 final IntersectionSimilarity<Character> similarity = new IntersectionSimilarity<>(converter::get);
322
323
324
325
326
327 assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
328 assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
329 }
330 }