You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/03/13 14:47:12 UTC
[commons-text] 01/03: TEXT-155: Add a generic
IntersectionSimilarity measure
This is an automated email from the ASF dual-hosted git repository.
aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git
commit 639e4abb7fa57319311277753eb9f6aabfa2cf30
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Sun Mar 10 13:22:14 2019 +0000
TEXT-155: Add a generic IntersectionSimilarity measure
---
.../text/similarity/IntersectionResult.java | 118 +++++++
.../text/similarity/IntersectionSimilarity.java | 237 ++++++++++++++
.../text/similarity/IntersectionResultTest.java | 153 ++++++++++
.../similarity/IntersectionSimilarityTest.java | 340 +++++++++++++++++++++
4 files changed, 848 insertions(+)
diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
new file mode 100644
index 0000000..1034c69
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Objects;
+
+/**
+ * Represents the intersection result between two sets.
+ *
+ * <p>Stores the size of set A, set B and the intersection of A and B
+ * (<code>|A ∩ B|</code>).</p>
+ *
+ * <p>This class is immutable.</p>
+ *
+ * @since 1.7
+ * @see <a href="https://en.wikipedia.org/wiki/Intersection_(set_theory)">Intersection</a>
+ */
+public class IntersectionResult {
+ /**
+ * The size of set A.
+ */
+ private final int sizeA;
+ /**
+ * The size of set B.
+ */
+ private final int sizeB;
+ /**
+ * The size of the intersection between set A and B.
+ */
+ private final int intersection;
+
+ /**
+ * Create the results for an intersection between two sets.
+ *
+ * @param sizeA the size of set A ({@code |A|})
+ * @param sizeB the size of set B ({@code |B|})
+ * @param intersection the size of the intersection of A and B (<code>|A ∩ B|</code>)
+ * @throws IllegalArgumentException if the sizes are negative or the intersection is greater
+ * than the minimum of the two set sizes
+ */
+ public IntersectionResult(final int sizeA, final int sizeB, final int intersection) {
+ if (sizeA < 0) {
+ throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA);
+ }
+ if (sizeB < 0) {
+ throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB);
+ }
+ if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) {
+ throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection);
+ }
+ this.sizeA = sizeA;
+ this.sizeB = sizeB;
+ this.intersection = intersection;
+ }
+
+ /**
+ * Get the size of set A.
+ *
+ * @return |A|
+ */
+ public int getSizeA() {
+ return sizeA;
+ }
+
+ /**
+ * Get the size of set B.
+ *
+ * @return |B|
+ */
+ public int getSizeB() {
+ return sizeB;
+ }
+
+ /**
+ * Get the size of the intersection between set A and B.
+ *
+ * @return <code>|A ∩ B|</code>
+ */
+ public int getIntersection() {
+ return intersection;
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ final IntersectionResult result = (IntersectionResult) o;
+ return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(sizeA, sizeB, intersection);
+ }
+
+ @Override
+ public String toString() {
+ return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection;
+ }
+}
diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
new file mode 100644
index 0000000..bbdd7b8
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.function.Function;
+
+/**
+ * Measures the intersection of two sets created from a pair of character sequences.
+ *
+ * <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage
+ * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
+ * {@link Object#equals(Object)} and {@link Object#hashCode()}.</p>
+ *
+ * @param <T> the type of the elements extracted from the character sequence
+ * @since 1.7
+ * @see Set
+ * @see HashMap
+ */
+public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> {
+ /** The converter used to create the elements from the characters. */
+ private final Function<CharSequence, Collection<T>> converter;
+
+ // The following is adapted from commons-collections for a Bag.
+ // A Bag is a collection that can store the count of the number
+ // of copies of each element.
+
+ /**
+ * Mutable counter class for storing the count of elements.
+ */
+ private static class BagCount {
+ /** The count. This is initialised to 1 upon construction. */
+ int count = 1;
+ }
+
+ /**
+ * A minimal implementation of a Bag that can store elements and a count.
+ *
+ * <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not
+ * even have to know its own size.
+ */
+ private class TinyBag {
+ /** The backing map. */
+ private final Map<T, BagCount> map;
+
+ /**
+ * Create a new tiny bag.
+ *
+ * @param initialCapacity the initial capacity
+ */
+ TinyBag(int initialCapacity) {
+ map = new HashMap<>(initialCapacity);
+ }
+
+ /**
+ * Adds a new element to the bag, incrementing its count in the underlying map.
+ *
+ * @param object the object to add
+ */
+ void add(T object) {
+ final BagCount mut = map.get(object);
+ if (mut == null) {
+ map.put(object, new BagCount());
+ } else {
+ mut.count++;
+ }
+ }
+
+ /**
+ * Returns the number of occurrence of the given element in this bag by
+ * looking up its count in the underlying map.
+ *
+ * @param object the object to search for
+ * @return the number of occurrences of the object, zero if not found
+ */
+ int getCount(final Object object) {
+ final BagCount count = map.get(object);
+ if (count != null) {
+ return count.count;
+ }
+ return 0;
+ }
+
+ /**
+ * Returns a Set view of the mappings contained in this bag.
+ *
+ * @return the Set view
+ */
+ Set<Entry<T, BagCount>> entrySet() {
+ return map.entrySet();
+ }
+
+ /**
+ * Get the number of unique elements in the bag.
+ *
+ * @return the unique element size
+ */
+ int uniqueElementSize() {
+ return map.size();
+ }
+ }
+
+ /**
+ * Create a new intersection similarity using the provided converter.
+ *
+ * <p>If the converter returns a {@link Set} then the intersection result will
+ * not include duplicates. Any other {@link Collection} is used to produce a result
+ * that will include duplicates in the intersect and union.
+ *
+ * @param converter the converter used to create the elements from the characters
+ * @throws IllegalArgumentException if the converter is null
+ */
+ public IntersectionSimilarity(Function<CharSequence, Collection<T>> converter) {
+ if (converter == null) {
+ throw new IllegalArgumentException("Converter must not be null");
+ }
+ this.converter = converter;
+ }
+
+ /**
+ * Calculates the intersection of two character sequences passed as input.
+ *
+ * @param left first character sequence
+ * @param right second character sequence
+ * @return the intersection result
+ * @throws IllegalArgumentException if either input sequence is {@code null}
+ */
+ @Override
+ public IntersectionResult apply(final CharSequence left, final CharSequence right) {
+ if (left == null || right == null) {
+ throw new IllegalArgumentException("Input cannot be null");
+ }
+
+ // Create the elements from the sequences
+ final Collection<T> objectsA = converter.apply(left);
+ final Collection<T> objectsB = converter.apply(right);
+ final int sizeA = objectsA.size();
+ final int sizeB = objectsB.size();
+
+ // Short-cut if either collection is empty
+ if (Math.min(sizeA, sizeB) == 0) {
+ // No intersection
+ return new IntersectionResult(sizeA, sizeB, 0);
+ }
+
+ // Intersection = count the number of shared elements
+ int intersection;
+ if (objectsA instanceof Set && objectsB instanceof Set) {
+ // If a Set then the elements will only have a count of 1.
+ // Iterate over the smaller set.
+ intersection = (sizeA < sizeB)
+ ? getIntersection((Set<T>) objectsA, (Set<T>) objectsB)
+ : getIntersection((Set<T>) objectsB, (Set<T>) objectsA);
+ } else {
+ // Create a bag for each collection
+ final TinyBag bagA = toBag(objectsA);
+ final TinyBag bagB = toBag(objectsB);
+ // Iterate over the smaller number of unique elements
+ intersection = (bagA.uniqueElementSize() < bagB.uniqueElementSize())
+ ? getIntersection(bagA, bagB)
+ : getIntersection(bagB, bagA);
+ }
+
+ return new IntersectionResult(sizeA, sizeB, intersection);
+ }
+
+ /**
+ * Convert the collection to a bag. The bag will contain the count of each element
+ * in the collection.
+ *
+ * @param objects the objects
+ * @return the bag
+ */
+ private TinyBag toBag(Collection<T> objects) {
+ final TinyBag bag = new TinyBag(objects.size());
+ for (T t : objects) {
+ bag.add(t);
+ }
+ return bag;
+ }
+
+ /**
+ * Compute the intersection between two sets. This is the count of all the elements
+ * that are within both sets.
+ *
+ * @param <T> the type of the elements in the set
+ * @param setA the set A
+ * @param setB the set B
+ * @return the intersection
+ */
+ private static <T> int getIntersection(Set<T> setA, Set<T> setB) {
+ int intersection = 0;
+ for (T element : setA) {
+ if (setB.contains(element)) {
+ intersection++;
+ }
+ }
+ return intersection;
+ }
+
+ /**
+ * Compute the intersection between two bags. This is the sum of the minimum
+ * count of each element that is within both sets.
+ *
+ * @param bagA the bag A
+ * @param bagB the bag B
+ * @return the intersection
+ */
+ private int getIntersection(TinyBag bagA, TinyBag bagB) {
+ int intersection = 0;
+ for (Entry<T, BagCount> entry : bagA.entrySet()) {
+ final T element = entry.getKey();
+ final int count = entry.getValue().count;
+ // The intersection of this entry in both bags is the minimum count
+ intersection += Math.min(count, bagB.getCount(element));
+ }
+ return intersection;
+ }
+}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
new file mode 100644
index 0000000..c2477e0
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.HashMap;
+import java.util.concurrent.ThreadLocalRandom;
+
+/**
+ * Unit tests for {@link IntersectionResult}.
+ */
+public class IntersectionResultTest {
+ @Test
+ public void testNewIntersectionResult_WithZeros() {
+ final int sizeA = 0;
+ final int sizeB = 0;
+ final int intersection = 0;
+ new IntersectionResult(sizeA, sizeB, intersection);
+ }
+
+ @Test
+ public void testNewIntersectionResult_WithNegativeSizeA() {
+ final int sizeA = -1;
+ final int sizeB = 0;
+ final int intersection = 0;
+ Assertions.assertThrows(IllegalArgumentException.class, () -> {
+ new IntersectionResult(sizeA, sizeB, intersection);
+ });
+ }
+
+ @Test
+ public void testNewIntersectionResult_WithNegativeSizeB() {
+ final int sizeA = 0;
+ final int sizeB = -1;
+ final int intersection = 0;
+ Assertions.assertThrows(IllegalArgumentException.class, () -> {
+ new IntersectionResult(sizeA, sizeB, intersection);
+ });
+ }
+
+ @Test
+ public void testNewIntersectionResult_WithNegativeIntersection() {
+ final int sizeA = 0;
+ final int sizeB = 0;
+ final int intersection = -1;
+ Assertions.assertThrows(IllegalArgumentException.class, () -> {
+ new IntersectionResult(sizeA, sizeB, intersection);
+ });
+ }
+
+ @Test
+ public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() {
+ final int sizeA = 1;
+ final int sizeB = 2;
+ final int intersection = Math.max(sizeA, sizeB) + 1;
+ Assertions.assertThrows(IllegalArgumentException.class, () -> {
+ new IntersectionResult(sizeA, sizeB, intersection);
+ });
+ Assertions.assertThrows(IllegalArgumentException.class, () -> {
+ new IntersectionResult(sizeB, sizeA, intersection);
+ });
+ }
+
+ @Test
+ public void testProperties() {
+ final ThreadLocalRandom rand = ThreadLocalRandom.current();
+ final int max = 1024;
+ for (int i = 0; i < 5; i++) {
+ // Ensure the min is above 0
+ final int sizeA = rand.nextInt(max) + 1;
+ final int sizeB = rand.nextInt(max) + 1;
+ final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
+ final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+ Assertions.assertEquals(sizeA, result.getSizeA());
+ Assertions.assertEquals(sizeB, result.getSizeB());
+ Assertions.assertEquals(intersection, result.getIntersection());
+ }
+ }
+
+ @Test
+ public void testEquals() {
+ final IntersectionResult[] results = new IntersectionResult[] {
+ new IntersectionResult(0, 0, 0),
+ new IntersectionResult(10, 0, 0),
+ new IntersectionResult(10, 10, 0),
+ new IntersectionResult(10, 10, 10),
+ };
+
+ // Test a different instance with same values
+ Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0)));
+
+ final Object something = new Object();
+ for (int i = 0; i < results.length; i++) {
+ Assertions.assertFalse(results[i].equals(something));
+ Assertions.assertFalse(results[i].equals(null));
+ for (int j = 0; j < results.length; j++) {
+ Assertions.assertTrue(results[i].equals(results[j]) == (i == j));
+ }
+ }
+ }
+
+ @Test
+ public void testHashCode() {
+ final IntersectionResult[] results = new IntersectionResult[] {
+ new IntersectionResult(10, 0, 0),
+ new IntersectionResult(10, 10, 0),
+ new IntersectionResult(10, 10, 10),
+ };
+ final HashMap<IntersectionResult, Integer> map = new HashMap<>();
+ final int offset = 123;
+ for (int i = 0; i < results.length; i++) {
+ map.put(results[i], i + offset);
+ }
+ for (int i = 0; i < results.length; i++) {
+ Assertions.assertEquals(i + offset, map.get(results[i]));
+ }
+ }
+
+ @Test
+ public void testToString() {
+ final ThreadLocalRandom rand = ThreadLocalRandom.current();
+ final int max = 9;
+ for (int i = 0; i < 5; i++) {
+ // Ensure the min is above 0
+ final int sizeA = rand.nextInt(max) + 1;
+ final int sizeB = rand.nextInt(max) + 1;
+ final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
+ final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+ final String string = result.toString();
+ // Not perfect as this will match substrings too. The chance of error
+ // is limited by restricting the numbers to a max of 10.
+ Assertions.assertTrue(string.contains(String.valueOf(sizeA)));
+ Assertions.assertTrue(string.contains(String.valueOf(sizeB)));
+ Assertions.assertTrue(string.contains(String.valueOf(intersection)));
+ }
+ }
+}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
new file mode 100644
index 0000000..2af8c8e
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+
+/**
+ * Unit tests for {@link IntersectionSimilarity}.
+ */
+public class IntersectionSimilarityTest {
+ @Test
+ public void testIntersectionUsingSetCharacter() {
+ // Compute using single characters.
+ // This test uses a set and so should not allow duplicates.
+ final IntersectionSimilarity<Character> similarity =
+ new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
+
+ // Expected:
+ // size A or B = count of unique characters (exclude duplicates)
+ // intersection = count of unique matching characters (exclude duplicates)
+ assertIntersection(similarity, "", "", 0, 0, 0);
+ assertIntersection(similarity, "a", "", 1, 0, 0);
+ assertIntersection(similarity, "a", "a", 1, 1, 1);
+ assertIntersection(similarity, "a", "b", 1, 1, 0);
+ assertIntersection(similarity, "aa", "ab", 1, 2, 1);
+ assertIntersection(similarity, "ab", "ab", 2, 2, 2);
+ assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
+ assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
+ assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
+ assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
+ assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
+ assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+ assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
+ }
+
+ @Test
+ public void testIntersectionUsingListCharacter() {
+ // Compute using single characters.
+ // This test uses a list and so duplicates should be matched.
+ final IntersectionSimilarity<Character> similarity =
+ new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
+
+ // Expected:
+ // size A or B = sequence length
+ // intersection = count of matching characters (include duplicates)
+ assertIntersection(similarity, "", "", 0, 0, 0);
+ assertIntersection(similarity, "a", "", 1, 0, 0);
+ assertIntersection(similarity, "a", "a", 1, 1, 1);
+ assertIntersection(similarity, "a", "b", 1, 1, 0);
+ assertIntersection(similarity, "aa", "ab", 2, 2, 1);
+ assertIntersection(similarity, "ab", "ab", 2, 2, 2);
+ assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
+ assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
+ assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
+ assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
+ assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
+ assertIntersection(similarity, "the same", "the same", 8, 8, 8);
+ assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
+ }
+
+ @Test
+ public void testIntersectionUsingSetBigrams() {
+ // Compute using pairs of characters (bigrams).
+ // This can be done using a 32-bit int to store two 16-bit characters.
+ // This test uses a set and so should not allow duplicates.
+ final IntersectionSimilarity<Integer> similarity =
+ new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
+
+ // Expected:
+ // size A or B = count of unique bigrams (exclude duplicates)
+ // intersection = count of unique matching bigrams (exclude duplicates)
+ assertIntersection(similarity, "", "", 0, 0, 0);
+ assertIntersection(similarity, "a", "", 0, 0, 0);
+ assertIntersection(similarity, "a", "a", 0, 0, 0);
+ assertIntersection(similarity, "a", "b", 0, 0, 0);
+ assertIntersection(similarity, "aa", "ab", 1, 1, 0);
+ assertIntersection(similarity, "ab", "ab", 1, 1, 1);
+ assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
+ assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
+ assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
+ assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
+ assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
+ assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+ assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
+ }
+
+ @Test
+ public void testIntersectionUsingListBigrams() {
+ // Compute using pairs of characters (bigrams).
+ // This can be done using a 32-bit int to store two 16-bit characters.
+ // This test uses a list and so duplicates should be matched.
+ final IntersectionSimilarity<Integer> similarity =
+ new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
+
+ // Expected:
+ // size A or B = sequence length - 1
+ // intersection = count of matching bigrams (include duplicates)
+ assertIntersection(similarity, "", "", 0, 0, 0);
+ assertIntersection(similarity, "a", "", 0, 0, 0);
+ assertIntersection(similarity, "a", "a", 0, 0, 0);
+ assertIntersection(similarity, "a", "b", 0, 0, 0);
+ assertIntersection(similarity, "aa", "ab", 1, 1, 0);
+ assertIntersection(similarity, "ab", "ab", 1, 1, 1);
+ assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
+ assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
+ assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
+ assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
+ assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
+ assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+ assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
+ }
+
+ @Test
+ public void testIntersectionUsingSetCharacterListCharacter() {
+ // Compute using a custom converter that returns a Set and a List.
+ // This is an edge-case test.
+ final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
+ final String sequence1 = "aabbccdd";
+ final String sequence2 = "aaaaaabbbfffff";
+ converter.put(sequence1, toCharacterSet(sequence1));
+ converter.put(sequence2, toCharacterList(sequence2));
+ final IntersectionSimilarity<Character> similarity =
+ new IntersectionSimilarity<>(converter::get);
+
+ // Expected:
+ // size A = count of unique characters (exclude duplicates)
+ // size B = sequence length
+ // intersection = count of matching characters (exclude duplicates)
+ assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
+ assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
+ }
+
+ /**
+ * Convert the {@link CharSequence} to a {@link Set} of {@link Character}s.
+ *
+ * @param sequence the sequence
+ * @return the set
+ */
+ private static Set<Character> toCharacterSet(CharSequence sequence) {
+ final int length = sequence.length();
+ final Set<Character> set = new HashSet<>(length);
+ for (int i = 0; i < length; i++) {
+ set.add(sequence.charAt(i));
+ }
+ return set;
+ }
+
+ /**
+ * Convert the {@link CharSequence} to a {@link List} of {@link Character}s.
+ *
+ * @param sequence the sequence
+ * @return the list
+ */
+ private static List<Character> toCharacterList(CharSequence sequence) {
+ final int length = sequence.length();
+ final List<Character> list = new ArrayList<>(length);
+ for (int i = 0; i < length; i++) {
+ list.add(sequence.charAt(i));
+ }
+ return list;
+ }
+
+ /**
+ * Convert the {@link CharSequence} to a {@link Set} of bigrams (pairs of characters).
+ * These are represented using 2 16-bit chars packed into a 32-bit int.
+ *
+ * @param sequence the sequence
+ * @return the set
+ */
+ private static Set<Integer> toBigramSet(CharSequence sequence) {
+ final int length = sequence.length();
+ final Set<Integer> set = new HashSet<>(length);
+ if (length > 1) {
+ char ch2 = sequence.charAt(0);
+ for (int i = 1; i < length; i++) {
+ final char ch1 = ch2;
+ ch2 = sequence.charAt(i);
+ set.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ return set;
+ }
+
+ /**
+ * Convert the {@link CharSequence} to a {@link List} of bigrams (pairs of characters).
+ * These are represented using 2 16-bit chars packed into a 32-bit int.
+ *
+ * @param sequence the sequence
+ * @return the list
+ */
+ private static List<Integer> toBigramList(CharSequence sequence) {
+ final int length = sequence.length();
+ final List<Integer> list = new ArrayList<>(length);
+ if (length > 1) {
+ char ch2 = sequence.charAt(0);
+ for (int i = 1; i < length; i++) {
+ final char ch1 = ch2;
+ ch2 = sequence.charAt(i);
+ list.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ return list;
+ }
+
+ private static <T> void assertIntersection(IntersectionSimilarity<T> similarity, CharSequence cs1, CharSequence cs2,
+ int sizeA, int sizeB, int intersection) {
+ final IntersectionResult result = similarity.apply(cs1, cs2);
+ assertEquals(sizeA, result.getSizeA(), "Size A error");
+ assertEquals(sizeB, result.getSizeB(), "Size B error");
+ assertEquals(intersection, result.getIntersection(), "Intersection error");
+ }
+
+ @Test
+ public void testF1ScoreUsingListWordBigrams() {
+ // Example of a word letter pairs algorithm by Simon White:
+ // http://www.catalysoft.com/articles/StrikeAMatch.html
+ // This splits into words using whitespace and then computes uppercase
+ // bigrams for each word.
+
+ // Split on whitespace
+ final Pattern pattern = Pattern.compile("\\s+");
+
+ // Compute using pairs of characters (bigrams) for each word.
+ // This can be done using a 32-bit int to store two 16-bit characters
+ final Function<CharSequence, Collection<Integer>> converter = cs -> {
+ final List<Integer> set = new ArrayList<>();
+ for (String word : pattern.split(cs)) {
+ if (word.length() > 1) {
+ // The strings are converted to upper case
+ char ch2 = Character.toUpperCase(word.charAt(0));
+ for (int i = 1; i < word.length(); i++) {
+ final char ch1 = ch2;
+ ch2 = Character.toUpperCase(word.charAt(i));
+ set.add(Integer.valueOf((ch1 << 16) | ch2));
+ }
+ }
+ }
+ return set;
+ };
+ final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
+
+ String bookTitle;
+ final String search1 = "Web Database Applications";
+ final String search2 = "PHP Web Applications";
+ final String search3 = "Web Aplications";
+ bookTitle = "Web Database Applications with PHP & MySQL";
+ assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Creating Database Web Applications with PHP and ASP";
+ assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Building Database Applications on the Web Using PHP3";
+ assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Building Web Database Applications with Visual Studio 6";
+ assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Web Application Development With PHP";
+ assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
+ assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
+ assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ bookTitle = "How to Find a Scholarship Online";
+ assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+ assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+ assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+ }
+
+ private static int toF1ScorePercent(IntersectionResult result) {
+ final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
+ // Convert to percentage
+ return (int) Math.round(value * 100);
+ }
+
+ @Test
+ public void testConstructorWithNullConverterThrows() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new IntersectionSimilarity<>(null);
+ });
+ }
+
+ @Test
+ public void testApplyNullNull() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
+ });
+ }
+
+ @Test
+ public void testApplyStringNull() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
+ });
+ }
+
+ @Test
+ public void testApplyNullString() {
+ assertThatIllegalArgumentException().isThrownBy(() -> {
+ new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
+ });
+ }
+}