You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/03/13 14:47:12 UTC

[commons-text] 01/03: TEXT-155: Add a generic IntersectionSimilarity measure

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit 639e4abb7fa57319311277753eb9f6aabfa2cf30
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Sun Mar 10 13:22:14 2019 +0000

    TEXT-155: Add a generic IntersectionSimilarity measure
---
 .../text/similarity/IntersectionResult.java        | 118 +++++++
 .../text/similarity/IntersectionSimilarity.java    | 237 ++++++++++++++
 .../text/similarity/IntersectionResultTest.java    | 153 ++++++++++
 .../similarity/IntersectionSimilarityTest.java     | 340 +++++++++++++++++++++
 4 files changed, 848 insertions(+)

diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
new file mode 100644
index 0000000..1034c69
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Objects;
+
+/**
+ * Represents the intersection result between two sets.
+ *
+ * <p>Stores the size of set A, set B and the intersection of A and B
+ * (<code>|A &#8745; B|</code>).</p>
+ *
+ * <p>This class is immutable.</p>
+ *
+ * @since 1.7
+ * @see <a href="https://en.wikipedia.org/wiki/Intersection_(set_theory)">Intersection</a>
+ */
+public class IntersectionResult {
+    /**
+     * The size of set A.
+     */
+    private final int sizeA;
+    /**
+     * The size of set B.
+     */
+    private final int sizeB;
+    /**
+     * The size of the intersection between set A and B.
+     */
+    private final int intersection;
+
+    /**
+     * Create the results for an intersection between two sets.
+     *
+     * @param sizeA the size of set A ({@code |A|})
+     * @param sizeB the size of set B ({@code |B|})
+     * @param intersection the size of the intersection of A and B (<code>|A &#8745; B|</code>)
+     * @throws IllegalArgumentException if the sizes are negative or the intersection is greater
+     * than the minimum of the two set sizes
+     */
+    public IntersectionResult(final int sizeA, final int sizeB, final int intersection) {
+        if (sizeA < 0) {
+            throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA);
+        }
+        if (sizeB < 0) {
+            throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB);
+        }
+        if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) {
+            throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection);
+        }
+        this.sizeA = sizeA;
+        this.sizeB = sizeB;
+        this.intersection = intersection;
+    }
+
+    /**
+     * Get the size of set A.
+     *
+     * @return |A|
+     */
+    public int getSizeA() {
+        return sizeA;
+    }
+
+    /**
+     * Get the size of set B.
+     *
+     * @return |B|
+     */
+    public int getSizeB() {
+        return sizeB;
+    }
+
+    /**
+     * Get the size of the intersection between set A and B.
+     *
+     * @return <code>|A &#8745; B|</code>
+     */
+    public int getIntersection() {
+        return intersection;
+    }
+
+    @Override
+    public boolean equals(final Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+        final IntersectionResult result = (IntersectionResult) o;
+        return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection;
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(sizeA, sizeB, intersection);
+    }
+
+    @Override
+    public String toString() {
+        return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection;
+    }
+}
diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
new file mode 100644
index 0000000..bbdd7b8
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.function.Function;
+
+/**
+ * Measures the intersection of two sets created from a pair of character sequences.
+ *
+ * <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage
+ * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
+ * {@link Object#equals(Object)} and {@link Object#hashCode()}.</p>
+ *
+ * @param <T> the type of the elements extracted from the character sequence
+ * @since 1.7
+ * @see Set
+ * @see HashMap
+ */
+public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> {
+    /** The converter used to create the elements from the characters. */
+    private final Function<CharSequence, Collection<T>> converter;
+
+    // The following is adapted from commons-collections for a Bag.
+    // A Bag is a collection that can store the count of the number
+    // of copies of each element.
+
+    /**
+     * Mutable counter class for storing the count of elements.
+     */
+    private static class BagCount {
+        /** The count. This is initialised to 1 upon construction. */
+        int count = 1;
+    }
+
+    /**
+     * A minimal implementation of a Bag that can store elements and a count.
+     *
+     * <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not
+     * even have to know its own size.
+     */
+    private class TinyBag {
+        /** The backing map. */
+        private final Map<T, BagCount> map;
+
+        /**
+         * Create a new tiny bag.
+         *
+         * @param initialCapacity the initial capacity
+         */
+        TinyBag(int initialCapacity) {
+            map = new HashMap<>(initialCapacity);
+        }
+
+        /**
+         * Adds a new element to the bag, incrementing its count in the underlying map.
+         *
+         * @param object the object to add
+         */
+        void add(T object) {
+            final BagCount mut = map.get(object);
+            if (mut == null) {
+                map.put(object, new BagCount());
+            } else {
+                mut.count++;
+            }
+        }
+
+        /**
+         * Returns the number of occurrence of the given element in this bag by
+         * looking up its count in the underlying map.
+         *
+         * @param object the object to search for
+         * @return the number of occurrences of the object, zero if not found
+         */
+        int getCount(final Object object) {
+            final BagCount count = map.get(object);
+            if (count != null) {
+                return count.count;
+            }
+            return 0;
+        }
+
+        /**
+         * Returns a Set view of the mappings contained in this bag.
+         *
+         * @return the Set view
+         */
+        Set<Entry<T, BagCount>> entrySet() {
+            return map.entrySet();
+        }
+
+        /**
+         * Get the number of unique elements in the bag.
+         *
+         * @return the unique element size
+         */
+        int uniqueElementSize() {
+            return map.size();
+        }
+    }
+
+    /**
+     * Create a new intersection similarity using the provided converter.
+     *
+     * <p>If the converter returns a {@link Set} then the intersection result will
+     * not include duplicates. Any other {@link Collection} is used to produce a result
+     * that will include duplicates in the intersect and union.
+     *
+     * @param converter the converter used to create the elements from the characters
+     * @throws IllegalArgumentException if the converter is null
+     */
+    public IntersectionSimilarity(Function<CharSequence, Collection<T>> converter) {
+        if (converter == null) {
+            throw new IllegalArgumentException("Converter must not be null");
+        }
+        this.converter = converter;
+    }
+
+    /**
+     * Calculates the intersection of two character sequences passed as input.
+     *
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return the intersection result
+     * @throws IllegalArgumentException if either input sequence is {@code null}
+     */
+    @Override
+    public IntersectionResult apply(final CharSequence left, final CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Input cannot be null");
+        }
+
+        // Create the elements from the sequences
+        final Collection<T> objectsA = converter.apply(left);
+        final Collection<T> objectsB = converter.apply(right);
+        final int sizeA = objectsA.size();
+        final int sizeB = objectsB.size();
+
+        // Short-cut if either collection is empty
+        if (Math.min(sizeA, sizeB) == 0) {
+            // No intersection
+            return new IntersectionResult(sizeA, sizeB, 0);
+        }
+
+        // Intersection = count the number of shared elements
+        int intersection;
+        if (objectsA instanceof Set && objectsB instanceof Set) {
+            // If a Set then the elements will only have a count of 1.
+            // Iterate over the smaller set.
+            intersection = (sizeA < sizeB)
+                    ? getIntersection((Set<T>) objectsA, (Set<T>) objectsB)
+                    : getIntersection((Set<T>) objectsB, (Set<T>) objectsA);
+        } else  {
+            // Create a bag for each collection
+            final TinyBag bagA = toBag(objectsA);
+            final TinyBag bagB = toBag(objectsB);
+            // Iterate over the smaller number of unique elements
+            intersection = (bagA.uniqueElementSize() < bagB.uniqueElementSize())
+                    ? getIntersection(bagA, bagB)
+                    : getIntersection(bagB, bagA);
+        }
+
+        return new IntersectionResult(sizeA, sizeB, intersection);
+    }
+
+    /**
+     * Convert the collection to a bag. The bag will contain the count of each element
+     * in the collection.
+     *
+     * @param objects the objects
+     * @return the bag
+     */
+    private TinyBag toBag(Collection<T> objects) {
+        final TinyBag bag = new TinyBag(objects.size());
+        for (T t : objects) {
+            bag.add(t);
+        }
+        return bag;
+    }
+
+    /**
+     * Compute the intersection between two sets. This is the count of all the elements
+     * that are within both sets.
+     *
+     * @param <T> the type of the elements in the set
+     * @param setA the set A
+     * @param setB the set B
+     * @return the intersection
+     */
+    private static <T> int getIntersection(Set<T> setA, Set<T> setB) {
+        int intersection = 0;
+        for (T element : setA) {
+            if (setB.contains(element)) {
+                intersection++;
+            }
+        }
+        return intersection;
+    }
+
+    /**
+     * Compute the intersection between two bags. This is the sum of the minimum
+     * count of each element that is within both sets.
+     *
+     * @param bagA the bag A
+     * @param bagB the bag B
+     * @return the intersection
+     */
+    private int getIntersection(TinyBag bagA, TinyBag bagB) {
+        int intersection = 0;
+        for (Entry<T, BagCount> entry : bagA.entrySet()) {
+            final T element = entry.getKey();
+            final int count = entry.getValue().count;
+            // The intersection of this entry in both bags is the minimum count
+            intersection += Math.min(count, bagB.getCount(element));
+        }
+        return intersection;
+    }
+}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
new file mode 100644
index 0000000..c2477e0
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.HashMap;
+import java.util.concurrent.ThreadLocalRandom;
+
+/**
+ * Unit tests for {@link IntersectionResult}.
+ */
+public class IntersectionResultTest {
+    @Test
+    public void testNewIntersectionResult_WithZeros() {
+        final int sizeA = 0;
+        final int sizeB = 0;
+        final int intersection = 0;
+        new IntersectionResult(sizeA, sizeB, intersection);
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithNegativeSizeA() {
+        final int sizeA = -1;
+        final int sizeB = 0;
+        final int intersection = 0;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithNegativeSizeB() {
+        final int sizeA = 0;
+        final int sizeB = -1;
+        final int intersection = 0;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithNegativeIntersection() {
+        final int sizeA = 0;
+        final int sizeB = 0;
+        final int intersection = -1;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() {
+        final int sizeA = 1;
+        final int sizeB = 2;
+        final int intersection = Math.max(sizeA, sizeB) + 1;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeB, sizeA, intersection);
+        });
+    }
+
+    @Test
+    public void testProperties() {
+        final ThreadLocalRandom rand = ThreadLocalRandom.current();
+        final int max = 1024;
+        for (int i = 0; i < 5; i++) {
+            // Ensure the min is above 0
+            final int sizeA = rand.nextInt(max) + 1;
+            final int sizeB = rand.nextInt(max) + 1;
+            final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
+            final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+            Assertions.assertEquals(sizeA, result.getSizeA());
+            Assertions.assertEquals(sizeB, result.getSizeB());
+            Assertions.assertEquals(intersection, result.getIntersection());
+        }
+    }
+
+    @Test
+    public void testEquals() {
+        final IntersectionResult[] results = new IntersectionResult[] {
+                new IntersectionResult(0, 0, 0),
+                new IntersectionResult(10, 0, 0),
+                new IntersectionResult(10, 10, 0),
+                new IntersectionResult(10, 10, 10),
+        };
+
+        // Test a different instance with same values
+        Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0)));
+
+        final Object something = new Object();
+        for (int i = 0; i < results.length; i++) {
+            Assertions.assertFalse(results[i].equals(something));
+            Assertions.assertFalse(results[i].equals(null));
+            for (int j = 0; j < results.length; j++) {
+                Assertions.assertTrue(results[i].equals(results[j]) == (i == j));
+            }
+        }
+    }
+
+    @Test
+    public void testHashCode() {
+        final IntersectionResult[] results = new IntersectionResult[] {
+                new IntersectionResult(10, 0, 0),
+                new IntersectionResult(10, 10, 0),
+                new IntersectionResult(10, 10, 10),
+        };
+        final HashMap<IntersectionResult, Integer> map = new HashMap<>();
+        final int offset = 123;
+        for (int i = 0; i < results.length; i++) {
+            map.put(results[i], i + offset);
+        }
+        for (int i = 0; i < results.length; i++) {
+            Assertions.assertEquals(i + offset, map.get(results[i]));
+        }
+    }
+
+    @Test
+    public void testToString() {
+        final ThreadLocalRandom rand = ThreadLocalRandom.current();
+        final int max = 9;
+        for (int i = 0; i < 5; i++) {
+            // Ensure the min is above 0
+            final int sizeA = rand.nextInt(max) + 1;
+            final int sizeB = rand.nextInt(max) + 1;
+            final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
+            final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+            final String string = result.toString();
+            // Not perfect as this will match substrings too. The chance of error
+            // is limited by restricting the numbers to a max of 10.
+            Assertions.assertTrue(string.contains(String.valueOf(sizeA)));
+            Assertions.assertTrue(string.contains(String.valueOf(sizeB)));
+            Assertions.assertTrue(string.contains(String.valueOf(intersection)));
+        }
+    }
+}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
new file mode 100644
index 0000000..2af8c8e
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+
+/**
+ * Unit tests for {@link IntersectionSimilarity}.
+ */
+public class IntersectionSimilarityTest {
+    @Test
+    public void testIntersectionUsingSetCharacter() {
+        // Compute using single characters.
+        // This test uses a set and so should not allow duplicates.
+        final IntersectionSimilarity<Character> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
+
+        // Expected:
+        // size A or B = count of unique characters (exclude duplicates)
+        // intersection = count of unique matching characters (exclude duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 1, 0, 0);
+        assertIntersection(similarity, "a", "a", 1, 1, 1);
+        assertIntersection(similarity, "a", "b", 1, 1, 0);
+        assertIntersection(similarity, "aa", "ab", 1, 2, 1);
+        assertIntersection(similarity, "ab", "ab", 2, 2, 2);
+        assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
+        assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
+        assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
+        assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
+        assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
+        assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
+    }
+
+    @Test
+    public void testIntersectionUsingListCharacter() {
+        // Compute using single characters.
+        // This test uses a list and so duplicates should be matched.
+        final IntersectionSimilarity<Character> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
+
+        // Expected:
+        // size A or B = sequence length
+        // intersection = count of matching characters (include duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 1, 0, 0);
+        assertIntersection(similarity, "a", "a", 1, 1, 1);
+        assertIntersection(similarity, "a", "b", 1, 1, 0);
+        assertIntersection(similarity, "aa", "ab", 2, 2, 1);
+        assertIntersection(similarity, "ab", "ab", 2, 2, 2);
+        assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
+        assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
+        assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
+        assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
+        assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
+        assertIntersection(similarity, "the same", "the same", 8, 8, 8);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
+    }
+
+    @Test
+    public void testIntersectionUsingSetBigrams() {
+        // Compute using pairs of characters (bigrams).
+        // This can be done using a 32-bit int to store two 16-bit characters.
+        // This test uses a set and so should not allow duplicates.
+        final IntersectionSimilarity<Integer> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
+
+        // Expected:
+        // size A or B = count of unique bigrams (exclude duplicates)
+        // intersection = count of unique matching bigrams (exclude duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "a", 0, 0, 0);
+        assertIntersection(similarity, "a", "b", 0, 0, 0);
+        assertIntersection(similarity, "aa", "ab", 1, 1, 0);
+        assertIntersection(similarity, "ab", "ab", 1, 1, 1);
+        assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
+        assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
+        assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
+        assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
+        assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
+        assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
+    }
+
+    @Test
+    public void testIntersectionUsingListBigrams() {
+        // Compute using pairs of characters (bigrams).
+        // This can be done using a 32-bit int to store two 16-bit characters.
+        // This test uses a list and so duplicates should be matched.
+        final IntersectionSimilarity<Integer> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
+
+        // Expected:
+        // size A or B = sequence length - 1
+        // intersection = count of matching bigrams (include duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "a", 0, 0, 0);
+        assertIntersection(similarity, "a", "b", 0, 0, 0);
+        assertIntersection(similarity, "aa", "ab", 1, 1, 0);
+        assertIntersection(similarity, "ab", "ab", 1, 1, 1);
+        assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
+        assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
+        assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
+        assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
+        assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
+        assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
+    }
+
+    @Test
+    public void testIntersectionUsingSetCharacterListCharacter() {
+        // Compute using a custom converter that returns a Set and a List.
+        // This is an edge-case test.
+        final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
+        final String sequence1 = "aabbccdd";
+        final String sequence2 = "aaaaaabbbfffff";
+        converter.put(sequence1, toCharacterSet(sequence1));
+        converter.put(sequence2, toCharacterList(sequence2));
+        final IntersectionSimilarity<Character> similarity =
+                new IntersectionSimilarity<>(converter::get);
+
+        // Expected:
+        // size A = count of unique characters (exclude duplicates)
+        // size B = sequence length
+        // intersection = count of matching characters (exclude duplicates)
+        assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
+        assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link Set} of {@link Character}s.
+     *
+     * @param sequence the sequence
+     * @return the set
+     */
+    private static Set<Character> toCharacterSet(CharSequence sequence) {
+        final int length = sequence.length();
+        final Set<Character> set = new HashSet<>(length);
+        for (int i = 0; i < length; i++) {
+            set.add(sequence.charAt(i));
+        }
+        return set;
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link List} of {@link Character}s.
+     *
+     * @param sequence the sequence
+     * @return the list
+     */
+    private static List<Character> toCharacterList(CharSequence sequence) {
+        final int length = sequence.length();
+        final List<Character> list = new ArrayList<>(length);
+        for (int i = 0; i < length; i++) {
+            list.add(sequence.charAt(i));
+        }
+        return list;
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link Set} of bigrams (pairs of characters).
+     * These are represented using 2 16-bit chars packed into a 32-bit int.
+     *
+     * @param sequence the sequence
+     * @return the set
+     */
+    private static Set<Integer> toBigramSet(CharSequence sequence) {
+        final int length = sequence.length();
+        final Set<Integer> set = new HashSet<>(length);
+        if (length > 1) {
+            char ch2 = sequence.charAt(0);
+            for (int i = 1; i < length; i++) {
+                final char ch1 = ch2;
+                ch2 = sequence.charAt(i);
+                set.add(Integer.valueOf((ch1 << 16) | ch2));
+            }
+        }
+        return set;
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link List} of bigrams (pairs of characters).
+     * These are represented using 2 16-bit chars packed into a 32-bit int.
+     *
+     * @param sequence the sequence
+     * @return the list
+     */
+    private static List<Integer> toBigramList(CharSequence sequence) {
+        final int length = sequence.length();
+        final List<Integer> list = new ArrayList<>(length);
+        if (length > 1) {
+            char ch2 = sequence.charAt(0);
+            for (int i = 1; i < length; i++) {
+                final char ch1 = ch2;
+                ch2 = sequence.charAt(i);
+                list.add(Integer.valueOf((ch1 << 16) | ch2));
+            }
+        }
+        return list;
+    }
+
+    private static <T> void assertIntersection(IntersectionSimilarity<T> similarity, CharSequence cs1, CharSequence cs2,
+            int sizeA, int sizeB, int intersection) {
+        final IntersectionResult result = similarity.apply(cs1, cs2);
+        assertEquals(sizeA, result.getSizeA(), "Size A error");
+        assertEquals(sizeB, result.getSizeB(), "Size B error");
+        assertEquals(intersection, result.getIntersection(), "Intersection error");
+    }
+
+    @Test
+    public void testF1ScoreUsingListWordBigrams() {
+        // Example of a word letter pairs algorithm by Simon White:
+        // http://www.catalysoft.com/articles/StrikeAMatch.html
+        // This splits into words using whitespace and then computes uppercase
+        // bigrams for each word.
+
+        // Split on whitespace
+        final Pattern pattern = Pattern.compile("\\s+");
+
+        // Compute using pairs of characters (bigrams) for each word.
+        // This can be done using a 32-bit int to store two 16-bit characters
+        final Function<CharSequence, Collection<Integer>> converter = cs -> {
+            final List<Integer> set = new ArrayList<>();
+            for (String word : pattern.split(cs)) {
+                if (word.length() > 1) {
+                    // The strings are converted to upper case
+                    char ch2 = Character.toUpperCase(word.charAt(0));
+                    for (int i = 1; i < word.length(); i++) {
+                        final char ch1 = ch2;
+                        ch2 = Character.toUpperCase(word.charAt(i));
+                        set.add(Integer.valueOf((ch1 << 16) | ch2));
+                    }
+                }
+            }
+            return set;
+        };
+        final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
+
+        String bookTitle;
+        final String search1 = "Web Database Applications";
+        final String search2 = "PHP Web Applications";
+        final String search3 = "Web Aplications";
+        bookTitle = "Web Database Applications with PHP & MySQL";
+        assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Creating Database Web Applications with PHP and ASP";
+        assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Building Database Applications on the Web Using PHP3";
+        assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Building Web Database Applications with Visual Studio 6";
+        assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Web Application Development With PHP";
+        assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
+        assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
+        assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "How to Find a Scholarship Online";
+        assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+    }
+
+    private static int toF1ScorePercent(IntersectionResult result) {
+        final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
+        // Convert to percentage
+        return (int) Math.round(value * 100);
+    }
+
+    @Test
+    public void testConstructorWithNullConverterThrows() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(null);
+        });
+    }
+
+    @Test
+    public void testApplyNullNull() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
+        });
+    }
+
+    @Test
+    public void testApplyStringNull() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
+        });
+    }
+
+    @Test
+    public void testApplyNullString() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
+        });
+    }
+}