You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/03/13 14:47:11 UTC

[commons-text] branch master updated (ea15c5a -> a6f2b44)

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git.


    from ea15c5a  TEXT-151: changes.xml entry
     new 639e4ab  TEXT-155: Add a generic IntersectionSimilarity measure
     new 89b5f4a  Merge branch 'feature-TEXT-155'
     new a6f2b44  TEXT-155: Update changes.xml

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/changes/changes.xml                            |   1 +
 .../text/similarity/IntersectionResult.java        | 118 +++++++
 .../text/similarity/IntersectionSimilarity.java    | 237 ++++++++++++++
 .../text/similarity/IntersectionResultTest.java    | 153 ++++++++++
 .../similarity/IntersectionSimilarityTest.java     | 340 +++++++++++++++++++++
 5 files changed, 849 insertions(+)
 create mode 100644 src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
 create mode 100644 src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
 create mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
 create mode 100644 src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java


[commons-text] 01/03: TEXT-155: Add a generic IntersectionSimilarity measure

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit 639e4abb7fa57319311277753eb9f6aabfa2cf30
Author: Alex Herbert <ah...@apache.org>
AuthorDate: Sun Mar 10 13:22:14 2019 +0000

    TEXT-155: Add a generic IntersectionSimilarity measure
---
 .../text/similarity/IntersectionResult.java        | 118 +++++++
 .../text/similarity/IntersectionSimilarity.java    | 237 ++++++++++++++
 .../text/similarity/IntersectionResultTest.java    | 153 ++++++++++
 .../similarity/IntersectionSimilarityTest.java     | 340 +++++++++++++++++++++
 4 files changed, 848 insertions(+)

diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
new file mode 100644
index 0000000..1034c69
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionResult.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Objects;
+
+/**
+ * Represents the intersection result between two sets.
+ *
+ * <p>Stores the size of set A, set B and the intersection of A and B
+ * (<code>|A &#8745; B|</code>).</p>
+ *
+ * <p>This class is immutable.</p>
+ *
+ * @since 1.7
+ * @see <a href="https://en.wikipedia.org/wiki/Intersection_(set_theory)">Intersection</a>
+ */
+public class IntersectionResult {
+    /**
+     * The size of set A.
+     */
+    private final int sizeA;
+    /**
+     * The size of set B.
+     */
+    private final int sizeB;
+    /**
+     * The size of the intersection between set A and B.
+     */
+    private final int intersection;
+
+    /**
+     * Create the results for an intersection between two sets.
+     *
+     * @param sizeA the size of set A ({@code |A|})
+     * @param sizeB the size of set B ({@code |B|})
+     * @param intersection the size of the intersection of A and B (<code>|A &#8745; B|</code>)
+     * @throws IllegalArgumentException if the sizes are negative or the intersection is greater
+     * than the minimum of the two set sizes
+     */
+    public IntersectionResult(final int sizeA, final int sizeB, final int intersection) {
+        if (sizeA < 0) {
+            throw new IllegalArgumentException("Set size |A| is not positive: " + sizeA);
+        }
+        if (sizeB < 0) {
+            throw new IllegalArgumentException("Set size |B| is not positive: " + sizeB);
+        }
+        if (intersection < 0 || intersection > Math.min(sizeA, sizeB)) {
+            throw new IllegalArgumentException("Invalid intersection of |A| and |B|: " + intersection);
+        }
+        this.sizeA = sizeA;
+        this.sizeB = sizeB;
+        this.intersection = intersection;
+    }
+
+    /**
+     * Get the size of set A.
+     *
+     * @return |A|
+     */
+    public int getSizeA() {
+        return sizeA;
+    }
+
+    /**
+     * Get the size of set B.
+     *
+     * @return |B|
+     */
+    public int getSizeB() {
+        return sizeB;
+    }
+
+    /**
+     * Get the size of the intersection between set A and B.
+     *
+     * @return <code>|A &#8745; B|</code>
+     */
+    public int getIntersection() {
+        return intersection;
+    }
+
+    @Override
+    public boolean equals(final Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+        final IntersectionResult result = (IntersectionResult) o;
+        return sizeA == result.sizeA && sizeB == result.sizeB && intersection == result.intersection;
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(sizeA, sizeB, intersection);
+    }
+
+    @Override
+    public String toString() {
+        return "Size A: " + sizeA + ", Size B: " + sizeB + ", Intersection: " + intersection;
+    }
+}
diff --git a/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
new file mode 100644
index 0000000..bbdd7b8
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/similarity/IntersectionSimilarity.java
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.function.Function;
+
+/**
+ * Measures the intersection of two sets created from a pair of character sequences.
+ *
+ * <p>It is assumed that the type {@code T} correctly conforms to the requirements for storage
+ * within a {@link Set} or {@link HashMap}. Ideally the type is immutable and implements
+ * {@link Object#equals(Object)} and {@link Object#hashCode()}.</p>
+ *
+ * @param <T> the type of the elements extracted from the character sequence
+ * @since 1.7
+ * @see Set
+ * @see HashMap
+ */
+public class IntersectionSimilarity<T> implements SimilarityScore<IntersectionResult> {
+    /** The converter used to create the elements from the characters. */
+    private final Function<CharSequence, Collection<T>> converter;
+
+    // The following is adapted from commons-collections for a Bag.
+    // A Bag is a collection that can store the count of the number
+    // of copies of each element.
+
+    /**
+     * Mutable counter class for storing the count of elements.
+     */
+    private static class BagCount {
+        /** The count. This is initialised to 1 upon construction. */
+        int count = 1;
+    }
+
+    /**
+     * A minimal implementation of a Bag that can store elements and a count.
+     *
+     * <p>For the intended purpose the Bag does not have to be a {@link Collection}. It does not
+     * even have to know its own size.
+     */
+    private class TinyBag {
+        /** The backing map. */
+        private final Map<T, BagCount> map;
+
+        /**
+         * Create a new tiny bag.
+         *
+         * @param initialCapacity the initial capacity
+         */
+        TinyBag(int initialCapacity) {
+            map = new HashMap<>(initialCapacity);
+        }
+
+        /**
+         * Adds a new element to the bag, incrementing its count in the underlying map.
+         *
+         * @param object the object to add
+         */
+        void add(T object) {
+            final BagCount mut = map.get(object);
+            if (mut == null) {
+                map.put(object, new BagCount());
+            } else {
+                mut.count++;
+            }
+        }
+
+        /**
+         * Returns the number of occurrence of the given element in this bag by
+         * looking up its count in the underlying map.
+         *
+         * @param object the object to search for
+         * @return the number of occurrences of the object, zero if not found
+         */
+        int getCount(final Object object) {
+            final BagCount count = map.get(object);
+            if (count != null) {
+                return count.count;
+            }
+            return 0;
+        }
+
+        /**
+         * Returns a Set view of the mappings contained in this bag.
+         *
+         * @return the Set view
+         */
+        Set<Entry<T, BagCount>> entrySet() {
+            return map.entrySet();
+        }
+
+        /**
+         * Get the number of unique elements in the bag.
+         *
+         * @return the unique element size
+         */
+        int uniqueElementSize() {
+            return map.size();
+        }
+    }
+
+    /**
+     * Create a new intersection similarity using the provided converter.
+     *
+     * <p>If the converter returns a {@link Set} then the intersection result will
+     * not include duplicates. Any other {@link Collection} is used to produce a result
+     * that will include duplicates in the intersect and union.
+     *
+     * @param converter the converter used to create the elements from the characters
+     * @throws IllegalArgumentException if the converter is null
+     */
+    public IntersectionSimilarity(Function<CharSequence, Collection<T>> converter) {
+        if (converter == null) {
+            throw new IllegalArgumentException("Converter must not be null");
+        }
+        this.converter = converter;
+    }
+
+    /**
+     * Calculates the intersection of two character sequences passed as input.
+     *
+     * @param left first character sequence
+     * @param right second character sequence
+     * @return the intersection result
+     * @throws IllegalArgumentException if either input sequence is {@code null}
+     */
+    @Override
+    public IntersectionResult apply(final CharSequence left, final CharSequence right) {
+        if (left == null || right == null) {
+            throw new IllegalArgumentException("Input cannot be null");
+        }
+
+        // Create the elements from the sequences
+        final Collection<T> objectsA = converter.apply(left);
+        final Collection<T> objectsB = converter.apply(right);
+        final int sizeA = objectsA.size();
+        final int sizeB = objectsB.size();
+
+        // Short-cut if either collection is empty
+        if (Math.min(sizeA, sizeB) == 0) {
+            // No intersection
+            return new IntersectionResult(sizeA, sizeB, 0);
+        }
+
+        // Intersection = count the number of shared elements
+        int intersection;
+        if (objectsA instanceof Set && objectsB instanceof Set) {
+            // If a Set then the elements will only have a count of 1.
+            // Iterate over the smaller set.
+            intersection = (sizeA < sizeB)
+                    ? getIntersection((Set<T>) objectsA, (Set<T>) objectsB)
+                    : getIntersection((Set<T>) objectsB, (Set<T>) objectsA);
+        } else  {
+            // Create a bag for each collection
+            final TinyBag bagA = toBag(objectsA);
+            final TinyBag bagB = toBag(objectsB);
+            // Iterate over the smaller number of unique elements
+            intersection = (bagA.uniqueElementSize() < bagB.uniqueElementSize())
+                    ? getIntersection(bagA, bagB)
+                    : getIntersection(bagB, bagA);
+        }
+
+        return new IntersectionResult(sizeA, sizeB, intersection);
+    }
+
+    /**
+     * Convert the collection to a bag. The bag will contain the count of each element
+     * in the collection.
+     *
+     * @param objects the objects
+     * @return the bag
+     */
+    private TinyBag toBag(Collection<T> objects) {
+        final TinyBag bag = new TinyBag(objects.size());
+        for (T t : objects) {
+            bag.add(t);
+        }
+        return bag;
+    }
+
+    /**
+     * Compute the intersection between two sets. This is the count of all the elements
+     * that are within both sets.
+     *
+     * @param <T> the type of the elements in the set
+     * @param setA the set A
+     * @param setB the set B
+     * @return the intersection
+     */
+    private static <T> int getIntersection(Set<T> setA, Set<T> setB) {
+        int intersection = 0;
+        for (T element : setA) {
+            if (setB.contains(element)) {
+                intersection++;
+            }
+        }
+        return intersection;
+    }
+
+    /**
+     * Compute the intersection between two bags. This is the sum of the minimum
+     * count of each element that is within both sets.
+     *
+     * @param bagA the bag A
+     * @param bagB the bag B
+     * @return the intersection
+     */
+    private int getIntersection(TinyBag bagA, TinyBag bagB) {
+        int intersection = 0;
+        for (Entry<T, BagCount> entry : bagA.entrySet()) {
+            final T element = entry.getKey();
+            final int count = entry.getValue().count;
+            // The intersection of this entry in both bags is the minimum count
+            intersection += Math.min(count, bagB.getCount(element));
+        }
+        return intersection;
+    }
+}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
new file mode 100644
index 0000000..c2477e0
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionResultTest.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.HashMap;
+import java.util.concurrent.ThreadLocalRandom;
+
+/**
+ * Unit tests for {@link IntersectionResult}.
+ */
+public class IntersectionResultTest {
+    @Test
+    public void testNewIntersectionResult_WithZeros() {
+        final int sizeA = 0;
+        final int sizeB = 0;
+        final int intersection = 0;
+        new IntersectionResult(sizeA, sizeB, intersection);
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithNegativeSizeA() {
+        final int sizeA = -1;
+        final int sizeB = 0;
+        final int intersection = 0;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithNegativeSizeB() {
+        final int sizeA = 0;
+        final int sizeB = -1;
+        final int intersection = 0;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithNegativeIntersection() {
+        final int sizeA = 0;
+        final int sizeB = 0;
+        final int intersection = -1;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+    }
+
+    @Test
+    public void testNewIntersectionResult_WithIntersectionAboveSizeAorB() {
+        final int sizeA = 1;
+        final int sizeB = 2;
+        final int intersection = Math.max(sizeA, sizeB) + 1;
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeA, sizeB, intersection);
+        });
+        Assertions.assertThrows(IllegalArgumentException.class, () -> {
+            new IntersectionResult(sizeB, sizeA, intersection);
+        });
+    }
+
+    @Test
+    public void testProperties() {
+        final ThreadLocalRandom rand = ThreadLocalRandom.current();
+        final int max = 1024;
+        for (int i = 0; i < 5; i++) {
+            // Ensure the min is above 0
+            final int sizeA = rand.nextInt(max) + 1;
+            final int sizeB = rand.nextInt(max) + 1;
+            final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
+            final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+            Assertions.assertEquals(sizeA, result.getSizeA());
+            Assertions.assertEquals(sizeB, result.getSizeB());
+            Assertions.assertEquals(intersection, result.getIntersection());
+        }
+    }
+
+    @Test
+    public void testEquals() {
+        final IntersectionResult[] results = new IntersectionResult[] {
+                new IntersectionResult(0, 0, 0),
+                new IntersectionResult(10, 0, 0),
+                new IntersectionResult(10, 10, 0),
+                new IntersectionResult(10, 10, 10),
+        };
+
+        // Test a different instance with same values
+        Assertions.assertTrue(results[0].equals(new IntersectionResult(0, 0, 0)));
+
+        final Object something = new Object();
+        for (int i = 0; i < results.length; i++) {
+            Assertions.assertFalse(results[i].equals(something));
+            Assertions.assertFalse(results[i].equals(null));
+            for (int j = 0; j < results.length; j++) {
+                Assertions.assertTrue(results[i].equals(results[j]) == (i == j));
+            }
+        }
+    }
+
+    @Test
+    public void testHashCode() {
+        final IntersectionResult[] results = new IntersectionResult[] {
+                new IntersectionResult(10, 0, 0),
+                new IntersectionResult(10, 10, 0),
+                new IntersectionResult(10, 10, 10),
+        };
+        final HashMap<IntersectionResult, Integer> map = new HashMap<>();
+        final int offset = 123;
+        for (int i = 0; i < results.length; i++) {
+            map.put(results[i], i + offset);
+        }
+        for (int i = 0; i < results.length; i++) {
+            Assertions.assertEquals(i + offset, map.get(results[i]));
+        }
+    }
+
+    @Test
+    public void testToString() {
+        final ThreadLocalRandom rand = ThreadLocalRandom.current();
+        final int max = 9;
+        for (int i = 0; i < 5; i++) {
+            // Ensure the min is above 0
+            final int sizeA = rand.nextInt(max) + 1;
+            final int sizeB = rand.nextInt(max) + 1;
+            final int intersection = rand.nextInt(Math.min(sizeA, sizeB));
+            final IntersectionResult result = new IntersectionResult(sizeA, sizeB, intersection);
+            final String string = result.toString();
+            // Not perfect as this will match substrings too. The chance of error
+            // is limited by restricting the numbers to a max of 10.
+            Assertions.assertTrue(string.contains(String.valueOf(sizeA)));
+            Assertions.assertTrue(string.contains(String.valueOf(sizeB)));
+            Assertions.assertTrue(string.contains(String.valueOf(intersection)));
+        }
+    }
+}
diff --git a/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
new file mode 100644
index 0000000..2af8c8e
--- /dev/null
+++ b/src/test/java/org/apache/commons/text/similarity/IntersectionSimilarityTest.java
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.text.similarity;
+
+import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.regex.Pattern;
+
+/**
+ * Unit tests for {@link IntersectionSimilarity}.
+ */
+public class IntersectionSimilarityTest {
+    @Test
+    public void testIntersectionUsingSetCharacter() {
+        // Compute using single characters.
+        // This test uses a set and so should not allow duplicates.
+        final IntersectionSimilarity<Character> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterSet);
+
+        // Expected:
+        // size A or B = count of unique characters (exclude duplicates)
+        // intersection = count of unique matching characters (exclude duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 1, 0, 0);
+        assertIntersection(similarity, "a", "a", 1, 1, 1);
+        assertIntersection(similarity, "a", "b", 1, 1, 0);
+        assertIntersection(similarity, "aa", "ab", 1, 2, 1);
+        assertIntersection(similarity, "ab", "ab", 2, 2, 2);
+        assertIntersection(similarity, "aaba", "abaa", 2, 2, 2);
+        assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
+        assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
+        assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
+        assertIntersection(similarity, "aabab", "ababa", 2, 2, 2);
+        assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
+    }
+
+    @Test
+    public void testIntersectionUsingListCharacter() {
+        // Compute using single characters.
+        // This test uses a list and so duplicates should be matched.
+        final IntersectionSimilarity<Character> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toCharacterList);
+
+        // Expected:
+        // size A or B = sequence length
+        // intersection = count of matching characters (include duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 1, 0, 0);
+        assertIntersection(similarity, "a", "a", 1, 1, 1);
+        assertIntersection(similarity, "a", "b", 1, 1, 0);
+        assertIntersection(similarity, "aa", "ab", 2, 2, 1);
+        assertIntersection(similarity, "ab", "ab", 2, 2, 2);
+        assertIntersection(similarity, "aaba", "abaa", 4, 4, 4);
+        assertIntersection(similarity, "aaaa", "aa", 4, 2, 2);
+        assertIntersection(similarity, "aa", "aaaa", 2, 4, 2);
+        assertIntersection(similarity, "aaaa", "aaa", 4, 3, 3);
+        assertIntersection(similarity, "aabab", "ababa", 5, 5, 5);
+        assertIntersection(similarity, "the same", "the same", 8, 8, 8);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 13, 13, 11);
+    }
+
+    @Test
+    public void testIntersectionUsingSetBigrams() {
+        // Compute using pairs of characters (bigrams).
+        // This can be done using a 32-bit int to store two 16-bit characters.
+        // This test uses a set and so should not allow duplicates.
+        final IntersectionSimilarity<Integer> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramSet);
+
+        // Expected:
+        // size A or B = count of unique bigrams (exclude duplicates)
+        // intersection = count of unique matching bigrams (exclude duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "a", 0, 0, 0);
+        assertIntersection(similarity, "a", "b", 0, 0, 0);
+        assertIntersection(similarity, "aa", "ab", 1, 1, 0);
+        assertIntersection(similarity, "ab", "ab", 1, 1, 1);
+        assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
+        assertIntersection(similarity, "aaaa", "aa", 1, 1, 1);
+        assertIntersection(similarity, "aa", "aaaa", 1, 1, 1);
+        assertIntersection(similarity, "aaaa", "aaa", 1, 1, 1);
+        assertIntersection(similarity, "aabab", "ababa", 3, 2, 2);
+        assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
+    }
+
+    @Test
+    public void testIntersectionUsingListBigrams() {
+        // Compute using pairs of characters (bigrams).
+        // This can be done using a 32-bit int to store two 16-bit characters.
+        // This test uses a list and so duplicates should be matched.
+        final IntersectionSimilarity<Integer> similarity =
+                new IntersectionSimilarity<>(IntersectionSimilarityTest::toBigramList);
+
+        // Expected:
+        // size A or B = sequence length - 1
+        // intersection = count of matching bigrams (include duplicates)
+        assertIntersection(similarity, "", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "", 0, 0, 0);
+        assertIntersection(similarity, "a", "a", 0, 0, 0);
+        assertIntersection(similarity, "a", "b", 0, 0, 0);
+        assertIntersection(similarity, "aa", "ab", 1, 1, 0);
+        assertIntersection(similarity, "ab", "ab", 1, 1, 1);
+        assertIntersection(similarity, "aaba", "abaa", 3, 3, 3);
+        assertIntersection(similarity, "aaaa", "aa", 3, 1, 1);
+        assertIntersection(similarity, "aa", "aaaa", 1, 3, 1);
+        assertIntersection(similarity, "aaaa", "aaa", 3, 2, 2);
+        assertIntersection(similarity, "aabab", "ababa", 4, 4, 3);
+        assertIntersection(similarity, "the same", "the same", 7, 7, 7);
+        assertIntersection(similarity, "abcdefghijklm", "ab_defg ijklm", 12, 12, 8);
+    }
+
+    @Test
+    public void testIntersectionUsingSetCharacterListCharacter() {
+        // Compute using a custom converter that returns a Set and a List.
+        // This is an edge-case test.
+        final HashMap<CharSequence, Collection<Character>> converter = new HashMap<>();
+        final String sequence1 = "aabbccdd";
+        final String sequence2 = "aaaaaabbbfffff";
+        converter.put(sequence1, toCharacterSet(sequence1));
+        converter.put(sequence2, toCharacterList(sequence2));
+        final IntersectionSimilarity<Character> similarity =
+                new IntersectionSimilarity<>(converter::get);
+
+        // Expected:
+        // size A = count of unique characters (exclude duplicates)
+        // size B = sequence length
+        // intersection = count of matching characters (exclude duplicates)
+        assertIntersection(similarity, sequence1, sequence2, 4, sequence2.length(), 2);
+        assertIntersection(similarity, sequence2, sequence1, sequence2.length(), 4, 2);
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link Set} of {@link Character}s.
+     *
+     * @param sequence the sequence
+     * @return the set
+     */
+    private static Set<Character> toCharacterSet(CharSequence sequence) {
+        final int length = sequence.length();
+        final Set<Character> set = new HashSet<>(length);
+        for (int i = 0; i < length; i++) {
+            set.add(sequence.charAt(i));
+        }
+        return set;
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link List} of {@link Character}s.
+     *
+     * @param sequence the sequence
+     * @return the list
+     */
+    private static List<Character> toCharacterList(CharSequence sequence) {
+        final int length = sequence.length();
+        final List<Character> list = new ArrayList<>(length);
+        for (int i = 0; i < length; i++) {
+            list.add(sequence.charAt(i));
+        }
+        return list;
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link Set} of bigrams (pairs of characters).
+     * These are represented using 2 16-bit chars packed into a 32-bit int.
+     *
+     * @param sequence the sequence
+     * @return the set
+     */
+    private static Set<Integer> toBigramSet(CharSequence sequence) {
+        final int length = sequence.length();
+        final Set<Integer> set = new HashSet<>(length);
+        if (length > 1) {
+            char ch2 = sequence.charAt(0);
+            for (int i = 1; i < length; i++) {
+                final char ch1 = ch2;
+                ch2 = sequence.charAt(i);
+                set.add(Integer.valueOf((ch1 << 16) | ch2));
+            }
+        }
+        return set;
+    }
+
+    /**
+     * Convert the {@link CharSequence} to a {@link List} of bigrams (pairs of characters).
+     * These are represented using 2 16-bit chars packed into a 32-bit int.
+     *
+     * @param sequence the sequence
+     * @return the list
+     */
+    private static List<Integer> toBigramList(CharSequence sequence) {
+        final int length = sequence.length();
+        final List<Integer> list = new ArrayList<>(length);
+        if (length > 1) {
+            char ch2 = sequence.charAt(0);
+            for (int i = 1; i < length; i++) {
+                final char ch1 = ch2;
+                ch2 = sequence.charAt(i);
+                list.add(Integer.valueOf((ch1 << 16) | ch2));
+            }
+        }
+        return list;
+    }
+
+    private static <T> void assertIntersection(IntersectionSimilarity<T> similarity, CharSequence cs1, CharSequence cs2,
+            int sizeA, int sizeB, int intersection) {
+        final IntersectionResult result = similarity.apply(cs1, cs2);
+        assertEquals(sizeA, result.getSizeA(), "Size A error");
+        assertEquals(sizeB, result.getSizeB(), "Size B error");
+        assertEquals(intersection, result.getIntersection(), "Intersection error");
+    }
+
+    @Test
+    public void testF1ScoreUsingListWordBigrams() {
+        // Example of a word letter pairs algorithm by Simon White:
+        // http://www.catalysoft.com/articles/StrikeAMatch.html
+        // This splits into words using whitespace and then computes uppercase
+        // bigrams for each word.
+
+        // Split on whitespace
+        final Pattern pattern = Pattern.compile("\\s+");
+
+        // Compute using pairs of characters (bigrams) for each word.
+        // This can be done using a 32-bit int to store two 16-bit characters
+        final Function<CharSequence, Collection<Integer>> converter = cs -> {
+            final List<Integer> set = new ArrayList<>();
+            for (String word : pattern.split(cs)) {
+                if (word.length() > 1) {
+                    // The strings are converted to upper case
+                    char ch2 = Character.toUpperCase(word.charAt(0));
+                    for (int i = 1; i < word.length(); i++) {
+                        final char ch1 = ch2;
+                        ch2 = Character.toUpperCase(word.charAt(i));
+                        set.add(Integer.valueOf((ch1 << 16) | ch2));
+                    }
+                }
+            }
+            return set;
+        };
+        final IntersectionSimilarity<Integer> similarity = new IntersectionSimilarity<>(converter);
+
+        String bookTitle;
+        final String search1 = "Web Database Applications";
+        final String search2 = "PHP Web Applications";
+        final String search3 = "Web Aplications";
+        bookTitle = "Web Database Applications with PHP & MySQL";
+        assertEquals(82, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(68, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Creating Database Web Applications with PHP and ASP";
+        assertEquals(71, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(59, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(50, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Building Database Applications on the Web Using PHP3";
+        assertEquals(70, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(58, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Building Web Database Applications with Visual Studio 6";
+        assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(47, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(46, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Web Application Development With PHP";
+        assertEquals(51, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(67, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(56, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "WebRAD: Building Database Applications on the Web with Visual FoxPro and Web Connection";
+        assertEquals(49, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(34, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(32, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "Structural Assessment: The Role of Large and Full-Scale Testing";
+        assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(7, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+        bookTitle = "How to Find a Scholarship Online";
+        assertEquals(10, toF1ScorePercent(similarity.apply(bookTitle, search1)));
+        assertEquals(11, toF1ScorePercent(similarity.apply(bookTitle, search2)));
+        assertEquals(12, toF1ScorePercent(similarity.apply(bookTitle, search3)));
+    }
+
+    private static int toF1ScorePercent(IntersectionResult result) {
+        final double value = 2.0 * result.getIntersection() / (result.getSizeA() + result.getSizeB());
+        // Convert to percentage
+        return (int) Math.round(value * 100);
+    }
+
+    @Test
+    public void testConstructorWithNullConverterThrows() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(null);
+        });
+    }
+
+    @Test
+    public void testApplyNullNull() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, null);
+        });
+    }
+
+    @Test
+    public void testApplyStringNull() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply("left", null);
+        });
+    }
+
+    @Test
+    public void testApplyNullString() {
+        assertThatIllegalArgumentException().isThrownBy(() -> {
+            new IntersectionSimilarity<>(cs -> new HashSet<>(Arrays.asList(cs))).apply(null, "right");
+        });
+    }
+}


[commons-text] 03/03: TEXT-155: Update changes.xml

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit a6f2b44e17a8fa11066ec3519411adf5acc2abaa
Author: aherbert <ah...@apache.org>
AuthorDate: Wed Mar 13 14:47:07 2019 +0000

    TEXT-155: Update changes.xml
---
 src/changes/changes.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 7cb10e3..c1cfeda 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -56,6 +56,7 @@ The <action> type attribute can be add,update,fix,remove.
     <action issue="TEXT-156" type="update" dev="aherbert">Fix the RegexTokenizer to use a static Pattern</action>
     <action issue="TEXT-157" type="update" dev="aherbert">Remove rounding from JaccardDistance and JaccardSimilarity</action>
     <action issue="TEXT-151" type="fix" dev="aherbert">Fix the JaroWinklerSimilarity to use StringUtils.equals to test for CharSequence equality</action>
+    <action issue="TEXT-155" type="add" dev="aherbert">Add a generic IntersectionSimilarity measure</action>
   </release>
 
   <release version="1.6" date="2018-10-12" description="Release 1.6">


[commons-text] 02/03: Merge branch 'feature-TEXT-155'

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit 89b5f4ab4ec3aff50fe6040bba8fbbe3da0976b4
Merge: ea15c5a 639e4ab
Author: aherbert <ah...@apache.org>
AuthorDate: Wed Mar 13 14:43:45 2019 +0000

    Merge branch 'feature-TEXT-155'
    
    Closes #109

 .../text/similarity/IntersectionResult.java        | 118 +++++++
 .../text/similarity/IntersectionSimilarity.java    | 237 ++++++++++++++
 .../text/similarity/IntersectionResultTest.java    | 153 ++++++++++
 .../similarity/IntersectionSimilarityTest.java     | 340 +++++++++++++++++++++
 4 files changed, 848 insertions(+)