You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ah...@apache.org on 2019/03/08 16:12:33 UTC

[commons-text] branch master updated (19df20d -> bf2f234)

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git.


    from 19df20d  TEXT-156: Update changes.xml
     new d768027  TEXT-157: Remove rounding from JaccardSimilarity and Distance
     new eacfa36  TEXT-157: Use expected=(intersect/union) in Jaccard tests
     new bf2f234  Merge branch 'improvement-TEXT-157'

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 src/changes/changes.xml                            |  1 +
 .../commons/text/similarity/JaccardDistance.java   |  2 +-
 .../commons/text/similarity/JaccardSimilarity.java |  2 +-
 .../text/similarity/JaccardDistanceTest.java       | 31 +++++++++++-----------
 .../text/similarity/JaccardSimilarityTest.java     | 31 +++++++++++-----------
 5 files changed, 35 insertions(+), 32 deletions(-)


[commons-text] 01/03: TEXT-157: Remove rounding from JaccardSimilarity and Distance

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit d768027dff94cb149788f49515d28b4aa10cdc33
Author: aherbert <a....@sussex.ac.uk>
AuthorDate: Fri Mar 8 11:47:25 2019 +0000

    TEXT-157: Remove rounding from JaccardSimilarity and Distance
---
 src/changes/changes.xml                            |  1 +
 .../commons/text/similarity/JaccardDistance.java   |  2 +-
 .../commons/text/similarity/JaccardSimilarity.java |  2 +-
 .../text/similarity/JaccardDistanceTest.java       | 32 ++++++++++++----------
 .../text/similarity/JaccardSimilarityTest.java     | 32 ++++++++++++----------
 5 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 1cb5391..82484c1 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -54,6 +54,7 @@ The <action> type attribute can be add,update,fix,remove.
     <action issue="TEXT-152" type="add" dev="" due-to="@CAPS50">Fix possible infinite loop in WordUtils.wrap for a regex pattern that would trigger on a match of 0 length</action>
     <action issue="TEXT-153" type="update" dev="" due-to="amirhadadi">Make prefixSet in LookupTranslator a BitSet</action>
     <action issue="TEXT-156" type="update" dev="aherbert">Fix the RegexTokenizer to use a static Pattern</action>
+    <action issue="TEXT-157" type="update" dev="aherbert">Remove rounding from JaccardDistance and JaccardSimilarity</action>
   </release>
 
   <release version="1.6" date="2018-10-12" description="Release 1.6">
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
index 38e4548..fe956b2 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardDistance.java
@@ -50,6 +50,6 @@ public class JaccardDistance implements EditDistance<Double> {
         if (left == null || right == null) {
             throw new IllegalArgumentException("Input cannot be null");
         }
-        return Math.round((1 - jaccardSimilarity.apply(left, right)) * 100d) / 100d;
+        return 1.0 - jaccardSimilarity.apply(left, right).doubleValue();
     }
 }
diff --git a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
index 2e88dd2..d1478cb 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaccardSimilarity.java
@@ -48,7 +48,7 @@ public class JaccardSimilarity implements SimilarityScore<Double> {
         if (left == null || right == null) {
             throw new IllegalArgumentException("Input cannot be null");
         }
-        return Math.round(calculateJaccardSimilarity(left, right) * 100d) / 100d;
+        return calculateJaccardSimilarity(left, right);
     }
 
     /**
diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
index 56d4909..595c83b 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
@@ -36,21 +36,23 @@ public class JaccardDistanceTest {
 
     @Test
     public void testGettingJaccardDistance() {
-        assertEquals(1.00d, classBeingTested.apply("", ""), 0.00000000000000000001d);
-        assertEquals(1.00d, classBeingTested.apply("left", ""), 0.00000000000000000001d);
-        assertEquals(1.00d, classBeingTested.apply("", "right"), 0.00000000000000000001d);
-        assertEquals(0.25d, classBeingTested.apply("frog", "fog"), 0.00000000000000000001d);
-        assertEquals(1.00d, classBeingTested.apply("fly", "ant"), 0.00000000000000000001d);
-        assertEquals(0.78d, classBeingTested.apply("elephant", "hippo"), 0.00000000000000000001d);
-        assertEquals(0.36d, classBeingTested.apply("ABC Corporation", "ABC Corp"), 0.00000000000000000001d);
-        assertEquals(0.24d, classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."),
-                0.00000000000000000001d);
-        assertEquals(0.11d, classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"),
-                0.00000000000000000001d);
-        assertEquals(0.10d, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00000000000000000001d);
-        assertEquals(0.87d, classBeingTested.apply("left", "right"), 0.00000000000000000001d);
-        assertEquals(0.87d, classBeingTested.apply("leettteft", "ritttght"), 0.00000000000000000001d);
-        assertEquals(0.0d, classBeingTested.apply("the same string", "the same string"), 0.00000000000000000001d);
+        // Results generated using the python distance library using:
+        // distance.jaccard(seq1, seq2)
+        assertEquals(1.0, classBeingTested.apply("", ""));
+        assertEquals(1.0, classBeingTested.apply("left", ""));
+        assertEquals(1.0, classBeingTested.apply("", "right"));
+        assertEquals(0.25, classBeingTested.apply("frog", "fog"));
+        assertEquals(1.0, classBeingTested.apply("fly", "ant"));
+        assertEquals(0.7777777777777778, classBeingTested.apply("elephant", "hippo"));
+        assertEquals(0.36363636363636365, classBeingTested.apply("ABC Corporation", "ABC Corp"));
+        assertEquals(0.23529411764705888,
+                classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."));
+        assertEquals(0.11111111111111116,
+                classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"));
+        assertEquals(0.09999999999999998, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"));
+        assertEquals(0.875, classBeingTested.apply("left", "right"));
+        assertEquals(0.875, classBeingTested.apply("leettteft", "ritttght"));
+        assertEquals(0.0, classBeingTested.apply("the same string", "the same string"));
     }
 
     @Test
diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
index c96d6d3..96e0908 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
@@ -36,21 +36,23 @@ public class JaccardSimilarityTest {
 
     @Test
     public void testGettingJaccardSimilarity() {
-        assertEquals(0.00d, classBeingTested.apply("", ""), 0.00000000000000000001d);
-        assertEquals(0.00d, classBeingTested.apply("left", ""), 0.00000000000000000001d);
-        assertEquals(0.00d, classBeingTested.apply("", "right"), 0.00000000000000000001d);
-        assertEquals(0.75d, classBeingTested.apply("frog", "fog"), 0.00000000000000000001d);
-        assertEquals(0.00d, classBeingTested.apply("fly", "ant"), 0.00000000000000000001d);
-        assertEquals(0.22d, classBeingTested.apply("elephant", "hippo"), 0.00000000000000000001d);
-        assertEquals(0.64d, classBeingTested.apply("ABC Corporation", "ABC Corp"), 0.00000000000000000001d);
-        assertEquals(0.76d, classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."),
-                0.00000000000000000001d);
-        assertEquals(0.89d, classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"),
-                0.00000000000000000001d);
-        assertEquals(0.9d, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00000000000000000001d);
-        assertEquals(0.13d, classBeingTested.apply("left", "right"), 0.00000000000000000001d);
-        assertEquals(0.13d, classBeingTested.apply("leettteft", "ritttght"), 0.00000000000000000001d);
-        assertEquals(1.0d, classBeingTested.apply("the same string", "the same string"), 0.00000000000000000001d);
+        // Results generated using the python distance library using:
+        // 1 - distance.jaccard(seq1, seq2)
+        assertEquals(0.0, classBeingTested.apply("", ""));
+        assertEquals(0.0, classBeingTested.apply("left", ""));
+        assertEquals(0.0, classBeingTested.apply("", "right"));
+        assertEquals(0.75, classBeingTested.apply("frog", "fog"));
+        assertEquals(0.0, classBeingTested.apply("fly", "ant"));
+        assertEquals(0.2222222222222222, classBeingTested.apply("elephant", "hippo"));
+        assertEquals(0.6363636363636364, classBeingTested.apply("ABC Corporation", "ABC Corp"));
+        assertEquals(0.7647058823529411,
+                classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."));
+        assertEquals(0.8888888888888888,
+                classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"));
+        assertEquals(0.9, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"));
+        assertEquals(0.125, classBeingTested.apply("left", "right"));
+        assertEquals(0.125, classBeingTested.apply("leettteft", "ritttght"));
+        assertEquals(1.0, classBeingTested.apply("the same string", "the same string"));
     }
 
     @Test


[commons-text] 02/03: TEXT-157: Use expected=(intersect/union) in Jaccard tests

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit eacfa3672ad6bb3a65c9268f9cd3c8b36be12149
Author: aherbert <a....@sussex.ac.uk>
AuthorDate: Fri Mar 8 12:32:16 2019 +0000

    TEXT-157: Use expected=(intersect/union) in Jaccard tests
---
 .../commons/text/similarity/JaccardDistanceTest.java  | 19 +++++++++----------
 .../text/similarity/JaccardSimilarityTest.java        | 19 +++++++++----------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
index 595c83b..979354f 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaccardDistanceTest.java
@@ -36,22 +36,21 @@ public class JaccardDistanceTest {
 
     @Test
     public void testGettingJaccardDistance() {
-        // Results generated using the python distance library using:
-        // distance.jaccard(seq1, seq2)
+        // Expected Jaccard distance = 1.0 - (intersect / union)
         assertEquals(1.0, classBeingTested.apply("", ""));
         assertEquals(1.0, classBeingTested.apply("left", ""));
         assertEquals(1.0, classBeingTested.apply("", "right"));
-        assertEquals(0.25, classBeingTested.apply("frog", "fog"));
+        assertEquals(1.0 - (3.0 / 4), classBeingTested.apply("frog", "fog"));
         assertEquals(1.0, classBeingTested.apply("fly", "ant"));
-        assertEquals(0.7777777777777778, classBeingTested.apply("elephant", "hippo"));
-        assertEquals(0.36363636363636365, classBeingTested.apply("ABC Corporation", "ABC Corp"));
-        assertEquals(0.23529411764705888,
+        assertEquals(1.0 - (2.0 / 9), classBeingTested.apply("elephant", "hippo"));
+        assertEquals(1.0 - (7.0 / 11), classBeingTested.apply("ABC Corporation", "ABC Corp"));
+        assertEquals(1.0 - (13.0 / 17),
                 classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."));
-        assertEquals(0.11111111111111116,
+        assertEquals(1.0 - (16.0 / 18),
                 classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"));
-        assertEquals(0.09999999999999998, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"));
-        assertEquals(0.875, classBeingTested.apply("left", "right"));
-        assertEquals(0.875, classBeingTested.apply("leettteft", "ritttght"));
+        assertEquals(1.0 - (9.0 / 10), classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"));
+        assertEquals(1.0 - (1.0 / 8), classBeingTested.apply("left", "right"));
+        assertEquals(1.0 - (1.0 / 8), classBeingTested.apply("leettteft", "ritttght"));
         assertEquals(0.0, classBeingTested.apply("the same string", "the same string"));
     }
 
diff --git a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
index 96e0908..bb46122 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaccardSimilarityTest.java
@@ -36,22 +36,21 @@ public class JaccardSimilarityTest {
 
     @Test
     public void testGettingJaccardSimilarity() {
-        // Results generated using the python distance library using:
-        // 1 - distance.jaccard(seq1, seq2)
+        // Expected Jaccard similarity = (intersect / union)
         assertEquals(0.0, classBeingTested.apply("", ""));
         assertEquals(0.0, classBeingTested.apply("left", ""));
         assertEquals(0.0, classBeingTested.apply("", "right"));
-        assertEquals(0.75, classBeingTested.apply("frog", "fog"));
+        assertEquals(3.0 / 4, classBeingTested.apply("frog", "fog"));
         assertEquals(0.0, classBeingTested.apply("fly", "ant"));
-        assertEquals(0.2222222222222222, classBeingTested.apply("elephant", "hippo"));
-        assertEquals(0.6363636363636364, classBeingTested.apply("ABC Corporation", "ABC Corp"));
-        assertEquals(0.7647058823529411,
+        assertEquals(2.0 / 9, classBeingTested.apply("elephant", "hippo"));
+        assertEquals(7.0 / 11, classBeingTested.apply("ABC Corporation", "ABC Corp"));
+        assertEquals(13.0 / 17,
                 classBeingTested.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."));
-        assertEquals(0.8888888888888888,
+        assertEquals(16.0 / 18,
                 classBeingTested.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"));
-        assertEquals(0.9, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"));
-        assertEquals(0.125, classBeingTested.apply("left", "right"));
-        assertEquals(0.125, classBeingTested.apply("leettteft", "ritttght"));
+        assertEquals(9.0 / 10, classBeingTested.apply("PENNSYLVANIA", "PENNCISYLVNIA"));
+        assertEquals(1.0 / 8, classBeingTested.apply("left", "right"));
+        assertEquals(1.0 / 8, classBeingTested.apply("leettteft", "ritttght"));
         assertEquals(1.0, classBeingTested.apply("the same string", "the same string"));
     }
 


[commons-text] 03/03: Merge branch 'improvement-TEXT-157'

Posted by ah...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git

commit bf2f234982a32355f73624dced6397c2ed98bf42
Merge: 19df20d eacfa36
Author: aherbert <a....@sussex.ac.uk>
AuthorDate: Fri Mar 8 16:10:57 2019 +0000

    Merge branch 'improvement-TEXT-157'
    
    Closes #111

 src/changes/changes.xml                            |  1 +
 .../commons/text/similarity/JaccardDistance.java   |  2 +-
 .../commons/text/similarity/JaccardSimilarity.java |  2 +-
 .../text/similarity/JaccardDistanceTest.java       | 31 +++++++++++-----------
 .../text/similarity/JaccardSimilarityTest.java     | 31 +++++++++++-----------
 5 files changed, 35 insertions(+), 32 deletions(-)