You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ch...@apache.org on 2018/08/08 16:45:46 UTC

[1/6] [text] Testcase for [TEXT-130] JaroWinklerDistance: Wrong results due to precision of transpositions

Repository: commons-text
Updated Branches:
  refs/heads/master 97d606405 -> 8d756e956


Testcase for [TEXT-130] JaroWinklerDistance: Wrong results due to precision of transpositions


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/70150fba
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/70150fba
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/70150fba

Branch: refs/heads/master
Commit: 70150fba9a0e26d944b0f649265a567309ba3af4
Parents: 802258f
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 22:51:18 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 22:51:18 2018 +0200

----------------------------------------------------------------------
 .../org/apache/commons/text/similarity/JaroWinklerDistanceTest.java | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/70150fba/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
index d51135e..478557a 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -45,6 +45,7 @@ public class JaroWinklerDistanceTest {
                 distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
         assertEquals(0.882329d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
         assertEquals(0.996598d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+        assertEquals(0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
     }
 
     @Test


[4/6] [text] Fix [TEXT-131] JaroWinklerDistance: Calculation deviates from definition

Posted by ch...@apache.org.
Fix [TEXT-131] JaroWinklerDistance: Calculation deviates from definition


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/5d148549
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/5d148549
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/5d148549

Branch: refs/heads/master
Commit: 5d148549bc6ea8501016856547e27aed58b116c3
Parents: 4546f45
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 23:20:21 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 23:22:33 2018 +0200

----------------------------------------------------------------------
 .../commons/text/similarity/JaroWinklerDistance.java      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/5d148549/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
index 915cd5c..74ea4f7 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -86,16 +86,16 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
             return 0D;
         }
         final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3;
-        final double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
+        final double jw = j < 0.7D ? j : j + defaultScalingFactor * mtp[2] * (1D - j);
         return jw;
     }
 
     /**
-     * This method returns the Jaro-Winkler string matches, half transpositions, prefix, max array.
+     * This method returns the Jaro-Winkler string matches, half transpositions, prefix array.
      *
      * @param first the first string to be matched
      * @param second the second string to be matched
-     * @return mtp array containing: matches, half transpositions, prefix, and max length
+     * @return mtp array containing: matches, half transpositions, and prefix
      */
     protected static int[] matches(final CharSequence first, final CharSequence second) {
         CharSequence max, min;
@@ -143,14 +143,14 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
             }
         }
         int prefix = 0;
-        for (int mi = 0; mi < min.length(); mi++) {
+        for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
             if (first.charAt(mi) == second.charAt(mi)) {
                 prefix++;
             } else {
                 break;
             }
         }
-        return new int[] {matches, halfTranspositions, prefix, max.length()};
+        return new int[] {matches, halfTranspositions, prefix};
     }
 
 }


[3/6] [text] Testcases for [TEXT-131] JaroWinklerDistance: Calculation deviates from definition

Posted by ch...@apache.org.
Testcases for [TEXT-131] JaroWinklerDistance: Calculation deviates from definition


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/4546f45c
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/4546f45c
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/4546f45c

Branch: refs/heads/master
Commit: 4546f45c7ed610b94336b7a60592ac77382f6fdb
Parents: 4d064de
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 23:04:32 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 23:19:09 2018 +0200

----------------------------------------------------------------------
 .../commons/text/similarity/JaroWinklerDistanceTest.java | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/4546f45c/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
index 478557a..d6bfda0 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -39,13 +39,14 @@ public class JaroWinklerDistanceTest {
         assertEquals(0.92499d, distance.apply("frog", "fog"), 0.00001d);
         assertEquals(0.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
         assertEquals(0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
-        assertEquals(0.92740d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
-        assertEquals(0.94580d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
-        assertEquals(0.921458d,
+        assertEquals(0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
+        assertEquals(0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
+        assertEquals(0.942d,
                 distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
-        assertEquals(0.882329d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
-        assertEquals(0.996598d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+        assertEquals(0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
+        assertEquals(0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
         assertEquals(0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
+        assertEquals(0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
     }
 
     @Test


[6/6] [text] TEXT-130, TEXT-131: thanks Jan Martin Keil

Posted by ch...@apache.org.
TEXT-130, TEXT-131: thanks Jan Martin Keil


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/8d756e95
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/8d756e95
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/8d756e95

Branch: refs/heads/master
Commit: 8d756e956dcc03f9bafc628f0d212a506f8c2848
Parents: 6a79e11
Author: Rob Tompkins <ch...@gmail.com>
Authored: Wed Aug 8 12:45:37 2018 -0400
Committer: Rob Tompkins <ch...@gmail.com>
Committed: Wed Aug 8 12:45:37 2018 -0400

----------------------------------------------------------------------
 pom.xml                 | 3 +++
 src/changes/changes.xml | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/8d756e95/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index d7a3df1..77ff956 100644
--- a/pom.xml
+++ b/pom.xml
@@ -354,6 +354,9 @@
     <contributor>
       <name>Luciano Medallia</name>
     </contributor>
+    <contributor>
+      <name>Jan Martin Keil</name>
+    </contributor>
   </contributors>
 
   <scm>

http://git-wip-us.apache.org/repos/asf/commons-text/blob/8d756e95/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 4058c0a..a24fb6d 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -45,7 +45,9 @@ The <action> type attribute can be add,update,fix,remove.
   </properties>
   <body>
 
-  <release version="1.4.1" date="2018-MM-DD" description="Release 1.4.1">
+  <release version="1.5" date="2018-MM-DD" description="Release 1.5">
+    <action issue="TEXT-130" type="fix" dev="chtompki" due-to="Jan Martin Keil">Fixes JaroWinklerDistance: Wrong results due to precision of transpositions</action>
+    <action issue="TEXT-131" type="fix" dev="chtompki" due-to="Jan Martin Keil">JaroWinklerDistance: Calculation deviates from definition</action>
   </release>
 
   <release version="1.4" date="2018-06-12" description="Release 1.4">


[2/6] [text] Fix [TEXT-130] JaroWinklerDistance: Wrong results due to precision of transpositions

Posted by ch...@apache.org.
Fix [TEXT-130] JaroWinklerDistance: Wrong results due to precision of transpositions


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/4d064dec
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/4d064dec
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/4d064dec

Branch: refs/heads/master
Commit: 4d064decbf7828918ca59b70d7fca19b7da955ec
Parents: 70150fb
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 22:55:00 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 22:55:00 2018 +0200

----------------------------------------------------------------------
 .../commons/text/similarity/JaroWinklerDistance.java    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d064dec/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
index 0ffb1ad..915cd5c 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -85,17 +85,17 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
         if (m == 0) {
             return 0D;
         }
-        final double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
+        final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3;
         final double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
         return jw;
     }
 
     /**
-     * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array.
+     * This method returns the Jaro-Winkler string matches, half transpositions, prefix, max array.
      *
      * @param first the first string to be matched
      * @param second the second string to be matched
-     * @return mtp array containing: matches, transpositions, prefix, and max length
+     * @return mtp array containing: matches, half transpositions, prefix, and max length
      */
     protected static int[] matches(final CharSequence first, final CharSequence second) {
         CharSequence max, min;
@@ -136,10 +136,10 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
                 si++;
             }
         }
-        int transpositions = 0;
+        int halfTranspositions = 0;
         for (int mi = 0; mi < ms1.length; mi++) {
             if (ms1[mi] != ms2[mi]) {
-                transpositions++;
+                halfTranspositions++;
             }
         }
         int prefix = 0;
@@ -150,7 +150,7 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
                 break;
             }
         }
-        return new int[] {matches, transpositions / 2, prefix, max.length()};
+        return new int[] {matches, halfTranspositions, prefix, max.length()};
     }
 
 }


[5/6] [text] Merge branch 'master' of https://github.com/jmkeil/commons-text

Posted by ch...@apache.org.
Merge branch 'master' of https://github.com/jmkeil/commons-text


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/6a79e117
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/6a79e117
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/6a79e117

Branch: refs/heads/master
Commit: 6a79e117f3c6eec668288ef41ab3970f3d441b22
Parents: 97d6064 5d14854
Author: Rob Tompkins <ch...@gmail.com>
Authored: Wed Aug 8 12:25:05 2018 -0400
Committer: Rob Tompkins <ch...@gmail.com>
Committed: Wed Aug 8 12:25:05 2018 -0400

----------------------------------------------------------------------
 .../text/similarity/JaroWinklerDistance.java        | 16 ++++++++--------
 .../text/similarity/JaroWinklerDistanceTest.java    | 12 +++++++-----
 2 files changed, 15 insertions(+), 13 deletions(-)
----------------------------------------------------------------------