You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ch...@apache.org on 2018/08/08 16:45:46 UTC
[1/6] [text] Testcase for [TEXT-130] JaroWinklerDistance: Wrong
results due to precision of transpositions
Repository: commons-text
Updated Branches:
refs/heads/master 97d606405 -> 8d756e956
Testcase for [TEXT-130] JaroWinklerDistance: Wrong results due to precision of transpositions
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/70150fba
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/70150fba
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/70150fba
Branch: refs/heads/master
Commit: 70150fba9a0e26d944b0f649265a567309ba3af4
Parents: 802258f
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 22:51:18 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 22:51:18 2018 +0200
----------------------------------------------------------------------
.../org/apache/commons/text/similarity/JaroWinklerDistanceTest.java | 1 +
1 file changed, 1 insertion(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/70150fba/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
index d51135e..478557a 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -45,6 +45,7 @@ public class JaroWinklerDistanceTest {
distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
assertEquals(0.882329d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
assertEquals(0.996598d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+ assertEquals(0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
}
@Test
[4/6] [text] Fix [TEXT-131] JaroWinklerDistance: Calculation deviates
from definition
Posted by ch...@apache.org.
Fix [TEXT-131] JaroWinklerDistance: Calculation deviates from definition
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/5d148549
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/5d148549
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/5d148549
Branch: refs/heads/master
Commit: 5d148549bc6ea8501016856547e27aed58b116c3
Parents: 4546f45
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 23:20:21 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 23:22:33 2018 +0200
----------------------------------------------------------------------
.../commons/text/similarity/JaroWinklerDistance.java | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/5d148549/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
index 915cd5c..74ea4f7 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -86,16 +86,16 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
return 0D;
}
final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3;
- final double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
+ final double jw = j < 0.7D ? j : j + defaultScalingFactor * mtp[2] * (1D - j);
return jw;
}
/**
- * This method returns the Jaro-Winkler string matches, half transpositions, prefix, max array.
+ * This method returns the Jaro-Winkler string matches, half transpositions, prefix array.
*
* @param first the first string to be matched
* @param second the second string to be matched
- * @return mtp array containing: matches, half transpositions, prefix, and max length
+ * @return mtp array containing: matches, half transpositions, and prefix
*/
protected static int[] matches(final CharSequence first, final CharSequence second) {
CharSequence max, min;
@@ -143,14 +143,14 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
}
}
int prefix = 0;
- for (int mi = 0; mi < min.length(); mi++) {
+ for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
if (first.charAt(mi) == second.charAt(mi)) {
prefix++;
} else {
break;
}
}
- return new int[] {matches, halfTranspositions, prefix, max.length()};
+ return new int[] {matches, halfTranspositions, prefix};
}
}
[3/6] [text] Testcases for [TEXT-131] JaroWinklerDistance:
Calculation deviates from definition
Posted by ch...@apache.org.
Testcases for [TEXT-131] JaroWinklerDistance: Calculation deviates from definition
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/4546f45c
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/4546f45c
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/4546f45c
Branch: refs/heads/master
Commit: 4546f45c7ed610b94336b7a60592ac77382f6fdb
Parents: 4d064de
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 23:04:32 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 23:19:09 2018 +0200
----------------------------------------------------------------------
.../commons/text/similarity/JaroWinklerDistanceTest.java | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/4546f45c/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
index 478557a..d6bfda0 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -39,13 +39,14 @@ public class JaroWinklerDistanceTest {
assertEquals(0.92499d, distance.apply("frog", "fog"), 0.00001d);
assertEquals(0.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
assertEquals(0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
- assertEquals(0.92740d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
- assertEquals(0.94580d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
- assertEquals(0.921458d,
+ assertEquals(0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
+ assertEquals(0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
+ assertEquals(0.942d,
distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
- assertEquals(0.882329d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
- assertEquals(0.996598d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+ assertEquals(0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
+ assertEquals(0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
assertEquals(0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
+ assertEquals(0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
}
@Test
[6/6] [text] TEXT-130, TEXT-131: thanks Jan Martin Keil
Posted by ch...@apache.org.
TEXT-130, TEXT-131: thanks Jan Martin Keil
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/8d756e95
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/8d756e95
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/8d756e95
Branch: refs/heads/master
Commit: 8d756e956dcc03f9bafc628f0d212a506f8c2848
Parents: 6a79e11
Author: Rob Tompkins <ch...@gmail.com>
Authored: Wed Aug 8 12:45:37 2018 -0400
Committer: Rob Tompkins <ch...@gmail.com>
Committed: Wed Aug 8 12:45:37 2018 -0400
----------------------------------------------------------------------
pom.xml | 3 +++
src/changes/changes.xml | 4 +++-
2 files changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/8d756e95/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index d7a3df1..77ff956 100644
--- a/pom.xml
+++ b/pom.xml
@@ -354,6 +354,9 @@
<contributor>
<name>Luciano Medallia</name>
</contributor>
+ <contributor>
+ <name>Jan Martin Keil</name>
+ </contributor>
</contributors>
<scm>
http://git-wip-us.apache.org/repos/asf/commons-text/blob/8d756e95/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 4058c0a..a24fb6d 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -45,7 +45,9 @@ The <action> type attribute can be add,update,fix,remove.
</properties>
<body>
- <release version="1.4.1" date="2018-MM-DD" description="Release 1.4.1">
+ <release version="1.5" date="2018-MM-DD" description="Release 1.5">
+ <action issue="TEXT-130" type="fix" dev="chtompki" due-to="Jan Martin Keil">Fixes JaroWinklerDistance: Wrong results due to precision of transpositions</action>
+ <action issue="TEXT-131" type="fix" dev="chtompki" due-to="Jan Martin Keil">JaroWinklerDistance: Calculation deviates from definition</action>
</release>
<release version="1.4" date="2018-06-12" description="Release 1.4">
[2/6] [text] Fix [TEXT-130] JaroWinklerDistance: Wrong results due to
precision of transpositions
Posted by ch...@apache.org.
Fix [TEXT-130] JaroWinklerDistance: Wrong results due to precision of transpositions
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/4d064dec
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/4d064dec
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/4d064dec
Branch: refs/heads/master
Commit: 4d064decbf7828918ca59b70d7fca19b7da955ec
Parents: 70150fb
Author: Jan Martin Keil <ja...@uni-jena.de>
Authored: Thu Aug 2 22:55:00 2018 +0200
Committer: Jan Martin Keil <ja...@uni-jena.de>
Committed: Thu Aug 2 22:55:00 2018 +0200
----------------------------------------------------------------------
.../commons/text/similarity/JaroWinklerDistance.java | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/commons-text/blob/4d064dec/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
index 0ffb1ad..915cd5c 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -85,17 +85,17 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
if (m == 0) {
return 0D;
}
- final double j = ((m / left.length() + m / right.length() + (m - mtp[1]) / m)) / 3;
+ final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3;
final double jw = j < 0.7D ? j : j + Math.min(defaultScalingFactor, 1D / mtp[3]) * mtp[2] * (1D - j);
return jw;
}
/**
- * This method returns the Jaro-Winkler string matches, transpositions, prefix, max array.
+ * This method returns the Jaro-Winkler string matches, half transpositions, prefix, max array.
*
* @param first the first string to be matched
* @param second the second string to be matched
- * @return mtp array containing: matches, transpositions, prefix, and max length
+ * @return mtp array containing: matches, half transpositions, prefix, and max length
*/
protected static int[] matches(final CharSequence first, final CharSequence second) {
CharSequence max, min;
@@ -136,10 +136,10 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
si++;
}
}
- int transpositions = 0;
+ int halfTranspositions = 0;
for (int mi = 0; mi < ms1.length; mi++) {
if (ms1[mi] != ms2[mi]) {
- transpositions++;
+ halfTranspositions++;
}
}
int prefix = 0;
@@ -150,7 +150,7 @@ public class JaroWinklerDistance implements SimilarityScore<Double> {
break;
}
}
- return new int[] {matches, transpositions / 2, prefix, max.length()};
+ return new int[] {matches, halfTranspositions, prefix, max.length()};
}
}
[5/6] [text] Merge branch 'master' of
https://github.com/jmkeil/commons-text
Posted by ch...@apache.org.
Merge branch 'master' of https://github.com/jmkeil/commons-text
Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/6a79e117
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/6a79e117
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/6a79e117
Branch: refs/heads/master
Commit: 6a79e117f3c6eec668288ef41ab3970f3d441b22
Parents: 97d6064 5d14854
Author: Rob Tompkins <ch...@gmail.com>
Authored: Wed Aug 8 12:25:05 2018 -0400
Committer: Rob Tompkins <ch...@gmail.com>
Committed: Wed Aug 8 12:25:05 2018 -0400
----------------------------------------------------------------------
.../text/similarity/JaroWinklerDistance.java | 16 ++++++++--------
.../text/similarity/JaroWinklerDistanceTest.java | 12 +++++++-----
2 files changed, 15 insertions(+), 13 deletions(-)
----------------------------------------------------------------------