You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2019/02/23 23:29:58 UTC
[commons-text] branch master updated: TEXT-104: deprecate
JaroWinkler methods for 2.0, and fix clirr report
This is an automated email from the ASF dual-hosted git repository.
kinow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git
The following commit(s) were added to refs/heads/master by this push:
new a32b1d2 TEXT-104: deprecate JaroWinkler methods for 2.0, and fix clirr report
new 56c060d Merge pull request #102 from kinow/deprecate-jaro-winkler-for-20
a32b1d2 is described below
commit a32b1d2948c7b1b4a07eda8a72fb5cd5bdf2dd00
Author: Bruno P. Kinoshita <br...@niwa.co.nz>
AuthorDate: Thu Feb 21 10:44:02 2019 +1300
TEXT-104: deprecate JaroWinkler methods for 2.0, and fix clirr report
---
.../text/similarity/JaroWinklerDistance.java | 91 +++++++++++++++++++++-
.../text/similarity/JaroWinklerDistanceTest.java | 45 +++++++----
2 files changed, 117 insertions(+), 19 deletions(-)
diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
index eaa9fb3..388d0c7 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -16,6 +16,8 @@
*/
package org.apache.commons.text.similarity;
+import java.util.Arrays;
+
/**
* Measures the Jaro-Winkler distance of two character sequences.
* It is the complementary of Jaro-Winkler similarity.
@@ -25,6 +27,12 @@ package org.apache.commons.text.similarity;
public class JaroWinklerDistance implements EditDistance<Double> {
/**
+ * @deprecated Deprecated as of 1.7. This constant will be removed in 2.0.
+ */
+ @Deprecated
+ public static final int INDEX_NOT_FOUND = -1;
+
+ /**
* Computes the Jaro Winkler Distance between two character sequences.
*
* <pre>
@@ -63,7 +71,86 @@ public class JaroWinklerDistance implements EditDistance<Double> {
throw new IllegalArgumentException("CharSequences must not be null");
}
- JaroWinklerSimilarity similarity = new JaroWinklerSimilarity();
- return 1 - similarity.apply(left, right);
+ // TODO: replace the rest of the code by this in 2.0, see TEXT-104
+ //
+ // JaroWinklerSimilarity similarity = new JaroWinklerSimilarity();
+ // return 1 - similarity.apply(left, right);
+
+ final double defaultScalingFactor = 0.1;
+ final int[] mtp = matches(left, right);
+ final double m = mtp[0];
+ if (m == 0) {
+ return 0D;
+ }
+ final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3;
+ final double jw = j < 0.7D ? j : j + defaultScalingFactor * mtp[2] * (1D - j);
+ return jw;
+ }
+
+ // TODO: remove this method in 2.0, see TEXT-104
+ /**
+ * This method returns the Jaro-Winkler string matches, half transpositions, prefix array.
+ *
+ * @param first the first string to be matched
+ * @param second the second string to be matched
+ * @return mtp array containing: matches, half transpositions, and prefix
+ * @deprecated Deprecated as of 1.7. This method will be removed in 2.0, and moved to a Jaro Winkler similarity
+ * class.
+ */
+ @Deprecated
+ protected static int[] matches(final CharSequence first, final CharSequence second) {
+ CharSequence max, min;
+ if (first.length() > second.length()) {
+ max = first;
+ min = second;
+ } else {
+ max = second;
+ min = first;
+ }
+ final int range = Math.max(max.length() / 2 - 1, 0);
+ final int[] matchIndexes = new int[min.length()];
+ Arrays.fill(matchIndexes, -1);
+ final boolean[] matchFlags = new boolean[max.length()];
+ int matches = 0;
+ for (int mi = 0; mi < min.length(); mi++) {
+ final char c1 = min.charAt(mi);
+ for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
+ if (!matchFlags[xi] && c1 == max.charAt(xi)) {
+ matchIndexes[mi] = xi;
+ matchFlags[xi] = true;
+ matches++;
+ break;
+ }
+ }
+ }
+ final char[] ms1 = new char[matches];
+ final char[] ms2 = new char[matches];
+ for (int i = 0, si = 0; i < min.length(); i++) {
+ if (matchIndexes[i] != -1) {
+ ms1[si] = min.charAt(i);
+ si++;
+ }
+ }
+ for (int i = 0, si = 0; i < max.length(); i++) {
+ if (matchFlags[i]) {
+ ms2[si] = max.charAt(i);
+ si++;
+ }
+ }
+ int halfTranspositions = 0;
+ for (int mi = 0; mi < ms1.length; mi++) {
+ if (ms1[mi] != ms2[mi]) {
+ halfTranspositions++;
+ }
+ }
+ int prefix = 0;
+ for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
+ if (first.charAt(mi) == second.charAt(mi)) {
+ prefix++;
+ } else {
+ break;
+ }
+ }
+ return new int[] {matches, halfTranspositions, prefix};
}
}
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
index eadf1a2..e56ec07 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -36,23 +36,34 @@ public class JaroWinklerDistanceTest {
@Test
public void testGetJaroWinklerDistance_StringString() {
- assertEquals(0d, distance.apply("", ""), 0.00001d);
- assertEquals(0d, distance.apply("foo", "foo"), 0.00001d);
- assertEquals(1 - 0.94166d, distance.apply("foo", "foo "), 0.00001d);
- assertEquals(1 - 0.90666d, distance.apply("foo", "foo "), 0.00001d);
- assertEquals(1 - 0.86666d, distance.apply("foo", " foo "), 0.00001d);
- assertEquals(1 - 0.51111d, distance.apply("foo", " foo"), 0.00001d);
- assertEquals(1 - 0.92499d, distance.apply("frog", "fog"), 0.00001d);
- assertEquals(1.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
- assertEquals(1 - 0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
- assertEquals(1 - 0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
- assertEquals(1 - 0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
- assertEquals(1 - 0.942d,
- distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
- assertEquals(1 - 0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
- assertEquals(1 - 0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
- assertEquals(1 - 0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
- assertEquals(1 - 0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
+ assertEquals(0.92499d, distance.apply("frog", "fog"), 0.00001d);
+ assertEquals(0.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
+ assertEquals(0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
+ assertEquals(0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
+ assertEquals(0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
+ assertEquals(0.942d, distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
+ assertEquals(0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
+ assertEquals(0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+ assertEquals(0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
+ assertEquals(0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
+ // TODO: replace tests in 2.0. See TEXT-104 for more.
+ // assertEquals(0d, distance.apply("", ""), 0.00001d);
+ // assertEquals(0d, distance.apply("foo", "foo"), 0.00001d);
+ // assertEquals(1 - 0.94166d, distance.apply("foo", "foo "), 0.00001d);
+ // assertEquals(1 - 0.90666d, distance.apply("foo", "foo "), 0.00001d);
+ // assertEquals(1 - 0.86666d, distance.apply("foo", " foo "), 0.00001d);
+ // assertEquals(1 - 0.51111d, distance.apply("foo", " foo"), 0.00001d);
+ // assertEquals(1 - 0.92499d, distance.apply("frog", "fog"), 0.00001d);
+ // assertEquals(1.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
+ // assertEquals(1 - 0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
+ // assertEquals(1 - 0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
+ // assertEquals(1 - 0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
+ // assertEquals(1 - 0.942d,
+ // distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
+ // assertEquals(1 - 0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
+ // assertEquals(1 - 0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+ // assertEquals(1 - 0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
+ // assertEquals(1 - 0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
}
@Test