You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by ki...@apache.org on 2019/02/23 23:29:58 UTC

[commons-text] branch master updated: TEXT-104: deprecate JaroWinkler methods for 2.0, and fix clirr report

This is an automated email from the ASF dual-hosted git repository.

kinow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-text.git


The following commit(s) were added to refs/heads/master by this push:
     new a32b1d2  TEXT-104: deprecate JaroWinkler methods for 2.0, and fix clirr report
     new 56c060d  Merge pull request #102 from kinow/deprecate-jaro-winkler-for-20
a32b1d2 is described below

commit a32b1d2948c7b1b4a07eda8a72fb5cd5bdf2dd00
Author: Bruno P. Kinoshita <br...@niwa.co.nz>
AuthorDate: Thu Feb 21 10:44:02 2019 +1300

    TEXT-104: deprecate JaroWinkler methods for 2.0, and fix clirr report
---
 .../text/similarity/JaroWinklerDistance.java       | 91 +++++++++++++++++++++-
 .../text/similarity/JaroWinklerDistanceTest.java   | 45 +++++++----
 2 files changed, 117 insertions(+), 19 deletions(-)

diff --git a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
index eaa9fb3..388d0c7 100644
--- a/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
+++ b/src/main/java/org/apache/commons/text/similarity/JaroWinklerDistance.java
@@ -16,6 +16,8 @@
  */
 package org.apache.commons.text.similarity;
 
+import java.util.Arrays;
+
 /**
  * Measures the Jaro-Winkler distance of two character sequences.
  * It is the complementary of Jaro-Winkler similarity.
@@ -25,6 +27,12 @@ package org.apache.commons.text.similarity;
 public class JaroWinklerDistance implements EditDistance<Double> {
 
     /**
+     * @deprecated Deprecated as of 1.7. This constant will be removed in 2.0.
+     */
+    @Deprecated
+    public static final int INDEX_NOT_FOUND = -1;
+
+    /**
      * Computes the Jaro Winkler Distance between two character sequences.
      *
      * <pre>
@@ -63,7 +71,86 @@ public class JaroWinklerDistance implements EditDistance<Double> {
             throw new IllegalArgumentException("CharSequences must not be null");
         }
 
-        JaroWinklerSimilarity similarity = new JaroWinklerSimilarity();
-        return 1 - similarity.apply(left, right);
+        // TODO: replace the rest of the code by this in 2.0, see TEXT-104
+        //
+        // JaroWinklerSimilarity similarity = new JaroWinklerSimilarity();
+        // return 1 - similarity.apply(left, right);
+
+        final double defaultScalingFactor = 0.1;
+        final int[] mtp = matches(left, right);
+        final double m = mtp[0];
+        if (m == 0) {
+            return 0D;
+        }
+        final double j = ((m / left.length() + m / right.length() + (m - (double) mtp[1] / 2) / m)) / 3;
+        final double jw = j < 0.7D ? j : j + defaultScalingFactor * mtp[2] * (1D - j);
+        return jw;
+    }
+
+    // TODO: remove this method in 2.0, see TEXT-104
+    /**
+     * This method returns the Jaro-Winkler string matches, half transpositions, prefix array.
+     *
+     * @param first the first string to be matched
+     * @param second the second string to be matched
+     * @return mtp array containing: matches, half transpositions, and prefix
+     * @deprecated Deprecated as of 1.7. This method will be removed in 2.0, and moved to a Jaro Winkler similarity
+     *             class.
+     */
+    @Deprecated
+    protected static int[] matches(final CharSequence first, final CharSequence second) {
+        CharSequence max, min;
+        if (first.length() > second.length()) {
+            max = first;
+            min = second;
+        } else {
+            max = second;
+            min = first;
+        }
+        final int range = Math.max(max.length() / 2 - 1, 0);
+        final int[] matchIndexes = new int[min.length()];
+        Arrays.fill(matchIndexes, -1);
+        final boolean[] matchFlags = new boolean[max.length()];
+        int matches = 0;
+        for (int mi = 0; mi < min.length(); mi++) {
+            final char c1 = min.charAt(mi);
+            for (int xi = Math.max(mi - range, 0), xn = Math.min(mi + range + 1, max.length()); xi < xn; xi++) {
+                if (!matchFlags[xi] && c1 == max.charAt(xi)) {
+                    matchIndexes[mi] = xi;
+                    matchFlags[xi] = true;
+                    matches++;
+                    break;
+                }
+            }
+        }
+        final char[] ms1 = new char[matches];
+        final char[] ms2 = new char[matches];
+        for (int i = 0, si = 0; i < min.length(); i++) {
+            if (matchIndexes[i] != -1) {
+                ms1[si] = min.charAt(i);
+                si++;
+            }
+        }
+        for (int i = 0, si = 0; i < max.length(); i++) {
+            if (matchFlags[i]) {
+                ms2[si] = max.charAt(i);
+                si++;
+            }
+        }
+        int halfTranspositions = 0;
+        for (int mi = 0; mi < ms1.length; mi++) {
+            if (ms1[mi] != ms2[mi]) {
+                halfTranspositions++;
+            }
+        }
+        int prefix = 0;
+        for (int mi = 0; mi < Math.min(4, min.length()); mi++) {
+            if (first.charAt(mi) == second.charAt(mi)) {
+                prefix++;
+            } else {
+                break;
+            }
+        }
+        return new int[] {matches, halfTranspositions, prefix};
     }
 }
diff --git a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
index eadf1a2..e56ec07 100644
--- a/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
+++ b/src/test/java/org/apache/commons/text/similarity/JaroWinklerDistanceTest.java
@@ -36,23 +36,34 @@ public class JaroWinklerDistanceTest {
 
     @Test
     public void testGetJaroWinklerDistance_StringString() {
-        assertEquals(0d, distance.apply("", ""), 0.00001d);
-        assertEquals(0d, distance.apply("foo", "foo"), 0.00001d);
-        assertEquals(1 - 0.94166d, distance.apply("foo", "foo "), 0.00001d);
-        assertEquals(1 - 0.90666d, distance.apply("foo", "foo  "), 0.00001d);
-        assertEquals(1 - 0.86666d, distance.apply("foo", " foo "), 0.00001d);
-        assertEquals(1 - 0.51111d, distance.apply("foo", "  foo"), 0.00001d);
-        assertEquals(1 - 0.92499d, distance.apply("frog", "fog"), 0.00001d);
-        assertEquals(1.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
-        assertEquals(1 - 0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
-        assertEquals(1 - 0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
-        assertEquals(1 - 0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
-        assertEquals(1 - 0.942d,
-                distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
-        assertEquals(1 - 0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
-        assertEquals(1 - 0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
-        assertEquals(1 - 0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
-        assertEquals(1 - 0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
+        assertEquals(0.92499d, distance.apply("frog", "fog"), 0.00001d);
+        assertEquals(0.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
+        assertEquals(0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
+        assertEquals(0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
+        assertEquals(0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
+        assertEquals(0.942d, distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
+        assertEquals(0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
+        assertEquals(0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+        assertEquals(0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
+        assertEquals(0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
+        // TODO: replace tests in 2.0. See TEXT-104 for more.
+        // assertEquals(0d, distance.apply("", ""), 0.00001d);
+        // assertEquals(0d, distance.apply("foo", "foo"), 0.00001d);
+        // assertEquals(1 - 0.94166d, distance.apply("foo", "foo "), 0.00001d);
+        // assertEquals(1 - 0.90666d, distance.apply("foo", "foo  "), 0.00001d);
+        // assertEquals(1 - 0.86666d, distance.apply("foo", " foo "), 0.00001d);
+        // assertEquals(1 - 0.51111d, distance.apply("foo", "  foo"), 0.00001d);
+        // assertEquals(1 - 0.92499d, distance.apply("frog", "fog"), 0.00001d);
+        // assertEquals(1.0d, distance.apply("fly", "ant"), 0.00000000000000000001d);
+        // assertEquals(1 - 0.44166d, distance.apply("elephant", "hippo"), 0.00001d);
+        // assertEquals(1 - 0.90666d, distance.apply("ABC Corporation", "ABC Corp"), 0.00001d);
+        // assertEquals(1 - 0.95251d, distance.apply("D N H Enterprises Inc", "D & H Enterprises, Inc."), 0.00001d);
+        // assertEquals(1 - 0.942d,
+        //         distance.apply("My Gym Children's Fitness Center", "My Gym. Childrens Fitness"), 0.00001d);
+        // assertEquals(1 - 0.898018d, distance.apply("PENNSYLVANIA", "PENNCISYLVNIA"), 0.00001d);
+        // assertEquals(1 - 0.971428d, distance.apply("/opt/software1", "/opt/software2"), 0.00001d);
+        // assertEquals(1 - 0.941666d, distance.apply("aaabcd", "aaacdb"), 0.00001d);
+        // assertEquals(1 - 0.911111d, distance.apply("John Horn", "John Hopkins"), 0.00001d);
     }
 
     @Test