You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by mb...@apache.org on 2011/06/16 17:52:09 UTC

svn commit: r1136496 - in /commons/proper/lang/trunk: pom.xml src/main/java/org/apache/commons/lang3/StringUtils.java src/test/java/org/apache/commons/lang3/StringUtilsTest.java

Author: mbenson
Date: Thu Jun 16 15:52:09 2011
New Revision: 1136496

URL: http://svn.apache.org/viewvc?rev=1136496&view=rev
Log:
[LANG-684] Levenshtein Distance Within a Given Threshold; submitted by Eli Lindsey

Modified:
    commons/proper/lang/trunk/pom.xml
    commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
    commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java

Modified: commons/proper/lang/trunk/pom.xml
URL: http://svn.apache.org/viewvc/commons/proper/lang/trunk/pom.xml?rev=1136496&r1=1136495&r2=1136496&view=diff
==============================================================================
--- commons/proper/lang/trunk/pom.xml (original)
+++ commons/proper/lang/trunk/pom.xml Thu Jun 16 15:52:09 2011
@@ -288,6 +288,9 @@
             <name>Rafal Krzewski</name>
         </contributor>
         <contributor>
+            <name>Eli Lindsey</name>
+        </contributor>
+        <contributor>
             <name>Craig R. McClanahan</name>
         </contributor>
         <contributor>

Modified: commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
URL: http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java?rev=1136496&r1=1136495&r2=1136496&view=diff
==============================================================================
--- commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java (original)
+++ commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java Thu Jun 16 15:52:09 2011
@@ -19,6 +19,7 @@ package org.apache.commons.lang3;
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Locale;
@@ -6137,6 +6138,168 @@ public class StringUtils {
         return p[n];
     }
 
+    /**
+     * <p>Find the Levenshtein distance between two Strings if it's less than or equal to a given 
+     * threshold.</p>
+     *
+     * <p>This is the number of changes needed to change one String into
+     * another, where each change is a single character modification (deletion,
+     * insertion or substitution).</p>
+     *
+     * <p>This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield
+     * and Chas Emerick's implementation of the Levenshtein distance algorithm from
+     * <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+     *
+     * <pre>
+     * StringUtils.getLevenshteinDistance(null, *, *)             = IllegalArgumentException
+     * StringUtils.getLevenshteinDistance(*, null, *)             = IllegalArgumentException
+     * StringUtils.getLevenshteinDistance(*, *, -1)               = IllegalArgumentException
+     * StringUtils.getLevenshteinDistance("","", 0)               = 0
+     * StringUtils.getLevenshteinDistance("aaapppp", "", 8)       = 7
+     * StringUtils.getLevenshteinDistance("aaapppp", "", 7)       = 7
+     * StringUtils.getLevenshteinDistance("aaapppp", "", 6))      = -1
+     * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
+     * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
+     * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
+     * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
+     * </pre>
+     *
+     * @param s  the first String, must not be null
+     * @param t  the second String, must not be null
+     * @param threshold the target threshold, must not be negative
+     * @return result distance, or -1 if the distance would be greater than the threshold
+     * @throws IllegalArgumentException if either String input {@code null} or negative threshold
+     */
+    public static int getLevenshteinDistance(CharSequence s, CharSequence t, int threshold) {
+        if(s == null || t == null) {
+            throw new IllegalArgumentException("String must not be null");
+        }
+        if(threshold < 0) {
+            throw new IllegalArgumentException("Threshold must not be negative");
+        }
+
+        /*
+        This implementation only computes the distance if it's less than or equal to the
+        threshold value, returning -1 if it's greater.  The advantage is performance: unbounded
+        distance is O(nm), but a bound of k allows us to reduce it to O(km) time by only 
+        computing a diagonal stripe of width 2k+1 of the cost table.
+        It is also possible to use this to compute the unbounded Levenshtein distance by starting
+        the threshold at 1 and doubling each time until the distance is found; this is O(dm), where
+        d is the distance.
+        
+        One subtlety comes from needing to ignore entries on the border of our stripe
+        eg.
+        p[] = |#|#|#|*
+        d[] =  *|#|#|#|
+        We must ignore the entry to the left of the leftmost member
+        We must ignore the entry above the rightmost member
+        
+        Another subtlety comes from our stripe running off the matrix if the strings aren't
+        of the same size.  Since string s is always swapped to be the shorter of the two, 
+        the stripe will always run off to the upper right instead of the lower left of the matrix.
+        
+        As a concrete example, suppose s is of length 5, t is of length 7, and our threshold is 1.
+        In this case we're going to walk a stripe of length 3.  The matrix would look like so:
+        
+           1 2 3 4 5
+        1 |#|#| | | |
+        2 |#|#|#| | |
+        3 | |#|#|#| |
+        4 | | |#|#|#|
+        5 | | | |#|#|
+        6 | | | | |#|
+        7 | | | | | |
+
+        Note how the stripe leads off the table as there is no possible way to turn a string of length 5
+        into one of length 7 in edit distance of 1.
+        
+        Additionally, this implementation decreases memory usage by using two 
+        single-dimensional arrays and swapping them back and forth instead of allocating
+        an entire n by m matrix.  This requires a few minor changes, such as immediately returning 
+        when it's detected that the stripe has run off the matrix and initially filling the arrays with
+        large values so that entries we don't compute are ignored.
+
+        See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion.
+         */
+
+        int n = s.length(); // length of s
+        int m = t.length(); // length of t
+
+        // if one string is empty, the edit distance is necessarily the length of the other
+        if(n == 0) {
+            return m <= threshold? m : -1;
+        } else if(m == 0) {
+            return n <= threshold? n : -1;
+        }
+
+        if(n > m) {
+            // swap the two strings to consume less memory
+            CharSequence tmp = s;
+            s = t;
+            t = tmp;
+            n = m;
+            m = t.length();
+        }
+
+        int p[] = new int[n+1]; // 'previous' cost array, horizontally
+        int d[] = new int[n+1]; // cost array, horizontally
+        int _d[]; // placeholder to assist in swapping p and d
+
+        // fill in starting table values
+        int boundary = Math.min(n, threshold) + 1;
+        for(int i = 0; i < boundary; i++) {
+            p[i] = i;
+        }
+        // these fills ensure that the value above the rightmost entry of our 
+        // stripe will be ignored in following loop iterations
+        Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
+        Arrays.fill(d, Integer.MAX_VALUE);
+
+        // iterates through t
+        for(int j = 1; j <= m; j++) {
+            char t_j = t.charAt(j-1); // jth character of t
+            d[0] = j;
+
+            // compute stripe indices, constrain to array size
+            int min = Math.max(1, j - threshold);
+            int max = Math.min(n, j + threshold);
+
+            // the stripe may lead off of the table if s and t are of different sizes
+            if(min > max) {
+                return -1;
+            }
+
+            // ignore entry left of leftmost
+            if(min > 1) {
+                d[min-1] = Integer.MAX_VALUE;
+            }
+
+            // iterates through [min, max] in s
+            for(int i = min; i <= max; i++) {
+                if(s.charAt(i-1) == t_j) {
+                    // diagonally left and up
+                    d[i] = p[i-1];
+                } else {
+                    // 1 + minimum of cell to the left, to the top, diagonally left and up
+                    d[i] = 1 + Math.min(Math.min(d[i-1], p[i]), p[i-1]);
+                }
+            }
+
+            // copy current distance counts to 'previous row' distance counts
+            _d = p;
+            p = d;
+            d = _d;
+        }
+
+        // if p[n] is greater than the threshold, there's no guarantee on it being the correct
+        // distance
+        if(p[n] <= threshold) {
+            return p[n];
+        } else {
+            return -1;
+        }
+    }
+
     // startsWith
     //-----------------------------------------------------------------------
 

Modified: commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
URL: http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java?rev=1136496&r1=1136495&r2=1136496&view=diff
==============================================================================
--- commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java (original)
+++ commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java Thu Jun 16 15:52:09 2011
@@ -1671,6 +1671,85 @@ public class StringUtilsTest extends Tes
         }
     }
 
+    public void testGetLevenshteinDistance_StringStringInt() {
+        // empty strings
+        assertEquals(0, StringUtils.getLevenshteinDistance("", "", 0));
+        assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 8));
+        assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 7));
+        assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "", 6));
+
+        // unequal strings, zero threshold
+        assertEquals(-1, StringUtils.getLevenshteinDistance("b", "a", 0));
+        assertEquals(-1, StringUtils.getLevenshteinDistance("a", "b", 0));
+    
+        // equal strings
+        assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 0));
+        assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 2));
+
+        // same length
+        assertEquals(-1, StringUtils.getLevenshteinDistance("aaa", "bbb", 2));
+        assertEquals(3, StringUtils.getLevenshteinDistance("aaa", "bbb", 3));
+    
+        // big stripe
+        assertEquals(6, StringUtils.getLevenshteinDistance("aaaaaa", "b", 10));
+
+        // distance less than threshold
+        assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 8));
+        assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 4));
+    
+        // distance equal to threshold
+        assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 7));
+        assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 3));
+
+        // distance greater than threshold
+        assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 2));
+        assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 2));
+        assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "b", 6));
+
+        // stripe runs off array, strings not similar
+        assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 1));
+        assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 1));
+
+        // stripe runs off array, strings are similar
+        assertEquals(-1, StringUtils.getLevenshteinDistance("12345", "1234567", 1));
+        assertEquals(-1, StringUtils.getLevenshteinDistance("1234567", "12345", 1));
+
+        // old getLevenshteinDistance test cases
+        assertEquals(1, StringUtils.getLevenshteinDistance("frog", "fog",1) );
+        assertEquals(3, StringUtils.getLevenshteinDistance("fly", "ant",3) );
+        assertEquals(7, StringUtils.getLevenshteinDistance("elephant", "hippo",7) );
+        assertEquals(-1, StringUtils.getLevenshteinDistance("elephant", "hippo",6) );
+        assertEquals(7, StringUtils.getLevenshteinDistance("hippo", "elephant",7) );
+        assertEquals(-1, StringUtils.getLevenshteinDistance("hippo", "elephant",6) );
+        assertEquals(8, StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz",8) );
+        assertEquals(8, StringUtils.getLevenshteinDistance("zzzzzzzz", "hippo",8) );
+        assertEquals(1, StringUtils.getLevenshteinDistance("hello", "hallo",1) );
+
+        // exceptions
+        try {
+            @SuppressWarnings("unused")
+            int d = StringUtils.getLevenshteinDistance("a", null, 0);
+            fail("expecting IllegalArgumentException");
+        } catch (IllegalArgumentException ex) {
+            // empty
+        }
+        try {
+            @SuppressWarnings("unused")
+            int d = StringUtils.getLevenshteinDistance(null, "a", 0);
+            fail("expecting IllegalArgumentException");
+        } catch (IllegalArgumentException ex) {
+            // empty
+        }
+
+        try {
+            @SuppressWarnings("unused")
+            int d = StringUtils.getLevenshteinDistance("a", "a", -1);
+            fail("expecting IllegalArgumentException");
+        } catch (IllegalArgumentException ex) {
+            // empty
+        }
+    }
+
     /**
      * A sanity check for {@link StringUtils#EMPTY}.
      */