You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by mb...@apache.org on 2011/06/16 17:52:09 UTC
svn commit: r1136496 - in /commons/proper/lang/trunk: pom.xml
src/main/java/org/apache/commons/lang3/StringUtils.java
src/test/java/org/apache/commons/lang3/StringUtilsTest.java
Author: mbenson
Date: Thu Jun 16 15:52:09 2011
New Revision: 1136496
URL: http://svn.apache.org/viewvc?rev=1136496&view=rev
Log:
[LANG-684] Levenshtein Distance Within a Given Threshold; submitted by Eli Lindsey
Modified:
commons/proper/lang/trunk/pom.xml
commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
Modified: commons/proper/lang/trunk/pom.xml
URL: http://svn.apache.org/viewvc/commons/proper/lang/trunk/pom.xml?rev=1136496&r1=1136495&r2=1136496&view=diff
==============================================================================
--- commons/proper/lang/trunk/pom.xml (original)
+++ commons/proper/lang/trunk/pom.xml Thu Jun 16 15:52:09 2011
@@ -288,6 +288,9 @@
<name>Rafal Krzewski</name>
</contributor>
<contributor>
+ <name>Eli Lindsey</name>
+ </contributor>
+ <contributor>
<name>Craig R. McClanahan</name>
</contributor>
<contributor>
Modified: commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java
URL: http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java?rev=1136496&r1=1136495&r2=1136496&view=diff
==============================================================================
--- commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java (original)
+++ commons/proper/lang/trunk/src/main/java/org/apache/commons/lang3/StringUtils.java Thu Jun 16 15:52:09 2011
@@ -19,6 +19,7 @@ package org.apache.commons.lang3;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
@@ -6137,6 +6138,168 @@ public class StringUtils {
return p[n];
}
+ /**
+ * <p>Find the Levenshtein distance between two Strings if it's less than or equal to a given
+ * threshold.</p>
+ *
+ * <p>This is the number of changes needed to change one String into
+ * another, where each change is a single character modification (deletion,
+ * insertion or substitution).</p>
+ *
+ * <p>This implementation follows from Algorithms on Strings, Trees and Sequences by Dan Gusfield
+ * and Chas Emerick's implementation of the Levenshtein distance algorithm from
+ * <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+ *
+ * <pre>
+ * StringUtils.getLevenshteinDistance(null, *, *) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance(*, null, *) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance(*, *, -1) = IllegalArgumentException
+ * StringUtils.getLevenshteinDistance("","", 0) = 0
+ * StringUtils.getLevenshteinDistance("aaapppp", "", 8) = 7
+ * StringUtils.getLevenshteinDistance("aaapppp", "", 7) = 7
+ * StringUtils.getLevenshteinDistance("aaapppp", "", 6)) = -1
+ * StringUtils.getLevenshteinDistance("elephant", "hippo", 7) = 7
+ * StringUtils.getLevenshteinDistance("elephant", "hippo", 6) = -1
+ * StringUtils.getLevenshteinDistance("hippo", "elephant", 7) = 7
+ * StringUtils.getLevenshteinDistance("hippo", "elephant", 6) = -1
+ * </pre>
+ *
+ * @param s the first String, must not be null
+ * @param t the second String, must not be null
+ * @param threshold the target threshold, must not be negative
+ * @return result distance, or -1 if the distance would be greater than the threshold
+ * @throws IllegalArgumentException if either String input {@code null} or negative threshold
+ */
+ public static int getLevenshteinDistance(CharSequence s, CharSequence t, int threshold) {
+ if(s == null || t == null) {
+ throw new IllegalArgumentException("String must not be null");
+ }
+ if(threshold < 0) {
+ throw new IllegalArgumentException("Threshold must not be negative");
+ }
+
+ /*
+ This implementation only computes the distance if it's less than or equal to the
+ threshold value, returning -1 if it's greater. The advantage is performance: unbounded
+ distance is O(nm), but a bound of k allows us to reduce it to O(km) time by only
+ computing a diagonal stripe of width 2k+1 of the cost table.
+ It is also possible to use this to compute the unbounded Levenshtein distance by starting
+ the threshold at 1 and doubling each time until the distance is found; this is O(dm), where
+ d is the distance.
+
+ One subtlety comes from needing to ignore entries on the border of our stripe
+ eg.
+ p[] = |#|#|#|*
+ d[] = *|#|#|#|
+ We must ignore the entry to the left of the leftmost member
+ We must ignore the entry above the rightmost member
+
+ Another subtlety comes from our stripe running off the matrix if the strings aren't
+ of the same size. Since string s is always swapped to be the shorter of the two,
+ the stripe will always run off to the upper right instead of the lower left of the matrix.
+
+ As a concrete example, suppose s is of length 5, t is of length 7, and our threshold is 1.
+ In this case we're going to walk a stripe of length 3. The matrix would look like so:
+
+ 1 2 3 4 5
+ 1 |#|#| | | |
+ 2 |#|#|#| | |
+ 3 | |#|#|#| |
+ 4 | | |#|#|#|
+ 5 | | | |#|#|
+ 6 | | | | |#|
+ 7 | | | | | |
+
+ Note how the stripe leads off the table as there is no possible way to turn a string of length 5
+ into one of length 7 in edit distance of 1.
+
+ Additionally, this implementation decreases memory usage by using two
+ single-dimensional arrays and swapping them back and forth instead of allocating
+ an entire n by m matrix. This requires a few minor changes, such as immediately returning
+ when it's detected that the stripe has run off the matrix and initially filling the arrays with
+ large values so that entries we don't compute are ignored.
+
+ See Algorithms on Strings, Trees and Sequences by Dan Gusfield for some discussion.
+ */
+
+ int n = s.length(); // length of s
+ int m = t.length(); // length of t
+
+ // if one string is empty, the edit distance is necessarily the length of the other
+ if(n == 0) {
+ return m <= threshold? m : -1;
+ } else if(m == 0) {
+ return n <= threshold? n : -1;
+ }
+
+ if(n > m) {
+ // swap the two strings to consume less memory
+ CharSequence tmp = s;
+ s = t;
+ t = tmp;
+ n = m;
+ m = t.length();
+ }
+
+ int p[] = new int[n+1]; // 'previous' cost array, horizontally
+ int d[] = new int[n+1]; // cost array, horizontally
+ int _d[]; // placeholder to assist in swapping p and d
+
+ // fill in starting table values
+ int boundary = Math.min(n, threshold) + 1;
+ for(int i = 0; i < boundary; i++) {
+ p[i] = i;
+ }
+ // these fills ensure that the value above the rightmost entry of our
+ // stripe will be ignored in following loop iterations
+ Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
+ Arrays.fill(d, Integer.MAX_VALUE);
+
+ // iterates through t
+ for(int j = 1; j <= m; j++) {
+ char t_j = t.charAt(j-1); // jth character of t
+ d[0] = j;
+
+ // compute stripe indices, constrain to array size
+ int min = Math.max(1, j - threshold);
+ int max = Math.min(n, j + threshold);
+
+ // the stripe may lead off of the table if s and t are of different sizes
+ if(min > max) {
+ return -1;
+ }
+
+ // ignore entry left of leftmost
+ if(min > 1) {
+ d[min-1] = Integer.MAX_VALUE;
+ }
+
+ // iterates through [min, max] in s
+ for(int i = min; i <= max; i++) {
+ if(s.charAt(i-1) == t_j) {
+ // diagonally left and up
+ d[i] = p[i-1];
+ } else {
+ // 1 + minimum of cell to the left, to the top, diagonally left and up
+ d[i] = 1 + Math.min(Math.min(d[i-1], p[i]), p[i-1]);
+ }
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // if p[n] is greater than the threshold, there's no guarantee on it being the correct
+ // distance
+ if(p[n] <= threshold) {
+ return p[n];
+ } else {
+ return -1;
+ }
+ }
+
// startsWith
//-----------------------------------------------------------------------
Modified: commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
URL: http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java?rev=1136496&r1=1136495&r2=1136496&view=diff
==============================================================================
--- commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java (original)
+++ commons/proper/lang/trunk/src/test/java/org/apache/commons/lang3/StringUtilsTest.java Thu Jun 16 15:52:09 2011
@@ -1671,6 +1671,85 @@ public class StringUtilsTest extends Tes
}
}
+ public void testGetLevenshteinDistance_StringStringInt() {
+ // empty strings
+ assertEquals(0, StringUtils.getLevenshteinDistance("", "", 0));
+ assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 8));
+ assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "", 7));
+ assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "", 6));
+
+ // unequal strings, zero threshold
+ assertEquals(-1, StringUtils.getLevenshteinDistance("b", "a", 0));
+ assertEquals(-1, StringUtils.getLevenshteinDistance("a", "b", 0));
+
+ // equal strings
+ assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 0));
+ assertEquals(0, StringUtils.getLevenshteinDistance("aa", "aa", 2));
+
+ // same length
+ assertEquals(-1, StringUtils.getLevenshteinDistance("aaa", "bbb", 2));
+ assertEquals(3, StringUtils.getLevenshteinDistance("aaa", "bbb", 3));
+
+ // big stripe
+ assertEquals(6, StringUtils.getLevenshteinDistance("aaaaaa", "b", 10));
+
+ // distance less than threshold
+ assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 8));
+ assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 4));
+
+ // distance equal to threshold
+ assertEquals(7, StringUtils.getLevenshteinDistance("aaapppp", "b", 7));
+ assertEquals(3, StringUtils.getLevenshteinDistance("a", "bbb", 3));
+
+ // distance greater than threshold
+ assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 2));
+ assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 2));
+ assertEquals(-1, StringUtils.getLevenshteinDistance("aaapppp", "b", 6));
+
+ // stripe runs off array, strings not similar
+ assertEquals(-1, StringUtils.getLevenshteinDistance("a", "bbb", 1));
+ assertEquals(-1, StringUtils.getLevenshteinDistance("bbb", "a", 1));
+
+ // stripe runs off array, strings are similar
+ assertEquals(-1, StringUtils.getLevenshteinDistance("12345", "1234567", 1));
+ assertEquals(-1, StringUtils.getLevenshteinDistance("1234567", "12345", 1));
+
+ // old getLevenshteinDistance test cases
+ assertEquals(1, StringUtils.getLevenshteinDistance("frog", "fog",1) );
+ assertEquals(3, StringUtils.getLevenshteinDistance("fly", "ant",3) );
+ assertEquals(7, StringUtils.getLevenshteinDistance("elephant", "hippo",7) );
+ assertEquals(-1, StringUtils.getLevenshteinDistance("elephant", "hippo",6) );
+ assertEquals(7, StringUtils.getLevenshteinDistance("hippo", "elephant",7) );
+ assertEquals(-1, StringUtils.getLevenshteinDistance("hippo", "elephant",6) );
+ assertEquals(8, StringUtils.getLevenshteinDistance("hippo", "zzzzzzzz",8) );
+ assertEquals(8, StringUtils.getLevenshteinDistance("zzzzzzzz", "hippo",8) );
+ assertEquals(1, StringUtils.getLevenshteinDistance("hello", "hallo",1) );
+
+ // exceptions
+ try {
+ @SuppressWarnings("unused")
+ int d = StringUtils.getLevenshteinDistance("a", null, 0);
+ fail("expecting IllegalArgumentException");
+ } catch (IllegalArgumentException ex) {
+ // empty
+ }
+ try {
+ @SuppressWarnings("unused")
+ int d = StringUtils.getLevenshteinDistance(null, "a", 0);
+ fail("expecting IllegalArgumentException");
+ } catch (IllegalArgumentException ex) {
+ // empty
+ }
+
+ try {
+ @SuppressWarnings("unused")
+ int d = StringUtils.getLevenshteinDistance("a", "a", -1);
+ fail("expecting IllegalArgumentException");
+ } catch (IllegalArgumentException ex) {
+ // empty
+ }
+ }
+
/**
* A sanity check for {@link StringUtils#EMPTY}.
*/