You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by ba...@apache.org on 2006/03/14 07:09:28 UTC
svn commit: r385745 -
/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java
Author: bayard
Date: Mon Mar 13 22:09:27 2006
New Revision: 385745
URL: http://svn.apache.org/viewcvs?rev=385745&view=rev
Log:
Finally applying Chas Emerick's improved getLevenshtein implementation
Modified:
jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java
Modified: jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java
URL: http://svn.apache.org/viewcvs/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java?rev=385745&r1=385744&r2=385745&view=diff
==============================================================================
--- jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java (original)
+++ jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java Mon Mar 13 22:09:27 2006
@@ -4711,8 +4711,13 @@
* another, where each change is a single character modification (deletion,
* insertion or substitution).</p>
*
- * <p>This implementation of the Levenshtein distance algorithm
- * is from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+ * <p>The previous implementation of the Levenshtein distance algorithm
+ * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+ *
+ * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+ * which can occur when my Java implementation is used with very large strings.<br>
+ * This implementation of the Levenshtein distance algorithm
+ * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
*
* <pre>
* StringUtils.getLevenshteinDistance(null, *) = IllegalArgumentException
@@ -4737,57 +4742,68 @@
if (s == null || t == null) {
throw new IllegalArgumentException("Strings must not be null");
}
- int d[][]; // matrix
- int n; // length of s
- int m; // length of t
- int i; // iterates through s
- int j; // iterates through t
- char s_i; // ith character of s
- char t_j; // jth character of t
- int cost; // cost
- // Step 1
- n = s.length();
- m = t.length();
+ /*
+ The difference between this impl. and the previous is that, rather
+ than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+ we maintain two single-dimensional arrays of length s.length()+1. The first, d,
+ is the 'current working' distance array that maintains the newest distance cost
+ counts as we iterate through the characters of String s. Each time we increment
+ the index of String t we are comparing, d is copied to p, the second int[]. Doing so
+ allows us to retain the previous cost counts as required by the algorithm (taking
+ the minimum of the cost count to the left, up one, and diagonally up and to the left
+ of the current cost count being calculated). (Note that the arrays aren't really
+ copied anymore, just switched...this is clearly much better than cloning an array
+ or doing a System.arraycopy() each time through the outer loop.)
+
+ Effectively, the difference between the two implementations is this one does not
+ cause an out of memory condition when calculating the LD over two very large strings.
+ */
+
+ int n = s.length(); // length of s
+ int m = t.length(); // length of t
+
if (n == 0) {
return m;
- }
- if (m == 0) {
+ } else if (m == 0) {
return n;
}
- d = new int[n + 1][m + 1];
- // Step 2
- for (i = 0; i <= n; i++) {
- d[i][0] = i;
- }
+ int p[] = new int[n+1]; //'previous' cost array, horizontally
+ int d[] = new int[n+1]; // cost array, horizontally
+ int _d[]; //placeholder to assist in swapping p and d
- for (j = 0; j <= m; j++) {
- d[0][j] = j;
- }
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
- // Step 3
- for (i = 1; i <= n; i++) {
- s_i = s.charAt(i - 1);
+ char t_j; // jth character of t
- // Step 4
- for (j = 1; j <= m; j++) {
- t_j = t.charAt(j - 1);
+ int cost; // cost
- // Step 5
- if (s_i == t_j) {
- cost = 0;
- } else {
- cost = 1;
- }
+ for (i = 0; i<=n; i++) {
+ p[i] = i;
+ }
- // Step 6
- d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
+ for (j = 1; j<=m; j++) {
+ t_j = t.charAt(j-1);
+ d[0] = j;
+
+ for (i=1; i<=n; i++) {
+ cost = s.charAt(i-1)==t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
}
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
}
- // Step 7
- return d[n][m];
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return p[n];
}
/**
@@ -4798,6 +4814,7 @@
* @param c value 3
* @return the smallest of the values
*/
+/*
private static int min(int a, int b, int c) {
// Method copied from NumberUtils to avoid dependency on subpackage
if (b < a) {
@@ -4808,5 +4825,6 @@
}
return a;
}
+*/
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org