You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by ba...@apache.org on 2006/03/14 07:09:28 UTC

svn commit: r385745 - /jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java

Author: bayard
Date: Mon Mar 13 22:09:27 2006
New Revision: 385745

URL: http://svn.apache.org/viewcvs?rev=385745&view=rev
Log:
Finally applying Chas Emerick's improved getLevenshtein implementation

Modified:
    jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java

Modified: jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java
URL: http://svn.apache.org/viewcvs/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java?rev=385745&r1=385744&r2=385745&view=diff
==============================================================================
--- jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java (original)
+++ jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringUtils.java Mon Mar 13 22:09:27 2006
@@ -4711,8 +4711,13 @@
      * another, where each change is a single character modification (deletion,
      * insertion or substitution).</p>
      *
-     * <p>This implementation of the Levenshtein distance algorithm
-     * is from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+     * <p>The previous implementation of the Levenshtein distance algorithm
+     * was from <a href="http://www.merriampark.com/ld.htm">http://www.merriampark.com/ld.htm</a></p>
+     *
+     * <p>Chas Emerick has written an implementation in Java, which avoids an OutOfMemoryError
+     * which can occur when my Java implementation is used with very large strings.<br>
+     * This implementation of the Levenshtein distance algorithm
+     * is from <a href="http://www.merriampark.com/ldjava.htm">http://www.merriampark.com/ldjava.htm</a></p>
      *
      * <pre>
      * StringUtils.getLevenshteinDistance(null, *)             = IllegalArgumentException
@@ -4737,57 +4742,68 @@
         if (s == null || t == null) {
             throw new IllegalArgumentException("Strings must not be null");
         }
-        int d[][]; // matrix
-        int n; // length of s
-        int m; // length of t
-        int i; // iterates through s
-        int j; // iterates through t
-        char s_i; // ith character of s
-        char t_j; // jth character of t
-        int cost; // cost
 
-        // Step 1
-        n = s.length();
-        m = t.length();
+        /*
+           The difference between this impl. and the previous is that, rather 
+           than creating and retaining a matrix of size s.length()+1 by t.length()+1, 
+           we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
+           is the 'current working' distance array that maintains the newest distance cost
+           counts as we iterate through the characters of String s.  Each time we increment
+           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
+           allows us to retain the previous cost counts as required by the algorithm (taking 
+           the minimum of the cost count to the left, up one, and diagonally up and to the left
+           of the current cost count being calculated).  (Note that the arrays aren't really 
+           copied anymore, just switched...this is clearly much better than cloning an array 
+           or doing a System.arraycopy() each time  through the outer loop.)
+
+           Effectively, the difference between the two implementations is this one does not 
+           cause an out of memory condition when calculating the LD over two very large strings.
+         */
+
+        int n = s.length(); // length of s
+        int m = t.length(); // length of t
+
         if (n == 0) {
             return m;
-        }
-        if (m == 0) {
+        } else if (m == 0) {
             return n;
         }
-        d = new int[n + 1][m + 1];
 
-        // Step 2
-        for (i = 0; i <= n; i++) {
-            d[i][0] = i;
-        }
+        int p[] = new int[n+1]; //'previous' cost array, horizontally
+        int d[] = new int[n+1]; // cost array, horizontally
+        int _d[]; //placeholder to assist in swapping p and d
 
-        for (j = 0; j <= m; j++) {
-            d[0][j] = j;
-        }
+        // indexes into strings s and t
+        int i; // iterates through s
+        int j; // iterates through t
 
-        // Step 3
-        for (i = 1; i <= n; i++) {
-            s_i = s.charAt(i - 1);
+        char t_j; // jth character of t
 
-            // Step 4
-            for (j = 1; j <= m; j++) {
-                t_j = t.charAt(j - 1);
+        int cost; // cost
 
-                // Step 5
-                if (s_i == t_j) {
-                    cost = 0;
-                } else {
-                    cost = 1;
-                }
+        for (i = 0; i<=n; i++) {
+            p[i] = i;
+        }
 
-                // Step 6
-                d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
+        for (j = 1; j<=m; j++) {
+            t_j = t.charAt(j-1);
+            d[0] = j;
+
+            for (i=1; i<=n; i++) {
+                cost = s.charAt(i-1)==t_j ? 0 : 1;
+                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+                d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);  
             }
+
+            // copy current distance counts to 'previous row' distance counts
+            _d = p;
+            p = d;
+            d = _d;
         }
 
-        // Step 7
-        return d[n][m];
+        // our last action in the above loop was to switch d and p, so p now 
+        // actually has the most recent cost counts
+        return p[n];
     }
 
     /**
@@ -4798,6 +4814,7 @@
      * @param c  value 3
      * @return  the smallest of the values
      */
+/*
     private static int min(int a, int b, int c) {
         // Method copied from NumberUtils to avoid dependency on subpackage
         if (b < a) {
@@ -4808,5 +4825,6 @@
         }
         return a;
     }
+*/
 
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org