You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by ot...@apache.org on 2008/05/22 08:24:56 UTC

svn commit: r659016 - in /lucene/java/trunk: CHANGES.txt contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java

Author: otis
Date: Wed May 21 23:24:55 2008
New Revision: 659016

URL: http://svn.apache.org/viewvc?rev=659016&view=rev
Log:
LUCENE-1183: Optimized TRStringDistance class (in contrib/spell) that uses less memory than the previous version

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=659016&r1=659015&r2=659016&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Wed May 21 23:24:55 2008
@@ -174,6 +174,9 @@
     runtime type checking for possible speedup of binaryValue().
     (Eks Dev via Mike McCandless)
 
+ 5. LUCENE-1183: Optimized TRStringDistance class (in contrib/spell) that uses
+    less memory than the previous version.  (Cédrik LIME via Otis Gospodnetic)
+
 Documentation
 
   1. LUCENE-1236:  Added some clarifying remarks to EdgeNGram*.java (Hiroaki Kawai via Grant Ingersoll)

Modified: lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java?rev=659016&r1=659015&r2=659016&view=diff
==============================================================================
--- lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java (original)
+++ lucene/java/trunk/contrib/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java Wed May 21 23:24:55 2008
@@ -1,6 +1,5 @@
 package org.apache.lucene.search.spell;
 
-
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -19,13 +18,16 @@
  */
 
 /**
- * Edit distance  class
+ * Edit distance class.
+ * Note: this class is not thread-safe.
  */
 final class TRStringDistance {
 
     final char[] sa;
     final int n;
-    final int[][][] cache=new int[30][][];
+    int p[]; //'previous' cost array, horizontally
+    int d[]; // cost array, horizontally
+    int _d[]; //placeholder to assist in swapping p and d
 
 
     /**
@@ -33,101 +35,73 @@
      * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
      */
     public TRStringDistance (String target) {
-        sa=target.toCharArray();
-        n=sa.length;
+        sa = target.toCharArray();
+        n = sa.length;
+        p = new int[n+1]; //'previous' cost array, horizontally
+        d = new int[n+1]; // cost array, horizontally
     }
 
 
     //*****************************
-     // Compute Levenshtein distance
-     //*****************************
-      public final int getDistance (String other) {
-          int d[][]; // matrix
-          int cost; // cost
-
-          // Step 1
-          final char[] ta=other.toCharArray();
-          final int m=ta.length;
-          if (n==0) {
-              return m;
-          }
-          if (m==0) {
-              return n;
-          }
-
-          if (m>=cache.length) {
-              d=form(n, m);
-          }
-          else if (cache[m]!=null) {
-              d=cache[m];
-          }
-          else {
-              d=cache[m]=form(n, m);
-
-              // Step 3
-
-          }
-          for (int i=1; i<=n; i++) {
-              final char s_i=sa[i-1];
-
-              // Step 4
-
-              for (int j=1; j<=m; j++) {
-                  final char t_j=ta[j-1];
-
-                  // Step 5
-
-                  if (s_i==t_j) { // same
-                      cost=0;
-                  }
-                  else { // not a match
-                      cost=1;
-
-                      // Step 6
-
-                  }
-                  d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost);
-
-              }
-
-          }
+    // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
+    //*****************************
+    public final int getDistance (String other) {
+        /*
+           The difference between this impl. and the previous is that, rather
+           than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+           we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
+           is the 'current working' distance array that maintains the newest distance cost
+           counts as we iterate through the characters of String s.  Each time we increment
+           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
+           allows us to retain the previous cost counts as required by the algorithm (taking
+           the minimum of the cost count to the left, up one, and diagonally up and to the left
+           of the current cost count being calculated).  (Note that the arrays aren't really
+           copied anymore, just switched...this is clearly much better than cloning an array
+           or doing a System.arraycopy() each time  through the outer loop.)
+
+           Effectively, the difference between the two implementations is this one does not
+           cause an out of memory condition when calculating the LD over two very large strings.
+         */
+
+        final int m = other.length();
+
+        if (n == 0) {
+            return m;
+        } else if (m == 0) {
+            return n;
+        }
 
-          // Step 7
-          return d[n][m];
+        // indexes into strings s and t
+        int i; // iterates through s
+        int j; // iterates through t
 
-      }
+        char t_j; // jth character of t
 
+        int cost; // cost
 
-    /**
-     *
-     */
-    private static int[][] form (int n, int m) {
-        int[][] d=new int[n+1][m+1];
-        // Step 2
-
-        for (int i=0; i<=n; i++) {
-            d[i][0]=i;
-
-        }
-        for (int j=0; j<=m; j++) {
-            d[0][j]=j;
+        for (i = 0; i<=n; i++) {
+            p[i] = i;
         }
-        return d;
-    }
 
+        for (j = 1; j<=m; j++) {
+            t_j = other.charAt(j-1);
+            d[0] = j;
+
+            for (i=1; i<=n; i++) {
+                cost = sa[i-1]==t_j ? 0 : 1;
+                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+                d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);
+            }
+
+            // copy current distance counts to 'previous row' distance counts
+            _d = p;
+            p = d;
+            d = _d;
+        }
 
-    //****************************
-     // Get minimum of three values
-     //****************************
-      private static int min3 (int a, int b, int c) {
-          int mi=a;
-          if (b<mi) {
-              mi=b;
-          }
-          if (c<mi) {
-              mi=c;
-          }
-          return mi;
+        // our last action in the above loop was to switch d and p, so p now
+        // actually has the most recent cost counts
+        return p[n];
+    }
 
-      }
 }