You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/12/06 16:30:55 UTC

[2/2] lucene-solr:branch_7x: LUCENE-8051: Typo in LevensHtein distance.

LUCENE-8051: Typo in LevensHtein distance.

Closes #284


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/26a4ebfa
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/26a4ebfa
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/26a4ebfa

Branch: refs/heads/branch_7x
Commit: 26a4ebfad5a4398c2f5ce48d094c89bdd28e61ae
Parents: 292b30e
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Dec 6 15:16:25 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Dec 6 17:29:19 2017 +0100

----------------------------------------------------------------------
 lucene/CHANGES.txt                              |   6 +-
 .../search/spell/LevenshteinDistance.java       | 125 +++++++++++++++++++
 .../lucene/search/spell/LevensteinDistance.java | 110 ++--------------
 .../lucene/search/spell/SpellChecker.java       |   4 +-
 .../search/spell/TestLevenshteinDistance.java   |   2 +-
 .../lucene/search/spell/TestSpellChecker.java   |   4 +-
 .../apache/solr/search/ValueSourceParser.java   |   6 +-
 .../spelling/AbstractLuceneSpellChecker.java    |   4 +-
 .../apache/solr/spelling/SolrSpellChecker.java  |   6 +-
 .../ConjunctionSolrSpellCheckerTest.java        |   6 +-
 solr/solr-ref-guide/src/spell-checking.adoc     |   4 +-
 11 files changed, 156 insertions(+), 121 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3823c7e..898cb3f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -4,7 +4,11 @@ For more information on past and future Lucene versions, please see:
 http://s.apache.org/luceneversions
 
 ======================= Lucene 7.3.0 =======================
-(No Changes)
+
+API Changes
+
+* LUCENE-8051: LevensteinDistance renamed to LevenshteinDistance.
+  (Pulak Ghosh via Adrien Grand)
 
 ======================= Lucene 7.2.0 =======================
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java
new file mode 100644
index 0000000..2aff15b
--- /dev/null
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spell;
+
+/**
+ * Levenshtein edit distance class.
+ */
+public class LevenshteinDistance implements StringDistance {
+
+    /**
+     * Optimized to run a bit faster than the static getDistance().
+     * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
+     */
+    public LevenshteinDistance () {
+    }
+
+
+    //*****************************
+    // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
+    //*****************************
+    @Override
+    public final float getDistance (String target, String other) {
+      char[] sa;
+      int n;
+      int p[]; //'previous' cost array, horizontally
+      int d[]; // cost array, horizontally
+      int _d[]; //placeholder to assist in swapping p and d
+      
+        /*
+           The difference between this impl. and the previous is that, rather
+           than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+           we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
+           is the 'current working' distance array that maintains the newest distance cost
+           counts as we iterate through the characters of String s.  Each time we increment
+           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
+           allows us to retain the previous cost counts as required by the algorithm (taking
+           the minimum of the cost count to the left, up one, and diagonally up and to the left
+           of the current cost count being calculated).  (Note that the arrays aren't really
+           copied anymore, just switched...this is clearly much better than cloning an array
+           or doing a System.arraycopy() each time  through the outer loop.)
+
+           Effectively, the difference between the two implementations is this one does not
+           cause an out of memory condition when calculating the LD over two very large strings.
+         */
+
+        sa = target.toCharArray();
+        n = sa.length;
+        p = new int[n+1]; 
+        d = new int[n+1]; 
+      
+        final int m = other.length();
+        if (n == 0 || m == 0) {
+          if (n == m) {
+            return 1;
+          }
+          else {
+            return 0;
+          }
+        } 
+
+
+        // indexes into strings s and t
+        int i; // iterates through s
+        int j; // iterates through t
+
+        char t_j; // jth character of t
+
+        int cost; // cost
+
+        for (i = 0; i<=n; i++) {
+            p[i] = i;
+        }
+
+        for (j = 1; j<=m; j++) {
+            t_j = other.charAt(j-1);
+            d[0] = j;
+
+            for (i=1; i<=n; i++) {
+                cost = sa[i-1]==t_j ? 0 : 1;
+                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+                d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);
+            }
+
+            // copy current distance counts to 'previous row' distance counts
+            _d = p;
+            p = d;
+            d = _d;
+        }
+
+        // our last action in the above loop was to switch d and p, so p now
+        // actually has the most recent cost counts
+        return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
+    }
+
+  @Override
+  public final int hashCode() {
+    return 163 * getClass().hashCode();
+  }
+
+  @Override
+  public final boolean equals(Object obj) {
+    if (this == obj) return true;
+    if (null == obj) return false;
+    return (getClass() == obj.getClass());
+  }
+
+  @Override
+  public final String toString() {
+    return "levensthein";
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
index de3f587..41e4b1a 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
@@ -17,109 +17,15 @@
 package org.apache.lucene.search.spell;
 
 /**
- * Levenstein edit distance class.
+ * Levenshtein edit distance class.
+ * @deprecated Use {@link LevenshteinDistance} instead.
  */
-public final class LevensteinDistance implements StringDistance {
+@Deprecated
+public class LevensteinDistance extends LevenshteinDistance {
 
-    /**
-     * Optimized to run a bit faster than the static getDistance().
-     * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
-     */
-    public LevensteinDistance () {
-    }
+  /**
+   * Sole constructor.
+   */
+  public LevensteinDistance() {}
 
-
-    //*****************************
-    // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
-    //*****************************
-    @Override
-    public float getDistance (String target, String other) {
-      char[] sa;
-      int n;
-      int p[]; //'previous' cost array, horizontally
-      int d[]; // cost array, horizontally
-      int _d[]; //placeholder to assist in swapping p and d
-      
-        /*
-           The difference between this impl. and the previous is that, rather
-           than creating and retaining a matrix of size s.length()+1 by t.length()+1,
-           we maintain two single-dimensional arrays of length s.length()+1.  The first, d,
-           is the 'current working' distance array that maintains the newest distance cost
-           counts as we iterate through the characters of String s.  Each time we increment
-           the index of String t we are comparing, d is copied to p, the second int[].  Doing so
-           allows us to retain the previous cost counts as required by the algorithm (taking
-           the minimum of the cost count to the left, up one, and diagonally up and to the left
-           of the current cost count being calculated).  (Note that the arrays aren't really
-           copied anymore, just switched...this is clearly much better than cloning an array
-           or doing a System.arraycopy() each time  through the outer loop.)
-
-           Effectively, the difference between the two implementations is this one does not
-           cause an out of memory condition when calculating the LD over two very large strings.
-         */
-
-        sa = target.toCharArray();
-        n = sa.length;
-        p = new int[n+1]; 
-        d = new int[n+1]; 
-      
-        final int m = other.length();
-        if (n == 0 || m == 0) {
-          if (n == m) {
-            return 1;
-          }
-          else {
-            return 0;
-          }
-        } 
-
-
-        // indexes into strings s and t
-        int i; // iterates through s
-        int j; // iterates through t
-
-        char t_j; // jth character of t
-
-        int cost; // cost
-
-        for (i = 0; i<=n; i++) {
-            p[i] = i;
-        }
-
-        for (j = 1; j<=m; j++) {
-            t_j = other.charAt(j-1);
-            d[0] = j;
-
-            for (i=1; i<=n; i++) {
-                cost = sa[i-1]==t_j ? 0 : 1;
-                // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
-                d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1),  p[i-1]+cost);
-            }
-
-            // copy current distance counts to 'previous row' distance counts
-            _d = p;
-            p = d;
-            d = _d;
-        }
-
-        // our last action in the above loop was to switch d and p, so p now
-        // actually has the most recent cost counts
-        return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
-    }
-
-  @Override
-  public int hashCode() {
-    return 163 * getClass().hashCode();
-  }
-
-  @Override
-  public boolean equals(Object obj) {
-    if (this == obj) return true;
-    if (null == obj) return false;
-    return (getClass() == obj.getClass());
-  }
-
-  @Override
-  public String toString() {
-    return "levenstein";
-  }
 }

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
index 1f11e27..a3b103d 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
@@ -126,7 +126,7 @@ public class SpellChecker implements java.io.Closeable {
   }
   /**
    * Use the given directory as a spell checker index with a
-   * {@link LevensteinDistance} as the default {@link StringDistance}. The
+   * {@link LevenshteinDistance} as the default {@link StringDistance}. The
    * directory is created if it doesn't exist yet.
    * 
    * @param spellIndex
@@ -135,7 +135,7 @@ public class SpellChecker implements java.io.Closeable {
    *           if spellchecker can not open the directory
    */
   public SpellChecker(Directory spellIndex) throws IOException {
-    this(spellIndex, new LevensteinDistance());
+    this(spellIndex, new LevenshteinDistance());
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
index 8b2cfd9..88c7587 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
@@ -20,7 +20,7 @@ import org.apache.lucene.util.LuceneTestCase;
 
 public class TestLevenshteinDistance extends LuceneTestCase {
 
-  private StringDistance sd = new LevensteinDistance();
+  private StringDistance sd = new LevenshteinDistance();
   
   public void testGetDistance() {
     float d = sd.getDistance("al", "al");

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
index 8428d84..ae6c972 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
@@ -149,7 +149,7 @@ public class TestSpellChecker extends LuceneTestCase {
   public void testComparator() throws Exception {
     IndexReader r = DirectoryReader.open(userindex);
     Directory compIdx = newDirectory();
-    SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
+    SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevenshteinDistance(), new SuggestWordFrequencyComparator());
     addwords(r, compareSP, "field3");
 
     String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3",
@@ -167,7 +167,7 @@ public class TestSpellChecker extends LuceneTestCase {
   public void testBogusField() throws Exception {
     IndexReader r = DirectoryReader.open(userindex);
     Directory compIdx = newDirectory();
-    SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
+    SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevenshteinDistance(), new SuggestWordFrequencyComparator());
     addwords(r, compareSP, "field3");
 
     String[] similar = compareSP.suggestSimilar("fvie", 2, r,

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
index 5944dc5..dc6411e 100644
--- a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
+++ b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
@@ -41,7 +41,7 @@ import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.spell.JaroWinklerDistance;
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
 import org.apache.lucene.search.spell.NGramDistance;
 import org.apache.lucene.search.spell.StringDistance;
 import org.apache.lucene.util.BytesRefBuilder;
@@ -406,7 +406,7 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin {
         if (distClass.equalsIgnoreCase("jw")) {
           dist = new JaroWinklerDistance();
         } else if (distClass.equalsIgnoreCase("edit")) {
-          dist = new LevensteinDistance();
+          dist = new LevenshteinDistance();
         } else if (distClass.equalsIgnoreCase("ngram")) {
           int ngram = 2;
           if (fp.hasMoreArguments()) {
@@ -1549,4 +1549,4 @@ class TestValueSource extends ValueSource {
   public SortField getSortField(boolean reverse) {
     return super.getSortField(reverse);
   }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
index a03e911..8170982 100644
--- a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
@@ -26,7 +26,7 @@ import java.util.List;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.spell.Dictionary;
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
 import org.apache.lucene.search.spell.SpellChecker;
 import org.apache.lucene.search.spell.StringDistance;
 import org.apache.lucene.search.spell.SuggestWord;
@@ -109,7 +109,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
       sd = core.getResourceLoader().newInstance(strDistanceName, StringDistance.class);
       //TODO: Figure out how to configure options.  Where's Spring when you need it?  Or at least BeanUtils...
     } else {
-      sd = new LevensteinDistance();
+      sd = new LevenshteinDistance();
     }
     try {
       initIndex();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
index bb461ab..5543867 100644
--- a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
@@ -23,7 +23,7 @@ import java.util.Map;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
 import org.apache.lucene.search.spell.StringDistance;
 import org.apache.lucene.search.spell.SuggestWord;
 import org.apache.lucene.search.spell.SuggestWordQueue;
@@ -89,9 +89,9 @@ public abstract class SolrSpellChecker {
     
     StringDistance sd = null;
     try {
-      sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();    
+      sd = getStringDistance() == null ? new LevenshteinDistance() : getStringDistance();    
     } catch(UnsupportedOperationException uoe) {
-      sd = new LevensteinDistance();
+      sd = new LevenshteinDistance();
     }
     
     SpellingResult result = new SpellingResult();

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
index 31f20fb..0df837f 100644
--- a/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
+++ b/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
@@ -18,7 +18,7 @@ package org.apache.solr.spelling;
 
 import java.io.IOException;
 
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
 import org.apache.lucene.search.spell.NGramDistance;
 import org.apache.lucene.search.spell.StringDistance;
 import org.apache.lucene.util.LuceneTestCase;
@@ -31,8 +31,8 @@ public class ConjunctionSolrSpellCheckerTest extends LuceneTestCase {
   @Test
   public void test() throws Exception {
     ConjunctionSolrSpellChecker cssc = new ConjunctionSolrSpellChecker();
-    MockSolrSpellChecker levenstein1 = new MockSolrSpellChecker(new LevensteinDistance());
-    MockSolrSpellChecker levenstein2 = new MockSolrSpellChecker(new LevensteinDistance());
+    MockSolrSpellChecker levenstein1 = new MockSolrSpellChecker(new LevenshteinDistance());
+    MockSolrSpellChecker levenstein2 = new MockSolrSpellChecker(new LevenshteinDistance());
     MockSolrSpellChecker ngram = new MockSolrSpellChecker(new NGramDistance());
     
     cssc.addChecker(levenstein1);

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/solr-ref-guide/src/spell-checking.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/spell-checking.adoc b/solr/solr-ref-guide/src/spell-checking.adoc
index 12818b2..26ec194 100644
--- a/solr/solr-ref-guide/src/spell-checking.adoc
+++ b/solr/solr-ref-guide/src/spell-checking.adoc
@@ -39,7 +39,7 @@ The `IndexBasedSpellChecker` uses a Solr index as the basis for a parallel index
     <str name="field">content</str>
     <str name="buildOnCommit">true</str>
     <!-- optional elements with defaults
-    <str name="distanceMeasure">org.apache.lucene.search.spell.LevensteinDistance</str>
+    <str name="distanceMeasure">org.apache.lucene.search.spell.LevenshteinDistance</str>
     <str name="accuracy">0.5</str>
     -->
  </lst>
@@ -99,7 +99,7 @@ The `FileBasedSpellChecker` uses an external file as a spelling dictionary. This
     <str name="characterEncoding">UTF-8</str>
     <str name="spellcheckIndexDir">./spellcheckerFile</str>
     <!-- optional elements with defaults
-    <str name="distanceMeasure">org.apache.lucene.search.spell.LevensteinDistance</str>
+    <str name="distanceMeasure">org.apache.lucene.search.spell.LevenshteinDistance</str>
     <str name="accuracy">0.5</str>
     -->
  </lst>