You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by jp...@apache.org on 2017/12/06 16:30:55 UTC
[2/2] lucene-solr:branch_7x: LUCENE-8051: Typo in LevensHtein
distance.
LUCENE-8051: Typo in LevensHtein distance.
Closes #284
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/26a4ebfa
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/26a4ebfa
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/26a4ebfa
Branch: refs/heads/branch_7x
Commit: 26a4ebfad5a4398c2f5ce48d094c89bdd28e61ae
Parents: 292b30e
Author: Adrien Grand <jp...@gmail.com>
Authored: Wed Dec 6 15:16:25 2017 +0100
Committer: Adrien Grand <jp...@gmail.com>
Committed: Wed Dec 6 17:29:19 2017 +0100
----------------------------------------------------------------------
lucene/CHANGES.txt | 6 +-
.../search/spell/LevenshteinDistance.java | 125 +++++++++++++++++++
.../lucene/search/spell/LevensteinDistance.java | 110 ++--------------
.../lucene/search/spell/SpellChecker.java | 4 +-
.../search/spell/TestLevenshteinDistance.java | 2 +-
.../lucene/search/spell/TestSpellChecker.java | 4 +-
.../apache/solr/search/ValueSourceParser.java | 6 +-
.../spelling/AbstractLuceneSpellChecker.java | 4 +-
.../apache/solr/spelling/SolrSpellChecker.java | 6 +-
.../ConjunctionSolrSpellCheckerTest.java | 6 +-
solr/solr-ref-guide/src/spell-checking.adoc | 4 +-
11 files changed, 156 insertions(+), 121 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3823c7e..898cb3f 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -4,7 +4,11 @@ For more information on past and future Lucene versions, please see:
http://s.apache.org/luceneversions
======================= Lucene 7.3.0 =======================
-(No Changes)
+
+API Changes
+
+* LUCENE-8051: LevensteinDistance renamed to LevenshteinDistance.
+ (Pulak Ghosh via Adrien Grand)
======================= Lucene 7.2.0 =======================
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java
new file mode 100644
index 0000000..2aff15b
--- /dev/null
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevenshteinDistance.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.spell;
+
+/**
+ * Levenshtein edit distance class.
+ */
+public class LevenshteinDistance implements StringDistance {
+
+ /**
+ * Optimized to run a bit faster than the static getDistance().
+ * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
+ */
+ public LevenshteinDistance () {
+ }
+
+
+ //*****************************
+ // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
+ //*****************************
+ @Override
+ public final float getDistance (String target, String other) {
+ char[] sa;
+ int n;
+ int p[]; //'previous' cost array, horizontally
+ int d[]; // cost array, horizontally
+ int _d[]; //placeholder to assist in swapping p and d
+
+ /*
+ The difference between this impl. and the previous is that, rather
+ than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+ we maintain two single-dimensional arrays of length s.length()+1. The first, d,
+ is the 'current working' distance array that maintains the newest distance cost
+ counts as we iterate through the characters of String s. Each time we increment
+ the index of String t we are comparing, d is copied to p, the second int[]. Doing so
+ allows us to retain the previous cost counts as required by the algorithm (taking
+ the minimum of the cost count to the left, up one, and diagonally up and to the left
+ of the current cost count being calculated). (Note that the arrays aren't really
+ copied anymore, just switched...this is clearly much better than cloning an array
+ or doing a System.arraycopy() each time through the outer loop.)
+
+ Effectively, the difference between the two implementations is this one does not
+ cause an out of memory condition when calculating the LD over two very large strings.
+ */
+
+ sa = target.toCharArray();
+ n = sa.length;
+ p = new int[n+1];
+ d = new int[n+1];
+
+ final int m = other.length();
+ if (n == 0 || m == 0) {
+ if (n == m) {
+ return 1;
+ }
+ else {
+ return 0;
+ }
+ }
+
+
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
+
+ char t_j; // jth character of t
+
+ int cost; // cost
+
+ for (i = 0; i<=n; i++) {
+ p[i] = i;
+ }
+
+ for (j = 1; j<=m; j++) {
+ t_j = other.charAt(j-1);
+ d[0] = j;
+
+ for (i=1; i<=n; i++) {
+ cost = sa[i-1]==t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
+ }
+
+ @Override
+ public final int hashCode() {
+ return 163 * getClass().hashCode();
+ }
+
+ @Override
+ public final boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (null == obj) return false;
+ return (getClass() == obj.getClass());
+ }
+
+ @Override
+ public final String toString() {
+ return "levensthein";
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
index de3f587..41e4b1a 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/LevensteinDistance.java
@@ -17,109 +17,15 @@
package org.apache.lucene.search.spell;
/**
- * Levenstein edit distance class.
+ * Levenshtein edit distance class.
+ * @deprecated Use {@link LevenshteinDistance} instead.
*/
-public final class LevensteinDistance implements StringDistance {
+@Deprecated
+public class LevensteinDistance extends LevenshteinDistance {
- /**
- * Optimized to run a bit faster than the static getDistance().
- * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
- */
- public LevensteinDistance () {
- }
+ /**
+ * Sole constructor.
+ */
+ public LevensteinDistance() {}
-
- //*****************************
- // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
- //*****************************
- @Override
- public float getDistance (String target, String other) {
- char[] sa;
- int n;
- int p[]; //'previous' cost array, horizontally
- int d[]; // cost array, horizontally
- int _d[]; //placeholder to assist in swapping p and d
-
- /*
- The difference between this impl. and the previous is that, rather
- than creating and retaining a matrix of size s.length()+1 by t.length()+1,
- we maintain two single-dimensional arrays of length s.length()+1. The first, d,
- is the 'current working' distance array that maintains the newest distance cost
- counts as we iterate through the characters of String s. Each time we increment
- the index of String t we are comparing, d is copied to p, the second int[]. Doing so
- allows us to retain the previous cost counts as required by the algorithm (taking
- the minimum of the cost count to the left, up one, and diagonally up and to the left
- of the current cost count being calculated). (Note that the arrays aren't really
- copied anymore, just switched...this is clearly much better than cloning an array
- or doing a System.arraycopy() each time through the outer loop.)
-
- Effectively, the difference between the two implementations is this one does not
- cause an out of memory condition when calculating the LD over two very large strings.
- */
-
- sa = target.toCharArray();
- n = sa.length;
- p = new int[n+1];
- d = new int[n+1];
-
- final int m = other.length();
- if (n == 0 || m == 0) {
- if (n == m) {
- return 1;
- }
- else {
- return 0;
- }
- }
-
-
- // indexes into strings s and t
- int i; // iterates through s
- int j; // iterates through t
-
- char t_j; // jth character of t
-
- int cost; // cost
-
- for (i = 0; i<=n; i++) {
- p[i] = i;
- }
-
- for (j = 1; j<=m; j++) {
- t_j = other.charAt(j-1);
- d[0] = j;
-
- for (i=1; i<=n; i++) {
- cost = sa[i-1]==t_j ? 0 : 1;
- // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
- d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
- }
-
- // copy current distance counts to 'previous row' distance counts
- _d = p;
- p = d;
- d = _d;
- }
-
- // our last action in the above loop was to switch d and p, so p now
- // actually has the most recent cost counts
- return 1.0f - ((float) p[n] / Math.max(other.length(), sa.length));
- }
-
- @Override
- public int hashCode() {
- return 163 * getClass().hashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (this == obj) return true;
- if (null == obj) return false;
- return (getClass() == obj.getClass());
- }
-
- @Override
- public String toString() {
- return "levenstein";
- }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java b/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
index 1f11e27..a3b103d 100644
--- a/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java
@@ -126,7 +126,7 @@ public class SpellChecker implements java.io.Closeable {
}
/**
* Use the given directory as a spell checker index with a
- * {@link LevensteinDistance} as the default {@link StringDistance}. The
+ * {@link LevenshteinDistance} as the default {@link StringDistance}. The
* directory is created if it doesn't exist yet.
*
* @param spellIndex
@@ -135,7 +135,7 @@ public class SpellChecker implements java.io.Closeable {
* if spellchecker can not open the directory
*/
public SpellChecker(Directory spellIndex) throws IOException {
- this(spellIndex, new LevensteinDistance());
+ this(spellIndex, new LevenshteinDistance());
}
/**
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
index 8b2cfd9..88c7587 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestLevenshteinDistance.java
@@ -20,7 +20,7 @@ import org.apache.lucene.util.LuceneTestCase;
public class TestLevenshteinDistance extends LuceneTestCase {
- private StringDistance sd = new LevensteinDistance();
+ private StringDistance sd = new LevenshteinDistance();
public void testGetDistance() {
float d = sd.getDistance("al", "al");
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
----------------------------------------------------------------------
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
index 8428d84..ae6c972 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
@@ -149,7 +149,7 @@ public class TestSpellChecker extends LuceneTestCase {
public void testComparator() throws Exception {
IndexReader r = DirectoryReader.open(userindex);
Directory compIdx = newDirectory();
- SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
+ SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevenshteinDistance(), new SuggestWordFrequencyComparator());
addwords(r, compareSP, "field3");
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3",
@@ -167,7 +167,7 @@ public class TestSpellChecker extends LuceneTestCase {
public void testBogusField() throws Exception {
IndexReader r = DirectoryReader.open(userindex);
Directory compIdx = newDirectory();
- SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
+ SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevenshteinDistance(), new SuggestWordFrequencyComparator());
addwords(r, compareSP, "field3");
String[] similar = compareSP.suggestSimilar("fvie", 2, r,
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
index 5944dc5..dc6411e 100644
--- a/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
+++ b/solr/core/src/java/org/apache/solr/search/ValueSourceParser.java
@@ -41,7 +41,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spell.JaroWinklerDistance;
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
import org.apache.lucene.search.spell.NGramDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.util.BytesRefBuilder;
@@ -406,7 +406,7 @@ public abstract class ValueSourceParser implements NamedListInitializedPlugin {
if (distClass.equalsIgnoreCase("jw")) {
dist = new JaroWinklerDistance();
} else if (distClass.equalsIgnoreCase("edit")) {
- dist = new LevensteinDistance();
+ dist = new LevenshteinDistance();
} else if (distClass.equalsIgnoreCase("ngram")) {
int ngram = 2;
if (fp.hasMoreArguments()) {
@@ -1549,4 +1549,4 @@ class TestValueSource extends ValueSource {
public SortField getSortField(boolean reverse) {
return super.getSortField(reverse);
}
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
index a03e911..8170982 100644
--- a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
@@ -26,7 +26,7 @@ import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.Dictionary;
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord;
@@ -109,7 +109,7 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
sd = core.getResourceLoader().newInstance(strDistanceName, StringDistance.class);
//TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils...
} else {
- sd = new LevensteinDistance();
+ sd = new LevenshteinDistance();
}
try {
initIndex();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
index bb461ab..5543867 100644
--- a/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/SolrSpellChecker.java
@@ -23,7 +23,7 @@ import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordQueue;
@@ -89,9 +89,9 @@ public abstract class SolrSpellChecker {
StringDistance sd = null;
try {
- sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
+ sd = getStringDistance() == null ? new LevenshteinDistance() : getStringDistance();
} catch(UnsupportedOperationException uoe) {
- sd = new LevensteinDistance();
+ sd = new LevenshteinDistance();
}
SpellingResult result = new SpellingResult();
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
index 31f20fb..0df837f 100644
--- a/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
+++ b/solr/core/src/test/org/apache/solr/spelling/ConjunctionSolrSpellCheckerTest.java
@@ -18,7 +18,7 @@ package org.apache.solr.spelling;
import java.io.IOException;
-import org.apache.lucene.search.spell.LevensteinDistance;
+import org.apache.lucene.search.spell.LevenshteinDistance;
import org.apache.lucene.search.spell.NGramDistance;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.util.LuceneTestCase;
@@ -31,8 +31,8 @@ public class ConjunctionSolrSpellCheckerTest extends LuceneTestCase {
@Test
public void test() throws Exception {
ConjunctionSolrSpellChecker cssc = new ConjunctionSolrSpellChecker();
- MockSolrSpellChecker levenstein1 = new MockSolrSpellChecker(new LevensteinDistance());
- MockSolrSpellChecker levenstein2 = new MockSolrSpellChecker(new LevensteinDistance());
+ MockSolrSpellChecker levenstein1 = new MockSolrSpellChecker(new LevenshteinDistance());
+ MockSolrSpellChecker levenstein2 = new MockSolrSpellChecker(new LevenshteinDistance());
MockSolrSpellChecker ngram = new MockSolrSpellChecker(new NGramDistance());
cssc.addChecker(levenstein1);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/26a4ebfa/solr/solr-ref-guide/src/spell-checking.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/spell-checking.adoc b/solr/solr-ref-guide/src/spell-checking.adoc
index 12818b2..26ec194 100644
--- a/solr/solr-ref-guide/src/spell-checking.adoc
+++ b/solr/solr-ref-guide/src/spell-checking.adoc
@@ -39,7 +39,7 @@ The `IndexBasedSpellChecker` uses a Solr index as the basis for a parallel index
<str name="field">content</str>
<str name="buildOnCommit">true</str>
<!-- optional elements with defaults
- <str name="distanceMeasure">org.apache.lucene.search.spell.LevensteinDistance</str>
+ <str name="distanceMeasure">org.apache.lucene.search.spell.LevenshteinDistance</str>
<str name="accuracy">0.5</str>
-->
</lst>
@@ -99,7 +99,7 @@ The `FileBasedSpellChecker` uses an external file as a spelling dictionary. This
<str name="characterEncoding">UTF-8</str>
<str name="spellcheckIndexDir">./spellcheckerFile</str>
<!-- optional elements with defaults
- <str name="distanceMeasure">org.apache.lucene.search.spell.LevensteinDistance</str>
+ <str name="distanceMeasure">org.apache.lucene.search.spell.LevenshteinDistance</str>
<str name="accuracy">0.5</str>
-->
</lst>