You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org> on 2017/02/02 23:56:50 UTC

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Taewoo Kim has uploaded a new change for review.

  https://asterix-gerrit.ics.uci.edu/1481

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation stpes early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to computes all cells in the 2*2 array.

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
8 files changed, 173 insertions(+), 117 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/1

diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
index 89ef0f7..cb3318f 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
@@ -47,6 +47,36 @@
 
         2
 
+### edit_distance_check ###
+* Syntax:
+
+        edit_distance_check(expression1, expression2, threshold)
+
+* Checks whether the edit distance of `expression1` and `expression2` is within a given threshold.
+
+* Arguments:
+    * `expression1` : a `string` or a homogeneous `array` of a comparable item type.
+    * `expression2` : The same type as `expression1`.
+    * `threshold` : a `bigint` that represents the distance threshold.
+* Return Value:
+    * an `array` with two items:
+        * The first item contains a `boolean` value representing whether the edit distance of `expression1` and `expression2` is within the given threshold.
+        * The second item contains an `integer` that represents the edit distance of `expression1` and `expression2` if the first item is true.
+        * If the first item is false, then the second item is set to 2147483647.
+    * `missing` if any argument is a `missing` value,
+    * `null` if any argument is a `null` value but no argument is a `missing` value,
+    * a type error will be raised if:
+        * the first or second argument is any other non-string value,
+        * or, the third argument is any other non-bigint value.
+* Note: an [n_gram index](similarity.html#UsingIndexesToSupportSimilarityQueries) can be utilized for this function.
+* Example:
+
+        edit_distance_check("happy","hapr",2);
+
+
+* The expected result is:
+
+        [ true, 2 ]
 
 ### edit_distance_contains ###
 * Syntax:
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
index ac4a3dd..751597d 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -22,8 +22,11 @@
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 
 public interface IGenericSimilarityMetric {
-    // returns similarity
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException;
+    // Returns -1 if this method supports early-termination and it becomes obvious that
+    // the possible similarity value can't satisfy the given simThresh value.
+    // Else returns the calculated similarity value.
+    public float getActualSimilarityVal(IListIterator firstList, IListIterator secondList, float simThresh)
+            throws HyracksDataException;
 
     // returns -1 if does not satisfy threshold
     // else returns similarity
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index d36d60d..70029a3 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -64,23 +64,6 @@
     }
 
     public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
-        // int intersectSize = 0;
-        //
-        // while (startX < tokensX.length && startY < tokensY.length) {
-        // int tokenX = tokensX[startX];
-        // int tokenY = tokensY[startY];
-        // if (tokenX > tokenY) {
-        // startY++;
-        // } else if (tokenX < tokenY) {
-        // startX++;
-        // } else {
-        // intersectSize++;
-        // startX++;
-        // startY++;
-        // }
-        // }
-        //
-        // return intersectSize;
         return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
     }
 
@@ -129,52 +112,6 @@
 
     public static PartialIntersect getPartialIntersectSize(int[] tokensX, int[] tokensY, int tokenStop) {
         return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
-    }
-
-    // @SuppressWarnings("unchecked")
-    // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
-    // int intersectSize = 0;
-    //
-    // Iterator<Tuple> iteratorX = tokensX.iterator();
-    // Iterator<Tuple> iteratorY = tokensY.iterator();
-    //
-    // Tuple nextX = null;
-    // Tuple nextY = null;
-    //
-    // while ((nextX != null || iteratorX.hasNext())
-    // && (nextY != null || iteratorY.hasNext())) {
-    // if (nextX == null) {
-    // nextX = iteratorX.next();
-    // }
-    // if (nextY == null) {
-    // nextY = iteratorY.next();
-    // }
-    //
-    // int cmp = nextX.compareTo(nextY);
-    // if (cmp > 0) {
-    // nextY = null;
-    // } else if (cmp < 0) {
-    // nextX = null;
-    // } else {
-    // intersectSize++;
-    // nextX = null;
-    // nextY = null;
-    // }
-    // }
-    //
-    // return intersectSize;
-    // }
-
-    // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
-
-    // public abstract float getSimilarity(DataBag tokensX, int lengthX,
-    // DataBag tokensY, int lengthY);
-
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
-        int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
-        int totalSize = tokensX.size() + tokensY.size();
-
-        return (float) intersectionSize / (totalSize - intersectionSize);
     }
 
     public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9dce89e..ba0453a 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -26,32 +26,50 @@
 
 public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
 
-    // dp implementation only needs 2 rows
+    // This Dynamic Programming implementation only needs 2 rows.
     private final int rows = 2;
     private int cols;
     private int[][] matrix;
 
-    // for letter count filtering
-    private final int[] fsLcCount = new int[128];
-    private final int[] ssLcCount = new int[128];
+    // for ASCII letter count filtering
+    private final int[] letterCounts = new int[128];
 
     public SimilarityMetricEditDistance() {
         cols = 100; // arbitrary default value
         matrix = new int[rows][cols];
     }
 
+    /**
+     * Gets the edit distance value for the given two lists using a Dynamic Programming approach.
+     * If a positive simThresh value is provided, this method only calculates 2 * (simThresh + 1) cells per row,
+     * not entire cells as an optimization. Refer to https://en.wikipedia.org/wiki/Wagner\u2013Fischer_algorithm
+     * for more details. Also, as one more optimization, during the calculation steps, if this method finds out
+     * that the final edit distance value cannot be less than simThresh, this method stops the calculation
+     * and immediately returns -1.
+     * If the final edit distance value is less than or equal to simThresh, then that value will be returned.
+     * If a non-positive simThresh is given, then it calculates all cells and rows and returns
+     * the final edit distance value.
+     *
+     * @return the edit distance of the two lists. -1 if a positive simThresh value is given and the edit distance
+     *         value is greater than the given simThresh.
+     */
     @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException {
+    public float getActualSimilarityVal(IListIterator firstList, IListIterator secondList, float simThresh)
+            throws HyracksDataException {
         int flLen = firstList.size();
         int slLen = secondList.size();
 
-        // reuse existing matrix if possible
+        // When a positive threshold is given, then we can apply two optimizations.
+        int edThresh = (int) simThresh;
+        boolean canTerminateEarly = edThresh >= 0 ? true : false;
+
+        // Reuses the existing matrix if possible.
         if (slLen >= cols) {
             cols = slLen + 1;
             matrix = new int[rows][cols];
         }
 
-        // init matrix
+        // Inits the matrix.
         for (int i = 0; i <= slLen; i++) {
             matrix[0][i] = i;
         }
@@ -59,19 +77,53 @@
         int currRow = 1;
         int prevRow = 0;
 
-        // expand dynamic programming matrix row by row
+        int from = 1;
+        int to = slLen;
+        int minDistance = -1;
+
+        // Expands the dynamic programming matrix row by row.
         for (int i = 1; i <= flLen; i++) {
             matrix[currRow][0] = i;
 
             secondList.reset();
-            for (int j = 1; j <= slLen; j++) {
+
+            // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+            // Also keeps minDistance to see whether the possible edit distance after
+            // each row calculation is greater than the simThresh.
+            if (canTerminateEarly) {
+                minDistance = edThresh + 1;
+                from = Math.max(i - edThresh - 1, 1);
+                to = Math.min(i + edThresh + 1, slLen);
+                for (int j = 1; j < from; j++) {
+                    // Moves the pointer of the second list to the point where the calculation starts for this row.
+                    secondList.next();
+                }
+                if (from > 1) {
+                    // Sets the left Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][from - 1] = edThresh + 1;
+                }
+                if (to < slLen) {
+                    // Sets the right Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][to + 1] = edThresh + 1;
+                }
+            }
+
+            for (int j = from; j <= to; j++) {
 
                 matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
                         matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
 
+                // Replaces minDistance after each cell computation if we find a smaller value than that.
+                if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+                    minDistance = matrix[currRow][j];
+                }
+
                 secondList.next();
             }
-
+            // If the minimum distance value is greater than the given threshold, no reason to process next row.
+            if (canTerminateEarly && minDistance > edThresh) {
+                return -1;
+            }
             firstList.next();
 
             int tmp = currRow;
@@ -82,6 +134,9 @@
         return matrix[prevRow][slLen];
     }
 
+    /**
+     * Gets the edit distance value for the given two lists.
+     */
     @Override
     public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
             throws HyracksDataException {
@@ -96,8 +151,8 @@
             return -1;
         }
 
-        float ed = getSimilarity(firstList, secondList);
-        if (ed > edThresh) {
+        float ed = getActualSimilarityVal(firstList, secondList, simThresh);
+        if (ed > edThresh || ed < 0) {
             return -1;
         } else {
             return ed;
@@ -155,7 +210,8 @@
     }
 
     // faster implementation for common case of string edit distance
-    public int UTF8StringEditDistance(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart) {
+    public int getActualUTF8StringEditDistanceVal(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart,
+            int edThresh) {
         int fsLen = UTF8StringUtil.getStringLength(leftBytes, fsStart);
         int ssLen = UTF8StringUtil.getStringLength(rightBytes, ssStart);
 
@@ -164,7 +220,10 @@
         int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
         int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
 
-        // reuse existing matrix if possible
+        // When a positive threshold is given, then we can apply two optimizations.
+        boolean canTerminateEarly = edThresh >= 0 ? true : false;
+
+        // Reuses the existing matrix if possible.
         if (ssLen >= cols) {
             cols = ssLen + 1;
             matrix = new int[rows][cols];
@@ -173,7 +232,7 @@
         int fsDataStart = fsStart + fsMetaLen;
         int ssDataStart = ssStart + ssMetaLen;
 
-        // init matrix
+        // Inits the matrix
         for (int i = 0; i <= ssLen; i++) {
             matrix[0][i] = i;
         }
@@ -181,19 +240,55 @@
         int currRow = 1;
         int prevRow = 0;
 
-        // expand dynamic programming matrix row by row
+        int from = 1;
+        int to = ssLen;
+        int minDistance = -1;
+
+        // Expands the dynamic programming matrix row by row.
         int fsPos = fsDataStart;
         for (int i = 1; i <= fsLen; i++) {
             matrix[currRow][0] = i;
             char fsChar = Character.toLowerCase(UTF8StringUtil.charAt(leftBytes, fsPos));
             int ssPos = ssDataStart;
-            for (int j = 1; j <= ssLen; j++) {
+
+            // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+            // Also keeps minDistance to see whether the possible edit distance after
+            // each row calculation is greater than the simThresh.
+            if (canTerminateEarly) {
+                minDistance = edThresh + 1;
+                from = Math.max(i - edThresh - 1, 1);
+                to = Math.min(i + edThresh + 1, ssLen);
+                for (int j = 1; j < from; j++) {
+                    // Moves the pointer of the second list to the point where the calculation starts for this row.
+                    ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
+                }
+                if (from > 1) {
+                    // Sets the left Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][from - 1] = edThresh + 1;
+                }
+                if (to < ssLen) {
+                    // Sets the right Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][to + 1] = edThresh + 1;
+                }
+            }
+
+            for (int j = from; j <= to; j++) {
                 char ssChar = Character.toLowerCase(UTF8StringUtil.charAt(rightBytes, ssPos));
 
                 matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
                         matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
 
+                // Replaces minDistance after each cell computation if we find a smaller value than that.
+                if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+                    minDistance = matrix[currRow][j];
+                }
+
                 ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
+            }
+
+            // If the minimum distance value is greater than the given threshold, no reason to process next row.
+            if (canTerminateEarly && minDistance > edThresh) {
+                    return -1;
             }
             fsPos += UTF8StringUtil.charSize(leftBytes, fsPos);
             int tmp = currRow;
@@ -218,8 +313,7 @@
         }
 
         // initialize letter count filtering
-        Arrays.fill(fsLcCount, 0);
-        Arrays.fill(ssLcCount, 0);
+        Arrays.fill(letterCounts, 0);
 
         // compute letter counts for first string
         int fsPos = fsStart + fsMetaLen;
@@ -227,7 +321,7 @@
         while (fsPos < fsEnd) {
             char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesLeft, fsPos));
             if (c < 128) {
-                fsLcCount[c]++;
+                letterCounts[c]++;
             }
             fsPos += UTF8StringUtil.charSize(bytesLeft, fsPos);
         }
@@ -238,30 +332,30 @@
         while (ssPos < ssEnd) {
             char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesRight, ssPos));
             if (c < 128) {
-                ssLcCount[c]++;
+                letterCounts[c]--;
             }
             ssPos += UTF8StringUtil.charSize(bytesRight, ssPos);
         }
 
         // apply filter
-        int gtSum = 0;
-        int ltSum = 0;
+        int secondTofirstDiffSum = 0;
+        int firstToSecondDiffSum = 0;
         for (int i = 0; i < 128; i++) {
-            if (fsLcCount[i] > ssLcCount[i]) {
-                gtSum += fsLcCount[i] - ssLcCount[i];
-                if (gtSum > edThresh) {
+            if (letterCounts[i] >= 0) {
+                secondTofirstDiffSum += letterCounts[i];
+                if (secondTofirstDiffSum > edThresh) {
                     return -1;
                 }
             } else {
-                ltSum += ssLcCount[i] - fsLcCount[i];
-                if (ltSum > edThresh) {
+                firstToSecondDiffSum += Math.abs(letterCounts[i]);
+                if (firstToSecondDiffSum > edThresh) {
                     return -1;
                 }
             }
         }
 
-        int ed = UTF8StringEditDistance(bytesLeft, fsStart, bytesRight, ssStart);
-        if (ed > edThresh) {
+        int ed = getActualUTF8StringEditDistanceVal(bytesLeft, fsStart, bytesRight, ssStart, edThresh);
+        if (ed > edThresh || ed < 0) {
             return -1;
         } else {
             return ed;
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index f4162c7..cafc7fb 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -44,24 +44,10 @@
         return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
     }
 
-    // @Override
-    // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
-    // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
-    // (int) tokensY.size());
-    // }
-
-    // @Override
-    // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
-    // int lengthY) {
-    // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
-    // tokensY);
-    // int totalSize = lengthX + lengthY;
-    //
-    // return (float) intersectionSize / (totalSize - intersectionSize);
-    // }
-
+    // SimThresh value will be ignored for this method since it doesn't provide an early termination.
     @Override
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+    public float getActualSimilarityVal(IListIterator tokensX, IListIterator tokensY, float simThresh)
+            throws HyracksDataException {
         int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
         int totalSize = tokensX.size() + tokensY.size();
 
@@ -81,7 +67,7 @@
             return -1f;
         }
 
-        float jacc = getSimilarity(firstList, secondList);
+        float jacc = getActualSimilarityVal(firstList, secondList, simThresh);
         if (jacc < simThresh) {
             return -1f;
         } else {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
index fee34b9..3dd3516 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -21,6 +21,8 @@
 import java.io.IOException;
 
 import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
 import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
 import org.apache.asterix.om.base.ABoolean;
 import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -77,6 +79,10 @@
         try {
             edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
                     argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+            if (edThresh < 0) {
+                throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+                        3, edThresh);
+            }
             editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
             writeResult(editDistance);
         } catch (IOException e) {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index c9d3731..92f8df3 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -105,13 +105,13 @@
 
         switch (argType) {
             case STRING: {
-                return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
-                        rightStartOffset + typeIndicatorSize);
+                return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+                        rightStartOffset + typeIndicatorSize, -1);
             }
             case ORDEREDLIST: {
                 firstOrdListIter.reset(leftBytes, leftStartOffset);
                 secondOrdListIter.reset(rightBytes, rightStartOffset);
-                return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter);
+                return (int) ed.getActualSimilarityVal(firstOrdListIter, secondOrdListIter, -1);
             }
             default: {
                 throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
index d40cb67..3a60295 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -35,6 +35,6 @@
 
     @Override
     protected float computeResult() throws HyracksDataException {
-        return jaccard.getSimilarity(firstListIter, secondListIter);
+        return jaccard.getActualSimilarityVal(firstListIter, secondListIter, -1.0f);
     }
 }

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,

I'd like you to reexamine a change.  Please visit

    https://asterix-gerrit.ics.uci.edu/1481

to look at the new patch set (#7).

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: Optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation steps early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to compute all cells in the 2 dimensional array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String. Change the name to ISequenceIterator.
 - Add the section for the function in the manual.
 - Remove letter counting filtering method since it is only applicable for
   the string in ASCII range (0 ~ 127).

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
A asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
14 files changed, 360 insertions(+), 281 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/7
-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 7
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,

I'd like you to reexamine a change.  Please visit

    https://asterix-gerrit.ics.uci.edu/1481

to look at the new patch set (#8).

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: Optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation steps early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to compute all cells in the 2 dimensional array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String. Change the name to ISequenceIterator.
 - Add the section for the function in the manual.
 - Remove letter counting filtering method since it is only applicable for
   the string in ASCII range (0 ~ 127).

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
A asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
14 files changed, 355 insertions(+), 281 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/8
-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/456/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6: Integration-Tests+1

Integration Tests Successful

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1788/ : SUCCESS

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 8: BAD+1

BAD Compatibility Tests Successful

https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/472/ : SUCCESS

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5:

BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/448/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6: BAD-1

BAD Compatibility Tests Failed

https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/449/ : FAILURE

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5:

(1 comment)

Addressed Chen's comments.

https://asterix-gerrit.ics.uci.edu/#/c/1481/5/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
File asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java:

PS5, Line 109: -1
> Add a comment to the function to explain the purpose of "-1".
Done


-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 3: Integration-Tests+1

Integration Tests Successful

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1773/ : SUCCESS

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

@Jianfeng: I now see what you mean. Since the main function is a private function, yes, I will add a unit test case since it is not exposed to the public interface. Makes sense.

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 1:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1770/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

Looks good. But I still feel there should be some simple *JUnit* test for the edit distance, not the AQL ones. 
The AQL (or SQL++) tests are too far away and usually is very difficult to hit the corner cases.

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,

I'd like you to reexamine a change.  Please visit

    https://asterix-gerrit.ics.uci.edu/1481

to look at the new patch set (#4).

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: Optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation steps early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to compute all cells in the 2 dimensional array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String. Change the name to ISequenceIterator.
 - Add the section for the function in the manual.

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
12 files changed, 255 insertions(+), 226 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/4
-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

For your comments, edit-distance-check_strings test case already contains that corner case. The latter two queries will do the early termination. I just checked it using "println".  

let $a := "Nalini Venkatasubramanian"
let $b := "Nalini Wekatasupramanian"
let $results :=
[
  edit-distance-check($a, $b, 3),
  edit-distance-check($b, $a, 3),
  edit-distance-check($a, $b, 2),
  edit-distance-check($b, $a, 2)
]
for $i in $results
return $i

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4155/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5: BAD-1

BAD Compatibility Tests Failed

https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/448/ : FAILURE

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 2:

WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack

PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 3:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1773/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4:

BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/442/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,

I'd like you to reexamine a change.  Please visit

    https://asterix-gerrit.ics.uci.edu/1481

to look at the new patch set (#6).

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: Optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation steps early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to compute all cells in the 2 dimensional array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String. Change the name to ISequenceIterator.
 - Add the section for the function in the manual.
 - Remove letter counting filtering method since it is only applicable for
   the string in ASCII range (0 ~ 127).

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
13 files changed, 283 insertions(+), 281 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/6
-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4:

(10 comments)

Just some minor comments.

https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java:

PS4, Line 26:  
use javadoc syntax?


PS4, Line 29: float
this function doesn't has to be exposed.


PS4, Line 32: returns
use javadoc?


https://asterix-gerrit.ics.uci.edu/#/c/1481/3/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java:

PS3, Line 63: public
is it necessary to has an `public interface` ? 
I think it can just be a private function of this class.


Line 70:         boolean canTerminateEarly = edThresh >= 0 ? true : false;
*boolean canTerminateEarly = edThresh >= 0* is enough.

and if edThresh > min(flLen, slLen) should also be false?


PS3, Line 131: 1
can you define a static variable and give `-1` a good name?


PS3, Line 144: Gets
do we really need this comments ? :-)


PS3, Line 157: -
it worth explain the meaning of -1


PS3, Line 168: public
is it necessary to be a public method?


PS3, Line 219: public
public -> private?


-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 8:

WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack

PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4154/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1784/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,

I'd like you to reexamine a change.  Please visit

    https://asterix-gerrit.ics.uci.edu/1481

to look at the new patch set (#3).

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation stpes early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to computes all cells in the 2*2 array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String.

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
12 files changed, 250 insertions(+), 170 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/3
-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 3:

WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack

PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4:

WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack

PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Anon. E. Moose (Code Review)" <do...@asterixdb.incubator.apache.org>.
Anon. E. Moose #1000151 has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5:

(1 comment)

https://asterix-gerrit.ics.uci.edu/#/c/1481/5/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
File asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java:

PS5, Line 109: -1
Add a comment to the function to explain the purpose of "-1".


-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4: BAD-1

BAD Compatibility Tests Failed

https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/442/ : FAILURE

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Anon. E. Moose (Code Review)" <do...@asterixdb.incubator.apache.org>.
Anon. E. Moose #1000151 has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4:

(5 comments)

First set of comments

https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java:

PS4, Line 29: float
> this function doesn't has to be exposed.
Is it better to rename "get" to "compute" since "get" seems to suggest it's a "getter"?


https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java:

PS4, Line 49: lists
"lists" -> "sequences" to be consistent with the parameter type?


PS4, Line 51: entire cells
"entire cells" -> "all the cells in the row"


PS4, Line 53: less than
"less than" -> "within"?


Line 99:             if (canTerminateEarly) {
Where is this "canTerminateEarly" decided?  I couldn't find it.


-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6: Integration-Tests+1

Integration Tests Successful

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1798/ : SUCCESS

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 8:

BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/472/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6: Integration-Tests-1

Integration Tests Timed Out

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1791/ : ABORTED

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4: Integration-Tests-1

Integration Tests Failed

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1775/ : UNSTABLE

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 1: Integration-Tests+1

Integration Tests Successful

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1770/ : SUCCESS

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 1:

BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/437/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1788/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,

I'd like you to reexamine a change.  Please visit

    https://asterix-gerrit.ics.uci.edu/1481

to look at the new patch set (#5).

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: Optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation steps early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to compute all cells in the 2 dimensional array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String. Change the name to ISequenceIterator.
 - Add the section for the function in the manual.

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
11 files changed, 291 insertions(+), 239 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/5
-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 1:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4137/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4147/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 3:

(11 comments)

@Jianfeng: Thanks!

https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java:

PS4, Line 26:  
> use javadoc syntax?
Done


PS4, Line 29: float
> this function doesn't has to be exposed.
Done


PS4, Line 32: returns
> use javadoc?
Done


https://asterix-gerrit.ics.uci.edu/#/c/1481/2/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java:

Line 28:     public static int getIntersectSize(ISequenceIterator tokensX, ISequenceIterator tokensY)
> MAJOR SonarQube violation:
Done


https://asterix-gerrit.ics.uci.edu/#/c/1481/3/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java:

PS3, Line 63: public
> is it necessary to has an `public interface` ? 
Agreed and done.


Line 70:         boolean canTerminateEarly = edThresh >= 0 ? true : false;
> *boolean canTerminateEarly = edThresh >= 0* is enough.
The caller that is calling this function already checks your if condition.Since we change this to a private function, I think it's OK not to add the if condition.


PS3, Line 131: 1
> can you define a static variable and give `-1` a good name?
Done


PS3, Line 144: Gets
> do we really need this comments ? :-)
Done


PS3, Line 157: -
> it worth explain the meaning of -1
Done


PS3, Line 168: public
> is it necessary to be a public method?
Yes. It is being called from the outside of this class.


PS3, Line 219: public
> public -> private?
It is being called from the outside.


-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1775/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1785/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 3: BAD-1

BAD Compatibility Tests Failed

https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/440/ : FAILURE

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 2:

Added Steven because of BAD failure.

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5:

@Jianfeng: the early termination logic is in place. We have test cases for them, too. In fact, the current test cases already cover them. (e.g., edit-distance-check_strings)

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack

PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 3:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4144/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1798/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 3:

BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/440/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 7:

WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack

PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 7
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1791/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6: Integration-Tests-1

Integration Tests Failed

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1785/ : UNSTABLE

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 5:

WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack

PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 4:

Oh, one more thing, do you have some test cases that actually trigger the early termination logic? I think that is necessary.

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 8:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4180/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 8:

Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1803/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6:

BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/449/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 8: Code-Review+2

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 7:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4179/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 7
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,

I'd like you to reexamine a change.  Please visit

    https://asterix-gerrit.ics.uci.edu/1481

to look at the new patch set (#2).

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................

ASTERIXDB-1778: optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation stpes early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to computes all cells in the 2*2 array.

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
12 files changed, 249 insertions(+), 170 deletions(-)


  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/2
-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 6: BAD+1

BAD Compatibility Tests Successful

https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/456/ : SUCCESS

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 2:

Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4143/

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


Patch Set 8: Integration-Tests+1

Integration Tests Successful

https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1803/ : SUCCESS

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function

Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.

Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................


Patch Set 1: BAD-1

BAD Compatibility Tests Failed

https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/437/ : FAILURE

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No

Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function

Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has submitted this change and it was merged.

Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................


ASTERIXDB-1778: Optimize the edit-distance-check function

 - Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
 - Terminate the calculation steps early when it become obvious that
   the possible edit-distance value is greater than the given threshold.
   There is no reason to compute all cells in the 2 dimensional array.
 - Move the location of IListIterator to Hyracks since we now have
   a CharacterIterator in a String. Change the name to ISequenceIterator.
 - Add the section for the function in the manual.
 - Remove letter counting filtering method since it is only applicable for
   the string in ASCII range (0 ~ 127).

Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Reviewed-on: https://asterix-gerrit.ics.uci.edu/1481
Sonar-Qube: Jenkins <je...@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
BAD: Jenkins <je...@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
Reviewed-by: Jianfeng Jia <ji...@gmail.com>
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
A asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
14 files changed, 355 insertions(+), 281 deletions(-)

Approvals:
  Jianfeng Jia: Looks good to me, approved
  Jenkins: Verified; No violations found; No violations found; Verified



diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
index 89ef0f7..cb3318f 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
@@ -47,6 +47,36 @@
 
         2
 
+### edit_distance_check ###
+* Syntax:
+
+        edit_distance_check(expression1, expression2, threshold)
+
+* Checks whether the edit distance of `expression1` and `expression2` is within a given threshold.
+
+* Arguments:
+    * `expression1` : a `string` or a homogeneous `array` of a comparable item type.
+    * `expression2` : The same type as `expression1`.
+    * `threshold` : a `bigint` that represents the distance threshold.
+* Return Value:
+    * an `array` with two items:
+        * The first item contains a `boolean` value representing whether the edit distance of `expression1` and `expression2` is within the given threshold.
+        * The second item contains an `integer` that represents the edit distance of `expression1` and `expression2` if the first item is true.
+        * If the first item is false, then the second item is set to 2147483647.
+    * `missing` if any argument is a `missing` value,
+    * `null` if any argument is a `null` value but no argument is a `missing` value,
+    * a type error will be raised if:
+        * the first or second argument is any other non-string value,
+        * or, the third argument is any other non-bigint value.
+* Note: an [n_gram index](similarity.html#UsingIndexesToSupportSimilarityQueries) can be utilized for this function.
+* Example:
+
+        edit_distance_check("happy","hapr",2);
+
+
+* The expected result is:
+
+        [ true, 2 ]
 
 ### edit_distance_contains ###
 * Syntax:
diff --git a/asterixdb/asterix-fuzzyjoin/pom.xml b/asterixdb/asterix-fuzzyjoin/pom.xml
index 0539782..9485852 100644
--- a/asterixdb/asterix-fuzzyjoin/pom.xml
+++ b/asterixdb/asterix-fuzzyjoin/pom.xml
@@ -82,6 +82,10 @@
       <groupId>org.apache.hyracks</groupId>
       <artifactId>hyracks-util</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>hyracks-data-std</artifactId>
+    </dependency>
   </dependencies>
 
 </project>
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
index ac4a3dd..b213df2 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -20,13 +20,33 @@
 package org.apache.asterix.fuzzyjoin.similarity;
 
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
 public interface IGenericSimilarityMetric {
-    // returns similarity
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException;
+    /**
+     * Returns the similarity value for the given two lists.
+     *
+     * @param firstSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @param secondSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @return a float similarity value
+     * @throws HyracksDataException
+     */
+    public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+            throws HyracksDataException;
 
-    // returns -1 if does not satisfy threshold
-    // else returns similarity
-    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+    /**
+     * Returns the similarity value for the given two lists. If the calculated similarity value
+     * doesn't satisfy the given simThresh value based on the function's check condition, this returns -1.
+     *
+     * @param firstSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @param secondSequence
+     *            an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+     * @return a float similarity value.
+     * @throws HyracksDataException
+     */
+    public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence, float simThresh)
             throws HyracksDataException;
 }
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index d36d60d..3348d4c 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -21,10 +21,12 @@
 
 import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
 public abstract class SimilarityMetric {
 
-    public static int getIntersectSize(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+    public static int getIntersectSize(ISequenceIterator tokensX, ISequenceIterator tokensY)
+            throws HyracksDataException {
         int intersectSize = 0;
         while (tokensX.hasNext() && tokensY.hasNext()) {
             int cmp = tokensX.compare(tokensY);
@@ -64,23 +66,6 @@
     }
 
     public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
-        // int intersectSize = 0;
-        //
-        // while (startX < tokensX.length && startY < tokensY.length) {
-        // int tokenX = tokensX[startX];
-        // int tokenY = tokensY[startY];
-        // if (tokenX > tokenY) {
-        // startY++;
-        // } else if (tokenX < tokenY) {
-        // startX++;
-        // } else {
-        // intersectSize++;
-        // startX++;
-        // startY++;
-        // }
-        // }
-        //
-        // return intersectSize;
         return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
     }
 
@@ -129,52 +114,6 @@
 
     public static PartialIntersect getPartialIntersectSize(int[] tokensX, int[] tokensY, int tokenStop) {
         return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
-    }
-
-    // @SuppressWarnings("unchecked")
-    // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
-    // int intersectSize = 0;
-    //
-    // Iterator<Tuple> iteratorX = tokensX.iterator();
-    // Iterator<Tuple> iteratorY = tokensY.iterator();
-    //
-    // Tuple nextX = null;
-    // Tuple nextY = null;
-    //
-    // while ((nextX != null || iteratorX.hasNext())
-    // && (nextY != null || iteratorY.hasNext())) {
-    // if (nextX == null) {
-    // nextX = iteratorX.next();
-    // }
-    // if (nextY == null) {
-    // nextY = iteratorY.next();
-    // }
-    //
-    // int cmp = nextX.compareTo(nextY);
-    // if (cmp > 0) {
-    // nextY = null;
-    // } else if (cmp < 0) {
-    // nextX = null;
-    // } else {
-    // intersectSize++;
-    // nextX = null;
-    // nextY = null;
-    // }
-    // }
-    //
-    // return intersectSize;
-    // }
-
-    // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
-
-    // public abstract float getSimilarity(DataBag tokensX, int lengthX,
-    // DataBag tokensY, int lengthY);
-
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
-        int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
-        int totalSize = tokensX.size() + tokensY.size();
-
-        return (float) intersectionSize / (totalSize - intersectionSize);
     }
 
     public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9dce89e..8003767 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -19,39 +19,59 @@
 
 package org.apache.asterix.fuzzyjoin.similarity;
 
-import java.util.Arrays;
-
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
+import org.apache.hyracks.data.std.util.UTF8StringCharByCharIterator;
 import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
 
-    // dp implementation only needs 2 rows
+    // This Dynamic Programming implementation only needs 2 rows.
     private final int rows = 2;
     private int cols;
     private int[][] matrix;
 
-    // for letter count filtering
-    private final int[] fsLcCount = new int[128];
-    private final int[] ssLcCount = new int[128];
+    // for string edit-distance calculation
+    private final UTF8StringCharByCharIterator leftIt = new UTF8StringCharByCharIterator();
+    private final UTF8StringCharByCharIterator rightIt = new UTF8StringCharByCharIterator();
+
+    public static final int SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE = -1;
 
     public SimilarityMetricEditDistance() {
         cols = 100; // arbitrary default value
         matrix = new int[rows][cols];
     }
 
-    @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException {
-        int flLen = firstList.size();
-        int slLen = secondList.size();
+    /**
+     * Gets the edit distance value for the given two sequences using a Dynamic Programming approach.
+     * If a positive simThresh value is provided, this method only calculates 2 * (simThresh + 1) cells per row,
+     * not all the cells in a row as an optimization. Refer to https://en.wikipedia.org/wiki/Wagner\u2013Fischer_algorithm
+     * for more details. Also, as one more optimization, during the calculation steps, if this method finds out
+     * that the final edit distance value cannot be within simThresh, this method stops the calculation
+     * and immediately returns -1.
+     * If the final edit distance value is less than or equal to simThresh, then that value will be returned.
+     * If a non-positive simThresh is given, then it calculates all cells and rows and returns
+     * the final edit distance value.
+     *
+     * @return the edit distance of the two lists. -1 if a positive simThresh value is given and the edit distance
+     *         value is greater than the given simThresh.
+     */
+    private float computeActualSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence,
+            float simThresh) throws HyracksDataException {
+        int flLen = firstSequence.size();
+        int slLen = secondSequence.size();
 
-        // reuse existing matrix if possible
+        // When a positive threshold is given, then we can apply two optimizations.
+        int edThresh = (int) simThresh;
+        boolean canTerminateEarly = edThresh >= 0;
+
+        // Reuses the existing matrix if possible.
         if (slLen >= cols) {
             cols = slLen + 1;
             matrix = new int[rows][cols];
         }
 
-        // init matrix
+        // Inits the matrix.
         for (int i = 0; i <= slLen; i++) {
             matrix[0][i] = i;
         }
@@ -59,20 +79,54 @@
         int currRow = 1;
         int prevRow = 0;
 
-        // expand dynamic programming matrix row by row
+        int from = 1;
+        int to = slLen;
+        int minDistance = -1;
+
+        // Expands the dynamic programming matrix row by row.
         for (int i = 1; i <= flLen; i++) {
             matrix[currRow][0] = i;
 
-            secondList.reset();
-            for (int j = 1; j <= slLen; j++) {
+            secondSequence.reset();
 
-                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
-                        matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
-
-                secondList.next();
+            // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+            // Also keeps minDistance to see whether the possible edit distance after
+            // each row calculation is greater than the simThresh.
+            if (canTerminateEarly) {
+                minDistance = edThresh + 1;
+                from = Math.max(i - edThresh - 1, 1);
+                to = Math.min(i + edThresh + 1, slLen);
+                for (int j = 1; j < from; j++) {
+                    // Moves the pointer of the second list to the point where the calculation starts for this row.
+                    secondSequence.next();
+                }
+                if (from > 1) {
+                    // Sets the left Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][from - 1] = edThresh + 1;
+                }
+                if (to < slLen) {
+                    // Sets the right Boundary cell value to make sure that the calculation is correct.
+                    matrix[currRow][to + 1] = edThresh + 1;
+                }
             }
 
-            firstList.next();
+            for (int j = from; j <= to; j++) {
+
+                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+                        matrix[prevRow][j - 1] + (firstSequence.compare(secondSequence) == 0 ? 0 : 1));
+
+                // Replaces minDistance after each cell computation if we find a smaller value than that.
+                if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+                    minDistance = matrix[currRow][j];
+                }
+
+                secondSequence.next();
+            }
+            // If the minimum distance value is greater than the given threshold, no reason to process next row.
+            if (canTerminateEarly && minDistance > edThresh) {
+                return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
+            }
+            firstSequence.next();
 
             int tmp = currRow;
             currRow = prevRow;
@@ -82,8 +136,12 @@
         return matrix[prevRow][slLen];
     }
 
+    /**
+     * Gets the similarity value for the given two sequences. If the value doesn't satisfy the given simThresh,
+     * this method returns -1. Else, this returns the real similarity value.
+     */
     @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+    public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
             throws HyracksDataException {
 
         int edThresh = (int) simThresh;
@@ -93,18 +151,18 @@
 
         // length filter
         if (Math.abs(flLen - slLen) > edThresh) {
-            return -1;
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         }
 
-        float ed = getSimilarity(firstList, secondList);
-        if (ed > edThresh) {
-            return -1;
+        float ed = computeActualSimilarity(firstList, secondList, simThresh);
+        if (ed > edThresh || ed < 0) {
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         } else {
             return ed;
         }
     }
 
-    public int getSimilarityContains(IListIterator exprList, IListIterator patternList, int simThresh)
+    public int getSimilarityContains(ISequenceIterator exprList, ISequenceIterator patternList, int simThresh)
             throws HyracksDataException {
         int exprLen = exprList.size();
         int patternLen = patternList.size();
@@ -148,182 +206,50 @@
         }
 
         if (minEd > simThresh) {
-            return -1;
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         } else {
             return minEd;
         }
     }
 
     // faster implementation for common case of string edit distance
-    public int UTF8StringEditDistance(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart) {
-        int fsLen = UTF8StringUtil.getStringLength(leftBytes, fsStart);
-        int ssLen = UTF8StringUtil.getStringLength(rightBytes, ssStart);
-
-        int fsUtfLen = UTF8StringUtil.getUTFLength(leftBytes, fsStart);
-        int ssUtfLen = UTF8StringUtil.getUTFLength(rightBytes, ssStart);
-        int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
-        int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
-        // reuse existing matrix if possible
-        if (ssLen >= cols) {
-            cols = ssLen + 1;
-            matrix = new int[rows][cols];
-        }
-
-        int fsDataStart = fsStart + fsMetaLen;
-        int ssDataStart = ssStart + ssMetaLen;
-
-        // init matrix
-        for (int i = 0; i <= ssLen; i++) {
-            matrix[0][i] = i;
-        }
-
-        int currRow = 1;
-        int prevRow = 0;
-
-        // expand dynamic programming matrix row by row
-        int fsPos = fsDataStart;
-        for (int i = 1; i <= fsLen; i++) {
-            matrix[currRow][0] = i;
-            char fsChar = Character.toLowerCase(UTF8StringUtil.charAt(leftBytes, fsPos));
-            int ssPos = ssDataStart;
-            for (int j = 1; j <= ssLen; j++) {
-                char ssChar = Character.toLowerCase(UTF8StringUtil.charAt(rightBytes, ssPos));
-
-                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
-                        matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
-
-                ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
-            }
-            fsPos += UTF8StringUtil.charSize(leftBytes, fsPos);
-            int tmp = currRow;
-            currRow = prevRow;
-            prevRow = tmp;
-        }
-        return matrix[prevRow][ssLen];
+    public int getActualUTF8StringEditDistanceVal(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart,
+            int edThresh) throws HyracksDataException {
+        leftIt.reset(leftBytes, fsStart);
+        rightIt.reset(rightBytes, ssStart);
+        return (int) computeActualSimilarity(leftIt, rightIt, edThresh);
     }
 
-    public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh) {
+    public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh)
+            throws HyracksDataException {
         int fsStrLen = UTF8StringUtil.getStringLength(bytesLeft, fsStart);
         int ssStrLen = UTF8StringUtil.getStringLength(bytesRight, ssStart);
 
-        int fsUtfLen = UTF8StringUtil.getUTFLength(bytesLeft, fsStart);
-        int ssUtfLen = UTF8StringUtil.getUTFLength(bytesRight, ssStart);
-        int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
-        int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
         // length filter
         if (Math.abs(fsStrLen - ssStrLen) > edThresh) {
-            return -1;
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         }
 
-        // initialize letter count filtering
-        Arrays.fill(fsLcCount, 0);
-        Arrays.fill(ssLcCount, 0);
-
-        // compute letter counts for first string
-        int fsPos = fsStart + fsMetaLen;
-        int fsEnd = fsPos + fsUtfLen;;
-        while (fsPos < fsEnd) {
-            char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesLeft, fsPos));
-            if (c < 128) {
-                fsLcCount[c]++;
-            }
-            fsPos += UTF8StringUtil.charSize(bytesLeft, fsPos);
-        }
-
-        // compute letter counts for second string
-        int ssPos = ssStart + ssMetaLen;
-        int ssEnd = ssPos + ssUtfLen;
-        while (ssPos < ssEnd) {
-            char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesRight, ssPos));
-            if (c < 128) {
-                ssLcCount[c]++;
-            }
-            ssPos += UTF8StringUtil.charSize(bytesRight, ssPos);
-        }
-
-        // apply filter
-        int gtSum = 0;
-        int ltSum = 0;
-        for (int i = 0; i < 128; i++) {
-            if (fsLcCount[i] > ssLcCount[i]) {
-                gtSum += fsLcCount[i] - ssLcCount[i];
-                if (gtSum > edThresh) {
-                    return -1;
-                }
-            } else {
-                ltSum += ssLcCount[i] - fsLcCount[i];
-                if (ltSum > edThresh) {
-                    return -1;
-                }
-            }
-        }
-
-        int ed = UTF8StringEditDistance(bytesLeft, fsStart, bytesRight, ssStart);
-        if (ed > edThresh) {
-            return -1;
+        int ed = getActualUTF8StringEditDistanceVal(bytesLeft, fsStart, bytesRight, ssStart, edThresh);
+        if (ed > edThresh || ed < 0) {
+            return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
         } else {
             return ed;
         }
     }
 
     // checks whether the first string contains a similar string to the second string
-    public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] pattenBytes, int patternStart,
-            int edThresh) {
+    public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] patternBytes, int patternStart,
+            int edThresh) throws HyracksDataException {
+        leftIt.reset(strBytes, stringStart);
+        rightIt.reset(patternBytes, patternStart);
+        return getSimilarityContains(leftIt, rightIt, edThresh);
+    }
 
-        int stringLen = UTF8StringUtil.getStringLength(strBytes, stringStart);
-        int patternLen = UTF8StringUtil.getStringLength(pattenBytes, patternStart);
-
-        int stringUTFLen = UTF8StringUtil.getUTFLength(strBytes, stringStart);
-        int stringMetaLen = UTF8StringUtil.getNumBytesToStoreLength(stringUTFLen);
-
-        int patternUTFLen = UTF8StringUtil.getUTFLength(pattenBytes, patternStart);
-        int patternMetaLen = UTF8StringUtil.getNumBytesToStoreLength(patternUTFLen);
-
-        // reuse existing matrix if possible
-        if (patternLen >= cols) {
-            cols = patternLen + 1;
-            matrix = new int[rows][cols];
-        }
-
-        int stringDataStart = stringStart + stringMetaLen;
-        int patternDataStart = patternStart + patternMetaLen;
-
-        // init matrix
-        for (int i = 0; i <= patternLen; i++) {
-            matrix[0][i] = i;
-        }
-
-        int currRow = 1;
-        int prevRow = 0;
-        int minEd = Integer.MAX_VALUE;
-        // expand dynamic programming matrix row by row
-        int stringPos = stringDataStart;
-        for (int i = 1; i <= stringLen; i++) {
-            matrix[currRow][0] = 0;
-            char stringChar = Character.toLowerCase(UTF8StringUtil.charAt(strBytes, stringPos));
-
-            int patternPos = patternDataStart;
-            for (int j = 1; j <= patternLen; j++) {
-                char patternChar = Character.toLowerCase(UTF8StringUtil.charAt(pattenBytes, patternPos));
-                matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
-                        matrix[prevRow][j - 1] + (stringChar == patternChar ? 0 : 1));
-                patternPos += UTF8StringUtil.charSize(pattenBytes, patternPos);
-                if (j == patternLen && matrix[currRow][patternLen] < minEd) {
-                    minEd = matrix[currRow][patternLen];
-                }
-            }
-
-            stringPos += UTF8StringUtil.charSize(strBytes, stringPos);
-            int tmp = currRow;
-            currRow = prevRow;
-            prevRow = tmp;
-        }
-        if (minEd > edThresh) {
-            return -1;
-        } else {
-            return minEd;
-        }
+    @Override
+    public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+            throws HyracksDataException {
+        // Passes -1 as the simThresh to calculate the edit distance without applying any calculation optimizations.
+        return computeActualSimilarity(firstSequence, secondSequence, -1);
     }
 }
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index f4162c7..4a31b8b 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -24,6 +24,7 @@
 
 import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
 public class SimilarityMetricJaccard extends SimilarityMetric implements IGenericSimilarityMetric {
 
@@ -44,24 +45,8 @@
         return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
     }
 
-    // @Override
-    // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
-    // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
-    // (int) tokensY.size());
-    // }
-
-    // @Override
-    // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
-    // int lengthY) {
-    // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
-    // tokensY);
-    // int totalSize = lengthX + lengthY;
-    //
-    // return (float) intersectionSize / (totalSize - intersectionSize);
-    // }
-
     @Override
-    public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+    public float computeSimilarity(ISequenceIterator tokensX, ISequenceIterator tokensY) throws HyracksDataException {
         int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
         int totalSize = tokensX.size() + tokensY.size();
 
@@ -69,7 +54,7 @@
     }
 
     @Override
-    public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+    public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
             throws HyracksDataException {
 
         // apply length filter
@@ -81,7 +66,7 @@
             return -1f;
         }
 
-        float jacc = getSimilarity(firstList, secondList);
+        float jacc = computeSimilarity(firstList, secondList);
         if (jacc < simThresh) {
             return -1f;
         } else {
diff --git a/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
new file mode 100644
index 0000000..caa80bc
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.asterix.fuzzyjoin.similarity;
+
+import static org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.junit.Test;
+
+public class SimilarityMetricEditDistanceTest {
+
+    private static final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
+
+    @Test
+    public void test() throws Exception {
+        // For this case, the edit-distance of two strings is 3.
+        UTF8StringPointable leftStrPointable1 = generateUTF8Pointable("coupon not available in store");
+        UTF8StringPointable rightStrPointable1 = generateUTF8Pointable("coupon is available in store");
+
+        // The edit-distance between leftStrPointable1 and the following is 14.
+        UTF8StringPointable rightStrPointable2 = generateUTF8Pointable("coupon in store");
+
+        byte[] leftBytes1 = leftStrPointable1.getByteArray();
+        int leftStartOffset1 = leftStrPointable1.getStartOffset();
+        byte[] rightBytes1 = rightStrPointable1.getByteArray();
+        int rightStartOffset1 = rightStrPointable1.getStartOffset();
+        byte[] rightBytes2 = rightStrPointable2.getByteArray();
+        int rightStartOffset2 = rightStrPointable2.getStartOffset();
+
+        // Case 1 - normal - no early termination
+        int edThresh = 3;
+        int edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes1, rightStartOffset1, edThresh);
+        assertEquals(edThresh, edVal);
+
+        // Case 2 - the length difference between two strings is greater than edThresh.
+        // Even without calculating the distance, the method should return -1.
+        edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+        assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+        // Case 3 - the edit distance is 14, but the threshold is 1.
+        // The early termination should happen and the returned value should be -1.
+        edThresh = 1;
+        edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+        assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+        // Case 4 - the edit distance is 14, but the threshold is 13.
+        // The early termination will not happen. But, the resulting edit distance is greater than the given threshold.
+        // So, the final returned value should be -1.
+        edThresh = 13;
+        edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+        assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+    }
+
+}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
index b929854..2435047 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
@@ -20,13 +20,13 @@
 
 import org.apache.asterix.common.exceptions.AsterixException;
 import org.apache.asterix.formats.nontagged.BinaryComparatorFactoryProvider;
-import org.apache.asterix.fuzzyjoin.similarity.IListIterator;
 import org.apache.asterix.om.types.ATypeTag;
 import org.apache.asterix.om.types.EnumDeserializer;
 import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
 
-public abstract class AbstractAsterixListIterator implements IListIterator {
+public abstract class AbstractAsterixListIterator implements ISequenceIterator {
 
     protected byte[] data;
     protected int count = 0;
@@ -42,7 +42,7 @@
     protected final boolean ignoreCase = true;
 
     @Override
-    public int compare(IListIterator cmpIter) throws HyracksDataException {
+    public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
         return cmp.compare(data, pos, -1, cmpIter.getData(), cmpIter.getPos(), -1);
     }
 
@@ -100,6 +100,7 @@
         }
     }
 
+    @Override
     public void reset(byte[] data, int startOff) throws HyracksDataException {
         this.data = data;
         this.startOff = startOff;
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
index fee34b9..63e9e44 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -21,6 +21,8 @@
 import java.io.IOException;
 
 import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
 import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
 import org.apache.asterix.om.base.ABoolean;
 import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -77,6 +79,10 @@
         try {
             edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
                     argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+            if (edThresh < 0) {
+                throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+                        3, edThresh);
+            }
             editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
             writeResult(editDistance);
         } catch (IOException e) {
@@ -101,7 +107,7 @@
             case ORDEREDLIST: {
                 firstOrdListIter.reset(leftBytes, leftStartOffset);
                 secondOrdListIter.reset(rightBytes, rightStartOffset);
-                return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
+                return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
             }
 
             default: {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index c9d3731..dbe99b9 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -105,13 +105,15 @@
 
         switch (argType) {
             case STRING: {
-                return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
-                        rightStartOffset + typeIndicatorSize);
+                // Passes -1 as the simThresh to calculate the edit distance
+                // without applying any calculation optimizations.
+                return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+                        rightStartOffset + typeIndicatorSize, -1);
             }
             case ORDEREDLIST: {
                 firstOrdListIter.reset(leftBytes, leftStartOffset);
                 secondOrdListIter.reset(rightBytes, rightStartOffset);
-                return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter);
+                return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter);
             }
             default: {
                 throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
index 19e9395..0decd9e 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
@@ -34,6 +34,6 @@
 
     @Override
     protected float computeResult() throws HyracksDataException {
-        return jaccard.getSimilarity(firstListIter, secondListIter, jaccThresh);
+        return jaccard.computeSimilarity(firstListIter, secondListIter, jaccThresh);
     }
 }
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
index d40cb67..1cd32c8 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -35,6 +35,6 @@
 
     @Override
     protected float computeResult() throws HyracksDataException {
-        return jaccard.getSimilarity(firstListIter, secondListIter);
+        return jaccard.computeSimilarity(firstListIter, secondListIter);
     }
 }
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
similarity index 81%
rename from asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
rename to hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
index 6c3d22e..9441453 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
@@ -17,12 +17,12 @@
  * under the License.
  */
 
-package org.apache.asterix.fuzzyjoin.similarity;
+package org.apache.hyracks.data.std.util;
 
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 
-public interface IListIterator {
-    public int compare(IListIterator cmpIter) throws HyracksDataException;
+public interface ISequenceIterator {
+    public int compare(ISequenceIterator cmpIter) throws HyracksDataException;
 
     public byte[] getData();
 
@@ -34,5 +34,7 @@
 
     public void reset() throws HyracksDataException;
 
+    public void reset(byte[] data, int startOff) throws HyracksDataException;
+
     public int size();
 }
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
new file mode 100644
index 0000000..237d291
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
+/**
+ * An iterator class for a String. This class iterates a char by char in the given String.
+ */
+public class UTF8StringCharByCharIterator implements ISequenceIterator {
+
+    protected byte[] data;
+    protected int pos = -1;
+    protected int length = -1;
+    protected int utfByteLength = -1;
+    protected int metaLength = -1;
+    protected int startOffset = -1;
+
+    @Override
+    public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
+        char thisChar = Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
+        char thatChar = Character.toLowerCase(UTF8StringUtil.charAt(cmpIter.getData(), cmpIter.getPos()));
+        if (thisChar == thatChar) {
+            return 0;
+        }
+        return -1;
+    }
+
+    @Override
+    public boolean hasNext() {
+        return pos < utfByteLength;
+    }
+
+    @Override
+    public int size() {
+        return length;
+    }
+
+    @Override
+    public byte[] getData() {
+        return data;
+    }
+
+    @Override
+    public int getPos() {
+        return pos;
+    }
+
+    @Override
+    public void next() throws HyracksDataException {
+        pos += UTF8StringUtil.charSize(data, pos);
+    }
+
+    @Override
+    public void reset() throws HyracksDataException {
+        pos = startOffset + metaLength;
+    }
+
+    @Override
+    public void reset(byte[] data, int startOff) throws HyracksDataException {
+        this.data = data;
+        this.startOffset = startOff;
+        this.length = UTF8StringUtil.getStringLength(data, startOffset);
+        this.utfByteLength = UTF8StringUtil.getUTFLength(data, startOffset);
+        this.metaLength = UTF8StringUtil.getNumBytesToStoreLength(utfByteLength);
+        reset();
+    }
+
+}

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 9
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>