You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org> on 2017/02/02 23:56:50 UTC
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Taewoo Kim has uploaded a new change for review.
https://asterix-gerrit.ics.uci.edu/1481
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation stpes early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to computes all cells in the 2*2 array.
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
8 files changed, 173 insertions(+), 117 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/1
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
index 89ef0f7..cb3318f 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
@@ -47,6 +47,36 @@
2
+### edit_distance_check ###
+* Syntax:
+
+ edit_distance_check(expression1, expression2, threshold)
+
+* Checks whether the edit distance of `expression1` and `expression2` is within a given threshold.
+
+* Arguments:
+ * `expression1` : a `string` or a homogeneous `array` of a comparable item type.
+ * `expression2` : The same type as `expression1`.
+ * `threshold` : a `bigint` that represents the distance threshold.
+* Return Value:
+ * an `array` with two items:
+ * The first item contains a `boolean` value representing whether the edit distance of `expression1` and `expression2` is within the given threshold.
+ * The second item contains an `integer` that represents the edit distance of `expression1` and `expression2` if the first item is true.
+ * If the first item is false, then the second item is set to 2147483647.
+ * `missing` if any argument is a `missing` value,
+ * `null` if any argument is a `null` value but no argument is a `missing` value,
+ * a type error will be raised if:
+ * the first or second argument is any other non-string value,
+ * or, the third argument is any other non-bigint value.
+* Note: an [n_gram index](similarity.html#UsingIndexesToSupportSimilarityQueries) can be utilized for this function.
+* Example:
+
+ edit_distance_check("happy","hapr",2);
+
+
+* The expected result is:
+
+ [ true, 2 ]
### edit_distance_contains ###
* Syntax:
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
index ac4a3dd..751597d 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -22,8 +22,11 @@
import org.apache.hyracks.api.exceptions.HyracksDataException;
public interface IGenericSimilarityMetric {
- // returns similarity
- public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException;
+ // Returns -1 if this method supports early-termination and it becomes obvious that
+ // the possible similarity value can't satisfy the given simThresh value.
+ // Else returns the calculated similarity value.
+ public float getActualSimilarityVal(IListIterator firstList, IListIterator secondList, float simThresh)
+ throws HyracksDataException;
// returns -1 if does not satisfy threshold
// else returns similarity
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index d36d60d..70029a3 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -64,23 +64,6 @@
}
public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
- // int intersectSize = 0;
- //
- // while (startX < tokensX.length && startY < tokensY.length) {
- // int tokenX = tokensX[startX];
- // int tokenY = tokensY[startY];
- // if (tokenX > tokenY) {
- // startY++;
- // } else if (tokenX < tokenY) {
- // startX++;
- // } else {
- // intersectSize++;
- // startX++;
- // startY++;
- // }
- // }
- //
- // return intersectSize;
return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
}
@@ -129,52 +112,6 @@
public static PartialIntersect getPartialIntersectSize(int[] tokensX, int[] tokensY, int tokenStop) {
return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
- }
-
- // @SuppressWarnings("unchecked")
- // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
- // int intersectSize = 0;
- //
- // Iterator<Tuple> iteratorX = tokensX.iterator();
- // Iterator<Tuple> iteratorY = tokensY.iterator();
- //
- // Tuple nextX = null;
- // Tuple nextY = null;
- //
- // while ((nextX != null || iteratorX.hasNext())
- // && (nextY != null || iteratorY.hasNext())) {
- // if (nextX == null) {
- // nextX = iteratorX.next();
- // }
- // if (nextY == null) {
- // nextY = iteratorY.next();
- // }
- //
- // int cmp = nextX.compareTo(nextY);
- // if (cmp > 0) {
- // nextY = null;
- // } else if (cmp < 0) {
- // nextX = null;
- // } else {
- // intersectSize++;
- // nextX = null;
- // nextY = null;
- // }
- // }
- //
- // return intersectSize;
- // }
-
- // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
-
- // public abstract float getSimilarity(DataBag tokensX, int lengthX,
- // DataBag tokensY, int lengthY);
-
- public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
- int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
- int totalSize = tokensX.size() + tokensY.size();
-
- return (float) intersectionSize / (totalSize - intersectionSize);
}
public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9dce89e..ba0453a 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -26,32 +26,50 @@
public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
- // dp implementation only needs 2 rows
+ // This Dynamic Programming implementation only needs 2 rows.
private final int rows = 2;
private int cols;
private int[][] matrix;
- // for letter count filtering
- private final int[] fsLcCount = new int[128];
- private final int[] ssLcCount = new int[128];
+ // for ASCII letter count filtering
+ private final int[] letterCounts = new int[128];
public SimilarityMetricEditDistance() {
cols = 100; // arbitrary default value
matrix = new int[rows][cols];
}
+ /**
+ * Gets the edit distance value for the given two lists using a Dynamic Programming approach.
+ * If a positive simThresh value is provided, this method only calculates 2 * (simThresh + 1) cells per row,
+ * not entire cells as an optimization. Refer to https://en.wikipedia.org/wiki/Wagner\u2013Fischer_algorithm
+ * for more details. Also, as one more optimization, during the calculation steps, if this method finds out
+ * that the final edit distance value cannot be less than simThresh, this method stops the calculation
+ * and immediately returns -1.
+ * If the final edit distance value is less than or equal to simThresh, then that value will be returned.
+ * If a non-positive simThresh is given, then it calculates all cells and rows and returns
+ * the final edit distance value.
+ *
+ * @return the edit distance of the two lists. -1 if a positive simThresh value is given and the edit distance
+ * value is greater than the given simThresh.
+ */
@Override
- public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException {
+ public float getActualSimilarityVal(IListIterator firstList, IListIterator secondList, float simThresh)
+ throws HyracksDataException {
int flLen = firstList.size();
int slLen = secondList.size();
- // reuse existing matrix if possible
+ // When a positive threshold is given, then we can apply two optimizations.
+ int edThresh = (int) simThresh;
+ boolean canTerminateEarly = edThresh >= 0 ? true : false;
+
+ // Reuses the existing matrix if possible.
if (slLen >= cols) {
cols = slLen + 1;
matrix = new int[rows][cols];
}
- // init matrix
+ // Inits the matrix.
for (int i = 0; i <= slLen; i++) {
matrix[0][i] = i;
}
@@ -59,19 +77,53 @@
int currRow = 1;
int prevRow = 0;
- // expand dynamic programming matrix row by row
+ int from = 1;
+ int to = slLen;
+ int minDistance = -1;
+
+ // Expands the dynamic programming matrix row by row.
for (int i = 1; i <= flLen; i++) {
matrix[currRow][0] = i;
secondList.reset();
- for (int j = 1; j <= slLen; j++) {
+
+ // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+ // Also keeps minDistance to see whether the possible edit distance after
+ // each row calculation is greater than the simThresh.
+ if (canTerminateEarly) {
+ minDistance = edThresh + 1;
+ from = Math.max(i - edThresh - 1, 1);
+ to = Math.min(i + edThresh + 1, slLen);
+ for (int j = 1; j < from; j++) {
+ // Moves the pointer of the second list to the point where the calculation starts for this row.
+ secondList.next();
+ }
+ if (from > 1) {
+ // Sets the left Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][from - 1] = edThresh + 1;
+ }
+ if (to < slLen) {
+ // Sets the right Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][to + 1] = edThresh + 1;
+ }
+ }
+
+ for (int j = from; j <= to; j++) {
matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
+ // Replaces minDistance after each cell computation if we find a smaller value than that.
+ if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+ minDistance = matrix[currRow][j];
+ }
+
secondList.next();
}
-
+ // If the minimum distance value is greater than the given threshold, no reason to process next row.
+ if (canTerminateEarly && minDistance > edThresh) {
+ return -1;
+ }
firstList.next();
int tmp = currRow;
@@ -82,6 +134,9 @@
return matrix[prevRow][slLen];
}
+ /**
+ * Gets the edit distance value for the given two lists.
+ */
@Override
public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
throws HyracksDataException {
@@ -96,8 +151,8 @@
return -1;
}
- float ed = getSimilarity(firstList, secondList);
- if (ed > edThresh) {
+ float ed = getActualSimilarityVal(firstList, secondList, simThresh);
+ if (ed > edThresh || ed < 0) {
return -1;
} else {
return ed;
@@ -155,7 +210,8 @@
}
// faster implementation for common case of string edit distance
- public int UTF8StringEditDistance(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart) {
+ public int getActualUTF8StringEditDistanceVal(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart,
+ int edThresh) {
int fsLen = UTF8StringUtil.getStringLength(leftBytes, fsStart);
int ssLen = UTF8StringUtil.getStringLength(rightBytes, ssStart);
@@ -164,7 +220,10 @@
int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
- // reuse existing matrix if possible
+ // When a positive threshold is given, then we can apply two optimizations.
+ boolean canTerminateEarly = edThresh >= 0 ? true : false;
+
+ // Reuses the existing matrix if possible.
if (ssLen >= cols) {
cols = ssLen + 1;
matrix = new int[rows][cols];
@@ -173,7 +232,7 @@
int fsDataStart = fsStart + fsMetaLen;
int ssDataStart = ssStart + ssMetaLen;
- // init matrix
+ // Inits the matrix
for (int i = 0; i <= ssLen; i++) {
matrix[0][i] = i;
}
@@ -181,19 +240,55 @@
int currRow = 1;
int prevRow = 0;
- // expand dynamic programming matrix row by row
+ int from = 1;
+ int to = ssLen;
+ int minDistance = -1;
+
+ // Expands the dynamic programming matrix row by row.
int fsPos = fsDataStart;
for (int i = 1; i <= fsLen; i++) {
matrix[currRow][0] = i;
char fsChar = Character.toLowerCase(UTF8StringUtil.charAt(leftBytes, fsPos));
int ssPos = ssDataStart;
- for (int j = 1; j <= ssLen; j++) {
+
+ // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+ // Also keeps minDistance to see whether the possible edit distance after
+ // each row calculation is greater than the simThresh.
+ if (canTerminateEarly) {
+ minDistance = edThresh + 1;
+ from = Math.max(i - edThresh - 1, 1);
+ to = Math.min(i + edThresh + 1, ssLen);
+ for (int j = 1; j < from; j++) {
+ // Moves the pointer of the second list to the point where the calculation starts for this row.
+ ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
+ }
+ if (from > 1) {
+ // Sets the left Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][from - 1] = edThresh + 1;
+ }
+ if (to < ssLen) {
+ // Sets the right Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][to + 1] = edThresh + 1;
+ }
+ }
+
+ for (int j = from; j <= to; j++) {
char ssChar = Character.toLowerCase(UTF8StringUtil.charAt(rightBytes, ssPos));
matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
+ // Replaces minDistance after each cell computation if we find a smaller value than that.
+ if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+ minDistance = matrix[currRow][j];
+ }
+
ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
+ }
+
+ // If the minimum distance value is greater than the given threshold, no reason to process next row.
+ if (canTerminateEarly && minDistance > edThresh) {
+ return -1;
}
fsPos += UTF8StringUtil.charSize(leftBytes, fsPos);
int tmp = currRow;
@@ -218,8 +313,7 @@
}
// initialize letter count filtering
- Arrays.fill(fsLcCount, 0);
- Arrays.fill(ssLcCount, 0);
+ Arrays.fill(letterCounts, 0);
// compute letter counts for first string
int fsPos = fsStart + fsMetaLen;
@@ -227,7 +321,7 @@
while (fsPos < fsEnd) {
char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesLeft, fsPos));
if (c < 128) {
- fsLcCount[c]++;
+ letterCounts[c]++;
}
fsPos += UTF8StringUtil.charSize(bytesLeft, fsPos);
}
@@ -238,30 +332,30 @@
while (ssPos < ssEnd) {
char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesRight, ssPos));
if (c < 128) {
- ssLcCount[c]++;
+ letterCounts[c]--;
}
ssPos += UTF8StringUtil.charSize(bytesRight, ssPos);
}
// apply filter
- int gtSum = 0;
- int ltSum = 0;
+ int secondTofirstDiffSum = 0;
+ int firstToSecondDiffSum = 0;
for (int i = 0; i < 128; i++) {
- if (fsLcCount[i] > ssLcCount[i]) {
- gtSum += fsLcCount[i] - ssLcCount[i];
- if (gtSum > edThresh) {
+ if (letterCounts[i] >= 0) {
+ secondTofirstDiffSum += letterCounts[i];
+ if (secondTofirstDiffSum > edThresh) {
return -1;
}
} else {
- ltSum += ssLcCount[i] - fsLcCount[i];
- if (ltSum > edThresh) {
+ firstToSecondDiffSum += Math.abs(letterCounts[i]);
+ if (firstToSecondDiffSum > edThresh) {
return -1;
}
}
}
- int ed = UTF8StringEditDistance(bytesLeft, fsStart, bytesRight, ssStart);
- if (ed > edThresh) {
+ int ed = getActualUTF8StringEditDistanceVal(bytesLeft, fsStart, bytesRight, ssStart, edThresh);
+ if (ed > edThresh || ed < 0) {
return -1;
} else {
return ed;
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index f4162c7..cafc7fb 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -44,24 +44,10 @@
return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
}
- // @Override
- // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
- // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
- // (int) tokensY.size());
- // }
-
- // @Override
- // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
- // int lengthY) {
- // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
- // tokensY);
- // int totalSize = lengthX + lengthY;
- //
- // return (float) intersectionSize / (totalSize - intersectionSize);
- // }
-
+ // SimThresh value will be ignored for this method since it doesn't provide an early termination.
@Override
- public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+ public float getActualSimilarityVal(IListIterator tokensX, IListIterator tokensY, float simThresh)
+ throws HyracksDataException {
int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
int totalSize = tokensX.size() + tokensY.size();
@@ -81,7 +67,7 @@
return -1f;
}
- float jacc = getSimilarity(firstList, secondList);
+ float jacc = getActualSimilarityVal(firstList, secondList, simThresh);
if (jacc < simThresh) {
return -1f;
} else {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
index fee34b9..3dd3516 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -21,6 +21,8 @@
import java.io.IOException;
import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
import org.apache.asterix.om.base.ABoolean;
import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -77,6 +79,10 @@
try {
edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+ if (edThresh < 0) {
+ throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+ 3, edThresh);
+ }
editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
writeResult(editDistance);
} catch (IOException e) {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index c9d3731..92f8df3 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -105,13 +105,13 @@
switch (argType) {
case STRING: {
- return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
- rightStartOffset + typeIndicatorSize);
+ return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+ rightStartOffset + typeIndicatorSize, -1);
}
case ORDEREDLIST: {
firstOrdListIter.reset(leftBytes, leftStartOffset);
secondOrdListIter.reset(rightBytes, rightStartOffset);
- return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter);
+ return (int) ed.getActualSimilarityVal(firstOrdListIter, secondOrdListIter, -1);
}
default: {
throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
index d40cb67..3a60295 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -35,6 +35,6 @@
@Override
protected float computeResult() throws HyracksDataException {
- return jaccard.getSimilarity(firstListIter, secondListIter);
+ return jaccard.getActualSimilarityVal(firstListIter, secondListIter, -1.0f);
}
}
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,
I'd like you to reexamine a change. Please visit
https://asterix-gerrit.ics.uci.edu/1481
to look at the new patch set (#7).
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: Optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation steps early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to compute all cells in the 2 dimensional array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String. Change the name to ISequenceIterator.
- Add the section for the function in the manual.
- Remove letter counting filtering method since it is only applicable for
the string in ASCII range (0 ~ 127).
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
A asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
14 files changed, 360 insertions(+), 281 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/7
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 7
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,
I'd like you to reexamine a change. Please visit
https://asterix-gerrit.ics.uci.edu/1481
to look at the new patch set (#8).
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: Optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation steps early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to compute all cells in the 2 dimensional array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String. Change the name to ISequenceIterator.
- Add the section for the function in the manual.
- Remove letter counting filtering method since it is only applicable for
the string in ASCII range (0 ~ 127).
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
A asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
14 files changed, 355 insertions(+), 281 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/8
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/456/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6: Integration-Tests+1
Integration Tests Successful
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1788/ : SUCCESS
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 8: BAD+1
BAD Compatibility Tests Successful
https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/472/ : SUCCESS
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5:
BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/448/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6: BAD-1
BAD Compatibility Tests Failed
https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/449/ : FAILURE
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5:
(1 comment)
Addressed Chen's comments.
https://asterix-gerrit.ics.uci.edu/#/c/1481/5/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
File asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java:
PS5, Line 109: -1
> Add a comment to the function to explain the purpose of "-1".
Done
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 3: Integration-Tests+1
Integration Tests Successful
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1773/ : SUCCESS
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
@Jianfeng: I now see what you mean. Since the main function is a private function, yes, I will add a unit test case since it is not exposed to the public interface. Makes sense.
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 1:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1770/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
Looks good. But I still feel there should be some simple *JUnit* test for the edit distance, not the AQL ones.
The AQL (or SQL++) tests are too far away and usually is very difficult to hit the corner cases.
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,
I'd like you to reexamine a change. Please visit
https://asterix-gerrit.ics.uci.edu/1481
to look at the new patch set (#4).
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: Optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation steps early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to compute all cells in the 2 dimensional array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String. Change the name to ISequenceIterator.
- Add the section for the function in the manual.
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
12 files changed, 255 insertions(+), 226 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/4
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
For your comments, edit-distance-check_strings test case already contains that corner case. The latter two queries will do the early termination. I just checked it using "println".
let $a := "Nalini Venkatasubramanian"
let $b := "Nalini Wekatasupramanian"
let $results :=
[
edit-distance-check($a, $b, 3),
edit-distance-check($b, $a, 3),
edit-distance-check($a, $b, 2),
edit-distance-check($b, $a, 2)
]
for $i in $results
return $i
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4155/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5: BAD-1
BAD Compatibility Tests Failed
https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/448/ : FAILURE
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 2:
WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack
PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 3:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1773/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4:
BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/442/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,
I'd like you to reexamine a change. Please visit
https://asterix-gerrit.ics.uci.edu/1481
to look at the new patch set (#6).
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: Optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation steps early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to compute all cells in the 2 dimensional array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String. Change the name to ISequenceIterator.
- Add the section for the function in the manual.
- Remove letter counting filtering method since it is only applicable for
the string in ASCII range (0 ~ 127).
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
13 files changed, 283 insertions(+), 281 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/6
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4:
(10 comments)
Just some minor comments.
https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java:
PS4, Line 26:
use javadoc syntax?
PS4, Line 29: float
this function doesn't has to be exposed.
PS4, Line 32: returns
use javadoc?
https://asterix-gerrit.ics.uci.edu/#/c/1481/3/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java:
PS3, Line 63: public
is it necessary to has an `public interface` ?
I think it can just be a private function of this class.
Line 70: boolean canTerminateEarly = edThresh >= 0 ? true : false;
*boolean canTerminateEarly = edThresh >= 0* is enough.
and if edThresh > min(flLen, slLen) should also be false?
PS3, Line 131: 1
can you define a static variable and give `-1` a good name?
PS3, Line 144: Gets
do we really need this comments ? :-)
PS3, Line 157: -
it worth explain the meaning of -1
PS3, Line 168: public
is it necessary to be a public method?
PS3, Line 219: public
public -> private?
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 8:
WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack
PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4154/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1784/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,
I'd like you to reexamine a change. Please visit
https://asterix-gerrit.ics.uci.edu/1481
to look at the new patch set (#3).
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation stpes early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to computes all cells in the 2*2 array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String.
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
12 files changed, 250 insertions(+), 170 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/3
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 3:
WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack
PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4:
WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack
PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Anon. E. Moose (Code Review)" <do...@asterixdb.incubator.apache.org>.
Anon. E. Moose #1000151 has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5:
(1 comment)
https://asterix-gerrit.ics.uci.edu/#/c/1481/5/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
File asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java:
PS5, Line 109: -1
Add a comment to the function to explain the purpose of "-1".
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4: BAD-1
BAD Compatibility Tests Failed
https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/442/ : FAILURE
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Anon. E. Moose (Code Review)" <do...@asterixdb.incubator.apache.org>.
Anon. E. Moose #1000151 has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4:
(5 comments)
First set of comments
https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java:
PS4, Line 29: float
> this function doesn't has to be exposed.
Is it better to rename "get" to "compute" since "get" seems to suggest it's a "getter"?
https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java:
PS4, Line 49: lists
"lists" -> "sequences" to be consistent with the parameter type?
PS4, Line 51: entire cells
"entire cells" -> "all the cells in the row"
PS4, Line 53: less than
"less than" -> "within"?
Line 99: if (canTerminateEarly) {
Where is this "canTerminateEarly" decided? I couldn't find it.
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6: Integration-Tests+1
Integration Tests Successful
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1798/ : SUCCESS
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 8:
BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/472/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6: Integration-Tests-1
Integration Tests Timed Out
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1791/ : ABORTED
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4: Integration-Tests-1
Integration Tests Failed
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1775/ : UNSTABLE
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 1: Integration-Tests+1
Integration Tests Successful
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1770/ : SUCCESS
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 1:
BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/437/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1788/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,
I'd like you to reexamine a change. Please visit
https://asterix-gerrit.ics.uci.edu/1481
to look at the new patch set (#5).
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: Optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation steps early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to compute all cells in the 2 dimensional array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String. Change the name to ISequenceIterator.
- Add the section for the function in the manual.
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
11 files changed, 291 insertions(+), 239 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/5
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 1:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4137/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4147/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 3:
(11 comments)
@Jianfeng: Thanks!
https://asterix-gerrit.ics.uci.edu/#/c/1481/4/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java:
PS4, Line 26:
> use javadoc syntax?
Done
PS4, Line 29: float
> this function doesn't has to be exposed.
Done
PS4, Line 32: returns
> use javadoc?
Done
https://asterix-gerrit.ics.uci.edu/#/c/1481/2/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java:
Line 28: public static int getIntersectSize(ISequenceIterator tokensX, ISequenceIterator tokensY)
> MAJOR SonarQube violation:
Done
https://asterix-gerrit.ics.uci.edu/#/c/1481/3/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
File asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java:
PS3, Line 63: public
> is it necessary to has an `public interface` ?
Agreed and done.
Line 70: boolean canTerminateEarly = edThresh >= 0 ? true : false;
> *boolean canTerminateEarly = edThresh >= 0* is enough.
The caller that is calling this function already checks your if condition.Since we change this to a private function, I think it's OK not to add the if condition.
PS3, Line 131: 1
> can you define a static variable and give `-1` a good name?
Done
PS3, Line 144: Gets
> do we really need this comments ? :-)
Done
PS3, Line 157: -
> it worth explain the meaning of -1
Done
PS3, Line 168: public
> is it necessary to be a public method?
Yes. It is being called from the outside of this class.
PS3, Line 219: public
> public -> private?
It is being called from the outside.
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: Yes
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1775/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1785/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 3: BAD-1
BAD Compatibility Tests Failed
https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/440/ : FAILURE
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 2:
Added Steven because of BAD failure.
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5:
@Jianfeng: the early termination logic is in place. We have test cases for them, too. In fact, the current test cases already cover them. (e.g., edit-distance-check_strings)
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack
PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 3:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4144/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1798/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 3:
BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/440/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 3
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 7:
WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack
PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 7
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1791/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6: Integration-Tests-1
Integration Tests Failed
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1785/ : UNSTABLE
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 5:
WARNING: THIS CHANGE CONTAINS CROSS-PRODUCT CHANGES IN:
* asterixdb
* hyracks-fullstack
PLEASE REVIEW CAREFULLY AND LOOK FOR API CHANGES!
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 5
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 4:
Oh, one more thing, do you have some test cases that actually trigger the early termination logic? I think that is necessary.
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 4
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 8:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4180/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 8:
Integration Tests Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1803/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6:
BAD Compatibility Tests Started https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/449/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Anon. E. Moose #1000151
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jianfeng Jia (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jianfeng Jia has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 8: Code-Review+2
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 7:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4179/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 7
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Hello Jenkins,
I'd like you to reexamine a change. Please visit
https://asterix-gerrit.ics.uci.edu/1481
to look at the new patch set (#2).
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation stpes early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to computes all cells in the 2*2 array.
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
12 files changed, 249 insertions(+), 170 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/81/1481/2
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newpatchset
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 6: BAD+1
BAD Compatibility Tests Successful
https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/456/ : SUCCESS
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 6
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 2:
Build Started https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-notopic/4143/
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 2
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
Patch Set 8: Integration-Tests+1
Integration Tests Successful
https://asterix-jenkins.ics.uci.edu/job/asterix-gerrit-integration-tests/1803/ : SUCCESS
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 8
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: optimize the edit-distance-check function
Posted by "Jenkins (Code Review)" <do...@asterixdb.incubator.apache.org>.
Jenkins has posted comments on this change.
Change subject: ASTERIXDB-1778: optimize the edit-distance-check function
......................................................................
Patch Set 1: BAD-1
BAD Compatibility Tests Failed
https://asterix-jenkins.ics.uci.edu/job/asterixbad-compat/437/ : FAILURE
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: comment
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-HasComments: No
Change in asterixdb[master]: ASTERIXDB-1778: Optimize the edit-distance-check function
Posted by "Taewoo Kim (Code Review)" <do...@asterixdb.incubator.apache.org>.
Taewoo Kim has submitted this change and it was merged.
Change subject: ASTERIXDB-1778: Optimize the edit-distance-check function
......................................................................
ASTERIXDB-1778: Optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation steps early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to compute all cells in the 2 dimensional array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String. Change the name to ISequenceIterator.
- Add the section for the function in the manual.
- Remove letter counting filtering method since it is only applicable for
the string in ASCII range (0 ~ 127).
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Reviewed-on: https://asterix-gerrit.ics.uci.edu/1481
Sonar-Qube: Jenkins <je...@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
BAD: Jenkins <je...@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
Reviewed-by: Jianfeng Jia <ji...@gmail.com>
---
M asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
M asterixdb/asterix-fuzzyjoin/pom.xml
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
M asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
A asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
M asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
R hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
A hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
14 files changed, 355 insertions(+), 281 deletions(-)
Approvals:
Jianfeng Jia: Looks good to me, approved
Jenkins: Verified; No violations found; No violations found; Verified
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
index 89ef0f7..cb3318f 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
@@ -47,6 +47,36 @@
2
+### edit_distance_check ###
+* Syntax:
+
+ edit_distance_check(expression1, expression2, threshold)
+
+* Checks whether the edit distance of `expression1` and `expression2` is within a given threshold.
+
+* Arguments:
+ * `expression1` : a `string` or a homogeneous `array` of a comparable item type.
+ * `expression2` : The same type as `expression1`.
+ * `threshold` : a `bigint` that represents the distance threshold.
+* Return Value:
+ * an `array` with two items:
+ * The first item contains a `boolean` value representing whether the edit distance of `expression1` and `expression2` is within the given threshold.
+ * The second item contains an `integer` that represents the edit distance of `expression1` and `expression2` if the first item is true.
+ * If the first item is false, then the second item is set to 2147483647.
+ * `missing` if any argument is a `missing` value,
+ * `null` if any argument is a `null` value but no argument is a `missing` value,
+ * a type error will be raised if:
+ * the first or second argument is any other non-string value,
+ * or, the third argument is any other non-bigint value.
+* Note: an [n_gram index](similarity.html#UsingIndexesToSupportSimilarityQueries) can be utilized for this function.
+* Example:
+
+ edit_distance_check("happy","hapr",2);
+
+
+* The expected result is:
+
+ [ true, 2 ]
### edit_distance_contains ###
* Syntax:
diff --git a/asterixdb/asterix-fuzzyjoin/pom.xml b/asterixdb/asterix-fuzzyjoin/pom.xml
index 0539782..9485852 100644
--- a/asterixdb/asterix-fuzzyjoin/pom.xml
+++ b/asterixdb/asterix-fuzzyjoin/pom.xml
@@ -82,6 +82,10 @@
<groupId>org.apache.hyracks</groupId>
<artifactId>hyracks-util</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>hyracks-data-std</artifactId>
+ </dependency>
</dependencies>
</project>
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
index ac4a3dd..b213df2 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -20,13 +20,33 @@
package org.apache.asterix.fuzzyjoin.similarity;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
public interface IGenericSimilarityMetric {
- // returns similarity
- public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException;
+ /**
+ * Returns the similarity value for the given two lists.
+ *
+ * @param firstSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @param secondSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @return a float similarity value
+ * @throws HyracksDataException
+ */
+ public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+ throws HyracksDataException;
- // returns -1 if does not satisfy threshold
- // else returns similarity
- public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+ /**
+ * Returns the similarity value for the given two lists. If the calculated similarity value
+ * doesn't satisfy the given simThresh value based on the function's check condition, this returns -1.
+ *
+ * @param firstSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @param secondSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @return a float similarity value.
+ * @throws HyracksDataException
+ */
+ public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence, float simThresh)
throws HyracksDataException;
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index d36d60d..3348d4c 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -21,10 +21,12 @@
import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
public abstract class SimilarityMetric {
- public static int getIntersectSize(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+ public static int getIntersectSize(ISequenceIterator tokensX, ISequenceIterator tokensY)
+ throws HyracksDataException {
int intersectSize = 0;
while (tokensX.hasNext() && tokensY.hasNext()) {
int cmp = tokensX.compare(tokensY);
@@ -64,23 +66,6 @@
}
public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
- // int intersectSize = 0;
- //
- // while (startX < tokensX.length && startY < tokensY.length) {
- // int tokenX = tokensX[startX];
- // int tokenY = tokensY[startY];
- // if (tokenX > tokenY) {
- // startY++;
- // } else if (tokenX < tokenY) {
- // startX++;
- // } else {
- // intersectSize++;
- // startX++;
- // startY++;
- // }
- // }
- //
- // return intersectSize;
return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
}
@@ -129,52 +114,6 @@
public static PartialIntersect getPartialIntersectSize(int[] tokensX, int[] tokensY, int tokenStop) {
return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
- }
-
- // @SuppressWarnings("unchecked")
- // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
- // int intersectSize = 0;
- //
- // Iterator<Tuple> iteratorX = tokensX.iterator();
- // Iterator<Tuple> iteratorY = tokensY.iterator();
- //
- // Tuple nextX = null;
- // Tuple nextY = null;
- //
- // while ((nextX != null || iteratorX.hasNext())
- // && (nextY != null || iteratorY.hasNext())) {
- // if (nextX == null) {
- // nextX = iteratorX.next();
- // }
- // if (nextY == null) {
- // nextY = iteratorY.next();
- // }
- //
- // int cmp = nextX.compareTo(nextY);
- // if (cmp > 0) {
- // nextY = null;
- // } else if (cmp < 0) {
- // nextX = null;
- // } else {
- // intersectSize++;
- // nextX = null;
- // nextY = null;
- // }
- // }
- //
- // return intersectSize;
- // }
-
- // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
-
- // public abstract float getSimilarity(DataBag tokensX, int lengthX,
- // DataBag tokensY, int lengthY);
-
- public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
- int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
- int totalSize = tokensX.size() + tokensY.size();
-
- return (float) intersectionSize / (totalSize - intersectionSize);
}
public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9dce89e..8003767 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -19,39 +19,59 @@
package org.apache.asterix.fuzzyjoin.similarity;
-import java.util.Arrays;
-
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
+import org.apache.hyracks.data.std.util.UTF8StringCharByCharIterator;
import org.apache.hyracks.util.string.UTF8StringUtil;
public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
- // dp implementation only needs 2 rows
+ // This Dynamic Programming implementation only needs 2 rows.
private final int rows = 2;
private int cols;
private int[][] matrix;
- // for letter count filtering
- private final int[] fsLcCount = new int[128];
- private final int[] ssLcCount = new int[128];
+ // for string edit-distance calculation
+ private final UTF8StringCharByCharIterator leftIt = new UTF8StringCharByCharIterator();
+ private final UTF8StringCharByCharIterator rightIt = new UTF8StringCharByCharIterator();
+
+ public static final int SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE = -1;
public SimilarityMetricEditDistance() {
cols = 100; // arbitrary default value
matrix = new int[rows][cols];
}
- @Override
- public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException {
- int flLen = firstList.size();
- int slLen = secondList.size();
+ /**
+ * Gets the edit distance value for the given two sequences using a Dynamic Programming approach.
+ * If a positive simThresh value is provided, this method only calculates 2 * (simThresh + 1) cells per row,
+ * not all the cells in a row as an optimization. Refer to https://en.wikipedia.org/wiki/Wagner\u2013Fischer_algorithm
+ * for more details. Also, as one more optimization, during the calculation steps, if this method finds out
+ * that the final edit distance value cannot be within simThresh, this method stops the calculation
+ * and immediately returns -1.
+ * If the final edit distance value is less than or equal to simThresh, then that value will be returned.
+ * If a non-positive simThresh is given, then it calculates all cells and rows and returns
+ * the final edit distance value.
+ *
+ * @return the edit distance of the two lists. -1 if a positive simThresh value is given and the edit distance
+ * value is greater than the given simThresh.
+ */
+ private float computeActualSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence,
+ float simThresh) throws HyracksDataException {
+ int flLen = firstSequence.size();
+ int slLen = secondSequence.size();
- // reuse existing matrix if possible
+ // When a positive threshold is given, then we can apply two optimizations.
+ int edThresh = (int) simThresh;
+ boolean canTerminateEarly = edThresh >= 0;
+
+ // Reuses the existing matrix if possible.
if (slLen >= cols) {
cols = slLen + 1;
matrix = new int[rows][cols];
}
- // init matrix
+ // Inits the matrix.
for (int i = 0; i <= slLen; i++) {
matrix[0][i] = i;
}
@@ -59,20 +79,54 @@
int currRow = 1;
int prevRow = 0;
- // expand dynamic programming matrix row by row
+ int from = 1;
+ int to = slLen;
+ int minDistance = -1;
+
+ // Expands the dynamic programming matrix row by row.
for (int i = 1; i <= flLen; i++) {
matrix[currRow][0] = i;
- secondList.reset();
- for (int j = 1; j <= slLen; j++) {
+ secondSequence.reset();
- matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
- matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
-
- secondList.next();
+ // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+ // Also keeps minDistance to see whether the possible edit distance after
+ // each row calculation is greater than the simThresh.
+ if (canTerminateEarly) {
+ minDistance = edThresh + 1;
+ from = Math.max(i - edThresh - 1, 1);
+ to = Math.min(i + edThresh + 1, slLen);
+ for (int j = 1; j < from; j++) {
+ // Moves the pointer of the second list to the point where the calculation starts for this row.
+ secondSequence.next();
+ }
+ if (from > 1) {
+ // Sets the left Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][from - 1] = edThresh + 1;
+ }
+ if (to < slLen) {
+ // Sets the right Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][to + 1] = edThresh + 1;
+ }
}
- firstList.next();
+ for (int j = from; j <= to; j++) {
+
+ matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+ matrix[prevRow][j - 1] + (firstSequence.compare(secondSequence) == 0 ? 0 : 1));
+
+ // Replaces minDistance after each cell computation if we find a smaller value than that.
+ if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+ minDistance = matrix[currRow][j];
+ }
+
+ secondSequence.next();
+ }
+ // If the minimum distance value is greater than the given threshold, no reason to process next row.
+ if (canTerminateEarly && minDistance > edThresh) {
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
+ }
+ firstSequence.next();
int tmp = currRow;
currRow = prevRow;
@@ -82,8 +136,12 @@
return matrix[prevRow][slLen];
}
+ /**
+ * Gets the similarity value for the given two sequences. If the value doesn't satisfy the given simThresh,
+ * this method returns -1. Else, this returns the real similarity value.
+ */
@Override
- public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+ public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
throws HyracksDataException {
int edThresh = (int) simThresh;
@@ -93,18 +151,18 @@
// length filter
if (Math.abs(flLen - slLen) > edThresh) {
- return -1;
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
}
- float ed = getSimilarity(firstList, secondList);
- if (ed > edThresh) {
- return -1;
+ float ed = computeActualSimilarity(firstList, secondList, simThresh);
+ if (ed > edThresh || ed < 0) {
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
} else {
return ed;
}
}
- public int getSimilarityContains(IListIterator exprList, IListIterator patternList, int simThresh)
+ public int getSimilarityContains(ISequenceIterator exprList, ISequenceIterator patternList, int simThresh)
throws HyracksDataException {
int exprLen = exprList.size();
int patternLen = patternList.size();
@@ -148,182 +206,50 @@
}
if (minEd > simThresh) {
- return -1;
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
} else {
return minEd;
}
}
// faster implementation for common case of string edit distance
- public int UTF8StringEditDistance(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart) {
- int fsLen = UTF8StringUtil.getStringLength(leftBytes, fsStart);
- int ssLen = UTF8StringUtil.getStringLength(rightBytes, ssStart);
-
- int fsUtfLen = UTF8StringUtil.getUTFLength(leftBytes, fsStart);
- int ssUtfLen = UTF8StringUtil.getUTFLength(rightBytes, ssStart);
- int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
- int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
- // reuse existing matrix if possible
- if (ssLen >= cols) {
- cols = ssLen + 1;
- matrix = new int[rows][cols];
- }
-
- int fsDataStart = fsStart + fsMetaLen;
- int ssDataStart = ssStart + ssMetaLen;
-
- // init matrix
- for (int i = 0; i <= ssLen; i++) {
- matrix[0][i] = i;
- }
-
- int currRow = 1;
- int prevRow = 0;
-
- // expand dynamic programming matrix row by row
- int fsPos = fsDataStart;
- for (int i = 1; i <= fsLen; i++) {
- matrix[currRow][0] = i;
- char fsChar = Character.toLowerCase(UTF8StringUtil.charAt(leftBytes, fsPos));
- int ssPos = ssDataStart;
- for (int j = 1; j <= ssLen; j++) {
- char ssChar = Character.toLowerCase(UTF8StringUtil.charAt(rightBytes, ssPos));
-
- matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
- matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
-
- ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
- }
- fsPos += UTF8StringUtil.charSize(leftBytes, fsPos);
- int tmp = currRow;
- currRow = prevRow;
- prevRow = tmp;
- }
- return matrix[prevRow][ssLen];
+ public int getActualUTF8StringEditDistanceVal(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart,
+ int edThresh) throws HyracksDataException {
+ leftIt.reset(leftBytes, fsStart);
+ rightIt.reset(rightBytes, ssStart);
+ return (int) computeActualSimilarity(leftIt, rightIt, edThresh);
}
- public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh) {
+ public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh)
+ throws HyracksDataException {
int fsStrLen = UTF8StringUtil.getStringLength(bytesLeft, fsStart);
int ssStrLen = UTF8StringUtil.getStringLength(bytesRight, ssStart);
- int fsUtfLen = UTF8StringUtil.getUTFLength(bytesLeft, fsStart);
- int ssUtfLen = UTF8StringUtil.getUTFLength(bytesRight, ssStart);
- int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
- int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
// length filter
if (Math.abs(fsStrLen - ssStrLen) > edThresh) {
- return -1;
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
}
- // initialize letter count filtering
- Arrays.fill(fsLcCount, 0);
- Arrays.fill(ssLcCount, 0);
-
- // compute letter counts for first string
- int fsPos = fsStart + fsMetaLen;
- int fsEnd = fsPos + fsUtfLen;;
- while (fsPos < fsEnd) {
- char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesLeft, fsPos));
- if (c < 128) {
- fsLcCount[c]++;
- }
- fsPos += UTF8StringUtil.charSize(bytesLeft, fsPos);
- }
-
- // compute letter counts for second string
- int ssPos = ssStart + ssMetaLen;
- int ssEnd = ssPos + ssUtfLen;
- while (ssPos < ssEnd) {
- char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesRight, ssPos));
- if (c < 128) {
- ssLcCount[c]++;
- }
- ssPos += UTF8StringUtil.charSize(bytesRight, ssPos);
- }
-
- // apply filter
- int gtSum = 0;
- int ltSum = 0;
- for (int i = 0; i < 128; i++) {
- if (fsLcCount[i] > ssLcCount[i]) {
- gtSum += fsLcCount[i] - ssLcCount[i];
- if (gtSum > edThresh) {
- return -1;
- }
- } else {
- ltSum += ssLcCount[i] - fsLcCount[i];
- if (ltSum > edThresh) {
- return -1;
- }
- }
- }
-
- int ed = UTF8StringEditDistance(bytesLeft, fsStart, bytesRight, ssStart);
- if (ed > edThresh) {
- return -1;
+ int ed = getActualUTF8StringEditDistanceVal(bytesLeft, fsStart, bytesRight, ssStart, edThresh);
+ if (ed > edThresh || ed < 0) {
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
} else {
return ed;
}
}
// checks whether the first string contains a similar string to the second string
- public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] pattenBytes, int patternStart,
- int edThresh) {
+ public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] patternBytes, int patternStart,
+ int edThresh) throws HyracksDataException {
+ leftIt.reset(strBytes, stringStart);
+ rightIt.reset(patternBytes, patternStart);
+ return getSimilarityContains(leftIt, rightIt, edThresh);
+ }
- int stringLen = UTF8StringUtil.getStringLength(strBytes, stringStart);
- int patternLen = UTF8StringUtil.getStringLength(pattenBytes, patternStart);
-
- int stringUTFLen = UTF8StringUtil.getUTFLength(strBytes, stringStart);
- int stringMetaLen = UTF8StringUtil.getNumBytesToStoreLength(stringUTFLen);
-
- int patternUTFLen = UTF8StringUtil.getUTFLength(pattenBytes, patternStart);
- int patternMetaLen = UTF8StringUtil.getNumBytesToStoreLength(patternUTFLen);
-
- // reuse existing matrix if possible
- if (patternLen >= cols) {
- cols = patternLen + 1;
- matrix = new int[rows][cols];
- }
-
- int stringDataStart = stringStart + stringMetaLen;
- int patternDataStart = patternStart + patternMetaLen;
-
- // init matrix
- for (int i = 0; i <= patternLen; i++) {
- matrix[0][i] = i;
- }
-
- int currRow = 1;
- int prevRow = 0;
- int minEd = Integer.MAX_VALUE;
- // expand dynamic programming matrix row by row
- int stringPos = stringDataStart;
- for (int i = 1; i <= stringLen; i++) {
- matrix[currRow][0] = 0;
- char stringChar = Character.toLowerCase(UTF8StringUtil.charAt(strBytes, stringPos));
-
- int patternPos = patternDataStart;
- for (int j = 1; j <= patternLen; j++) {
- char patternChar = Character.toLowerCase(UTF8StringUtil.charAt(pattenBytes, patternPos));
- matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
- matrix[prevRow][j - 1] + (stringChar == patternChar ? 0 : 1));
- patternPos += UTF8StringUtil.charSize(pattenBytes, patternPos);
- if (j == patternLen && matrix[currRow][patternLen] < minEd) {
- minEd = matrix[currRow][patternLen];
- }
- }
-
- stringPos += UTF8StringUtil.charSize(strBytes, stringPos);
- int tmp = currRow;
- currRow = prevRow;
- prevRow = tmp;
- }
- if (minEd > edThresh) {
- return -1;
- } else {
- return minEd;
- }
+ @Override
+ public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+ throws HyracksDataException {
+ // Passes -1 as the simThresh to calculate the edit distance without applying any calculation optimizations.
+ return computeActualSimilarity(firstSequence, secondSequence, -1);
}
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index f4162c7..4a31b8b 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -24,6 +24,7 @@
import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
public class SimilarityMetricJaccard extends SimilarityMetric implements IGenericSimilarityMetric {
@@ -44,24 +45,8 @@
return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
}
- // @Override
- // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
- // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
- // (int) tokensY.size());
- // }
-
- // @Override
- // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
- // int lengthY) {
- // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
- // tokensY);
- // int totalSize = lengthX + lengthY;
- //
- // return (float) intersectionSize / (totalSize - intersectionSize);
- // }
-
@Override
- public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+ public float computeSimilarity(ISequenceIterator tokensX, ISequenceIterator tokensY) throws HyracksDataException {
int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
int totalSize = tokensX.size() + tokensY.size();
@@ -69,7 +54,7 @@
}
@Override
- public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+ public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
throws HyracksDataException {
// apply length filter
@@ -81,7 +66,7 @@
return -1f;
}
- float jacc = getSimilarity(firstList, secondList);
+ float jacc = computeSimilarity(firstList, secondList);
if (jacc < simThresh) {
return -1f;
} else {
diff --git a/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
new file mode 100644
index 0000000..caa80bc
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.asterix.fuzzyjoin.similarity;
+
+import static org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.junit.Test;
+
+public class SimilarityMetricEditDistanceTest {
+
+ private static final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
+
+ @Test
+ public void test() throws Exception {
+ // For this case, the edit-distance of two strings is 3.
+ UTF8StringPointable leftStrPointable1 = generateUTF8Pointable("coupon not available in store");
+ UTF8StringPointable rightStrPointable1 = generateUTF8Pointable("coupon is available in store");
+
+ // The edit-distance between leftStrPointable1 and the following is 14.
+ UTF8StringPointable rightStrPointable2 = generateUTF8Pointable("coupon in store");
+
+ byte[] leftBytes1 = leftStrPointable1.getByteArray();
+ int leftStartOffset1 = leftStrPointable1.getStartOffset();
+ byte[] rightBytes1 = rightStrPointable1.getByteArray();
+ int rightStartOffset1 = rightStrPointable1.getStartOffset();
+ byte[] rightBytes2 = rightStrPointable2.getByteArray();
+ int rightStartOffset2 = rightStrPointable2.getStartOffset();
+
+ // Case 1 - normal - no early termination
+ int edThresh = 3;
+ int edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes1, rightStartOffset1, edThresh);
+ assertEquals(edThresh, edVal);
+
+ // Case 2 - the length difference between two strings is greater than edThresh.
+ // Even without calculating the distance, the method should return -1.
+ edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+ assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+ // Case 3 - the edit distance is 14, but the threshold is 1.
+ // The early termination should happen and the returned value should be -1.
+ edThresh = 1;
+ edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+ assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+ // Case 4 - the edit distance is 14, but the threshold is 13.
+ // The early termination will not happen. But, the resulting edit distance is greater than the given threshold.
+ // So, the final returned value should be -1.
+ edThresh = 13;
+ edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+ assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+ }
+
+}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
index b929854..2435047 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
@@ -20,13 +20,13 @@
import org.apache.asterix.common.exceptions.AsterixException;
import org.apache.asterix.formats.nontagged.BinaryComparatorFactoryProvider;
-import org.apache.asterix.fuzzyjoin.similarity.IListIterator;
import org.apache.asterix.om.types.ATypeTag;
import org.apache.asterix.om.types.EnumDeserializer;
import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
-public abstract class AbstractAsterixListIterator implements IListIterator {
+public abstract class AbstractAsterixListIterator implements ISequenceIterator {
protected byte[] data;
protected int count = 0;
@@ -42,7 +42,7 @@
protected final boolean ignoreCase = true;
@Override
- public int compare(IListIterator cmpIter) throws HyracksDataException {
+ public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
return cmp.compare(data, pos, -1, cmpIter.getData(), cmpIter.getPos(), -1);
}
@@ -100,6 +100,7 @@
}
}
+ @Override
public void reset(byte[] data, int startOff) throws HyracksDataException {
this.data = data;
this.startOff = startOff;
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
index fee34b9..63e9e44 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -21,6 +21,8 @@
import java.io.IOException;
import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
import org.apache.asterix.om.base.ABoolean;
import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -77,6 +79,10 @@
try {
edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+ if (edThresh < 0) {
+ throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+ 3, edThresh);
+ }
editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
writeResult(editDistance);
} catch (IOException e) {
@@ -101,7 +107,7 @@
case ORDEREDLIST: {
firstOrdListIter.reset(leftBytes, leftStartOffset);
secondOrdListIter.reset(rightBytes, rightStartOffset);
- return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
+ return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
}
default: {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index c9d3731..dbe99b9 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -105,13 +105,15 @@
switch (argType) {
case STRING: {
- return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
- rightStartOffset + typeIndicatorSize);
+ // Passes -1 as the simThresh to calculate the edit distance
+ // without applying any calculation optimizations.
+ return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+ rightStartOffset + typeIndicatorSize, -1);
}
case ORDEREDLIST: {
firstOrdListIter.reset(leftBytes, leftStartOffset);
secondOrdListIter.reset(rightBytes, rightStartOffset);
- return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter);
+ return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter);
}
default: {
throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
index 19e9395..0decd9e 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
@@ -34,6 +34,6 @@
@Override
protected float computeResult() throws HyracksDataException {
- return jaccard.getSimilarity(firstListIter, secondListIter, jaccThresh);
+ return jaccard.computeSimilarity(firstListIter, secondListIter, jaccThresh);
}
}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
index d40cb67..1cd32c8 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -35,6 +35,6 @@
@Override
protected float computeResult() throws HyracksDataException {
- return jaccard.getSimilarity(firstListIter, secondListIter);
+ return jaccard.computeSimilarity(firstListIter, secondListIter);
}
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
similarity index 81%
rename from asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
rename to hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
index 6c3d22e..9441453 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
@@ -17,12 +17,12 @@
* under the License.
*/
-package org.apache.asterix.fuzzyjoin.similarity;
+package org.apache.hyracks.data.std.util;
import org.apache.hyracks.api.exceptions.HyracksDataException;
-public interface IListIterator {
- public int compare(IListIterator cmpIter) throws HyracksDataException;
+public interface ISequenceIterator {
+ public int compare(ISequenceIterator cmpIter) throws HyracksDataException;
public byte[] getData();
@@ -34,5 +34,7 @@
public void reset() throws HyracksDataException;
+ public void reset(byte[] data, int startOff) throws HyracksDataException;
+
public int size();
}
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
new file mode 100644
index 0000000..237d291
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
+/**
+ * An iterator class for a String. This class iterates a char by char in the given String.
+ */
+public class UTF8StringCharByCharIterator implements ISequenceIterator {
+
+ protected byte[] data;
+ protected int pos = -1;
+ protected int length = -1;
+ protected int utfByteLength = -1;
+ protected int metaLength = -1;
+ protected int startOffset = -1;
+
+ @Override
+ public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
+ char thisChar = Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
+ char thatChar = Character.toLowerCase(UTF8StringUtil.charAt(cmpIter.getData(), cmpIter.getPos()));
+ if (thisChar == thatChar) {
+ return 0;
+ }
+ return -1;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return pos < utfByteLength;
+ }
+
+ @Override
+ public int size() {
+ return length;
+ }
+
+ @Override
+ public byte[] getData() {
+ return data;
+ }
+
+ @Override
+ public int getPos() {
+ return pos;
+ }
+
+ @Override
+ public void next() throws HyracksDataException {
+ pos += UTF8StringUtil.charSize(data, pos);
+ }
+
+ @Override
+ public void reset() throws HyracksDataException {
+ pos = startOffset + metaLength;
+ }
+
+ @Override
+ public void reset(byte[] data, int startOff) throws HyracksDataException {
+ this.data = data;
+ this.startOffset = startOff;
+ this.length = UTF8StringUtil.getStringLength(data, startOffset);
+ this.utfByteLength = UTF8StringUtil.getUTFLength(data, startOffset);
+ this.metaLength = UTF8StringUtil.getNumBytesToStoreLength(utfByteLength);
+ reset();
+ }
+
+}
--
To view, visit https://asterix-gerrit.ics.uci.edu/1481
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Gerrit-PatchSet: 9
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <wa...@gmail.com>
Gerrit-Reviewer: Chen Li <ch...@gmail.com>
Gerrit-Reviewer: Jenkins <je...@fulliautomatix.ics.uci.edu>
Gerrit-Reviewer: Jianfeng Jia <ji...@gmail.com>
Gerrit-Reviewer: Steven Jacobs <sj...@ucr.edu>
Gerrit-Reviewer: Taewoo Kim <wa...@gmail.com>