You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by mb...@apache.org on 2018/03/14 04:08:20 UTC

[7/7] asterixdb git commit: [ASTERIXDB-2330][*DB][RT] Add IFunctionRegistrant for dynamic function registration

[ASTERIXDB-2330][*DB][RT] Add IFunctionRegistrant for dynamic function registration

- user model changes: no
- storage format changes: no
- interface changes: yes

Details:

- Look for IFunctionRegistrant service instances at runtime, and use these to dynamically
register non-core functions with *DB
- Extract fuzzyjoin functions from core runtime

Change-Id: Ia88590280cbf476e08b905d9e1d62c68667a2569
Reviewed-on: https://asterix-gerrit.ics.uci.edu/2480
Reviewed-by: abdullah alamoudi <ba...@gmail.com>
Tested-by: Michael Blow <mb...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo
Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/b8307794
Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/b8307794
Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/b8307794

Branch: refs/heads/master
Commit: b8307794c3483199a88266e8646bfbad79cc70f9
Parents: 38c41ac
Author: Michael Blow <mb...@apache.org>
Authored: Tue Mar 13 20:45:51 2018 -0700
Committer: Michael Blow <mb...@apache.org>
Committed: Tue Mar 13 21:07:36 2018 -0700

----------------------------------------------------------------------
 asterixdb/asterix-app/pom.xml                   |    7 +
 .../apache/asterix/runtime/NullMissingTest.java |    5 +-
 asterixdb/asterix-fuzzyjoin/pom.xml             |   64 +-
 .../runtime/FuzzyJoinFunctionRegistrant.java    |   72 ++
 .../common/EditDistanceCheckEvaluator.java      |  134 +++
 .../common/EditDistanceContainsEvaluator.java   |   59 +
 .../common/EditDistanceEvaluator.java           |  156 +++
 .../evaluators/common/GramTokensEvaluator.java  |   95 ++
 .../common/SimilarityFiltersCache.java          |   53 +
 .../common/SimilarityJaccardCheckEvaluator.java |  166 +++
 .../common/SimilarityJaccardEvaluator.java      |  277 +++++
 .../SimilarityJaccardPrefixEvaluator.java       |  230 ++++
 .../SimilarityJaccardSortedCheckEvaluator.java  |   39 +
 .../SimilarityJaccardSortedEvaluator.java       |   40 +
 .../evaluators/common/WordTokensEvaluator.java  |   71 ++
 .../CountHashedGramTokensDescriptor.java        |   68 ++
 .../CountHashedWordTokensDescriptor.java        |   66 ++
 .../functions/EditDistanceCheckDescriptor.java  |   58 +
 .../EditDistanceContainsDescriptor.java         |   58 +
 .../functions/EditDistanceDescriptor.java       |   59 +
 .../EditDistanceListIsFilterableDescriptor.java |  149 +++
 ...ditDistanceStringIsFilterableDescriptor.java |   65 ++
 ...EditDistanceStringIsFilterableEvaluator.java |  121 ++
 .../functions/GramTokensDescriptor.java         |   66 ++
 .../functions/HashedGramTokensDescriptor.java   |   66 ++
 .../functions/HashedWordTokensDescriptor.java   |   66 ++
 .../functions/PrefixLenDescriptor.java          |  141 +++
 .../functions/PrefixLenJaccardDescriptor.java   |  126 ++
 .../functions/SimilarityDescriptor.java         |  268 +++++
 .../SimilarityJaccardCheckDescriptor.java       |   58 +
 .../functions/SimilarityJaccardDescriptor.java  |   59 +
 .../SimilarityJaccardPrefixCheckDescriptor.java |  110 ++
 .../SimilarityJaccardPrefixDescriptor.java      |   59 +
 .../SimilarityJaccardSortedCheckDescriptor.java |   59 +
 .../SimilarityJaccardSortedDescriptor.java      |   60 +
 .../functions/SpatialIntersectDescriptor.java   | 1091 ++++++++++++++++++
 .../functions/WordTokensDescriptor.java         |   66 ++
 ...che.asterix.om.functions.IFunctionRegistrant |   20 +
 .../om/functions/IFunctionCollection.java       |   27 +
 .../om/functions/IFunctionRegistrant.java       |   25 +
 ...AbstractScalarFunctionDynamicDescriptor.java |   33 +
 asterixdb/asterix-runtime/pom.xml               |    6 -
 ...AbstractScalarFunctionDynamicDescriptor.java |   33 -
 .../common/EditDistanceCheckEvaluator.java      |  134 ---
 .../common/EditDistanceContainsEvaluator.java   |   59 -
 .../common/EditDistanceEvaluator.java           |  156 ---
 .../evaluators/common/GramTokensEvaluator.java  |   95 --
 .../common/SimilarityFiltersCache.java          |   53 -
 .../common/SimilarityJaccardCheckEvaluator.java |  166 ---
 .../common/SimilarityJaccardEvaluator.java      |  277 -----
 .../SimilarityJaccardPrefixEvaluator.java       |  230 ----
 .../SimilarityJaccardSortedCheckEvaluator.java  |   39 -
 .../SimilarityJaccardSortedEvaluator.java       |   40 -
 .../evaluators/common/WordTokensEvaluator.java  |   71 --
 .../CountHashedGramTokensDescriptor.java        |   68 --
 .../CountHashedWordTokensDescriptor.java        |   66 --
 .../functions/EditDistanceCheckDescriptor.java  |   58 -
 .../EditDistanceContainsDescriptor.java         |   58 -
 .../functions/EditDistanceDescriptor.java       |   59 -
 .../EditDistanceListIsFilterableDescriptor.java |  149 ---
 ...ditDistanceStringIsFilterableDescriptor.java |   65 --
 ...EditDistanceStringIsFilterableEvaluator.java |  121 --
 .../functions/GramTokensDescriptor.java         |   66 --
 .../functions/HashedGramTokensDescriptor.java   |   66 --
 .../functions/HashedWordTokensDescriptor.java   |   66 --
 .../functions/PrefixLenDescriptor.java          |  141 ---
 .../functions/PrefixLenJaccardDescriptor.java   |  126 --
 .../functions/SimilarityDescriptor.java         |  268 -----
 .../SimilarityJaccardCheckDescriptor.java       |   58 -
 .../functions/SimilarityJaccardDescriptor.java  |   59 -
 .../SimilarityJaccardPrefixCheckDescriptor.java |  110 --
 .../SimilarityJaccardPrefixDescriptor.java      |   59 -
 .../SimilarityJaccardSortedCheckDescriptor.java |   59 -
 .../SimilarityJaccardSortedDescriptor.java      |   60 -
 .../functions/SpatialIntersectDescriptor.java   | 1091 ------------------
 .../functions/WordTokensDescriptor.java         |   66 --
 .../runtime/functions/FunctionCollection.java   |   55 +-
 asterixdb/asterix-server/pom.xml                |    6 +
 78 files changed, 4513 insertions(+), 4358 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-app/pom.xml
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/pom.xml b/asterixdb/asterix-app/pom.xml
index e75ad42..b51d499 100644
--- a/asterixdb/asterix-app/pom.xml
+++ b/asterixdb/asterix-app/pom.xml
@@ -152,6 +152,7 @@
           </ignoredUsedUndeclaredDependencies>
           <usedDependencies combine.children="append">
             <usedDependency>org.apache.hadoop:hadoop-common</usedDependency>
+            <usedDependency>org.apache.asterix:asterix-fuzzyjoin</usedDependency>
           </usedDependencies>
           <ignoredUnusedDeclaredDependencies>
             <ignoredUnusedDeclaredDependency>org.apache.asterix:asterix-external-data:zip:*</ignoredUnusedDeclaredDependency>
@@ -643,5 +644,11 @@
       <artifactId>log4j-jul</artifactId>
       <version>2.10.0</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.asterix</groupId>
+      <artifactId>asterix-fuzzyjoin</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
index 3b33868..d96fec9 100644
--- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
+++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
@@ -55,8 +55,9 @@ public class NullMissingTest {
                 ++testedFunctions;
             }
         }
-        // 208 is the current number of functions with generated code.
-        Assert.assertTrue(testedFunctions >= 208);
+        // 217 is the current number of functions with generated code.
+        Assert.assertTrue("expected >= 217 generated functions to be tested, but was " + testedFunctions,
+                testedFunctions >= 217);
     }
 
     private void testFunction(IFunctionDescriptorFactory funcFactory) throws Exception {

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/pom.xml
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/pom.xml b/asterixdb/asterix-fuzzyjoin/pom.xml
index 236bcc7..8056cdf 100644
--- a/asterixdb/asterix-fuzzyjoin/pom.xml
+++ b/asterixdb/asterix-fuzzyjoin/pom.xml
@@ -23,7 +23,6 @@
     <groupId>org.apache.asterix</groupId>
     <version>0.9.4-SNAPSHOT</version>
   </parent>
-  <groupId>org.apache.asterix</groupId>
   <artifactId>asterix-fuzzyjoin</artifactId>
 
   <licenses>
@@ -42,6 +41,23 @@
   <build>
     <plugins>
       <plugin>
+        <groupId>org.apache.asterix</groupId>
+        <artifactId>asterix-evaluator-generator-maven-plugin</artifactId>
+        <version>${project.version}</version>
+        <configuration>
+          <evaluatorPackagePrefix>org.apache.asterix.runtime.evaluators</evaluatorPackagePrefix>
+        </configuration>
+        <executions>
+          <execution>
+            <id>generate-evaluator</id>
+            <phase>process-classes</phase>
+            <goals>
+              <goal>generate-evaluator</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>
         <executions>
@@ -52,17 +68,6 @@
             <phase>test-compile</phase>
           </execution>
         </executions>
-        <configuration>
-          <outputDirectory>${basedir}/target</outputDirectory>
-          <includes>
-            <include>**/*.class</include>
-            <include>**/*.txt</include>
-            <include>**/README*</include>
-            <include>**/NOTICE*</include>
-            <include>**/LICENSE*</include>
-            <include>**/DEPENDENCIES*</include>
-          </includes>
-        </configuration>
       </plugin>
       <plugin>
         <groupId>org.apache.rat</groupId>
@@ -102,6 +107,41 @@
       <groupId>org.apache.hyracks</groupId>
       <artifactId>hyracks-data-std</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.asterix</groupId>
+      <artifactId>asterix-om</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.asterix</groupId>
+      <artifactId>asterix-runtime</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>algebricks-common</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>algebricks-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>hyracks-storage-am-lsm-invertedindex</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>algebricks-runtime</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.asterix</groupId>
+      <artifactId>asterix-common</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>hyracks-dataflow-common</artifactId>
+    </dependency>
   </dependencies>
 
 </project>

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java
new file mode 100644
index 0000000..35d8727
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime;
+
+import org.apache.asterix.om.functions.IFunctionCollection;
+import org.apache.asterix.om.functions.IFunctionRegistrant;
+import org.apache.asterix.runtime.evaluators.functions.CountHashedGramTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.CountHashedWordTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceContainsDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceListIsFilterableDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceStringIsFilterableDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.GramTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.HashedGramTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.HashedWordTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.PrefixLenJaccardDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardPrefixCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardPrefixDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardSortedCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardSortedDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SpatialIntersectDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.WordTokensDescriptor;
+
+public class FuzzyJoinFunctionRegistrant implements IFunctionRegistrant {
+    @Override
+    public void register(IFunctionCollection fc) {
+        // TODO: decide how should we deal these two weird functions as
+        // the number of arguments of the function depend on the first few arguments.
+        fc.add(SimilarityJaccardPrefixDescriptor.FACTORY);
+        fc.add(SimilarityJaccardPrefixCheckDescriptor.FACTORY);
+
+        // Spatial
+        fc.addGenerated(SpatialIntersectDescriptor.FACTORY);
+
+        // fuzzyjoin function
+        fc.addGenerated(PrefixLenJaccardDescriptor.FACTORY);
+        fc.addGenerated(WordTokensDescriptor.FACTORY);
+        fc.addGenerated(HashedWordTokensDescriptor.FACTORY);
+        fc.addGenerated(CountHashedWordTokensDescriptor.FACTORY);
+        fc.addGenerated(GramTokensDescriptor.FACTORY);
+        fc.addGenerated(HashedGramTokensDescriptor.FACTORY);
+        fc.addGenerated(CountHashedGramTokensDescriptor.FACTORY);
+        fc.addGenerated(EditDistanceDescriptor.FACTORY);
+        fc.addGenerated(EditDistanceCheckDescriptor.FACTORY);
+        fc.addGenerated(EditDistanceStringIsFilterableDescriptor.FACTORY);
+        fc.addGenerated(EditDistanceListIsFilterableDescriptor.FACTORY);
+        fc.addGenerated(EditDistanceContainsDescriptor.FACTORY);
+        fc.addGenerated(SimilarityJaccardDescriptor.FACTORY);
+        fc.addGenerated(SimilarityJaccardCheckDescriptor.FACTORY);
+        fc.addGenerated(SimilarityJaccardSortedDescriptor.FACTORY);
+        fc.addGenerated(SimilarityJaccardSortedCheckDescriptor.FACTORY);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
new file mode 100644
index 0000000..dece292
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.ABoolean;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class EditDistanceCheckEvaluator extends EditDistanceEvaluator {
+
+    protected final IScalarEvaluator edThreshEval;
+    protected int edThresh;
+    private final IPointable argPtrThreshold = new VoidPointable();
+    protected final OrderedListBuilder listBuilder;
+    protected ArrayBackedValueStorage listItemVal;
+    @SuppressWarnings("unchecked")
+    protected final ISerializerDeserializer<ABoolean> booleanSerde =
+            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN);
+
+    public EditDistanceCheckEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        super(args, context);
+        edThreshEval = args[2].createScalarEvaluator(context);
+        listBuilder = new OrderedListBuilder();
+        listItemVal = new ArrayBackedValueStorage();
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+        firstStringEval.evaluate(tuple, argPtr1);
+        firstTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+        secondStringEval.evaluate(tuple, argPtr2);
+        secondTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+        edThreshEval.evaluate(tuple, argPtrThreshold);
+
+        if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+            result.set(resultStorage);
+            return;
+        }
+        try {
+            edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
+                    argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+            if (edThresh < 0) {
+                throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+                        3, edThresh);
+            }
+            editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
+            writeResult(editDistance);
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+
+    @Override
+    protected int computeResult(IPointable left, IPointable right, ATypeTag argType) throws HyracksDataException {
+        byte[] leftBytes = left.getByteArray();
+        int leftStartOffset = left.getStartOffset();
+        byte[] rightBytes = right.getByteArray();
+        int rightStartOffset = right.getStartOffset();
+        switch (argType) {
+            case STRING: {
+                return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+                        rightStartOffset + typeIndicatorSize, edThresh);
+            }
+
+            case ARRAY: {
+                firstOrdListIter.reset(leftBytes, leftStartOffset);
+                secondOrdListIter.reset(rightBytes, rightStartOffset);
+                return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
+            }
+
+            default: {
+                throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_CHECK, 0, argType.serialize(),
+                        ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+            }
+
+        }
+    }
+
+    @Override
+    protected void writeResult(int ed) throws IOException {
+        listBuilder.reset(new AOrderedListType(BuiltinType.ANY, "list"));
+        boolean matches = (ed < 0) ? false : true;
+        listItemVal.reset();
+        booleanSerde.serialize(matches ? ABoolean.TRUE : ABoolean.FALSE, listItemVal.getDataOutput());
+        listBuilder.addItem(listItemVal);
+
+        listItemVal.reset();
+        aInt64.setValue((matches) ? ed : Integer.MAX_VALUE);
+        int64Serde.serialize(aInt64, listItemVal.getDataOutput());
+        listBuilder.addItem(listItemVal);
+        listBuilder.write(out, true);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
new file mode 100644
index 0000000..eaf3368
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+
+public class EditDistanceContainsEvaluator extends EditDistanceCheckEvaluator {
+
+    public EditDistanceContainsEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        super(args, context);
+    }
+
+    @Override
+    protected int computeResult(IPointable left, IPointable right, ATypeTag argType) throws HyracksDataException {
+        byte[] leftBytes = left.getByteArray();
+        int leftStartOffset = left.getStartOffset();
+        byte[] rightBytes = right.getByteArray();
+        int rightStartOffset = right.getStartOffset();
+
+        switch (argType) {
+            case STRING: {
+                return ed.UTF8StringEditDistanceContains(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+                        rightStartOffset + typeIndicatorSize, edThresh);
+            }
+            case ARRAY: {
+                firstOrdListIter.reset(leftBytes, leftStartOffset);
+                secondOrdListIter.reset(rightBytes, rightStartOffset);
+                return ed.getSimilarityContains(firstOrdListIter, secondOrdListIter, edThresh);
+            }
+            default: {
+                throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_CONTAINS, 0, argType.serialize(),
+                        ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
new file mode 100644
index 0000000..85fd334
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetricEditDistance;
+import org.apache.asterix.om.base.AInt64;
+import org.apache.asterix.om.base.AMutableInt64;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.asterix.runtime.exceptions.IncompatibleTypeException;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.asterix.runtime.exceptions.UnsupportedItemTypeException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class EditDistanceEvaluator implements IScalarEvaluator {
+
+    // assuming type indicator in serde format
+    protected final int typeIndicatorSize = 1;
+
+    protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+    protected final DataOutput out = resultStorage.getDataOutput();
+    protected final IPointable argPtr1 = new VoidPointable();
+    protected final IPointable argPtr2 = new VoidPointable();
+    protected final IScalarEvaluator firstStringEval;
+    protected final IScalarEvaluator secondStringEval;
+    protected final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
+    protected final OrderedListIterator firstOrdListIter = new OrderedListIterator();
+    protected final OrderedListIterator secondOrdListIter = new OrderedListIterator();
+    protected int editDistance = 0;
+    protected final AMutableInt64 aInt64 = new AMutableInt64(-1);
+    @SuppressWarnings("unchecked")
+    protected final ISerializerDeserializer<AInt64> int64Serde =
+            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT64);
+    protected ATypeTag itemTypeTag;
+
+    protected ATypeTag firstTypeTag;
+    protected ATypeTag secondTypeTag;
+
+    public EditDistanceEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        firstStringEval = args[0].createScalarEvaluator(context);
+        secondStringEval = args[1].createScalarEvaluator(context);
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+        firstStringEval.evaluate(tuple, argPtr1);
+        firstTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+        secondStringEval.evaluate(tuple, argPtr2);
+        secondTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+
+        if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+            result.set(resultStorage);
+            return;
+        }
+
+        editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
+        try {
+            writeResult(editDistance);
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+
+    protected int computeResult(IPointable left, IPointable right, ATypeTag argType) throws HyracksDataException {
+        byte[] leftBytes = left.getByteArray();
+        int leftStartOffset = left.getStartOffset();
+        byte[] rightBytes = right.getByteArray();
+        int rightStartOffset = right.getStartOffset();
+
+        switch (argType) {
+            case STRING: {
+                // Passes -1 as the simThresh to calculate the edit distance
+                // without applying any calculation optimizations.
+                return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+                        rightStartOffset + typeIndicatorSize, -1);
+            }
+            case ARRAY: {
+                firstOrdListIter.reset(leftBytes, leftStartOffset);
+                secondOrdListIter.reset(rightBytes, rightStartOffset);
+                return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter);
+            }
+            default: {
+                throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
+                        ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+            }
+
+        }
+    }
+
+    protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws HyracksDataException {
+        if (typeTag1 != typeTag2) {
+            throw new IncompatibleTypeException(BuiltinFunctions.EDIT_DISTANCE, typeTag1.serialize(),
+                    typeTag2.serialize());
+        }
+
+        // Since they are equal, check one tag is enough.
+        if (typeTag1 != ATypeTag.STRING && typeTag1 != ATypeTag.ARRAY) { // could be an list
+            throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, typeTag1.serialize(),
+                    ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+        }
+
+        if (typeTag1 == ATypeTag.ARRAY) {
+            itemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER
+                    .deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset() + 1]);
+            if (itemTypeTag == ATypeTag.ANY) {
+                throw new UnsupportedItemTypeException(BuiltinFunctions.EDIT_DISTANCE, itemTypeTag.serialize());
+            }
+            itemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER
+                    .deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset() + 1]);
+            if (itemTypeTag == ATypeTag.ANY) {
+                throw new UnsupportedItemTypeException(BuiltinFunctions.EDIT_DISTANCE, itemTypeTag.serialize());
+            }
+        }
+        return true;
+    }
+
+    protected void writeResult(int ed) throws IOException {
+        aInt64.setValue(ed);
+        int64Serde.serialize(aInt64, out);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java
new file mode 100644
index 0000000..ef727c9
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.dataflow.data.nontagged.serde.ABooleanSerializerDeserializer;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+
+public class GramTokensEvaluator implements IScalarEvaluator {
+
+    // assuming type indicator in serde format
+    private final int typeIndicatorSize = 1;
+
+    private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+    private final DataOutput out = resultStorage.getDataOutput();
+    private final IPointable stringArg = new VoidPointable();
+    private final IPointable gramLengthArg = new VoidPointable();
+    private final IPointable prePostArg = new VoidPointable();
+    private final IScalarEvaluator stringEval;
+    private final IScalarEvaluator gramLengthEval;
+    private final IScalarEvaluator prePostEval;
+
+    private final NGramUTF8StringBinaryTokenizer tokenizer;
+    private final OrderedListBuilder listBuilder = new OrderedListBuilder();
+    private final AOrderedListType listType;
+
+    public GramTokensEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context, IBinaryTokenizer tokenizer,
+            BuiltinType itemType) throws HyracksDataException {
+        stringEval = args[0].createScalarEvaluator(context);
+        gramLengthEval = args[1].createScalarEvaluator(context);
+        prePostEval = args[2].createScalarEvaluator(context);
+        this.tokenizer = (NGramUTF8StringBinaryTokenizer) tokenizer;
+        this.listType = new AOrderedListType(itemType, null);
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+        stringEval.evaluate(tuple, stringArg);
+        gramLengthEval.evaluate(tuple, gramLengthArg);
+        prePostEval.evaluate(tuple, prePostArg);
+
+        int gramLength = ATypeHierarchy.getIntegerValue(BuiltinFunctions.GRAM_TOKENS.getName(), 1,
+                gramLengthArg.getByteArray(), gramLengthArg.getStartOffset());
+        tokenizer.setGramlength(gramLength);
+        boolean prePost = ABooleanSerializerDeserializer.getBoolean(prePostArg.getByteArray(),
+                prePostArg.getStartOffset() + typeIndicatorSize);
+        tokenizer.setPrePost(prePost);
+        tokenizer.reset(stringArg.getByteArray(), stringArg.getStartOffset(), stringArg.getLength());
+
+        try {
+            listBuilder.reset(listType);
+            while (tokenizer.hasNext()) {
+                tokenizer.next();
+                listBuilder.addItem(tokenizer.getToken());
+            }
+            listBuilder.write(out, true);
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java
new file mode 100644
index 0000000..4f5fb69
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataInputStream;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersFactory;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream;
+import org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+
+public class SimilarityFiltersCache {
+    private final UTF8StringSerializerDeserializer utf8SerDer = new UTF8StringSerializerDeserializer();
+
+    private final ByteBufferInputStream bbis = new ByteBufferInputStream();
+    private final DataInputStream dis = new DataInputStream(bbis);
+
+    private float similarityThresholdCached = 0;
+    private byte[] similarityNameBytesCached = null;
+    private SimilarityFilters similarityFiltersCached = null;
+
+    public SimilarityFilters get(float similarityThreshold, byte[] similarityNameBytes, int startOffset, int len)
+            throws HyracksDataException {
+        if (similarityThreshold != similarityThresholdCached || similarityNameBytesCached == null
+                || !Arrays.equals(similarityNameBytes, similarityNameBytesCached)) {
+            bbis.setByteBuffer(ByteBuffer.wrap(similarityNameBytes), startOffset + 1);
+            String similarityName = utf8SerDer.deserialize(dis);
+            similarityNameBytesCached = Arrays.copyOfRange(similarityNameBytes, startOffset, len);
+            similarityFiltersCached =
+                    SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold);
+        }
+        return similarityFiltersCached;
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
new file mode 100644
index 0000000..60b5592
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.dataflow.data.nontagged.serde.AFloatSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.ABoolean;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.IntegerPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.data.std.util.BinaryEntry;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class SimilarityJaccardCheckEvaluator extends SimilarityJaccardEvaluator {
+
+    protected final IScalarEvaluator jaccThreshEval;
+    protected float jaccThresh = -1f;
+    protected IPointable jaccThreshPointable = new VoidPointable();
+
+    protected OrderedListBuilder listBuilder;
+    protected ArrayBackedValueStorage inputVal;
+    @SuppressWarnings("unchecked")
+    protected final ISerializerDeserializer<ABoolean> booleanSerde =
+            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN);
+    protected final AOrderedListType listType = new AOrderedListType(BuiltinType.ANY, "list");
+
+    public SimilarityJaccardCheckEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        super(args, context);
+        jaccThreshEval = args[2].createScalarEvaluator(context);
+        listBuilder = new OrderedListBuilder();
+        inputVal = new ArrayBackedValueStorage();
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+
+        firstOrdListEval.evaluate(tuple, argPtr1);
+        secondOrdListEval.evaluate(tuple, argPtr2);
+        jaccThreshEval.evaluate(tuple, jaccThreshPointable);
+
+        firstTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+        secondTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+
+        firstItemTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset() + 1]);
+        secondItemTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset() + 1]);
+
+        jaccThresh = AFloatSerializerDeserializer.getFloat(jaccThreshPointable.getByteArray(),
+                jaccThreshPointable.getStartOffset() + TYPE_INDICATOR_SIZE);
+
+        if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+            result.set(resultStorage);
+            return;
+        }
+        if (prepareLists(argPtr1, argPtr2)) {
+            jaccSim = computeResult();
+        } else {
+            jaccSim = 0.0f;
+        }
+        try {
+            writeResult(jaccSim);
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+
+    @Override
+    protected int probeHashMap(AbstractAsterixListIterator probeIter, int buildListSize, int probeListSize)
+            throws HyracksDataException {
+        // Apply length filter.
+        int lengthLowerBound = (int) Math.ceil(jaccThresh * probeListSize);
+        if ((lengthLowerBound > buildListSize)
+                || (buildListSize > (int) Math.floor(1.0f / jaccThresh * probeListSize))) {
+            return -1;
+        }
+        // Probe phase: Probe items from second list, and compute intersection size.
+        int intersectionSize = 0;
+        int probeListCount = 0;
+        int minUnionSize = buildListSize;
+        while (probeIter.hasNext()) {
+            probeListCount++;
+            byte[] buf = probeIter.getData();
+            int off = probeIter.getPos();
+            int len = probeIter.getItemLen();
+            keyEntry.set(buf, off, len);
+            BinaryEntry entry = hashMap.get(keyEntry);
+            if (entry != null) {
+                // Increment second value.
+                int firstValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset());
+                // Irrelevant for the intersection size.
+                if (firstValInt == 0) {
+                    continue;
+                }
+                int secondValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset() + 4);
+                // Subtract old min value.
+                intersectionSize -= (firstValInt < secondValInt) ? firstValInt : secondValInt;
+                secondValInt++;
+                // Add new min value.
+                intersectionSize += (firstValInt < secondValInt) ? firstValInt : secondValInt;
+                IntegerPointable.setInteger(entry.getBuf(), entry.getOffset() + 4, secondValInt);
+            } else {
+                // Could not find element in other set. Increase min union size by 1.
+                minUnionSize++;
+                // Check whether jaccThresh can still be satisfied if there was a mismatch.
+                int maxIntersectionSize = Math.min(buildListSize, intersectionSize + (probeListSize - probeListCount));
+                int lowerBound = (int) Math.floor(jaccThresh * minUnionSize);
+                if (maxIntersectionSize < lowerBound) {
+                    // Cannot satisfy jaccThresh.
+                    return -1;
+                }
+            }
+            probeIter.next();
+        }
+        return intersectionSize;
+    }
+
+    @Override
+    protected void writeResult(float jacc) throws IOException {
+        listBuilder.reset(listType);
+        boolean matches = (jacc < jaccThresh) ? false : true;
+        inputVal.reset();
+        booleanSerde.serialize(matches ? ABoolean.TRUE : ABoolean.FALSE, inputVal.getDataOutput());
+        listBuilder.addItem(inputVal);
+
+        inputVal.reset();
+        aFloat.setValue((matches) ? jacc : 0.0f);
+        floatSerde.serialize(aFloat, inputVal.getDataOutput());
+        listBuilder.addItem(inputVal);
+
+        listBuilder.write(out, true);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
new file mode 100644
index 0000000..1e5ad3c
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.asterix.dataflow.data.nontagged.comparators.ListItemBinaryComparatorFactory;
+import org.apache.asterix.dataflow.data.nontagged.hash.ListItemBinaryHashFunctionFactory;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.AFloat;
+import org.apache.asterix.om.base.AMutableFloat;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.asterix.runtime.evaluators.functions.BinaryHashMap;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
+import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.IntegerPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.data.std.util.BinaryEntry;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class SimilarityJaccardEvaluator implements IScalarEvaluator {
+
+    // Parameters for hash table.
+    protected static final int MIN_TABLE_SIZE = 100;
+    protected static final int TABLE_FRAME_SIZE = 32768;
+
+    // Assuming type indicator in serde format.
+    protected static final int TYPE_INDICATOR_SIZE = 1;
+
+    protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+    protected final DataOutput out = resultStorage.getDataOutput();
+    protected final IPointable argPtr1 = new VoidPointable();
+    protected final IPointable argPtr2 = new VoidPointable();
+    protected final IScalarEvaluator firstOrdListEval;
+    protected final IScalarEvaluator secondOrdListEval;
+
+    protected final OrderedListIterator fstOrdListIter = new OrderedListIterator();
+    protected final OrderedListIterator sndOrdListIter = new OrderedListIterator();
+    protected final UnorderedListIterator fstUnordListIter = new UnorderedListIterator();
+    protected final UnorderedListIterator sndUnordListIter = new UnorderedListIterator();
+
+    protected AbstractAsterixListIterator firstListIter;
+    protected AbstractAsterixListIterator secondListIter;
+
+    protected final AMutableFloat aFloat = new AMutableFloat(0);
+    @SuppressWarnings("unchecked")
+    protected final ISerializerDeserializer<AFloat> floatSerde =
+            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AFLOAT);
+
+    protected ATypeTag firstTypeTag;
+    protected ATypeTag secondTypeTag;
+    protected float jaccSim = 0.0f;
+    protected ATypeTag firstItemTypeTag;
+    protected ATypeTag secondItemTypeTag;
+
+    protected BinaryHashMap hashMap;
+    protected BinaryEntry keyEntry = new BinaryEntry();
+    protected BinaryEntry valEntry = new BinaryEntry();
+
+    // Ignore case for strings. Defaults to true.
+    protected final boolean ignoreCase = true;
+
+    protected int hashTableSize = MIN_TABLE_SIZE;
+
+    public SimilarityJaccardEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        firstOrdListEval = args[0].createScalarEvaluator(context);
+        secondOrdListEval = args[1].createScalarEvaluator(context);
+        byte[] emptyValBuf = new byte[8];
+        Arrays.fill(emptyValBuf, (byte) 0);
+        valEntry.set(emptyValBuf, 0, 8);
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+
+        firstOrdListEval.evaluate(tuple, argPtr1);
+        secondOrdListEval.evaluate(tuple, argPtr2);
+
+        firstTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+        secondTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+
+        firstItemTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset() + 1]);
+        secondItemTypeTag =
+                EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset() + 1]);
+
+        if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+            result.set(resultStorage);
+            return;
+        }
+        if (prepareLists(argPtr1, argPtr2)) {
+            jaccSim = computeResult();
+        } else {
+            jaccSim = 0.0f;
+        }
+        try {
+            writeResult(jaccSim);
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+
+    protected boolean prepareLists(IPointable left, IPointable right) throws HyracksDataException {
+        firstListIter.reset(left.getByteArray(), left.getStartOffset());
+        secondListIter.reset(right.getByteArray(), right.getStartOffset());
+        // Check for special case where one of the lists is empty, since list
+        // types won't match.
+        if (firstListIter.size() == 0 || secondListIter.size() == 0) {
+            return false;
+        }
+
+        // Set the size of the table dynamically
+        hashTableSize = Math.max(Math.max(firstListIter.size(), secondListIter.size()), MIN_TABLE_SIZE);
+
+        // TODO: Check item types are compatible.
+        return true;
+    }
+
+    protected float computeResult() throws HyracksDataException {
+        // We will subtract the intersection size later to get the real union size.
+        int firstListSize = firstListIter.size();
+        int secondListSize = secondListIter.size();
+        int unionSize = firstListSize + secondListSize;
+        // Choose smaller list as build, and larger one as probe.
+        AbstractAsterixListIterator buildList = (firstListSize < secondListSize) ? firstListIter : secondListIter;
+        AbstractAsterixListIterator probeList = (buildList == firstListIter) ? secondListIter : firstListIter;
+        int buildListSize = (buildList == firstListIter) ? firstListSize : secondListSize;
+        int probeListSize = (probeList == firstListIter) ? firstListSize : secondListSize;
+        ATypeTag buildItemTypeTag = (buildList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+        ATypeTag probeItemTypeTag = (probeList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+
+        setHashMap(buildItemTypeTag, probeItemTypeTag);
+        buildHashMap(buildList);
+        int intersectionSize = probeHashMap(probeList, buildListSize, probeListSize);
+        // Special indicator for the "check" version of jaccard.
+        if (intersectionSize < 0) {
+            return -1;
+        }
+        unionSize -= intersectionSize;
+        return (float) intersectionSize / (float) unionSize;
+    }
+
+    protected void buildHashMap(AbstractAsterixListIterator buildIter) throws HyracksDataException {
+        // Build phase: Add items into hash map, starting with first list.
+        // Value in map is a pair of integers. Set first integer to 1.
+        IntegerPointable.setInteger(valEntry.getBuf(), 0, 1);
+        while (buildIter.hasNext()) {
+            byte[] buf = buildIter.getData();
+            int off = buildIter.getPos();
+            int len = buildIter.getItemLen();
+            keyEntry.set(buf, off, len);
+            BinaryEntry entry = hashMap.put(keyEntry, valEntry);
+            if (entry != null) {
+                // Increment value.
+                int firstValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset());
+                IntegerPointable.setInteger(entry.getBuf(), entry.getOffset(), firstValInt + 1);
+            }
+            buildIter.next();
+        }
+    }
+
+    protected int probeHashMap(AbstractAsterixListIterator probeIter, int buildListSize, int probeListSize)
+            throws HyracksDataException {
+        // Probe phase: Probe items from second list, and compute intersection size.
+        int intersectionSize = 0;
+        while (probeIter.hasNext()) {
+            byte[] buf = probeIter.getData();
+            int off = probeIter.getPos();
+            int len = probeIter.getItemLen();
+            keyEntry.set(buf, off, len);
+            BinaryEntry entry = hashMap.get(keyEntry);
+            if (entry != null) {
+                // Increment second value.
+                int firstValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset());
+                // Irrelevant for the intersection size.
+                if (firstValInt == 0) {
+                    continue;
+                }
+                int secondValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset() + 4);
+                // Subtract old min value.
+                intersectionSize -= (firstValInt < secondValInt) ? firstValInt : secondValInt;
+                secondValInt++;
+                // Add new min value.
+                intersectionSize += (firstValInt < secondValInt) ? firstValInt : secondValInt;
+                IntegerPointable.setInteger(entry.getBuf(), entry.getOffset() + 4, secondValInt);
+            }
+            probeIter.next();
+        }
+        return intersectionSize;
+    }
+
+    protected void setHashMap(ATypeTag buildItemTypeTag, ATypeTag probeItemTypeTag) {
+        if (hashMap != null) {
+            hashMap.clear();
+            return;
+        }
+
+        IBinaryHashFunction putHashFunc =
+                ListItemBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction(buildItemTypeTag, ignoreCase);
+        IBinaryHashFunction getHashFunc =
+                ListItemBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction(probeItemTypeTag, ignoreCase);
+        IBinaryComparator cmp = ListItemBinaryComparatorFactory.INSTANCE.createBinaryComparator(buildItemTypeTag,
+                probeItemTypeTag, ignoreCase);
+        hashMap = new BinaryHashMap(hashTableSize, TABLE_FRAME_SIZE, putHashFunc, getHashFunc, cmp);
+    }
+
+    protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws HyracksDataException {
+        switch (typeTag1) {
+            case ARRAY: {
+                firstListIter = fstOrdListIter;
+                break;
+            }
+            case MULTISET: {
+                firstListIter = fstUnordListIter;
+                break;
+            }
+            default: {
+                throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 0, typeTag1.serialize(),
+                        ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+            }
+        }
+        switch (typeTag2) {
+            case ARRAY: {
+                secondListIter = sndOrdListIter;
+                break;
+            }
+            case MULTISET: {
+                secondListIter = sndUnordListIter;
+                break;
+            }
+            default: {
+                throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 1, typeTag2.serialize(),
+                        ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+            }
+        }
+        return true;
+    }
+
+    protected void writeResult(float jacc) throws IOException {
+        aFloat.setValue(jacc);
+        floatSerde.serialize(aFloat, out);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java
new file mode 100644
index 0000000..b70c6ad
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.dataflow.data.nontagged.serde.AFloatSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AOrderedListSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AUnorderedListSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.fuzzyjoin.IntArray;
+import org.apache.asterix.fuzzyjoin.similarity.PartialIntersect;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetric;
+import org.apache.asterix.om.base.AFloat;
+import org.apache.asterix.om.base.AMutableFloat;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class SimilarityJaccardPrefixEvaluator implements IScalarEvaluator {
+    // assuming type indicator in serde format
+    protected final int typeIndicatorSize = 1;
+
+    protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+    protected final DataOutput out = resultStorage.getDataOutput();
+    protected final IPointable inputVal = new VoidPointable();
+    protected final IScalarEvaluator evalLen1;
+    protected final IScalarEvaluator evalTokens1;
+    protected final IScalarEvaluator evalLen2;
+    protected final IScalarEvaluator evalTokens2;
+    protected final IScalarEvaluator evalTokenPrefix;
+    protected final IScalarEvaluator evalThreshold;
+
+    protected float similarityThresholdCache;
+    protected SimilarityFiltersJaccard similarityFilters;
+    protected final IntArray tokens1 = new IntArray();
+    protected final IntArray tokens2 = new IntArray();
+    protected final PartialIntersect parInter = new PartialIntersect();
+
+    protected float sim = 0.0f;
+
+    // result
+    protected final AMutableFloat res = new AMutableFloat(0);
+    @SuppressWarnings("unchecked")
+    protected final ISerializerDeserializer<AFloat> reusSerde =
+            SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AFLOAT);
+
+    public SimilarityJaccardPrefixEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        evalLen1 = args[0].createScalarEvaluator(context);
+        evalTokens1 = args[1].createScalarEvaluator(context);
+        evalLen2 = args[2].createScalarEvaluator(context);
+        evalTokens2 = args[3].createScalarEvaluator(context);
+        evalTokenPrefix = args[4].createScalarEvaluator(context);
+        evalThreshold = args[5].createScalarEvaluator(context);
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+        // similarity threshold
+        sim = 0;
+        evalThreshold.evaluate(tuple, inputVal);
+        float similarityThreshold =
+                AFloatSerializerDeserializer.getFloat(inputVal.getByteArray(), inputVal.getStartOffset() + 1);
+
+        if (similarityThreshold != similarityThresholdCache || similarityFilters == null) {
+            similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
+            similarityThresholdCache = similarityThreshold;
+        }
+
+        evalLen1.evaluate(tuple, inputVal);
+        int length1 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 0,
+                inputVal.getByteArray(), inputVal.getStartOffset());
+        evalLen2.evaluate(tuple, inputVal);
+        int length2 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 2,
+                inputVal.getByteArray(), inputVal.getStartOffset());
+
+        //
+        // -- - length filter - --
+        //
+        if (similarityFilters.passLengthFilter(length1, length2)) {
+
+            // -- - tokens1 - --
+            int i;
+            tokens1.reset();
+            evalTokens1.evaluate(tuple, inputVal);
+
+            byte[] serList = inputVal.getByteArray();
+            int startOffset = inputVal.getStartOffset();
+            if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG
+                    && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
+                throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 1, serList[startOffset],
+                        ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+            }
+
+            int lengthTokens1;
+            if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
+                lengthTokens1 =
+                        AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+                // read tokens
+                for (i = 0; i < lengthTokens1; i++) {
+                    int itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+                    int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+                            BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
+                    tokens1.add(token);
+                }
+            } else {
+                lengthTokens1 =
+                        AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+                // read tokens
+                for (i = 0; i < lengthTokens1; i++) {
+                    int itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+                    int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+                            BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
+                    tokens1.add(token);
+                }
+            }
+            // pad tokens
+            for (; i < length1; i++) {
+                tokens1.add(Integer.MAX_VALUE);
+            }
+
+            // -- - tokens2 - --
+            tokens2.reset();
+            evalTokens2.evaluate(tuple, inputVal);
+
+            serList = inputVal.getByteArray();
+            startOffset = inputVal.getStartOffset();
+            if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG
+                    && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
+                throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 3, serList[startOffset],
+                        ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+            }
+
+            int lengthTokens2;
+            if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
+                lengthTokens2 =
+                        AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+                // read tokens
+                for (i = 0; i < lengthTokens2; i++) {
+                    int itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+                    int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+                            BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
+                    tokens2.add(token);
+                }
+            } else {
+                lengthTokens2 =
+                        AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+                // read tokens
+                for (i = 0; i < lengthTokens2; i++) {
+                    int itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+                    int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+                            BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
+                    tokens2.add(token);
+                }
+            }
+            // pad tokens
+            for (; i < length2; i++) {
+                tokens2.add(Integer.MAX_VALUE);
+            }
+
+            // -- - token prefix - --
+            evalTokenPrefix.evaluate(tuple, inputVal);
+            int tokenPrefix = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 4,
+                    inputVal.getByteArray(), inputVal.getStartOffset());
+
+            //
+            // -- - position filter - --
+            //
+            SimilarityMetric.getPartialIntersectSize(tokens1.get(), 0, tokens1.length(), tokens2.get(), 0,
+                    tokens2.length(), tokenPrefix, parInter);
+            if (similarityFilters.passPositionFilter(parInter.intersectSize, parInter.posXStop, length1,
+                    parInter.posYStop, length2)) {
+
+                //
+                // -- - suffix filter - --
+                //
+                if (similarityFilters.passSuffixFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStart,
+                        tokens2.get(), 0, tokens2.length(), parInter.posYStart)) {
+
+                    sim = similarityFilters.passSimilarityFilter(tokens1.get(), 0, tokens1.length(),
+                            parInter.posXStop + 1, tokens2.get(), 0, tokens2.length(), parInter.posYStop + 1,
+                            parInter.intersectSize);
+                }
+            }
+        }
+
+        try {
+            writeResult();
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+
+    public void writeResult() throws IOException {
+        res.setValue(sim);
+        reusSerde.serialize(res, out);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
new file mode 100644
index 0000000..0decd9e
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetricJaccard;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class SimilarityJaccardSortedCheckEvaluator extends SimilarityJaccardCheckEvaluator {
+
+    protected final SimilarityMetricJaccard jaccard = new SimilarityMetricJaccard();
+
+    public SimilarityJaccardSortedCheckEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        super(args, context);
+    }
+
+    @Override
+    protected float computeResult() throws HyracksDataException {
+        return jaccard.computeSimilarity(firstListIter, secondListIter, jaccThresh);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
new file mode 100644
index 0000000..1cd32c8
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetricJaccard;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+// Assumes that both arguments are sorted by the same ordering.
+public class SimilarityJaccardSortedEvaluator extends SimilarityJaccardEvaluator {
+
+    protected final SimilarityMetricJaccard jaccard = new SimilarityMetricJaccard();
+
+    public SimilarityJaccardSortedEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+            throws HyracksDataException {
+        super(args, context);
+    }
+
+    @Override
+    protected float computeResult() throws HyracksDataException {
+        return jaccard.computeSimilarity(firstListIter, secondListIter);
+    }
+}

http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java
new file mode 100644
index 0000000..e51d5cf
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+
+public class WordTokensEvaluator implements IScalarEvaluator {
+    private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+    private final DataOutput out = resultStorage.getDataOutput();
+    private final IPointable argPtr = new VoidPointable();
+    private final IScalarEvaluator stringEval;
+
+    private final IBinaryTokenizer tokenizer;
+    private final OrderedListBuilder listBuilder = new OrderedListBuilder();
+    private final AOrderedListType listType;
+
+    public WordTokensEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context, IBinaryTokenizer tokenizer,
+            BuiltinType itemType) throws HyracksDataException {
+        stringEval = args[0].createScalarEvaluator(context);
+        this.tokenizer = tokenizer;
+        this.listType = new AOrderedListType(itemType, null);
+    }
+
+    @Override
+    public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+        resultStorage.reset();
+        stringEval.evaluate(tuple, argPtr);
+        tokenizer.reset(argPtr.getByteArray(), argPtr.getStartOffset(), argPtr.getLength());
+        try {
+            listBuilder.reset(listType);
+            while (tokenizer.hasNext()) {
+                tokenizer.next();
+                listBuilder.addItem(tokenizer.getToken());
+            }
+            listBuilder.write(out, true);
+        } catch (IOException e) {
+            throw new HyracksDataException(e);
+        }
+        result.set(resultStorage);
+    }
+}