You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by mb...@apache.org on 2018/03/14 04:08:20 UTC
[7/7] asterixdb git commit: [ASTERIXDB-2330][*DB][RT] Add
IFunctionRegistrant for dynamic function registration
[ASTERIXDB-2330][*DB][RT] Add IFunctionRegistrant for dynamic function registration
- user model changes: no
- storage format changes: no
- interface changes: yes
Details:
- Look for IFunctionRegistrant service instances at runtime, and use these to dynamically
register non-core functions with *DB
- Extract fuzzyjoin functions from core runtime
Change-Id: Ia88590280cbf476e08b905d9e1d62c68667a2569
Reviewed-on: https://asterix-gerrit.ics.uci.edu/2480
Reviewed-by: abdullah alamoudi <ba...@gmail.com>
Tested-by: Michael Blow <mb...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo
Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/b8307794
Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/b8307794
Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/b8307794
Branch: refs/heads/master
Commit: b8307794c3483199a88266e8646bfbad79cc70f9
Parents: 38c41ac
Author: Michael Blow <mb...@apache.org>
Authored: Tue Mar 13 20:45:51 2018 -0700
Committer: Michael Blow <mb...@apache.org>
Committed: Tue Mar 13 21:07:36 2018 -0700
----------------------------------------------------------------------
asterixdb/asterix-app/pom.xml | 7 +
.../apache/asterix/runtime/NullMissingTest.java | 5 +-
asterixdb/asterix-fuzzyjoin/pom.xml | 64 +-
.../runtime/FuzzyJoinFunctionRegistrant.java | 72 ++
.../common/EditDistanceCheckEvaluator.java | 134 +++
.../common/EditDistanceContainsEvaluator.java | 59 +
.../common/EditDistanceEvaluator.java | 156 +++
.../evaluators/common/GramTokensEvaluator.java | 95 ++
.../common/SimilarityFiltersCache.java | 53 +
.../common/SimilarityJaccardCheckEvaluator.java | 166 +++
.../common/SimilarityJaccardEvaluator.java | 277 +++++
.../SimilarityJaccardPrefixEvaluator.java | 230 ++++
.../SimilarityJaccardSortedCheckEvaluator.java | 39 +
.../SimilarityJaccardSortedEvaluator.java | 40 +
.../evaluators/common/WordTokensEvaluator.java | 71 ++
.../CountHashedGramTokensDescriptor.java | 68 ++
.../CountHashedWordTokensDescriptor.java | 66 ++
.../functions/EditDistanceCheckDescriptor.java | 58 +
.../EditDistanceContainsDescriptor.java | 58 +
.../functions/EditDistanceDescriptor.java | 59 +
.../EditDistanceListIsFilterableDescriptor.java | 149 +++
...ditDistanceStringIsFilterableDescriptor.java | 65 ++
...EditDistanceStringIsFilterableEvaluator.java | 121 ++
.../functions/GramTokensDescriptor.java | 66 ++
.../functions/HashedGramTokensDescriptor.java | 66 ++
.../functions/HashedWordTokensDescriptor.java | 66 ++
.../functions/PrefixLenDescriptor.java | 141 +++
.../functions/PrefixLenJaccardDescriptor.java | 126 ++
.../functions/SimilarityDescriptor.java | 268 +++++
.../SimilarityJaccardCheckDescriptor.java | 58 +
.../functions/SimilarityJaccardDescriptor.java | 59 +
.../SimilarityJaccardPrefixCheckDescriptor.java | 110 ++
.../SimilarityJaccardPrefixDescriptor.java | 59 +
.../SimilarityJaccardSortedCheckDescriptor.java | 59 +
.../SimilarityJaccardSortedDescriptor.java | 60 +
.../functions/SpatialIntersectDescriptor.java | 1091 ++++++++++++++++++
.../functions/WordTokensDescriptor.java | 66 ++
...che.asterix.om.functions.IFunctionRegistrant | 20 +
.../om/functions/IFunctionCollection.java | 27 +
.../om/functions/IFunctionRegistrant.java | 25 +
...AbstractScalarFunctionDynamicDescriptor.java | 33 +
asterixdb/asterix-runtime/pom.xml | 6 -
...AbstractScalarFunctionDynamicDescriptor.java | 33 -
.../common/EditDistanceCheckEvaluator.java | 134 ---
.../common/EditDistanceContainsEvaluator.java | 59 -
.../common/EditDistanceEvaluator.java | 156 ---
.../evaluators/common/GramTokensEvaluator.java | 95 --
.../common/SimilarityFiltersCache.java | 53 -
.../common/SimilarityJaccardCheckEvaluator.java | 166 ---
.../common/SimilarityJaccardEvaluator.java | 277 -----
.../SimilarityJaccardPrefixEvaluator.java | 230 ----
.../SimilarityJaccardSortedCheckEvaluator.java | 39 -
.../SimilarityJaccardSortedEvaluator.java | 40 -
.../evaluators/common/WordTokensEvaluator.java | 71 --
.../CountHashedGramTokensDescriptor.java | 68 --
.../CountHashedWordTokensDescriptor.java | 66 --
.../functions/EditDistanceCheckDescriptor.java | 58 -
.../EditDistanceContainsDescriptor.java | 58 -
.../functions/EditDistanceDescriptor.java | 59 -
.../EditDistanceListIsFilterableDescriptor.java | 149 ---
...ditDistanceStringIsFilterableDescriptor.java | 65 --
...EditDistanceStringIsFilterableEvaluator.java | 121 --
.../functions/GramTokensDescriptor.java | 66 --
.../functions/HashedGramTokensDescriptor.java | 66 --
.../functions/HashedWordTokensDescriptor.java | 66 --
.../functions/PrefixLenDescriptor.java | 141 ---
.../functions/PrefixLenJaccardDescriptor.java | 126 --
.../functions/SimilarityDescriptor.java | 268 -----
.../SimilarityJaccardCheckDescriptor.java | 58 -
.../functions/SimilarityJaccardDescriptor.java | 59 -
.../SimilarityJaccardPrefixCheckDescriptor.java | 110 --
.../SimilarityJaccardPrefixDescriptor.java | 59 -
.../SimilarityJaccardSortedCheckDescriptor.java | 59 -
.../SimilarityJaccardSortedDescriptor.java | 60 -
.../functions/SpatialIntersectDescriptor.java | 1091 ------------------
.../functions/WordTokensDescriptor.java | 66 --
.../runtime/functions/FunctionCollection.java | 55 +-
asterixdb/asterix-server/pom.xml | 6 +
78 files changed, 4513 insertions(+), 4358 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-app/pom.xml
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/pom.xml b/asterixdb/asterix-app/pom.xml
index e75ad42..b51d499 100644
--- a/asterixdb/asterix-app/pom.xml
+++ b/asterixdb/asterix-app/pom.xml
@@ -152,6 +152,7 @@
</ignoredUsedUndeclaredDependencies>
<usedDependencies combine.children="append">
<usedDependency>org.apache.hadoop:hadoop-common</usedDependency>
+ <usedDependency>org.apache.asterix:asterix-fuzzyjoin</usedDependency>
</usedDependencies>
<ignoredUnusedDeclaredDependencies>
<ignoredUnusedDeclaredDependency>org.apache.asterix:asterix-external-data:zip:*</ignoredUnusedDeclaredDependency>
@@ -643,5 +644,11 @@
<artifactId>log4j-jul</artifactId>
<version>2.10.0</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.asterix</groupId>
+ <artifactId>asterix-fuzzyjoin</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
index 3b33868..d96fec9 100644
--- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
+++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/runtime/NullMissingTest.java
@@ -55,8 +55,9 @@ public class NullMissingTest {
++testedFunctions;
}
}
- // 208 is the current number of functions with generated code.
- Assert.assertTrue(testedFunctions >= 208);
+ // 217 is the current number of functions with generated code.
+ Assert.assertTrue("expected >= 217 generated functions to be tested, but was " + testedFunctions,
+ testedFunctions >= 217);
}
private void testFunction(IFunctionDescriptorFactory funcFactory) throws Exception {
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/pom.xml
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/pom.xml b/asterixdb/asterix-fuzzyjoin/pom.xml
index 236bcc7..8056cdf 100644
--- a/asterixdb/asterix-fuzzyjoin/pom.xml
+++ b/asterixdb/asterix-fuzzyjoin/pom.xml
@@ -23,7 +23,6 @@
<groupId>org.apache.asterix</groupId>
<version>0.9.4-SNAPSHOT</version>
</parent>
- <groupId>org.apache.asterix</groupId>
<artifactId>asterix-fuzzyjoin</artifactId>
<licenses>
@@ -42,6 +41,23 @@
<build>
<plugins>
<plugin>
+ <groupId>org.apache.asterix</groupId>
+ <artifactId>asterix-evaluator-generator-maven-plugin</artifactId>
+ <version>${project.version}</version>
+ <configuration>
+ <evaluatorPackagePrefix>org.apache.asterix.runtime.evaluators</evaluatorPackagePrefix>
+ </configuration>
+ <executions>
+ <execution>
+ <id>generate-evaluator</id>
+ <phase>process-classes</phase>
+ <goals>
+ <goal>generate-evaluator</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
@@ -52,17 +68,6 @@
<phase>test-compile</phase>
</execution>
</executions>
- <configuration>
- <outputDirectory>${basedir}/target</outputDirectory>
- <includes>
- <include>**/*.class</include>
- <include>**/*.txt</include>
- <include>**/README*</include>
- <include>**/NOTICE*</include>
- <include>**/LICENSE*</include>
- <include>**/DEPENDENCIES*</include>
- </includes>
- </configuration>
</plugin>
<plugin>
<groupId>org.apache.rat</groupId>
@@ -102,6 +107,41 @@
<groupId>org.apache.hyracks</groupId>
<artifactId>hyracks-data-std</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.asterix</groupId>
+ <artifactId>asterix-om</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.asterix</groupId>
+ <artifactId>asterix-runtime</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>algebricks-common</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>algebricks-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>hyracks-storage-am-lsm-invertedindex</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>algebricks-runtime</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.asterix</groupId>
+ <artifactId>asterix-common</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>hyracks-dataflow-common</artifactId>
+ </dependency>
</dependencies>
</project>
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java
new file mode 100644
index 0000000..35d8727
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/FuzzyJoinFunctionRegistrant.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime;
+
+import org.apache.asterix.om.functions.IFunctionCollection;
+import org.apache.asterix.om.functions.IFunctionRegistrant;
+import org.apache.asterix.runtime.evaluators.functions.CountHashedGramTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.CountHashedWordTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceContainsDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceListIsFilterableDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.EditDistanceStringIsFilterableDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.GramTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.HashedGramTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.HashedWordTokensDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.PrefixLenJaccardDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardPrefixCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardPrefixDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardSortedCheckDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SimilarityJaccardSortedDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.SpatialIntersectDescriptor;
+import org.apache.asterix.runtime.evaluators.functions.WordTokensDescriptor;
+
+public class FuzzyJoinFunctionRegistrant implements IFunctionRegistrant {
+ @Override
+ public void register(IFunctionCollection fc) {
+ // TODO: decide how should we deal these two weird functions as
+ // the number of arguments of the function depend on the first few arguments.
+ fc.add(SimilarityJaccardPrefixDescriptor.FACTORY);
+ fc.add(SimilarityJaccardPrefixCheckDescriptor.FACTORY);
+
+ // Spatial
+ fc.addGenerated(SpatialIntersectDescriptor.FACTORY);
+
+ // fuzzyjoin function
+ fc.addGenerated(PrefixLenJaccardDescriptor.FACTORY);
+ fc.addGenerated(WordTokensDescriptor.FACTORY);
+ fc.addGenerated(HashedWordTokensDescriptor.FACTORY);
+ fc.addGenerated(CountHashedWordTokensDescriptor.FACTORY);
+ fc.addGenerated(GramTokensDescriptor.FACTORY);
+ fc.addGenerated(HashedGramTokensDescriptor.FACTORY);
+ fc.addGenerated(CountHashedGramTokensDescriptor.FACTORY);
+ fc.addGenerated(EditDistanceDescriptor.FACTORY);
+ fc.addGenerated(EditDistanceCheckDescriptor.FACTORY);
+ fc.addGenerated(EditDistanceStringIsFilterableDescriptor.FACTORY);
+ fc.addGenerated(EditDistanceListIsFilterableDescriptor.FACTORY);
+ fc.addGenerated(EditDistanceContainsDescriptor.FACTORY);
+ fc.addGenerated(SimilarityJaccardDescriptor.FACTORY);
+ fc.addGenerated(SimilarityJaccardCheckDescriptor.FACTORY);
+ fc.addGenerated(SimilarityJaccardSortedDescriptor.FACTORY);
+ fc.addGenerated(SimilarityJaccardSortedCheckDescriptor.FACTORY);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
new file mode 100644
index 0000000..dece292
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.ABoolean;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class EditDistanceCheckEvaluator extends EditDistanceEvaluator {
+
+ protected final IScalarEvaluator edThreshEval;
+ protected int edThresh;
+ private final IPointable argPtrThreshold = new VoidPointable();
+ protected final OrderedListBuilder listBuilder;
+ protected ArrayBackedValueStorage listItemVal;
+ @SuppressWarnings("unchecked")
+ protected final ISerializerDeserializer<ABoolean> booleanSerde =
+ SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN);
+
+ public EditDistanceCheckEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ super(args, context);
+ edThreshEval = args[2].createScalarEvaluator(context);
+ listBuilder = new OrderedListBuilder();
+ listItemVal = new ArrayBackedValueStorage();
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+ resultStorage.reset();
+ firstStringEval.evaluate(tuple, argPtr1);
+ firstTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+ secondStringEval.evaluate(tuple, argPtr2);
+ secondTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+ edThreshEval.evaluate(tuple, argPtrThreshold);
+
+ if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+ result.set(resultStorage);
+ return;
+ }
+ try {
+ edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
+ argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+ if (edThresh < 0) {
+ throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+ 3, edThresh);
+ }
+ editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
+ writeResult(editDistance);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ result.set(resultStorage);
+ }
+
+ @Override
+ protected int computeResult(IPointable left, IPointable right, ATypeTag argType) throws HyracksDataException {
+ byte[] leftBytes = left.getByteArray();
+ int leftStartOffset = left.getStartOffset();
+ byte[] rightBytes = right.getByteArray();
+ int rightStartOffset = right.getStartOffset();
+ switch (argType) {
+ case STRING: {
+ return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+ rightStartOffset + typeIndicatorSize, edThresh);
+ }
+
+ case ARRAY: {
+ firstOrdListIter.reset(leftBytes, leftStartOffset);
+ secondOrdListIter.reset(rightBytes, rightStartOffset);
+ return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
+ }
+
+ default: {
+ throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_CHECK, 0, argType.serialize(),
+ ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+ }
+
+ }
+ }
+
+ @Override
+ protected void writeResult(int ed) throws IOException {
+ listBuilder.reset(new AOrderedListType(BuiltinType.ANY, "list"));
+ boolean matches = (ed < 0) ? false : true;
+ listItemVal.reset();
+ booleanSerde.serialize(matches ? ABoolean.TRUE : ABoolean.FALSE, listItemVal.getDataOutput());
+ listBuilder.addItem(listItemVal);
+
+ listItemVal.reset();
+ aInt64.setValue((matches) ? ed : Integer.MAX_VALUE);
+ int64Serde.serialize(aInt64, listItemVal.getDataOutput());
+ listBuilder.addItem(listItemVal);
+ listBuilder.write(out, true);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
new file mode 100644
index 0000000..eaf3368
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+
+public class EditDistanceContainsEvaluator extends EditDistanceCheckEvaluator {
+
+ public EditDistanceContainsEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ super(args, context);
+ }
+
+ @Override
+ protected int computeResult(IPointable left, IPointable right, ATypeTag argType) throws HyracksDataException {
+ byte[] leftBytes = left.getByteArray();
+ int leftStartOffset = left.getStartOffset();
+ byte[] rightBytes = right.getByteArray();
+ int rightStartOffset = right.getStartOffset();
+
+ switch (argType) {
+ case STRING: {
+ return ed.UTF8StringEditDistanceContains(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+ rightStartOffset + typeIndicatorSize, edThresh);
+ }
+ case ARRAY: {
+ firstOrdListIter.reset(leftBytes, leftStartOffset);
+ secondOrdListIter.reset(rightBytes, rightStartOffset);
+ return ed.getSimilarityContains(firstOrdListIter, secondOrdListIter, edThresh);
+ }
+ default: {
+ throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE_CONTAINS, 0, argType.serialize(),
+ ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
new file mode 100644
index 0000000..85fd334
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetricEditDistance;
+import org.apache.asterix.om.base.AInt64;
+import org.apache.asterix.om.base.AMutableInt64;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.asterix.runtime.exceptions.IncompatibleTypeException;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.asterix.runtime.exceptions.UnsupportedItemTypeException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class EditDistanceEvaluator implements IScalarEvaluator {
+
+ // assuming type indicator in serde format
+ protected final int typeIndicatorSize = 1;
+
+ protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+ protected final DataOutput out = resultStorage.getDataOutput();
+ protected final IPointable argPtr1 = new VoidPointable();
+ protected final IPointable argPtr2 = new VoidPointable();
+ protected final IScalarEvaluator firstStringEval;
+ protected final IScalarEvaluator secondStringEval;
+ protected final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
+ protected final OrderedListIterator firstOrdListIter = new OrderedListIterator();
+ protected final OrderedListIterator secondOrdListIter = new OrderedListIterator();
+ protected int editDistance = 0;
+ protected final AMutableInt64 aInt64 = new AMutableInt64(-1);
+ @SuppressWarnings("unchecked")
+ protected final ISerializerDeserializer<AInt64> int64Serde =
+ SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AINT64);
+ protected ATypeTag itemTypeTag;
+
+ protected ATypeTag firstTypeTag;
+ protected ATypeTag secondTypeTag;
+
+ public EditDistanceEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ firstStringEval = args[0].createScalarEvaluator(context);
+ secondStringEval = args[1].createScalarEvaluator(context);
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+ resultStorage.reset();
+ firstStringEval.evaluate(tuple, argPtr1);
+ firstTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+ secondStringEval.evaluate(tuple, argPtr2);
+ secondTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+
+ if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+ result.set(resultStorage);
+ return;
+ }
+
+ editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
+ try {
+ writeResult(editDistance);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ result.set(resultStorage);
+ }
+
+ protected int computeResult(IPointable left, IPointable right, ATypeTag argType) throws HyracksDataException {
+ byte[] leftBytes = left.getByteArray();
+ int leftStartOffset = left.getStartOffset();
+ byte[] rightBytes = right.getByteArray();
+ int rightStartOffset = right.getStartOffset();
+
+ switch (argType) {
+ case STRING: {
+ // Passes -1 as the simThresh to calculate the edit distance
+ // without applying any calculation optimizations.
+ return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+ rightStartOffset + typeIndicatorSize, -1);
+ }
+ case ARRAY: {
+ firstOrdListIter.reset(leftBytes, leftStartOffset);
+ secondOrdListIter.reset(rightBytes, rightStartOffset);
+ return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter);
+ }
+ default: {
+ throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
+ ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+ }
+
+ }
+ }
+
+ protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws HyracksDataException {
+ if (typeTag1 != typeTag2) {
+ throw new IncompatibleTypeException(BuiltinFunctions.EDIT_DISTANCE, typeTag1.serialize(),
+ typeTag2.serialize());
+ }
+
+ // Since they are equal, check one tag is enough.
+ if (typeTag1 != ATypeTag.STRING && typeTag1 != ATypeTag.ARRAY) { // could be an list
+ throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, typeTag1.serialize(),
+ ATypeTag.SERIALIZED_STRING_TYPE_TAG, ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG);
+ }
+
+ if (typeTag1 == ATypeTag.ARRAY) {
+ itemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER
+ .deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset() + 1]);
+ if (itemTypeTag == ATypeTag.ANY) {
+ throw new UnsupportedItemTypeException(BuiltinFunctions.EDIT_DISTANCE, itemTypeTag.serialize());
+ }
+ itemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER
+ .deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset() + 1]);
+ if (itemTypeTag == ATypeTag.ANY) {
+ throw new UnsupportedItemTypeException(BuiltinFunctions.EDIT_DISTANCE, itemTypeTag.serialize());
+ }
+ }
+ return true;
+ }
+
+ protected void writeResult(int ed) throws IOException {
+ aInt64.setValue(ed);
+ int64Serde.serialize(aInt64, out);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java
new file mode 100644
index 0000000..ef727c9
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/GramTokensEvaluator.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.dataflow.data.nontagged.serde.ABooleanSerializerDeserializer;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+
+public class GramTokensEvaluator implements IScalarEvaluator {
+
+ // assuming type indicator in serde format
+ private final int typeIndicatorSize = 1;
+
+ private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+ private final DataOutput out = resultStorage.getDataOutput();
+ private final IPointable stringArg = new VoidPointable();
+ private final IPointable gramLengthArg = new VoidPointable();
+ private final IPointable prePostArg = new VoidPointable();
+ private final IScalarEvaluator stringEval;
+ private final IScalarEvaluator gramLengthEval;
+ private final IScalarEvaluator prePostEval;
+
+ private final NGramUTF8StringBinaryTokenizer tokenizer;
+ private final OrderedListBuilder listBuilder = new OrderedListBuilder();
+ private final AOrderedListType listType;
+
+ public GramTokensEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context, IBinaryTokenizer tokenizer,
+ BuiltinType itemType) throws HyracksDataException {
+ stringEval = args[0].createScalarEvaluator(context);
+ gramLengthEval = args[1].createScalarEvaluator(context);
+ prePostEval = args[2].createScalarEvaluator(context);
+ this.tokenizer = (NGramUTF8StringBinaryTokenizer) tokenizer;
+ this.listType = new AOrderedListType(itemType, null);
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+ resultStorage.reset();
+ stringEval.evaluate(tuple, stringArg);
+ gramLengthEval.evaluate(tuple, gramLengthArg);
+ prePostEval.evaluate(tuple, prePostArg);
+
+ int gramLength = ATypeHierarchy.getIntegerValue(BuiltinFunctions.GRAM_TOKENS.getName(), 1,
+ gramLengthArg.getByteArray(), gramLengthArg.getStartOffset());
+ tokenizer.setGramlength(gramLength);
+ boolean prePost = ABooleanSerializerDeserializer.getBoolean(prePostArg.getByteArray(),
+ prePostArg.getStartOffset() + typeIndicatorSize);
+ tokenizer.setPrePost(prePost);
+ tokenizer.reset(stringArg.getByteArray(), stringArg.getStartOffset(), stringArg.getLength());
+
+ try {
+ listBuilder.reset(listType);
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+ listBuilder.addItem(tokenizer.getToken());
+ }
+ listBuilder.write(out, true);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ result.set(resultStorage);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java
new file mode 100644
index 0000000..4f5fb69
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityFiltersCache.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataInputStream;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFilters;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersFactory;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.dataflow.common.comm.util.ByteBufferInputStream;
+import org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+
+public class SimilarityFiltersCache {
+ private final UTF8StringSerializerDeserializer utf8SerDer = new UTF8StringSerializerDeserializer();
+
+ private final ByteBufferInputStream bbis = new ByteBufferInputStream();
+ private final DataInputStream dis = new DataInputStream(bbis);
+
+ private float similarityThresholdCached = 0;
+ private byte[] similarityNameBytesCached = null;
+ private SimilarityFilters similarityFiltersCached = null;
+
+ public SimilarityFilters get(float similarityThreshold, byte[] similarityNameBytes, int startOffset, int len)
+ throws HyracksDataException {
+ if (similarityThreshold != similarityThresholdCached || similarityNameBytesCached == null
+ || !Arrays.equals(similarityNameBytes, similarityNameBytesCached)) {
+ bbis.setByteBuffer(ByteBuffer.wrap(similarityNameBytes), startOffset + 1);
+ String similarityName = utf8SerDer.deserialize(dis);
+ similarityNameBytesCached = Arrays.copyOfRange(similarityNameBytes, startOffset, len);
+ similarityFiltersCached =
+ SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold);
+ }
+ return similarityFiltersCached;
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
new file mode 100644
index 0000000..60b5592
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.dataflow.data.nontagged.serde.AFloatSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.ABoolean;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.IntegerPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.data.std.util.BinaryEntry;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class SimilarityJaccardCheckEvaluator extends SimilarityJaccardEvaluator {
+
+ protected final IScalarEvaluator jaccThreshEval;
+ protected float jaccThresh = -1f;
+ protected IPointable jaccThreshPointable = new VoidPointable();
+
+ protected OrderedListBuilder listBuilder;
+ protected ArrayBackedValueStorage inputVal;
+ @SuppressWarnings("unchecked")
+ protected final ISerializerDeserializer<ABoolean> booleanSerde =
+ SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.ABOOLEAN);
+ protected final AOrderedListType listType = new AOrderedListType(BuiltinType.ANY, "list");
+
+ public SimilarityJaccardCheckEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ super(args, context);
+ jaccThreshEval = args[2].createScalarEvaluator(context);
+ listBuilder = new OrderedListBuilder();
+ inputVal = new ArrayBackedValueStorage();
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+ resultStorage.reset();
+
+ firstOrdListEval.evaluate(tuple, argPtr1);
+ secondOrdListEval.evaluate(tuple, argPtr2);
+ jaccThreshEval.evaluate(tuple, jaccThreshPointable);
+
+ firstTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+ secondTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+
+ firstItemTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset() + 1]);
+ secondItemTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset() + 1]);
+
+ jaccThresh = AFloatSerializerDeserializer.getFloat(jaccThreshPointable.getByteArray(),
+ jaccThreshPointable.getStartOffset() + TYPE_INDICATOR_SIZE);
+
+ if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+ result.set(resultStorage);
+ return;
+ }
+ if (prepareLists(argPtr1, argPtr2)) {
+ jaccSim = computeResult();
+ } else {
+ jaccSim = 0.0f;
+ }
+ try {
+ writeResult(jaccSim);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ result.set(resultStorage);
+ }
+
+ @Override
+ protected int probeHashMap(AbstractAsterixListIterator probeIter, int buildListSize, int probeListSize)
+ throws HyracksDataException {
+ // Apply length filter.
+ int lengthLowerBound = (int) Math.ceil(jaccThresh * probeListSize);
+ if ((lengthLowerBound > buildListSize)
+ || (buildListSize > (int) Math.floor(1.0f / jaccThresh * probeListSize))) {
+ return -1;
+ }
+ // Probe phase: Probe items from second list, and compute intersection size.
+ int intersectionSize = 0;
+ int probeListCount = 0;
+ int minUnionSize = buildListSize;
+ while (probeIter.hasNext()) {
+ probeListCount++;
+ byte[] buf = probeIter.getData();
+ int off = probeIter.getPos();
+ int len = probeIter.getItemLen();
+ keyEntry.set(buf, off, len);
+ BinaryEntry entry = hashMap.get(keyEntry);
+ if (entry != null) {
+ // Increment second value.
+ int firstValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset());
+ // Irrelevant for the intersection size.
+ if (firstValInt == 0) {
+ continue;
+ }
+ int secondValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset() + 4);
+ // Subtract old min value.
+ intersectionSize -= (firstValInt < secondValInt) ? firstValInt : secondValInt;
+ secondValInt++;
+ // Add new min value.
+ intersectionSize += (firstValInt < secondValInt) ? firstValInt : secondValInt;
+ IntegerPointable.setInteger(entry.getBuf(), entry.getOffset() + 4, secondValInt);
+ } else {
+ // Could not find element in other set. Increase min union size by 1.
+ minUnionSize++;
+ // Check whether jaccThresh can still be satisfied if there was a mismatch.
+ int maxIntersectionSize = Math.min(buildListSize, intersectionSize + (probeListSize - probeListCount));
+ int lowerBound = (int) Math.floor(jaccThresh * minUnionSize);
+ if (maxIntersectionSize < lowerBound) {
+ // Cannot satisfy jaccThresh.
+ return -1;
+ }
+ }
+ probeIter.next();
+ }
+ return intersectionSize;
+ }
+
+ @Override
+ protected void writeResult(float jacc) throws IOException {
+ listBuilder.reset(listType);
+ boolean matches = (jacc < jaccThresh) ? false : true;
+ inputVal.reset();
+ booleanSerde.serialize(matches ? ABoolean.TRUE : ABoolean.FALSE, inputVal.getDataOutput());
+ listBuilder.addItem(inputVal);
+
+ inputVal.reset();
+ aFloat.setValue((matches) ? jacc : 0.0f);
+ floatSerde.serialize(aFloat, inputVal.getDataOutput());
+ listBuilder.addItem(inputVal);
+
+ listBuilder.write(out, true);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
new file mode 100644
index 0000000..1e5ad3c
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.asterix.dataflow.data.nontagged.comparators.ListItemBinaryComparatorFactory;
+import org.apache.asterix.dataflow.data.nontagged.hash.ListItemBinaryHashFunctionFactory;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.om.base.AFloat;
+import org.apache.asterix.om.base.AMutableFloat;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.EnumDeserializer;
+import org.apache.asterix.runtime.evaluators.functions.BinaryHashMap;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
+import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.IntegerPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.data.std.util.BinaryEntry;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class SimilarityJaccardEvaluator implements IScalarEvaluator {
+
+ // Parameters for hash table.
+ protected static final int MIN_TABLE_SIZE = 100;
+ protected static final int TABLE_FRAME_SIZE = 32768;
+
+ // Assuming type indicator in serde format.
+ protected static final int TYPE_INDICATOR_SIZE = 1;
+
+ protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+ protected final DataOutput out = resultStorage.getDataOutput();
+ protected final IPointable argPtr1 = new VoidPointable();
+ protected final IPointable argPtr2 = new VoidPointable();
+ protected final IScalarEvaluator firstOrdListEval;
+ protected final IScalarEvaluator secondOrdListEval;
+
+ protected final OrderedListIterator fstOrdListIter = new OrderedListIterator();
+ protected final OrderedListIterator sndOrdListIter = new OrderedListIterator();
+ protected final UnorderedListIterator fstUnordListIter = new UnorderedListIterator();
+ protected final UnorderedListIterator sndUnordListIter = new UnorderedListIterator();
+
+ protected AbstractAsterixListIterator firstListIter;
+ protected AbstractAsterixListIterator secondListIter;
+
+ protected final AMutableFloat aFloat = new AMutableFloat(0);
+ @SuppressWarnings("unchecked")
+ protected final ISerializerDeserializer<AFloat> floatSerde =
+ SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AFLOAT);
+
+ protected ATypeTag firstTypeTag;
+ protected ATypeTag secondTypeTag;
+ protected float jaccSim = 0.0f;
+ protected ATypeTag firstItemTypeTag;
+ protected ATypeTag secondItemTypeTag;
+
+ protected BinaryHashMap hashMap;
+ protected BinaryEntry keyEntry = new BinaryEntry();
+ protected BinaryEntry valEntry = new BinaryEntry();
+
+ // Ignore case for strings. Defaults to true.
+ protected final boolean ignoreCase = true;
+
+ protected int hashTableSize = MIN_TABLE_SIZE;
+
+ public SimilarityJaccardEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ firstOrdListEval = args[0].createScalarEvaluator(context);
+ secondOrdListEval = args[1].createScalarEvaluator(context);
+ byte[] emptyValBuf = new byte[8];
+ Arrays.fill(emptyValBuf, (byte) 0);
+ valEntry.set(emptyValBuf, 0, 8);
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+ resultStorage.reset();
+
+ firstOrdListEval.evaluate(tuple, argPtr1);
+ secondOrdListEval.evaluate(tuple, argPtr2);
+
+ firstTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset()]);
+ secondTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset()]);
+
+ firstItemTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr1.getByteArray()[argPtr1.getStartOffset() + 1]);
+ secondItemTypeTag =
+ EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argPtr2.getByteArray()[argPtr2.getStartOffset() + 1]);
+
+ if (!checkArgTypes(firstTypeTag, secondTypeTag)) {
+ result.set(resultStorage);
+ return;
+ }
+ if (prepareLists(argPtr1, argPtr2)) {
+ jaccSim = computeResult();
+ } else {
+ jaccSim = 0.0f;
+ }
+ try {
+ writeResult(jaccSim);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ result.set(resultStorage);
+ }
+
+ protected boolean prepareLists(IPointable left, IPointable right) throws HyracksDataException {
+ firstListIter.reset(left.getByteArray(), left.getStartOffset());
+ secondListIter.reset(right.getByteArray(), right.getStartOffset());
+ // Check for special case where one of the lists is empty, since list
+ // types won't match.
+ if (firstListIter.size() == 0 || secondListIter.size() == 0) {
+ return false;
+ }
+
+ // Set the size of the table dynamically
+ hashTableSize = Math.max(Math.max(firstListIter.size(), secondListIter.size()), MIN_TABLE_SIZE);
+
+ // TODO: Check item types are compatible.
+ return true;
+ }
+
+ protected float computeResult() throws HyracksDataException {
+ // We will subtract the intersection size later to get the real union size.
+ int firstListSize = firstListIter.size();
+ int secondListSize = secondListIter.size();
+ int unionSize = firstListSize + secondListSize;
+ // Choose smaller list as build, and larger one as probe.
+ AbstractAsterixListIterator buildList = (firstListSize < secondListSize) ? firstListIter : secondListIter;
+ AbstractAsterixListIterator probeList = (buildList == firstListIter) ? secondListIter : firstListIter;
+ int buildListSize = (buildList == firstListIter) ? firstListSize : secondListSize;
+ int probeListSize = (probeList == firstListIter) ? firstListSize : secondListSize;
+ ATypeTag buildItemTypeTag = (buildList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+ ATypeTag probeItemTypeTag = (probeList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+
+ setHashMap(buildItemTypeTag, probeItemTypeTag);
+ buildHashMap(buildList);
+ int intersectionSize = probeHashMap(probeList, buildListSize, probeListSize);
+ // Special indicator for the "check" version of jaccard.
+ if (intersectionSize < 0) {
+ return -1;
+ }
+ unionSize -= intersectionSize;
+ return (float) intersectionSize / (float) unionSize;
+ }
+
+ protected void buildHashMap(AbstractAsterixListIterator buildIter) throws HyracksDataException {
+ // Build phase: Add items into hash map, starting with first list.
+ // Value in map is a pair of integers. Set first integer to 1.
+ IntegerPointable.setInteger(valEntry.getBuf(), 0, 1);
+ while (buildIter.hasNext()) {
+ byte[] buf = buildIter.getData();
+ int off = buildIter.getPos();
+ int len = buildIter.getItemLen();
+ keyEntry.set(buf, off, len);
+ BinaryEntry entry = hashMap.put(keyEntry, valEntry);
+ if (entry != null) {
+ // Increment value.
+ int firstValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset());
+ IntegerPointable.setInteger(entry.getBuf(), entry.getOffset(), firstValInt + 1);
+ }
+ buildIter.next();
+ }
+ }
+
+ protected int probeHashMap(AbstractAsterixListIterator probeIter, int buildListSize, int probeListSize)
+ throws HyracksDataException {
+ // Probe phase: Probe items from second list, and compute intersection size.
+ int intersectionSize = 0;
+ while (probeIter.hasNext()) {
+ byte[] buf = probeIter.getData();
+ int off = probeIter.getPos();
+ int len = probeIter.getItemLen();
+ keyEntry.set(buf, off, len);
+ BinaryEntry entry = hashMap.get(keyEntry);
+ if (entry != null) {
+ // Increment second value.
+ int firstValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset());
+ // Irrelevant for the intersection size.
+ if (firstValInt == 0) {
+ continue;
+ }
+ int secondValInt = IntegerPointable.getInteger(entry.getBuf(), entry.getOffset() + 4);
+ // Subtract old min value.
+ intersectionSize -= (firstValInt < secondValInt) ? firstValInt : secondValInt;
+ secondValInt++;
+ // Add new min value.
+ intersectionSize += (firstValInt < secondValInt) ? firstValInt : secondValInt;
+ IntegerPointable.setInteger(entry.getBuf(), entry.getOffset() + 4, secondValInt);
+ }
+ probeIter.next();
+ }
+ return intersectionSize;
+ }
+
+ protected void setHashMap(ATypeTag buildItemTypeTag, ATypeTag probeItemTypeTag) {
+ if (hashMap != null) {
+ hashMap.clear();
+ return;
+ }
+
+ IBinaryHashFunction putHashFunc =
+ ListItemBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction(buildItemTypeTag, ignoreCase);
+ IBinaryHashFunction getHashFunc =
+ ListItemBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction(probeItemTypeTag, ignoreCase);
+ IBinaryComparator cmp = ListItemBinaryComparatorFactory.INSTANCE.createBinaryComparator(buildItemTypeTag,
+ probeItemTypeTag, ignoreCase);
+ hashMap = new BinaryHashMap(hashTableSize, TABLE_FRAME_SIZE, putHashFunc, getHashFunc, cmp);
+ }
+
+ protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws HyracksDataException {
+ switch (typeTag1) {
+ case ARRAY: {
+ firstListIter = fstOrdListIter;
+ break;
+ }
+ case MULTISET: {
+ firstListIter = fstUnordListIter;
+ break;
+ }
+ default: {
+ throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 0, typeTag1.serialize(),
+ ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+ }
+ }
+ switch (typeTag2) {
+ case ARRAY: {
+ secondListIter = sndOrdListIter;
+ break;
+ }
+ case MULTISET: {
+ secondListIter = sndUnordListIter;
+ break;
+ }
+ default: {
+ throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 1, typeTag2.serialize(),
+ ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+ }
+ }
+ return true;
+ }
+
+ protected void writeResult(float jacc) throws IOException {
+ aFloat.setValue(jacc);
+ floatSerde.serialize(aFloat, out);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java
new file mode 100644
index 0000000..b70c6ad
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardPrefixEvaluator.java
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.dataflow.data.nontagged.serde.AFloatSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AOrderedListSerializerDeserializer;
+import org.apache.asterix.dataflow.data.nontagged.serde.AUnorderedListSerializerDeserializer;
+import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
+import org.apache.asterix.fuzzyjoin.IntArray;
+import org.apache.asterix.fuzzyjoin.similarity.PartialIntersect;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityFiltersJaccard;
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetric;
+import org.apache.asterix.om.base.AFloat;
+import org.apache.asterix.om.base.AMutableFloat;
+import org.apache.asterix.om.functions.BuiltinFunctions;
+import org.apache.asterix.om.types.ATypeTag;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.asterix.om.types.hierachy.ATypeHierarchy;
+import org.apache.asterix.runtime.exceptions.TypeMismatchException;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.dataflow.value.ISerializerDeserializer;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+
+public class SimilarityJaccardPrefixEvaluator implements IScalarEvaluator {
+ // assuming type indicator in serde format
+ protected final int typeIndicatorSize = 1;
+
+ protected final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+ protected final DataOutput out = resultStorage.getDataOutput();
+ protected final IPointable inputVal = new VoidPointable();
+ protected final IScalarEvaluator evalLen1;
+ protected final IScalarEvaluator evalTokens1;
+ protected final IScalarEvaluator evalLen2;
+ protected final IScalarEvaluator evalTokens2;
+ protected final IScalarEvaluator evalTokenPrefix;
+ protected final IScalarEvaluator evalThreshold;
+
+ protected float similarityThresholdCache;
+ protected SimilarityFiltersJaccard similarityFilters;
+ protected final IntArray tokens1 = new IntArray();
+ protected final IntArray tokens2 = new IntArray();
+ protected final PartialIntersect parInter = new PartialIntersect();
+
+ protected float sim = 0.0f;
+
+ // result
+ protected final AMutableFloat res = new AMutableFloat(0);
+ @SuppressWarnings("unchecked")
+ protected final ISerializerDeserializer<AFloat> reusSerde =
+ SerializerDeserializerProvider.INSTANCE.getSerializerDeserializer(BuiltinType.AFLOAT);
+
+ public SimilarityJaccardPrefixEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ evalLen1 = args[0].createScalarEvaluator(context);
+ evalTokens1 = args[1].createScalarEvaluator(context);
+ evalLen2 = args[2].createScalarEvaluator(context);
+ evalTokens2 = args[3].createScalarEvaluator(context);
+ evalTokenPrefix = args[4].createScalarEvaluator(context);
+ evalThreshold = args[5].createScalarEvaluator(context);
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+ resultStorage.reset();
+ // similarity threshold
+ sim = 0;
+ evalThreshold.evaluate(tuple, inputVal);
+ float similarityThreshold =
+ AFloatSerializerDeserializer.getFloat(inputVal.getByteArray(), inputVal.getStartOffset() + 1);
+
+ if (similarityThreshold != similarityThresholdCache || similarityFilters == null) {
+ similarityFilters = new SimilarityFiltersJaccard(similarityThreshold);
+ similarityThresholdCache = similarityThreshold;
+ }
+
+ evalLen1.evaluate(tuple, inputVal);
+ int length1 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 0,
+ inputVal.getByteArray(), inputVal.getStartOffset());
+ evalLen2.evaluate(tuple, inputVal);
+ int length2 = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 2,
+ inputVal.getByteArray(), inputVal.getStartOffset());
+
+ //
+ // -- - length filter - --
+ //
+ if (similarityFilters.passLengthFilter(length1, length2)) {
+
+ // -- - tokens1 - --
+ int i;
+ tokens1.reset();
+ evalTokens1.evaluate(tuple, inputVal);
+
+ byte[] serList = inputVal.getByteArray();
+ int startOffset = inputVal.getStartOffset();
+ if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG
+ && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
+ throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 1, serList[startOffset],
+ ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+ }
+
+ int lengthTokens1;
+ if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
+ lengthTokens1 =
+ AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+ // read tokens
+ for (i = 0; i < lengthTokens1; i++) {
+ int itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+ int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+ BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
+ tokens1.add(token);
+ }
+ } else {
+ lengthTokens1 =
+ AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+ // read tokens
+ for (i = 0; i < lengthTokens1; i++) {
+ int itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+ int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+ BuiltinFunctions.SIMILARITY_JACCARD.getName(), 1, serList, itemOffset, startOffset + 1);
+ tokens1.add(token);
+ }
+ }
+ // pad tokens
+ for (; i < length1; i++) {
+ tokens1.add(Integer.MAX_VALUE);
+ }
+
+ // -- - tokens2 - --
+ tokens2.reset();
+ evalTokens2.evaluate(tuple, inputVal);
+
+ serList = inputVal.getByteArray();
+ startOffset = inputVal.getStartOffset();
+ if (serList[startOffset] != ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG
+ && serList[startOffset] != ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG) {
+ throw new TypeMismatchException(BuiltinFunctions.SIMILARITY_JACCARD, 3, serList[startOffset],
+ ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG, ATypeTag.SERIALIZED_UNORDEREDLIST_TYPE_TAG);
+ }
+
+ int lengthTokens2;
+ if (serList[startOffset] == ATypeTag.SERIALIZED_ORDEREDLIST_TYPE_TAG) {
+ lengthTokens2 =
+ AOrderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+ // read tokens
+ for (i = 0; i < lengthTokens2; i++) {
+ int itemOffset = AOrderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+ int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+ BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
+ tokens2.add(token);
+ }
+ } else {
+ lengthTokens2 =
+ AUnorderedListSerializerDeserializer.getNumberOfItems(inputVal.getByteArray(), startOffset);
+ // read tokens
+ for (i = 0; i < lengthTokens2; i++) {
+ int itemOffset = AUnorderedListSerializerDeserializer.getItemOffset(serList, startOffset, i);
+ int token = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
+ BuiltinFunctions.SIMILARITY_JACCARD.getName(), 3, serList, itemOffset, startOffset + 1);
+ tokens2.add(token);
+ }
+ }
+ // pad tokens
+ for (; i < length2; i++) {
+ tokens2.add(Integer.MAX_VALUE);
+ }
+
+ // -- - token prefix - --
+ evalTokenPrefix.evaluate(tuple, inputVal);
+ int tokenPrefix = ATypeHierarchy.getIntegerValue(BuiltinFunctions.SIMILARITY_JACCARD.getName(), 4,
+ inputVal.getByteArray(), inputVal.getStartOffset());
+
+ //
+ // -- - position filter - --
+ //
+ SimilarityMetric.getPartialIntersectSize(tokens1.get(), 0, tokens1.length(), tokens2.get(), 0,
+ tokens2.length(), tokenPrefix, parInter);
+ if (similarityFilters.passPositionFilter(parInter.intersectSize, parInter.posXStop, length1,
+ parInter.posYStop, length2)) {
+
+ //
+ // -- - suffix filter - --
+ //
+ if (similarityFilters.passSuffixFilter(tokens1.get(), 0, tokens1.length(), parInter.posXStart,
+ tokens2.get(), 0, tokens2.length(), parInter.posYStart)) {
+
+ sim = similarityFilters.passSimilarityFilter(tokens1.get(), 0, tokens1.length(),
+ parInter.posXStop + 1, tokens2.get(), 0, tokens2.length(), parInter.posYStop + 1,
+ parInter.intersectSize);
+ }
+ }
+ }
+
+ try {
+ writeResult();
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ result.set(resultStorage);
+ }
+
+ public void writeResult() throws IOException {
+ res.setValue(sim);
+ reusSerde.serialize(res, out);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
new file mode 100644
index 0000000..0decd9e
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetricJaccard;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class SimilarityJaccardSortedCheckEvaluator extends SimilarityJaccardCheckEvaluator {
+
+ protected final SimilarityMetricJaccard jaccard = new SimilarityMetricJaccard();
+
+ public SimilarityJaccardSortedCheckEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ super(args, context);
+ }
+
+ @Override
+ protected float computeResult() throws HyracksDataException {
+ return jaccard.computeSimilarity(firstListIter, secondListIter, jaccThresh);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
new file mode 100644
index 0000000..1cd32c8
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import org.apache.asterix.fuzzyjoin.similarity.SimilarityMetricJaccard;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+// Assumes that both arguments are sorted by the same ordering.
+public class SimilarityJaccardSortedEvaluator extends SimilarityJaccardEvaluator {
+
+ protected final SimilarityMetricJaccard jaccard = new SimilarityMetricJaccard();
+
+ public SimilarityJaccardSortedEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context)
+ throws HyracksDataException {
+ super(args, context);
+ }
+
+ @Override
+ protected float computeResult() throws HyracksDataException {
+ return jaccard.computeSimilarity(firstListIter, secondListIter);
+ }
+}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/b8307794/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java
new file mode 100644
index 0000000..e51d5cf
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/runtime/evaluators/common/WordTokensEvaluator.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.asterix.runtime.evaluators.common;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.om.types.AOrderedListType;
+import org.apache.asterix.om.types.BuiltinType;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluator;
+import org.apache.hyracks.algebricks.runtime.base.IScalarEvaluatorFactory;
+import org.apache.hyracks.api.context.IHyracksTaskContext;
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.primitive.VoidPointable;
+import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
+import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+
+public class WordTokensEvaluator implements IScalarEvaluator {
+ private final ArrayBackedValueStorage resultStorage = new ArrayBackedValueStorage();
+ private final DataOutput out = resultStorage.getDataOutput();
+ private final IPointable argPtr = new VoidPointable();
+ private final IScalarEvaluator stringEval;
+
+ private final IBinaryTokenizer tokenizer;
+ private final OrderedListBuilder listBuilder = new OrderedListBuilder();
+ private final AOrderedListType listType;
+
+ public WordTokensEvaluator(IScalarEvaluatorFactory[] args, IHyracksTaskContext context, IBinaryTokenizer tokenizer,
+ BuiltinType itemType) throws HyracksDataException {
+ stringEval = args[0].createScalarEvaluator(context);
+ this.tokenizer = tokenizer;
+ this.listType = new AOrderedListType(itemType, null);
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
+ resultStorage.reset();
+ stringEval.evaluate(tuple, argPtr);
+ tokenizer.reset(argPtr.getByteArray(), argPtr.getStartOffset(), argPtr.getLength());
+ try {
+ listBuilder.reset(listType);
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+ listBuilder.addItem(tokenizer.getToken());
+ }
+ listBuilder.write(out, true);
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ result.set(resultStorage);
+ }
+}