You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by wa...@apache.org on 2018/10/10 00:59:15 UTC
[06/36] asterixdb git commit: [NO ISSUE][COMP][RT] Enable multiway
similarity joins
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql
new file mode 100644
index 0000000..310caa9
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.6.query.aql
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+// Stage 1
+let $rankedTokens := (
+ for $right in dataset left
+ let $id := $right.id
+ for $token in word-tokens($right.authors)
+ /*+ hash */ group by $tokenGroupped := $token with $id
+ /*+ inmem 34 198608 */ order by count($id), $tokenGroupped
+ return $tokenGroupped
+)
+
+// Stage 2_2 of right
+let $tokenRightVerify := (
+ for $right in dataset right
+ let $tokenUnrankedLeft := word-tokens($right.authors)
+ let $tokens := (
+ for $token in $tokenUnrankedLeft
+ let $index :=
+ for $tokenRanked at $i in $rankedTokens
+ where $token = /*+ bcast */ $tokenRanked
+ return $i
+ order by $index
+ return $index
+ )
+ order by $right.authors
+ return {"authors": $right.authors, "tokens": $tokenUnrankedLeft, "ranked": $tokens}
+)
+let $tokenRight := (
+ for $right in dataset right
+ let $tokenUnrankedRight := word-tokens($right.authors)
+ for $token in $tokenUnrankedRight
+ for $tokenRanked at $i in $rankedTokens
+ where $token = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+)
+
+for $r in $tokenRightVerify
+return $r
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql
new file mode 100644
index 0000000..4e51613
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_1/basic-1_2_1.7.query.aql
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+for $paperDBLP in dataset('right')
+ let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ let $index :=
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ return $i
+ order by $index[0]
+ return $index[0]
+order by $paperDBLP.authors
+return {"tokens": $tokensUnrankedDBLP, "ranks": $tokensDBLP}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql
new file mode 100644
index 0000000..45cc975
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+drop dataverse fuzzyjoin_basic if exists;
+
+create dataverse fuzzyjoin_basic;
+
+use dataverse fuzzyjoin_basic;
+
+create type BasicType as closed {
+ id: uuid,
+ authors: string
+}
+
+create dataset left(BasicType) primary key id autogenerated;
+create dataset right(BasicType) primary key id autogenerated;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql
new file mode 100644
index 0000000..c9aceb2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.2.update.aql
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+load dataset left
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm"));
+
+load dataset right
+using localfs
+(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm"));
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql
new file mode 100644
index 0000000..510c1f0
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.3.query.aql
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+let $r := count(
+ for $paperDBLP in dataset('left')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f))
+
+ for $paperCSX in dataset('right')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := word-tokens($paperCSX.authors)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenCSX in subset-collection($tokensCSX, 0, prefix-len-jaccard(len($tokensCSX), .8f))
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+ let $sim := similarity-jaccard-prefix($lenDBLP, $tokensDBLP, $lenCSX, $tokensCSX, $prefixTokenCSX, .8f)
+ where $sim >= .8f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, "sim": $sim[0]}
+)
+return $r
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql
new file mode 100644
index 0000000..b903881
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.4.query.aql
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+let $s := count(
+ for $paperDBLP in dataset('left')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f))
+
+ for $paperCSX in dataset('right')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := word-tokens($paperCSX.authors)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedCSX), .8f) - len($tokensUnrankedCSX) + len($tokensCSX)
+ for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen)
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+ let $sim := similarity-jaccard-prefix($lenDBLP, $tokensDBLP, $lenCSX, $tokensCSX, $prefixTokenCSX, .8f)
+ where $sim >= .8f
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, "sim": $sim[0]}
+)
+return 0
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql
new file mode 100644
index 0000000..119520a
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.5.query.aql
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+let $t := count(
+ for $paperDBLP in dataset('left')
+ let $idDBLP := $paperDBLP.id
+ let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors)
+ let $lenDBLP := len($tokensUnrankedDBLP)
+ let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f))
+
+ for $paperCSX in dataset('right')
+ let $idCSX := $paperCSX.id
+ let $tokensUnrankedCSX := word-tokens($paperCSX.authors)
+ let $lenCSX := len($tokensUnrankedCSX)
+ let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedCSX), .8f) - len($tokensUnrankedCSX) + len($tokensCSX)
+ for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen)
+
+ where $prefixTokenDBLP = $prefixTokenCSX
+
+ /*+ hash*/
+ group by $idDBLP := $idDBLP, $idCSX := $idCSX with $tokensUnrankedDBLP, $tokensUnrankedCSX
+ let $sim := similarity-jaccard-check($tokensUnrankedDBLP[0], $tokensUnrankedCSX[0], .8f)
+ where $sim[1] >= .8f
+ return {'idDBLP': $idDBLP, 'idCSX': $idCSX, 'sim': $sim[1]}
+)
+return $t
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql
new file mode 100644
index 0000000..465cda9
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_2/basic-1_2_2.6.query.aql
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+for $paperDBLP in dataset('left')
+let $idDBLP := $paperDBLP.id
+let $tokensUnrankedDBLP := word-tokens($paperDBLP.authors)
+let $lenDBLP := len($tokensUnrankedDBLP)
+let $tokensDBLP :=
+ for $tokenUnranked in $tokensUnrankedDBLP
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+for $prefixTokenDBLP in subset-collection($tokensDBLP, 0, prefix-len-jaccard(len($tokensDBLP), .8f))
+
+for $paperCSX in dataset('right')
+let $idCSX := $paperCSX.id
+let $tokensUnrankedCSX := word-tokens($paperCSX.authors)
+let $lenCSX := len($tokensUnrankedCSX)
+let $tokensCSX :=
+ for $tokenUnranked in $tokensUnrankedCSX
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $paper in dataset('left')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedCSX), .8f) - len($tokensUnrankedCSX) + len($tokensCSX)
+for $prefixTokenCSX in subset-collection($tokensCSX, 0, $actualPrefixLen)
+
+where $prefixTokenDBLP = $prefixTokenCSX
+
+/*+ hash*/
+distinct by $idDBLP, $idCSX
+let $sim := similarity-jaccard-check($tokensUnrankedDBLP, $tokensUnrankedCSX, .8f)
+where $sim[1] >= .8f
+order by $tokensUnrankedDBLP, $tokensUnrankedCSX
+return {'DBLP': $tokensUnrankedDBLP, 'CSX': $tokensUnrankedCSX, 'sim': $sim[1]}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql
new file mode 100644
index 0000000..45cc975
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+drop dataverse fuzzyjoin_basic if exists;
+
+create dataverse fuzzyjoin_basic;
+
+use dataverse fuzzyjoin_basic;
+
+create type BasicType as closed {
+ id: uuid,
+ authors: string
+}
+
+create dataset left(BasicType) primary key id autogenerated;
+create dataset right(BasicType) primary key id autogenerated;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql
new file mode 100644
index 0000000..c9aceb2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.2.update.aql
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+load dataset left
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm"));
+
+load dataset right
+using localfs
+(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm"));
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql
new file mode 100644
index 0000000..009c2b9
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_3/basic-1_2_3.3.query.aql
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+set simthreshold '.2f';
+
+let $pj := (
+ for $r in dataset left
+ for $s in dataset right
+ where word-tokens($r.authors) ~= word-tokens($s.authors)
+ return {"rid": $r.id, "sid": $s.id, "rstr": $r.authors, "sstr": $s.authors}
+)
+
+let $nl := (
+ for $r in dataset left
+ for $s in dataset right
+ where word-tokens($r.authors) /*+ indexnl */ ~= word-tokens($s.authors)
+ return {"rid": $r.id, "sid": $s.id, "rstr": $r.authors, "sstr": $s.authors}
+)
+
+let $orderedTokens := (
+for $paper in dataset('right')
+ let $id := $paper.id
+ for $token in word-tokens($paper.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+)
+
+let $simpairs := (
+ for $r in $nl
+ return {
+ "rid": $r.rid,
+ "sid": $r.sid,
+ "rstr": $r.rstr,
+ "sstr": $r.sstr,
+ "simpairs":
+ for $s in $pj
+ where $r.rid = $s.rid and $r.sid = $s.sid
+ return {"rid": $s.rid, "sid": $s.sid}
+ }
+)
+
+for $d in $simpairs
+where count($d.simpairs) = 0
+ let $rid := $d.rid
+ let $sid := $d.sid
+ let $rstr := for $t in word-tokens($d.rstr) order by $t return $t
+ let $sstr := for $t in word-tokens($d.sstr) order by $t return $t
+ let $rlen := len(for $t in word-tokens($d.rstr) order by $t return $t)
+ let $slen := len(for $t in word-tokens($d.sstr) order by $t return $t)
+ let $orstr := for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i
+ let $osstr := for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i
+ let $lorstr := len(for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i)
+ let $losstr := len(for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i)
+return {
+ "rid": $rid,
+ "sid": $sid,
+ "rstr": $rstr,
+ "sstr": $sstr,
+ "rlen": $rlen,
+ "slen": $slen,
+ "orstr": $orstr,
+ "osstr": $osstr,
+ "lorstr": $lorstr,
+ "losstr": $losstr,
+ "simpairs": $d,
+ "sim": let $cmmon := for $r in $orstr for $s in $osstr where $r = $s return $r
+ return similarity-jaccard-prefix($rlen, $orstr, $slen, $osstr, $cmmon[0], 0.2f)}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql
new file mode 100644
index 0000000..45cc975
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+drop dataverse fuzzyjoin_basic if exists;
+
+create dataverse fuzzyjoin_basic;
+
+use dataverse fuzzyjoin_basic;
+
+create type BasicType as closed {
+ id: uuid,
+ authors: string
+}
+
+create dataset left(BasicType) primary key id autogenerated;
+create dataset right(BasicType) primary key id autogenerated;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql
new file mode 100644
index 0000000..c9aceb2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.2.update.aql
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+load dataset left
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm"));
+
+load dataset right
+using localfs
+(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm"));
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql
new file mode 100644
index 0000000..b6976bd
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_4/basic-1_2_4.3.query.aql
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+let $tokensUnrankedLeft := [ "a", "baesens", "baestaens", "bart", "den", "dirk", "dirk", "emma", "gestel", "johan", "k",
+"marleen", "poel", "suykens", "tony", "van", "van", "willekens" ]
+
+let $tokensUnrankedRight := [ "a", "baesens", "baestaens", "bart", "bedrijfskunde", "den", "dirk", "dirk", "emma", "en",
+"gent", "gestel", "johan", "k", "marleen", "poel", "suykens", "tony", "van", "van", "willekens" ]
+
+let $lenLeft := len($tokensUnrankedLeft)
+let $tokensLeft :=
+ for $tokenUnranked in $tokensUnrankedLeft
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $orders in dataset left
+ let $id := $orders.id
+ for $token in word-tokens($orders.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+
+let $lenRight := len($tokensUnrankedRight)
+let $tokensRight :=
+ for $tokenUnranked in $tokensUnrankedRight
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $orders in dataset left
+ let $id := $orders.id
+ for $token in word-tokens($orders.authors)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+
+return {
+ "leftTokens": $tokensLeft, "rightTokens": $tokensRight, "lenLeftTrue": len($tokensUnrankedLeft),
+ "lenLeft": len($tokensLeft), "lenRightTrue": len($tokensUnrankedRight), "lenRight": len($tokensRight),
+ "full_sim": similarity-jaccard-check($tokensLeft, $tokensRight, .8f),
+ "true_sim": similarity-jaccard-check($tokensUnrankedLeft, $tokensUnrankedRight, .8f),
+ "pref_sim": similarity-jaccard-prefix(len($tokensLeft), $tokensLeft, len($tokensRight),
+ $tokensRight, $tokensLeft[0], .8f),
+ "fast_sim": similarity-jaccard-prefix(len($tokensUnrankedLeft), $tokensLeft, len($tokensUnrankedRight),
+ $tokensRight, $tokensLeft[0], .8f)
+ }
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql
new file mode 100644
index 0000000..45cc975
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+drop dataverse fuzzyjoin_basic if exists;
+
+create dataverse fuzzyjoin_basic;
+
+use dataverse fuzzyjoin_basic;
+
+create type BasicType as closed {
+ id: uuid,
+ authors: string
+}
+
+create dataset left(BasicType) primary key id autogenerated;
+create dataset right(BasicType) primary key id autogenerated;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql
new file mode 100644
index 0000000..c9aceb2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.2.update.aql
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+load dataset left
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm"));
+
+load dataset right
+using localfs
+(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm"));
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql
new file mode 100644
index 0000000..3b5d44b
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_5/basic-1_2_5.3.query.aql
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set simthreshold '.2f';
+
+let $cpj := count(
+ for $r in dataset left
+ for $s in dataset right
+ where word-tokens($r.authors) ~= word-tokens($s.authors)
+ return {"rid": $r.id, "sid": $s.id}
+)
+
+let $cnl := count(
+ for $r in dataset left
+ for $s in dataset right
+ where word-tokens($r.authors) /*+ indexnl */ ~= word-tokens($s.authors)
+ return {"rid": $r.id, "sid": $s.id}
+)
+
+return [$cnl, $cpj]
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql
new file mode 100644
index 0000000..a72efb5
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int64,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create dataset DBLP(DBLPType) primary key id;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql
new file mode 100644
index 0000000..d3d02d4
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.2.update.aql
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin;
+
+load dataset DBLP
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql
new file mode 100644
index 0000000..c53475f
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_6/basic-1_2_6.3.query.aql
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin;
+
+set simthreshold '.15f';
+
+let $cpj := count(
+ for $dblp in dataset('DBLP')
+ for $dblp2 in dataset('DBLP')
+ where word-tokens($dblp.title) ~= word-tokens($dblp2.title)
+ order by $dblp.id, $dblp2.id
+ return {'dblp': $dblp.id, 'dblp2': $dblp2.id}
+)
+
+let $cnl := count(
+ for $dblp in dataset('DBLP')
+ for $dblp2 in dataset('DBLP')
+ where word-tokens($dblp.title) /*+indexnl*/ ~= word-tokens($dblp2.title)
+ order by $dblp.id, $dblp2.id
+ return {'dblp': $dblp.id, 'dblp2': $dblp2.id}
+)
+
+return [$cnl, $cpj]
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql
new file mode 100644
index 0000000..bd84097
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+drop dataverse fuzzyjoin if exists;
+
+create dataverse fuzzyjoin;
+
+use dataverse fuzzyjoin;
+
+create type DBLPType as closed {
+ id: int64,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create dataset DBLP(DBLPType) primary key id;
+
+create dataset TO(DBLPType) primary key id;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql
new file mode 100644
index 0000000..7674827
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.2.update.aql
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin;
+
+load dataset DBLP
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+
+load dataset TO
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql
new file mode 100644
index 0000000..597e8a1
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_2_7/basic-1_2_7.3.query.aql
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin;
+
+set import-private-functions 'true'
+
+set simthreshold '.61f';
+
+let $pj := (
+ for $dblp in dataset DBLP
+ for $dblp2 in dataset TO
+ where word-tokens($dblp.title) ~= word-tokens($dblp2.title)
+ return {"rid": $dblp.id, "sid": $dblp2.id, "rstr": $dblp.title, "sstr": $dblp2.title}
+)
+
+let $nl := (
+ for $dblp in dataset DBLP
+ for $dblp2 in dataset TO
+ where word-tokens($dblp.title) /* +indexnl */ ~= word-tokens($dblp2.title)
+ return {"rid": $dblp.id, "sid": $dblp2.id, "rstr": $dblp.title, "sstr": $dblp2.title}
+)
+
+let $orderedTokens := (
+ for $paper in dataset TO
+ let $id := $paper.id
+ for $token in word-tokens($paper.title)
+ /*+ hash */
+ group by $tokenGrouped := $token with $id
+ /*+ inmem 1 302 */
+ order by count($id), $tokenGrouped
+ return $tokenGrouped
+)
+
+let $simpairs := (
+ for $r in $nl
+ return {
+ "rid": $r.rid,
+ "sid": $r.sid,
+ "rstr": $r.rstr,
+ "sstr": $r.sstr,
+ "simpairs":
+ for $s in $pj
+ where $r.rid = $s.rid and $r.sid = $s.sid
+ return {"rid": $s.rid, "sid": $s.sid}
+ }
+)
+
+for $d in $simpairs
+where count($d.simpairs) = 0
+ let $rid := $d.rid
+ let $sid := $d.sid
+ let $rstr := for $t in word-tokens($d.rstr) order by $t return $t
+ let $sstr := for $t in word-tokens($d.sstr) order by $t return $t
+ let $rlen := len(for $t in word-tokens($d.rstr) order by $t return $t)
+ let $slen := len(for $t in word-tokens($d.sstr) order by $t return $t)
+ let $orstr := for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i
+ let $osstr := for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i
+ let $lorstr := len(for $t in word-tokens($d.rstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i)
+ let $losstr := len(for $t in word-tokens($d.sstr) for $token at $i in $orderedTokens where $t /*+ bcast */ = $token order by $i return $i)
+return {
+ "rid": $rid,
+ "sid": $sid,
+ "rstr": $rstr,
+ "sstr": $sstr,
+ "rlen": $rlen,
+ "slen": $slen,
+ "orstr": $orstr,
+ "osstr": $osstr,
+ "lorstr": $lorstr,
+ "losstr": $losstr,
+ "simpairs": $d,
+ "sim": let $cmmon := for $r in $orstr for $s in $osstr where $r = $s return $r
+ return similarity-jaccard-prefix($rlen, $orstr, $slen, $osstr, $cmmon[0], 0.61f)}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql
new file mode 100644
index 0000000..45cc975
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+drop dataverse fuzzyjoin_basic if exists;
+
+create dataverse fuzzyjoin_basic;
+
+use dataverse fuzzyjoin_basic;
+
+create type BasicType as closed {
+ id: uuid,
+ authors: string
+}
+
+create dataset left(BasicType) primary key id autogenerated;
+create dataset right(BasicType) primary key id autogenerated;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql
new file mode 100644
index 0000000..c9aceb2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.2.update.aql
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+load dataset left
+using localfs
+(("path"="asterix_nc1://data/pub-small/dblpauthors.adm"),("format"="adm"));
+
+load dataset right
+using localfs
+(("path"="asterix_nc1://data/pub-small/csxauthors.adm"),("format"="adm"));
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql
new file mode 100644
index 0000000..f91e841
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.3.query.aql
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+//
+// -- - Stage 1 - --
+//
+for $orderRight in dataset('right')
+let $rightId := $orderRight.id
+for $orderTokenRight in word-tokens($orderRight.authors)
+ /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId
+/*+ inmem 1 302 */ order by count($rightId), $tokenRightGrouped
+return [ $tokenRightGrouped, count($rightId) ]
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql
new file mode 100644
index 0000000..66dbbbc
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.4.query.aql
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+//
+// -- - Stage 1 - --
+//
+for $r in
+for $orderRight in dataset('right')
+let $rightId := $orderRight.id
+for $orderTokenRight in word-tokens($orderRight.authors)
+ /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId
+return {"rt": $tokenRightGrouped, "rc": count($rightId)}
+
+for $l in
+for $orderLeft in dataset('left')
+let $leftId := $orderLeft.id
+for $orderTokenLeft in word-tokens($orderLeft.authors)
+ /*+ hash */ group by $tokenLeftGrouped := $orderTokenLeft with $leftId
+return {"lt": $tokenLeftGrouped, "lc": count($leftId)}
+
+where $r.rt = $l.lt
+/*+ inmem 1 302 */ order by $r.rc * $l.lc, $r.rt
+return [ $r.rt, $r.rc * $l.lc ]
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql
new file mode 100644
index 0000000..ad93db1
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.5.query.aql
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+let $r := count(
+ for $right in dataset('right')
+ let $idRight := $right.id
+ let $tokensUnrankedRight := word-tokens($right.authors)
+ let $lenRight := len($tokensUnrankedRight)
+ let $tokensRight :=
+ for $tokenUnranked in $tokensUnrankedRight
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $orderRight in dataset('right')
+ let $rightId := $orderRight.id
+ for $orderTokenRight in word-tokens($orderRight.authors)
+ /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId
+ /*+ inmem 1 302 */ order by count($rightId)
+ return $tokenRightGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-jaccard(len($tokensRight), .8f))
+
+ for $left in dataset('left')
+ let $idLeft := $left.id
+ let $tokensUnrankedLeft := word-tokens($left.authors)
+ let $lenLeft := len($tokensUnrankedLeft)
+ let $tokensLeft :=
+ for $tokenUnranked in $tokensUnrankedLeft
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $orderRight in dataset('right')
+ let $rightId := $orderRight.id
+ for $orderTokenRight in word-tokens($orderRight.authors)
+ /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId
+ /*+ inmem 1 302 */ order by count($rightId)
+ return $tokenRightGrouped
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedLeft), .8f) - len($tokensUnrankedLeft) + len($tokensLeft)
+ for $prefixTokenLeft in subset-collection($tokensLeft, 0, $actualPrefixLen)
+
+ where $prefixTokenRight = $prefixTokenLeft
+ let $sim := similarity-jaccard-prefix($lenRight, $tokensRight, $lenLeft, $tokensLeft, $prefixTokenLeft, .8f)
+ where $sim >= .8f
+ /*+ hash*/ group by $idRight := $idRight, $idLeft := $idLeft with $sim
+ return {'idDBLP': $idRight, 'idCSX': $idLeft, "sim": $sim[0]}
+)
+return $r
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql
new file mode 100644
index 0000000..5594de3
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/basic-1_3_1/basic-1_3_1.6.query.aql
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+use dataverse fuzzyjoin_basic;
+
+set import-private-functions 'true'
+
+let $r := count(
+ for $right in dataset('right')
+ let $idRight := $right.id
+ let $tokensUnrankedRight := word-tokens($right.authors)
+ let $lenRight := len($tokensUnrankedRight)
+ let $tokensRight :=
+ for $tokenUnranked in $tokensUnrankedRight
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $r in
+ for $orderRight in dataset('right')
+ let $rightId := $orderRight.id
+ for $orderTokenRight in word-tokens($orderRight.authors)
+ /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId
+ return {"rt": $tokenRightGrouped, "rc": count($rightId)}
+ for $l in
+ for $orderLeft in dataset('left')
+ let $leftId := $orderLeft.id
+ for $orderTokenLeft in word-tokens($orderLeft.authors)
+ /*+ hash */ group by $tokenLeftGrouped := $orderTokenLeft with $leftId
+ return {"lt": $tokenLeftGrouped, "lc": count($leftId)}
+ where $r.rt = $l.lt
+ /*+ inmem 1 302 */ order by $r.rc * $l.lc
+ return $r.rt
+
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-jaccard(len($tokensRight), .8f))
+
+ for $left in dataset('left')
+ let $idLeft := $left.id
+ let $tokensUnrankedLeft := word-tokens($left.authors)
+ let $lenLeft := len($tokensUnrankedLeft)
+ let $tokensLeft :=
+ for $tokenUnranked in $tokensUnrankedLeft
+ for $tokenRanked at $i in
+ //
+ // -- - Stage 1 - --
+ //
+ for $r in
+ for $orderRight in dataset('right')
+ let $rightId := $orderRight.id
+ for $orderTokenRight in word-tokens($orderRight.authors)
+ /*+ hash */ group by $tokenRightGrouped := $orderTokenRight with $rightId
+ return {"rt": $tokenRightGrouped, "rc": count($rightId)}
+ for $l in
+ for $orderLeft in dataset('left')
+ let $leftId := $orderLeft.id
+ for $orderTokenLeft in word-tokens($orderLeft.authors)
+ /*+ hash */ group by $tokenLeftGrouped := $orderTokenLeft with $leftId
+ return {"lt": $tokenLeftGrouped, "lc": count($leftId)}
+ where $r.rt = $l.lt
+ /*+ inmem 1 302 */ order by $r.rc * $l.lc
+ return $r.rt
+
+ where $tokenUnranked = /*+ bcast */ $tokenRanked
+ order by $i
+ return $i
+ let $actualPrefixLen := prefix-len-jaccard(len($tokensUnrankedLeft), .8f) - len($tokensUnrankedLeft) + len($tokensLeft)
+ for $prefixTokenLeft in subset-collection($tokensLeft, 0, $actualPrefixLen)
+
+ where $prefixTokenRight = $prefixTokenLeft
+ let $sim := similarity-jaccard-prefix($lenRight, $tokensRight, $lenLeft, $tokensLeft, $prefixTokenLeft, .8f)
+ where $sim >= .8f
+ /*+ hash*/ group by $idRight := $idRight, $idLeft := $idLeft with $sim
+ return {'idDBLP': $idRight, 'idCSX': $idLeft, "sim": $sim[0]}
+)
+return $r
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql
index 1cff8fc..7ecca70 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql
@@ -71,9 +71,7 @@ set import-private-functions 'true';
$tokensCSX,
0,
prefix-len-jaccard(len($tokensCSX), .5f))
-
where $prefixTokenDBLP = $prefixTokenCSX
-
let $sim := similarity-jaccard-prefix(
$lenDBLP,
$tokensDBLP,
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql
new file mode 100644
index 0000000..3573f47
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.1.ddl.aql
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' word tokens.
+ * We expect the join to be transformed into an prefix-based fuzzy join following with an < select.
+ * Success : Yes
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPNestedType as closed {
+ id: int64,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type DBLPType as closed {
+ nested: DBLPNestedType
+}
+
+create type CSXNestedType as closed {
+ id: int64,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as closed {
+ nested: CSXNestedType
+}
+
+create dataset DBLPtmp(DBLPNestedType) primary key id;
+create dataset CSXtmp(CSXNestedType) primary key id;
+
+create dataset DBLP(DBLPType) primary key nested.id;
+create dataset CSX(CSXType) primary key nested.id;
+
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql
new file mode 100644
index 0000000..a2633b1
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.2.update.aql
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse test;
+
+load dataset DBLPtmp
+using localfs
+(("path"="asterix_nc1://data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted;
+
+load dataset CSXtmp
+using localfs
+(("path"="asterix_nc1://data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
+
+insert into dataset DBLP(
+ for $x in dataset DBLPtmp
+ return {
+ "nested": $x
+ }
+);
+
+insert into dataset CSX(
+ for $x in dataset CSXtmp
+ return {
+ "nested": $x
+ }
+);
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql
new file mode 100644
index 0000000..0359448
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.3.ddl.aql
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse test;
+
+create index keyword_index on DBLP(nested.title) type keyword;
+
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql
new file mode 100644
index 0000000..65e2576
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.1/word-jaccard.4.query.aql
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse test;
+
+for $a in dataset('DBLP')
+for $b in dataset('CSX')
+where word-tokens($a.nested.title) ~= word-tokens($b.nested.title)
+ and $a.nested.id < $b.nested.id
+order by $a.nested.id, $b.nested.id
+return { "arec": $a.nested, "brec": $b.nested }
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql
new file mode 100644
index 0000000..72458b9
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.1.ddl.aql
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+ * Description : Fuzzy joins two datasets, DBLP and CSX, based on the similarity-jaccard function of their titles' 3-gram tokens.
+ * We expect the join to be transformed into an indexed prefix-based fuzzy join.
+ * We treat the < condition as a select over the fuzzy join results.
+ * Success : Yes
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPNestedType as closed {
+ id: int64,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type DBLPType as closed {
+ nested: DBLPNestedType
+}
+
+create type CSXNestedType as closed {
+ id: int64,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as closed {
+ nested: CSXNestedType
+}
+
+create dataset DBLPtmp(DBLPNestedType) primary key id;
+create dataset CSXtmp(CSXNestedType) primary key id;
+
+create dataset DBLP(DBLPType) primary key nested.id;
+create dataset CSX(CSXType) primary key nested.id;
+
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql
new file mode 100644
index 0000000..a2633b1
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.2.update.aql
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse test;
+
+load dataset DBLPtmp
+using localfs
+(("path"="asterix_nc1://data/dblp-small/dblp-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted;
+
+load dataset CSXtmp
+using localfs
+(("path"="asterix_nc1://data/pub-small/csx-small-multi-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
+
+insert into dataset DBLP(
+ for $x in dataset DBLPtmp
+ return {
+ "nested": $x
+ }
+);
+
+insert into dataset CSX(
+ for $x in dataset CSXtmp
+ return {
+ "nested": $x
+ }
+);
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql
new file mode 100644
index 0000000..9307af9
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-4.1.2/ngram-jaccard-inline.3.ddl.aql
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use dataverse test;
+
+create index ngram_index on DBLP(nested.title) type ngram(3);
+