You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by wa...@apache.org on 2018/10/10 00:59:45 UTC
[36/36] asterixdb git commit: [NO ISSUE][COMP][RT] Enable multiway
similarity joins
[NO ISSUE][COMP][RT] Enable multiway similarity joins
- Enable the FuzzyJoinRule that transforms
a nested-loop-similarity-join plan to a three-stage-similarity join.
- Modify FuzzyJoinRuleCollections.
- Add the ExtractCommonExpressionRule to extract common expressions
in the star-like multiple similarity join substitutions.
- Add the InlineSubplanInputForNestedTupleSourceRule to translate
the generated subplan from the similarity function-derived
substitution into join in case of nested schemas.
- Use similarity-jaccard-prefix to enable the pp+ join strategy.
- Use the right side to build the heavy hash join on
the prefix tokens from both sides.
- Add RemoveAssign/Variables/AggRules to iteratively remove unused
assign/vars once FuzzyJoinRule is applied in each round.
- Add three new optimization cases for multiway similarity joins.
- link-like multiway similarity joins
- star-like multiway similarity joins
- hybrid multiway similarity joins with the both styles of similarity joins.
- Add a check whether a similarity function is on
a select over an existing similarity join.
- Change the inverted-index-based similarity join to the three-stage-similarity join
due to efficiency considerations.
Change-Id: I8736f104905eeda763d39709e002c2b9629278cc
Reviewed-on: https://asterix-gerrit.ics.uci.edu/1076
Sonar-Qube: Jenkins <je...@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
Contrib: Jenkins <je...@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dm...@couchbase.com>
Reviewed-by: Taewoo Kim <wa...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/asterixdb/repo
Commit: http://git-wip-us.apache.org/repos/asf/asterixdb/commit/d906bd89
Tree: http://git-wip-us.apache.org/repos/asf/asterixdb/tree/d906bd89
Diff: http://git-wip-us.apache.org/repos/asf/asterixdb/diff/d906bd89
Branch: refs/heads/master
Commit: d906bd89e48351156262c1c22096d23269bf0f0a
Parents: e7fa4b3
Author: Taewoo Kim <wa...@yahoo.com>
Authored: Mon Oct 8 18:01:23 2018 -0700
Committer: Taewoo Kim <wa...@gmail.com>
Committed: Tue Oct 9 17:58:47 2018 -0700
----------------------------------------------------------------------
.../provider/DefaultRuleSetFactory.java | 8 +-
.../asterix/optimizer/base/FuzzyUtils.java | 12 +
.../asterix/optimizer/base/RuleCollections.java | 12 +-
.../asterix/optimizer/rules/FuzzyJoinRule.java | 508 ++--
.../data/dblp-small/csx-small-multi-id.txt | 100 +
.../data/dblp-small/dblp-small-multi-id.txt | 100 +
.../data/pub-small/csx-small-multi-id.txt | 100 +
.../asterix-app/data/pub-small/csxauthors.adm | 196 ++
.../data/pub-small/dblp-small-multi-id.txt | 100 +
.../asterix-app/data/pub-small/dblpauthors.adm | 194 ++
.../src/test/resources/optimizerts/ignore.txt | 1 -
.../optimizerts/queries/fj-dblp-csx-hybrid.aql | 69 +
.../queries/fj-dblp-csx-selflink.aql | 66 +
.../optimizerts/queries/fj-dblp-csx-simple.aql | 54 +
.../optimizerts/queries/fj-dblp-csx-star.aql | 69 +
.../optimizerts/queries/fj-dblp-csx.aql | 101 +-
.../jaccard-similarity-join-dual-order.aql | 109 +
.../jaccard-similarity-join-right-ahead.aql | 89 +
.../optimizerts/results/fj-dblp-csx-hybrid.plan | 613 +++++
.../results/fj-dblp-csx-selflink.plan | 2436 +++++++++++++++++
.../optimizerts/results/fj-dblp-csx-simple.plan | 146 ++
.../optimizerts/results/fj-dblp-csx-star.plan | 2481 ++++++++++++++++++
.../optimizerts/results/fj-dblp-csx.plan | 134 +
.../ngram-jaccard-inline.plan | 170 +-
.../word-jaccard-inline.plan | 170 +-
.../results/inverted-index-join/issue741.plan | 155 +-
...obe-pidx-with-join-jaccard-check-idx_01.plan | 166 +-
.../ngram-fuzzyeq-jaccard_01.plan | 157 +-
.../ngram-fuzzyeq-jaccard_02.plan | 157 +-
.../ngram-fuzzyeq-jaccard_03.plan | 158 +-
.../ngram-jaccard-check_01.plan | 157 +-
.../ngram-jaccard-check_02.plan | 157 +-
.../ngram-jaccard-check_03.plan | 158 +-
.../ngram-jaccard-check_04.plan | 160 +-
.../inverted-index-join/ngram-jaccard_01.plan | 157 +-
.../inverted-index-join/ngram-jaccard_02.plan | 157 +-
.../inverted-index-join/ngram-jaccard_03.plan | 158 +-
.../inverted-index-join/ngram-jaccard_04.plan | 160 +-
.../word-fuzzyeq-jaccard_01.plan | 157 +-
.../word-fuzzyeq-jaccard_02.plan | 157 +-
.../word-fuzzyeq-jaccard_03.plan | 158 +-
.../word-jaccard-check-after-btree-access.plan | 178 +-
.../word-jaccard-check_01.plan | 157 +-
.../word-jaccard-check_02.plan | 157 +-
.../word-jaccard-check_03.plan | 158 +-
.../word-jaccard-check_04.plan | 160 +-
.../inverted-index-join/word-jaccard_01.plan | 157 +-
.../inverted-index-join/word-jaccard_02.plan | 157 +-
.../inverted-index-join/word-jaccard_03.plan | 158 +-
.../inverted-index-join/word-jaccard_04.plan | 160 +-
.../ngram-jaccard-check-multi-let.plan | 2 +-
.../word-jaccard-check-multi-let.plan | 2 +-
...obe-pidx-with-join-jaccard-check-idx_01.plan | 171 +-
.../ngram-fuzzyeq-jaccard_01.plan | 164 +-
.../ngram-jaccard-check_01.plan | 164 +-
.../ngram-jaccard-inline.plan | 199 +-
.../inverted-index-join/ngram-jaccard_01.plan | 164 +-
.../olist-jaccard-inline.plan | 2 +-
.../ulist-jaccard-inline.plan | 2 +-
.../word-fuzzyeq-jaccard_01.plan | 164 +-
.../word-jaccard-check-after-btree-access.plan | 202 +-
.../word-jaccard-check_01.plan | 164 +-
.../word-jaccard-inline.plan | 199 +-
.../inverted-index-join/word-jaccard_01.plan | 164 +-
.../ngram-jaccard-check-multi-let.plan | 2 +-
.../word-jaccard-check-multi-let.plan | 2 +-
.../ngram-fuzzyeq-jaccard_01.plan | 164 +-
.../ngram-fuzzyeq-jaccard_02.plan | 164 +-
.../ngram-fuzzyeq-jaccard_03.plan | 163 +-
.../ngram-fuzzyeq-jaccard_04.plan | 164 +-
.../ngram-jaccard-check_01.plan | 164 +-
.../ngram-jaccard-check_02.plan | 164 +-
.../ngram-jaccard-check_03.plan | 163 +-
.../ngram-jaccard-check_04.plan | 164 +-
.../ngram-jaccard-inline.plan | 199 +-
.../inverted-index-join/ngram-jaccard_01.plan | 164 +-
.../inverted-index-join/ngram-jaccard_02.plan | 164 +-
.../inverted-index-join/ngram-jaccard_03.plan | 163 +-
.../inverted-index-join/ngram-jaccard_04.plan | 164 +-
.../word-fuzzyeq-jaccard_01.plan | 164 +-
.../word-fuzzyeq-jaccard_02.plan | 164 +-
.../word-fuzzyeq-jaccard_03.plan | 163 +-
.../word-fuzzyeq-jaccard_04.plan | 164 +-
.../word-jaccard-check-after-btree-access.plan | 202 +-
.../word-jaccard-check_01.plan | 164 +-
.../word-jaccard-check_02.plan | 164 +-
.../word-jaccard-check_03.plan | 163 +-
.../word-jaccard-check_04.plan | 164 +-
.../word-jaccard-inline.plan | 199 +-
.../inverted-index-join/word-jaccard_01.plan | 164 +-
.../inverted-index-join/word-jaccard_02.plan | 164 +-
.../inverted-index-join/word-jaccard_03.plan | 163 +-
.../inverted-index-join/word-jaccard_04.plan | 164 +-
.../ngram-fuzzyeq-jaccard_01.plan | 157 +-
.../ngram-fuzzyeq-jaccard_02.plan | 157 +-
.../ngram-fuzzyeq-jaccard_03.plan | 158 +-
.../ngram-fuzzyeq-jaccard_04.plan | 157 +-
.../ngram-jaccard-check_01.plan | 157 +-
.../ngram-jaccard-check_02.plan | 157 +-
.../ngram-jaccard-check_03.plan | 158 +-
.../ngram-jaccard-check_04.plan | 157 +-
.../ngram-jaccard-check_inline_03.plan | 160 +-
.../inverted-index-join/ngram-jaccard_01.plan | 157 +-
.../inverted-index-join/ngram-jaccard_02.plan | 157 +-
.../inverted-index-join/ngram-jaccard_03.plan | 158 +-
.../inverted-index-join/ngram-jaccard_04.plan | 157 +-
.../ngram-jaccard_inline_03.plan | 160 +-
.../word-fuzzyeq-jaccard_01.plan | 157 +-
.../word-fuzzyeq-jaccard_02.plan | 157 +-
.../word-fuzzyeq-jaccard_03.plan | 158 +-
.../word-fuzzyeq-jaccard_04.plan | 157 +-
.../word-jaccard-check-after-btree-access.plan | 178 +-
.../word-jaccard-check_01.plan | 157 +-
.../word-jaccard-check_02.plan | 157 +-
.../word-jaccard-check_03.plan | 158 +-
.../word-jaccard-check_04.plan | 157 +-
.../word-jaccard-check_inline_03.plan | 160 +-
.../inverted-index-join/word-jaccard_01.plan | 157 +-
.../inverted-index-join/word-jaccard_02.plan | 157 +-
.../inverted-index-join/word-jaccard_03.plan | 158 +-
.../inverted-index-join/word-jaccard_04.plan | 157 +-
.../word-jaccard_inline_03.plan | 160 +-
.../jaccard-similarity-join-dual-order.plan | 191 ++
.../jaccard-similarity-join-right-ahead.plan | 128 +
.../fuzzyjoin/basic-1_1/basic-1_1.1.ddl.aql | 40 +
.../fuzzyjoin/basic-1_1/basic-1_1.2.update.aql | 34 +
.../fuzzyjoin/basic-1_1/basic-1_1.3.query.aql | 81 +
.../fuzzyjoin/basic-1_1_1/basic-1_1_1.1.ddl.aql | 31 +
.../basic-1_1_1/basic-1_1_1.2.update.aql | 27 +
.../basic-1_1_1/basic-1_1_1.3.query.aql | 61 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.1.ddl.aql | 23 +
.../basic-1_1_2/basic-1_1_2.10.query.aql | 29 +
.../basic-1_1_2/basic-1_1_2.2.update.aql | 18 +
.../basic-1_1_2/basic-1_1_2.3.query.aql | 30 +
.../basic-1_1_2/basic-1_1_2.4.query.aql | 30 +
.../basic-1_1_2/basic-1_1_2.5.query.aql | 30 +
.../basic-1_1_2/basic-1_1_2.6.query.aql | 30 +
.../basic-1_1_2/basic-1_1_2.7.query.aql | 30 +
.../basic-1_1_2/basic-1_1_2.8.query.aql | 30 +
.../basic-1_1_2/basic-1_1_2.9.query.aql | 29 +
.../fuzzyjoin/basic-1_1_3/basic-1_1_3.1.ddl.aql | 23 +
.../basic-1_1_3/basic-1_1_3.2.update.aql | 18 +
.../basic-1_1_3/basic-1_1_3.3.query.aql | 30 +
.../basic-1_1_3/basic-1_1_3.4.query.aql | 38 +
.../fuzzyjoin/basic-1_2_1/basic-1_2_1.1.ddl.aql | 31 +
.../basic-1_2_1/basic-1_2_1.2.update.aql | 27 +
.../basic-1_2_1/basic-1_2_1.3.query.aql | 123 +
.../basic-1_2_1/basic-1_2_1.4.query.aql | 51 +
.../basic-1_2_1/basic-1_2_1.5.query.aql | 63 +
.../basic-1_2_1/basic-1_2_1.6.query.aql | 60 +
.../basic-1_2_1/basic-1_2_1.7.query.aql | 46 +
.../fuzzyjoin/basic-1_2_2/basic-1_2_2.1.ddl.aql | 31 +
.../basic-1_2_2/basic-1_2_2.2.update.aql | 27 +
.../basic-1_2_2/basic-1_2_2.3.query.aql | 77 +
.../basic-1_2_2/basic-1_2_2.4.query.aql | 78 +
.../basic-1_2_2/basic-1_2_2.5.query.aql | 79 +
.../basic-1_2_2/basic-1_2_2.6.query.aql | 77 +
.../fuzzyjoin/basic-1_2_3/basic-1_2_3.1.ddl.aql | 31 +
.../basic-1_2_3/basic-1_2_3.2.update.aql | 27 +
.../basic-1_2_3/basic-1_2_3.3.query.aql | 89 +
.../fuzzyjoin/basic-1_2_4/basic-1_2_4.1.ddl.aql | 31 +
.../basic-1_2_4/basic-1_2_4.2.update.aql | 27 +
.../basic-1_2_4/basic-1_2_4.3.query.aql | 76 +
.../fuzzyjoin/basic-1_2_5/basic-1_2_5.1.ddl.aql | 31 +
.../basic-1_2_5/basic-1_2_5.2.update.aql | 27 +
.../basic-1_2_5/basic-1_2_5.3.query.aql | 37 +
.../fuzzyjoin/basic-1_2_6/basic-1_2_6.1.ddl.aql | 33 +
.../basic-1_2_6/basic-1_2_6.2.update.aql | 23 +
.../basic-1_2_6/basic-1_2_6.3.query.aql | 39 +
.../fuzzyjoin/basic-1_2_7/basic-1_2_7.1.ddl.aql | 35 +
.../basic-1_2_7/basic-1_2_7.2.update.aql | 27 +
.../basic-1_2_7/basic-1_2_7.3.query.aql | 89 +
.../fuzzyjoin/basic-1_3_1/basic-1_3_1.1.ddl.aql | 31 +
.../basic-1_3_1/basic-1_3_1.2.update.aql | 27 +
.../basic-1_3_1/basic-1_3_1.3.query.aql | 29 +
.../basic-1_3_1/basic-1_3_1.4.query.aql | 40 +
.../basic-1_3_1/basic-1_3_1.5.query.aql | 73 +
.../basic-1_3_1/basic-1_3_1.6.query.aql | 93 +
.../dblp-csx-2_5.2/dblp-csx-2_5.2.3.query.aql | 2 -
.../dblp-csx-4.1.1/word-jaccard.1.ddl.aql | 58 +
.../dblp-csx-4.1.1/word-jaccard.2.update.aql | 42 +
.../dblp-csx-4.1.1/word-jaccard.3.ddl.aql | 23 +
.../dblp-csx-4.1.1/word-jaccard.4.query.aql | 27 +
.../ngram-jaccard-inline.1.ddl.aql | 59 +
.../ngram-jaccard-inline.2.update.aql | 42 +
.../ngram-jaccard-inline.3.ddl.aql | 23 +
.../ngram-jaccard-inline.4.query.aql | 27 +
.../dblp-csx-4.2.1/word-jaccard.1.ddl.aql | 58 +
.../dblp-csx-4.2.1/word-jaccard.2.update.aql | 42 +
.../dblp-csx-4.2.1/word-jaccard.3.ddl.aql | 23 +
.../dblp-csx-4.2.1/word-jaccard.4.query.aql | 28 +
.../ngram-jaccard-inline.1.ddl.aql | 58 +
.../ngram-jaccard-inline.2.update.aql | 42 +
.../ngram-jaccard-inline.3.ddl.aql | 23 +
.../ngram-jaccard-inline.4.query.aql | 29 +
.../dblp-csx-4.3.1/word-jaccard.1.ddl.aql | 58 +
.../dblp-csx-4.3.1/word-jaccard.2.update.aql | 43 +
.../dblp-csx-4.3.1/word-jaccard.3.ddl.aql | 23 +
.../dblp-csx-4.3.1/word-jaccard.4.query.aql | 29 +
.../ngram-jaccard-inline.1.ddl.aql | 58 +
.../ngram-jaccard-inline.2.update.aql | 43 +
.../ngram-jaccard-inline.3.ddl.aql | 23 +
.../ngram-jaccard-inline.4.query.aql | 30 +
.../dblp-csx-4.4.1/word-jaccard.1.ddl.aql | 57 +
.../dblp-csx-4.4.1/word-jaccard.2.update.aql | 42 +
.../dblp-csx-4.4.1/word-jaccard.3.ddl.aql | 23 +
.../dblp-csx-4.4.1/word-jaccard.4.query.aql | 32 +
.../dblp-csx-4.4.2/word-jaccard.1.ddl.aql | 58 +
.../dblp-csx-4.4.2/word-jaccard.2.update.aql | 42 +
.../dblp-csx-4.4.2/word-jaccard.3.ddl.aql | 18 +
.../dblp-csx-4.4.2/word-jaccard.4.query.aql | 32 +
.../dblp-csx-aqlplus_6.1.ddl.aql | 53 +
.../dblp-csx-aqlplus_6.2.update.aql | 41 +
.../dblp-csx-aqlplus_6.3.query.aql | 26 +
.../dblp-csx-dblp-aqlplus_1.3.query.aql | 2 +-
.../results/fuzzyjoin/basic-1_1/basic-1_1.1.adm | 1 +
.../fuzzyjoin/basic-1_1_1/basic-1_1_1.1.adm | 13 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.10.adm | 1 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.3.adm | 1 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.4.adm | 1 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.5.adm | 1 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.6.adm | 1 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.7.adm | 1 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.8.adm | 1 +
.../fuzzyjoin/basic-1_1_2/basic-1_1_2.9.adm | 1 +
.../fuzzyjoin/basic-1_1_3/basic-1_1_2.3.adm | 1 +
.../fuzzyjoin/basic-1_1_3/basic-1_1_2.4.adm | 1 +
.../fuzzyjoin/basic-1_2_1/basic-1_2_1.3.adm | 1 +
.../fuzzyjoin/basic-1_2_1/basic-1_2_1.4.adm | 4 +
.../fuzzyjoin/basic-1_2_1/basic-1_2_1.5.adm | 0
.../fuzzyjoin/basic-1_2_1/basic-1_2_1.6.adm | 196 ++
.../fuzzyjoin/basic-1_2_1/basic-1_2_1.7.adm | 196 ++
.../fuzzyjoin/basic-1_2_2/basic-1_2_2.3.adm | 1 +
.../fuzzyjoin/basic-1_2_2/basic-1_2_2.4.adm | 1 +
.../fuzzyjoin/basic-1_2_2/basic-1_2_2.5.adm | 1 +
.../fuzzyjoin/basic-1_2_2/basic-1_2_2.6.adm | 120 +
.../fuzzyjoin/basic-1_2_3/basic-1_2_3.3.adm | 0
.../fuzzyjoin/basic-1_2_4/basic-1_2_4.3.adm | 1 +
.../fuzzyjoin/basic-1_2_5/basic-1_2_5.3.adm | 1 +
.../fuzzyjoin/basic-1_2_6/basic-1_2_6.3.adm | 1 +
.../fuzzyjoin/basic-1_2_7/basic-1_2_7.3.adm | 0
.../fuzzyjoin/basic-1_3_1/basic-1_3_1.3.adm | 834 ++++++
.../fuzzyjoin/basic-1_3_1/basic-1_3_1.4.adm | 547 ++++
.../fuzzyjoin/basic-1_3_1/basic-1_3_1.5.adm | 1 +
.../fuzzyjoin/basic-1_3_1/basic-1_3_1.6.adm | 1 +
.../fuzzyjoin/dblp-csx-4.1.1/dblp-csx-4.1.1.adm | 7 +
.../fuzzyjoin/dblp-csx-4.1.2/dblp-csx-4.1.2.adm | 7 +
.../fuzzyjoin/dblp-csx-4.2.1/dblp-csx-4.2.1.adm | 2 +
.../fuzzyjoin/dblp-csx-4.2.2/dblp-csx-4.2.2.adm | 5 +
.../fuzzyjoin/dblp-csx-4.3.1/dblp-csx-4.3.1.adm | 2 +
.../fuzzyjoin/dblp-csx-4.3.2/dblp-csx-4.3.2.adm | 5 +
.../fuzzyjoin/dblp-csx-4.4.1/dblp-csx-4.4.1.adm | 5 +
.../fuzzyjoin/dblp-csx-4.4.2/dblp-csx-4.4.2.adm | 5 +
.../dblp-csx-aqlplus_6/dblp-csx-aqlplus_6.1.adm | 9 +
.../src/test/resources/runtimets/testsuite.xml | 107 +-
.../asterix/common/exceptions/ErrorCode.java | 1 +
.../main/resources/asx_errormsg/en.properties | 1 +
.../fuzzyjoin/similarity/SimilarityFilters.java | 21 +-
.../similarity/SimilarityFiltersJaccard.java | 141 +-
.../SimilarityJaccardPrefixEvaluator.java | 22 +-
.../logical/visitors/IsomorphismUtilities.java | 64 +
261 files changed, 26850 insertions(+), 2387 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java
index 2db5d9a..eb41204 100644
--- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java
+++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/compiler/provider/DefaultRuleSetFactory.java
@@ -25,6 +25,7 @@ import org.apache.asterix.common.dataflow.ICcApplicationContext;
import org.apache.asterix.optimizer.base.RuleCollections;
import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException;
import org.apache.hyracks.algebricks.common.utils.Pair;
+import org.apache.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialFirstRuleCheckFixpointRuleController;
import org.apache.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialFixpointRuleController;
import org.apache.hyracks.algebricks.compiler.rewriter.rulecontrollers.SequentialOnceRuleController;
import org.apache.hyracks.algebricks.core.rewriter.base.AbstractRuleController;
@@ -50,6 +51,8 @@ public class DefaultRuleSetFactory implements IRuleSetFactory {
SequentialFixpointRuleController seqCtrlNoDfs = new SequentialFixpointRuleController(false);
SequentialFixpointRuleController seqCtrlFullDfs = new SequentialFixpointRuleController(true);
SequentialOnceRuleController seqOnceCtrl = new SequentialOnceRuleController(true);
+ SequentialFirstRuleCheckFixpointRuleController seqFirstRuleGateKeeperDfs =
+ new SequentialFirstRuleCheckFixpointRuleController(true);
defaultLogicalRewrites.add(new Pair<>(seqOnceCtrl, RuleCollections.buildInitialTranslationRuleCollection()));
defaultLogicalRewrites.add(new Pair<>(seqOnceCtrl, RuleCollections.buildTypeInferenceRuleCollection()));
defaultLogicalRewrites.add(new Pair<>(seqOnceCtrl, RuleCollections.buildAutogenerateIDRuleCollection()));
@@ -58,9 +61,8 @@ public class DefaultRuleSetFactory implements IRuleSetFactory {
defaultLogicalRewrites
.add(new Pair<>(seqCtrlNoDfs, RuleCollections.buildCondPushDownAndJoinInferenceRuleCollection()));
defaultLogicalRewrites.add(new Pair<>(seqCtrlFullDfs, RuleCollections.buildLoadFieldsRuleCollection(appCtx)));
- // fj
- defaultLogicalRewrites.add(new Pair<>(seqCtrlFullDfs, RuleCollections.buildFuzzyJoinRuleCollection()));
- //
+ defaultLogicalRewrites
+ .add(new Pair<>(seqFirstRuleGateKeeperDfs, RuleCollections.buildFuzzyJoinRuleCollection()));
defaultLogicalRewrites
.add(new Pair<>(seqCtrlFullDfs, RuleCollections.buildNormalizationRuleCollection(appCtx)));
defaultLogicalRewrites
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java
index d7e05b3..1e64c91 100644
--- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java
+++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/FuzzyUtils.java
@@ -52,6 +52,7 @@ public class FuzzyUtils {
return BuiltinFunctions.COUNTHASHED_WORD_TOKENS;
case MULTISET:
case ARRAY:
+ case UNION:
case ANY:
return null;
default:
@@ -119,4 +120,15 @@ public class FuzzyUtils {
simFunction = simFunction.toLowerCase();
return simFunction;
}
+
+ public static String getSimFunction(FunctionIdentifier simFuncId) {
+ if (simFuncId.equals(BuiltinFunctions.SIMILARITY_JACCARD)
+ || simFuncId.equals(BuiltinFunctions.SIMILARITY_JACCARD_CHECK)) {
+ return JACCARD_FUNCTION_NAME;
+ } else if (simFuncId.equals(BuiltinFunctions.EDIT_DISTANCE)
+ || simFuncId.equals(BuiltinFunctions.EDIT_DISTANCE_CHECK)) {
+ return EDIT_DISTANCE_FUNCTION_NAME;
+ }
+ return null;
+ }
}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java
index cf01573..3c981d4 100644
--- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java
+++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/base/RuleCollections.java
@@ -41,6 +41,7 @@ import org.apache.asterix.optimizer.rules.FeedScanCollectionToUnnest;
import org.apache.asterix.optimizer.rules.FixReplicateOperatorOutputsRule;
import org.apache.asterix.optimizer.rules.FullTextContainsParameterCheckRule;
import org.apache.asterix.optimizer.rules.FuzzyEqRule;
+import org.apache.asterix.optimizer.rules.FuzzyJoinRule;
import org.apache.asterix.optimizer.rules.InjectTypeCastForFunctionArgumentsRule;
import org.apache.asterix.optimizer.rules.InjectTypeCastForUnionRule;
import org.apache.asterix.optimizer.rules.InlineUnnestFunctionRule;
@@ -259,8 +260,15 @@ public final class RuleCollections {
public static final List<IAlgebraicRewriteRule> buildFuzzyJoinRuleCollection() {
List<IAlgebraicRewriteRule> fuzzy = new LinkedList<>();
- // fuzzy.add(new FuzzyJoinRule()); -- The non-indexed fuzzy join will be temporarily disabled. It should be enabled some time in the near future.
- fuzzy.add(new InferTypesRule());
+ fuzzy.add(new FuzzyJoinRule());
+ fuzzy.add(new ExtractCommonExpressionsRule());
+ fuzzy.add(new NestedSubplanToJoinRule());
+ fuzzy.add(new PushSelectIntoJoinRule());
+ fuzzy.add(new RemoveUnusedAssignAndAggregateRule());
+ fuzzy.add(new InlineSubplanInputForNestedTupleSourceRule());
+ fuzzy.add(new RemoveRedundantVariablesRule());
+ fuzzy.add(new AsterixInlineVariablesRule());
+ fuzzy.add(new RemoveUnusedAssignAndAggregateRule());
return fuzzy;
}
http://git-wip-us.apache.org/repos/asf/asterixdb/blob/d906bd89/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java
----------------------------------------------------------------------
diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java
index d814ba0..dabfb82 100644
--- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java
+++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/FuzzyJoinRule.java
@@ -19,6 +19,7 @@
package org.apache.asterix.optimizer.rules;
import java.io.StringReader;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
@@ -26,11 +27,17 @@ import java.util.Locale;
import org.apache.asterix.aqlplus.parser.AQLPlusParser;
import org.apache.asterix.aqlplus.parser.ParseException;
+import org.apache.asterix.common.exceptions.CompilationException;
+import org.apache.asterix.common.exceptions.ErrorCode;
import org.apache.asterix.lang.common.base.Clause;
-import org.apache.asterix.lang.common.struct.Identifier;
+import org.apache.asterix.lang.common.struct.VarIdentifier;
+import org.apache.asterix.lang.common.util.FunctionUtil;
import org.apache.asterix.metadata.declared.MetadataProvider;
+import org.apache.asterix.om.base.AFloat;
+import org.apache.asterix.om.constants.AsterixConstantValue;
import org.apache.asterix.om.functions.BuiltinFunctions;
import org.apache.asterix.om.typecomputer.impl.TypeComputeUtils;
+import org.apache.asterix.om.types.BuiltinType;
import org.apache.asterix.om.types.IAType;
import org.apache.asterix.optimizer.base.FuzzyUtils;
import org.apache.asterix.translator.AqlPlusExpressionToPlanTranslator;
@@ -48,6 +55,7 @@ import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
import org.apache.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression;
import org.apache.hyracks.algebricks.core.algebra.expressions.ConstantExpression;
import org.apache.hyracks.algebricks.core.algebra.expressions.IndexedNLJoinExpressionAnnotation;
+import org.apache.hyracks.algebricks.core.algebra.expressions.ScalarFunctionCallExpression;
import org.apache.hyracks.algebricks.core.algebra.expressions.VariableReferenceExpression;
import org.apache.hyracks.algebricks.core.algebra.functions.AlgebricksBuiltinFunctions;
import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
@@ -55,6 +63,7 @@ import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractBina
import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator;
import org.apache.hyracks.algebricks.core.algebra.operators.logical.LeftOuterJoinOperator;
import org.apache.hyracks.algebricks.core.algebra.operators.logical.SelectOperator;
+import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.IsomorphismUtilities;
import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.LogicalOperatorDeepCopyWithNewVariablesVisitor;
import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.VariableUtilities;
import org.apache.hyracks.algebricks.core.algebra.util.OperatorPropertiesUtil;
@@ -62,118 +71,132 @@ import org.apache.hyracks.algebricks.core.rewriter.base.IAlgebraicRewriteRule;
public class FuzzyJoinRule implements IAlgebraicRewriteRule {
- private static HashSet<FunctionIdentifier> simFuncs = new HashSet<FunctionIdentifier>();
+ private static HashSet<FunctionIdentifier> simFuncs = new HashSet<>();
static {
simFuncs.add(BuiltinFunctions.SIMILARITY_JACCARD_CHECK);
}
+ private final List<Collection<LogicalVariable>> previousPKs = new ArrayList<>();
+
+ // Please correspond the value to the anchors $$RIGHT_##_0 and ##RIGHT_3 in AQLPLUS.
+ private static final int SUBSET_RIGHT_INDEX = 1; // Corresponds to $$RIGHT_1_0 and ##RIGHT_1
+ // private static final int LENGTH_LEFT_INDEX = 2; // Corresponds to $$RIGHT_2_0 and ##RIGHT_2
+ private static final int LENGTH_RIGHT_INDEX = 3; // Corresponds to $$RIGHT_3_0 and ##RIGHT_3
+
+ // Step1: Initialize the host embedding aql to substitute the fuzzy equal condition such as $r.a ~= $s.b
private static final String AQLPLUS = ""
//
// -- - Stage 3 - --
//
- + "((#RIGHT), " + " (join((#LEFT), "
+ + "((##LEFT_0), " + " (join((##RIGHT_0), "
//
// -- -- - Stage 2 - --
//
- + " (" + " join( " + " ( " + " #LEFT_1 " + " let $tokensUnrankedLeft := %s($$LEFT_1) "
- + " let $lenLeft := len($tokensUnrankedLeft) " + " let $tokensLeft := "
- + " for $token in $tokensUnrankedLeft " + " for $tokenRanked at $i in "
+ + " (" + "join( " + "( " + "##RIGHT_1 " + " let $tokensUnrankedRight := %s($$RIGHT_1) "
+ + " let $lenRight := len($tokensUnrankedRight) " + " let $tokensRight := "
+ + " for $token in $tokensUnrankedRight " + "for $tokenRanked at $i in "
//
// -- -- -- - Stage 1 - --
- //
- // + " #LEFT_2 "
- // + " let $id := $$LEFTPK_2 "
- // + " for $token in %s($$LEFT_2) "
- + " #RIGHT_2 " + " let $id := $$RIGHTPK_2 " + " for $token in %s($$RIGHT_2) "
- + " /*+ hash */ " + " group by $tokenGroupped := $token with $id "
- + " /*+ inmem 34 198608 */ " + " order by count($id), $tokenGroupped "
- + " return $tokenGroupped "
+ // Since we use the right join branch R to generate our token order, this can shorten the prefix length of S
+ // in case of R ~= S, where some tokens are in S but not in R.
+ + " ##RIGHT_3 " + "let $id := $$RIGHTPK_3_0 " + "for $token in %s($$RIGHT_3) "
+ + " /*+ hash */ " + "group by $tokenGroupped := $token with $id "
+ + " order by count($id), $tokenGroupped return $tokenGroupped "
//
// -- -- -- -
//
- + " where $token = /*+ bcast */ $tokenRanked " + " order by $i " + " return $i "
- + " for $prefixTokenLeft in subset-collection($tokensLeft, 0, prefix-len-%s(len($tokensLeft), %ff)) "
- + " ),( " + " #RIGHT_1 " + " let $tokensUnrankedRight := %s($$RIGHT_1) "
- + " let $lenRight := len($tokensUnrankedRight) " + " let $tokensRight := "
- + " for $token in $tokensUnrankedRight " + " for $tokenRanked at $i in "
+ + " where $token = /*+ bcast */ $tokenRanked " + "order by $i " + "return $i "
+ + " for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-%s(len($tokensRight), %ff)) "
+ + " ), " + "( " + "##LEFT_1 " + "let $tokensUnrankedLeft := %s($$LEFT_1) "
+ + " let $lenLeft := len($tokensUnrankedLeft) " + "let $tokensLeft := "
+ + " for $token in $tokensUnrankedLeft " + "for $tokenRanked at $i in "
//
// -- -- -- - Stage 1 - --
- //
- // + " #LEFT_3 "
- // + " let $id := $$LEFTPK_3 "
- // + " for $token in %s($$LEFT_3) "
- + " #RIGHT_3 " + " let $id := $$RIGHTPK_3 " + " for $token in %s($$RIGHT_3) "
- + " /*+ hash */ " + " group by $tokenGroupped := $token with $id "
- + " /*+ inmem 34 198608 */ " + " order by count($id), $tokenGroupped "
- + " return $tokenGroupped "
+ + " ##RIGHT_2 " + "let $id := $$RIGHTPK_2_0 " + "for $token in %s($$RIGHT_2) "
+ + " /*+ hash */ " + "group by $tokenGroupped := $token with $id "
+ + " order by count($id), $tokenGroupped return $tokenGroupped "
//
// -- -- -- -
//
- + " where $token = /*+ bcast */ $tokenRanked " + " order by $i " + " return $i "
- + " for $prefixTokenRight in subset-collection($tokensRight, 0, prefix-len-%s(len($tokensRight), %ff)) "
- + " ), $prefixTokenLeft = $prefixTokenRight) "
- + " let $sim := similarity-%s-prefix($lenLeft, $tokensLeft, $lenRight, $tokensRight, $prefixTokenLeft, %ff) "
- + " where $sim >= %ff " + " /*+ hash*/ "
- + " group by $idLeft := $$LEFTPK_1, $idRight := $$RIGHTPK_1 with $sim "
+ + " where $token = /*+ bcast */ $tokenRanked " + "order by $i " + "return $i "
+ // We use the input string $tokensUnrankedLeft instead of $tokensLeft to ensure it will not miss similar
+ // pairs when the prefix of S has been reduced in case of R ~= S, where some tokens are in S but not in R.
+ + " let $actualPreLen := prefix-len-%s(len($tokensUnrankedLeft), %ff) - $lenLeft + len($tokensLeft) "
+ + " for $prefixTokenLeft in subset-collection($tokensLeft, 0, $actualPreLen)) "
+ + " , $prefixTokenLeft = $prefixTokenRight) "
+ + "let $sim := similarity-%s-prefix($lenRight, $tokensRight, $lenLeft, $tokensLeft, $prefixTokenLeft, %ff) "
+ + "where $sim >= %ff " + "/*+ hash*/ " + "group by %s, %s with $sim "
//
// -- -- -
//
- + " ), $$LEFTPK = $idLeft)), $$RIGHTPK = $idRight)";
+ + " ), %s)), %s)";
- private Collection<LogicalVariable> liveVars = new HashSet<LogicalVariable>();
+ private static final String GROUPBY_LEFT = "$idLeft_%d := $$LEFTPK_1_%d";
+ private static final String GROUPBY_RIGHT = "$idRight_%d := $$RIGHTPK_1_%d";
+ private static final String JOIN_COND_LEFT = "$$LEFTPK_0_%d = $idLeft_%d";
+ private static final String JOIN_COND_RIGHT = "$$RIGHTPK_0_%d = $idRight_%d";
+ private static final String AQLPLUS_INNER_JOIN = "join";
+ private static final String AQLPLUS_LEFTOUTER_JOIN = "loj";
@Override
public boolean rewritePost(Mutable<ILogicalOperator> opRef, IOptimizationContext context)
throws AlgebricksException {
+
AbstractLogicalOperator op = (AbstractLogicalOperator) opRef.getValue();
- // current opperator is join
+ // current operator should be a join.
if (op.getOperatorTag() != LogicalOperatorTag.INNERJOIN
&& op.getOperatorTag() != LogicalOperatorTag.LEFTOUTERJOIN) {
return false;
}
- // Find GET_ITEM function.
+ // Finds GET_ITEM function in the join condition.
AbstractBinaryJoinOperator joinOp = (AbstractBinaryJoinOperator) op;
- Mutable<ILogicalExpression> expRef = joinOp.getCondition();
- Mutable<ILogicalExpression> getItemExprRef = getSimilarityExpression(expRef);
+ Mutable<ILogicalExpression> exprRef = joinOp.getCondition();
+ Mutable<ILogicalExpression> getItemExprRef = getSimilarityExpression(exprRef);
if (getItemExprRef == null) {
return false;
}
- // Check if the GET_ITEM function is on one of the supported similarity-check functions.
+ // Checks if the GET_ITEM function is on the one of the supported similarity-check functions.
AbstractFunctionCallExpression getItemFuncExpr = (AbstractFunctionCallExpression) getItemExprRef.getValue();
Mutable<ILogicalExpression> argRef = getItemFuncExpr.getArguments().get(0);
AbstractFunctionCallExpression simFuncExpr = (AbstractFunctionCallExpression) argRef.getValue();
if (!simFuncs.contains(simFuncExpr.getFunctionIdentifier())) {
return false;
}
- // Skip this rule based on annotations.
+ // Skips this rule based on annotations.
if (simFuncExpr.getAnnotations().containsKey(IndexedNLJoinExpressionAnnotation.INSTANCE)) {
return false;
}
+ // Gets both input branches of fuzzy join.
List<Mutable<ILogicalOperator>> inputOps = joinOp.getInputs();
ILogicalOperator leftInputOp = inputOps.get(0).getValue();
ILogicalOperator rightInputOp = inputOps.get(1).getValue();
- List<Mutable<ILogicalExpression>> inputExps = simFuncExpr.getArguments();
+ List<Mutable<ILogicalExpression>> inputExprs = simFuncExpr.getArguments();
+ if (inputExprs.size() != 3) {
+ return false;
+ }
- ILogicalExpression inputExp0 = inputExps.get(0).getValue();
- ILogicalExpression inputExp1 = inputExps.get(1).getValue();
+ // Extracts Fuzzy similarity function.
+ ILogicalExpression leftOperatingExpr = inputExprs.get(0).getValue();
+ ILogicalExpression rightOperatingExpr = inputExprs.get(1).getValue();
+ ILogicalExpression thresholdConstantExpr = inputExprs.get(2).getValue();
- // left and right expressions are variables
- if (inputExp0.getExpressionTag() != LogicalExpressionTag.VARIABLE
- || inputExp1.getExpressionTag() != LogicalExpressionTag.VARIABLE) {
+ // left and right expressions should be variables.
+ if (leftOperatingExpr.getExpressionTag() != LogicalExpressionTag.VARIABLE
+ || rightOperatingExpr.getExpressionTag() != LogicalExpressionTag.VARIABLE
+ || thresholdConstantExpr.getExpressionTag() != LogicalExpressionTag.CONSTANT) {
return false;
}
- LogicalVariable inputVar0 = ((VariableReferenceExpression) inputExp0).getVariableReference();
- LogicalVariable inputVar1 = ((VariableReferenceExpression) inputExp1).getVariableReference();
+ LogicalVariable inputVar0 = ((VariableReferenceExpression) leftOperatingExpr).getVariableReference();
+ LogicalVariable inputVar1 = ((VariableReferenceExpression) rightOperatingExpr).getVariableReference();
LogicalVariable leftInputVar;
LogicalVariable rightInputVar;
-
- liveVars.clear();
+ Collection<LogicalVariable> liveVars = new HashSet<>();
VariableUtilities.getLiveVariables(leftInputOp, liveVars);
if (liveVars.contains(inputVar0)) {
leftInputVar = inputVar0;
@@ -182,72 +205,217 @@ public class FuzzyJoinRule implements IAlgebraicRewriteRule {
leftInputVar = inputVar1;
rightInputVar = inputVar0;
}
-
- List<LogicalVariable> leftInputPKs = context.findPrimaryKey(leftInputVar);
- List<LogicalVariable> rightInputPKs = context.findPrimaryKey(rightInputVar);
- // Bail if primary keys could not be inferred.
- if (leftInputPKs == null || rightInputPKs == null) {
- return false;
- }
- // primary key has only one variable
- if (leftInputPKs.size() != 1 || rightInputPKs.size() != 1) {
- return false;
- }
+ // leftInputPKs in currrentPKs keeps all PKs derived from the left branch in the current similarity fuzzyjoin.
+ List<LogicalVariable> leftInputPKs = findPrimaryKeysInSubplan(liveVars, context);
+ liveVars.clear();
+ VariableUtilities.getLiveVariables(rightInputOp, liveVars);
+ // rightInputPKs in currentPKs keeps all PKs derived from the right branch in the current similarity fuzzyjoin.
+ List<LogicalVariable> rightInputPKs = findPrimaryKeysInSubplan(liveVars, context);
IAType leftType = (IAType) context.getOutputTypeEnvironment(leftInputOp).getVarType(leftInputVar);
- IAType rightType = (IAType) context.getOutputTypeEnvironment(rightInputOp).getVarType(rightInputVar);
- // left-hand side and right-hand side of "~=" has the same type
- IAType left2 = TypeComputeUtils.getActualType(leftType);
- IAType right2 = TypeComputeUtils.getActualType(rightType);
- if (!left2.deepEqual(right2)) {
+ if (!isPrefixFuzzyJoin(context, leftInputOp, rightInputOp, rightInputVar, leftInputPKs, rightInputPKs,
+ leftType)) {
return false;
}
+
//
// -- - FIRE - --
//
MetadataProvider metadataProvider = ((MetadataProvider) context.getMetadataProvider());
+ // Steps 1 and 2. Generate the prefix-based fuzzy jon template.
+ String aqlPlus = generateAqlTemplate(metadataProvider, joinOp, simFuncExpr, leftInputPKs, leftType,
+ rightInputPKs, thresholdConstantExpr);
+ // Steps 3 and 4. Generate the prefix-based fuzzy join subplan.
+ ILogicalOperator outputOp = generatePrefixFuzzyJoinSubplan(context, metadataProvider, aqlPlus, leftInputOp,
+ leftInputPKs, leftInputVar, rightInputOp, rightInputPKs, rightInputVar);
+
+ // Step 5. Bind the plan to the parent op referred by the following opRef.
+ SelectOperator extraSelect;
+ if (getItemExprRef != exprRef) {
+ // more than one join condition
+ getItemExprRef.setValue(ConstantExpression.TRUE);
+ switch (joinOp.getJoinKind()) {
+ case INNER: {
+ extraSelect = new SelectOperator(exprRef, false, null);
+ extraSelect.setSourceLocation(exprRef.getValue().getSourceLocation());
+ extraSelect.getInputs().add(new MutableObject<>(outputOp));
+ outputOp = extraSelect;
+ break;
+ }
+ case LEFT_OUTER: {
+ LeftOuterJoinOperator topJoin = (LeftOuterJoinOperator) outputOp;
+ setConditionForLeftOuterJoin(topJoin, exprRef);
+ break;
+ }
+ default: {
+ throw new IllegalStateException();
+ }
+ }
+ }
+ opRef.setValue(outputOp);
+ OperatorPropertiesUtil.typeOpRec(opRef, context);
+ return true;
+ }
+
+ /**
+ * To handle multiple fuzzy-join conditions on a same pair of datasets, this rule checks the PKs in bottom-up way.
+ * The previousPKs list incrementally maintains the PKs from a previous fuzzy-join operator's input branches.
+ * In addition, the given fuzzy-join operator has been successfully translated into a prefix-based fuzzy join
+ * sub-plan of the current fuzzy-join operator. There are two cases:
+ * 1. If the previousPKs list contains the currentPKs list (the PKs from the input branches of the current
+ * fuzzy-join operator), this means that the current fuzzy-join condition has no new input branch. This case
+ * SHOULD BE regarded as a SELECT over one of the previous fuzzy-joins.
+ * 2. Otherwise, we can apply this rule to the current fuzzy-join operator to a new prefix-based fuzzy-join plan.
+ */
+ private boolean isPrefixFuzzyJoin(IOptimizationContext context, ILogicalOperator leftInputOp,
+ ILogicalOperator rightInputOp, LogicalVariable rightInputVar, List<LogicalVariable> leftInputPKs,
+ List<LogicalVariable> rightInputPKs, IAType leftType) throws AlgebricksException {
+ Collection<LogicalVariable> currentPKs = new HashSet<>();
+ currentPKs.addAll(leftInputPKs);
+ currentPKs.addAll(rightInputPKs);
+ // If PKs derived from the both branches are SAME as that of a previous fuzzyjoin, we treat this fuzzyjoin
+ // condition as a select over that fuzzyjoin.
+ for (int i = 0; i < previousPKs.size(); i++) {
+ if (previousPKs.get(i).containsAll(currentPKs) && currentPKs.containsAll(previousPKs.get(i))) {
+ return false;
+ }
+ }
+
+ //Suppose we want to query on the same dataset on the different fields, i.e. A.a1 ~= B.b1 AND A.a2 ~= B.b2
+ //We conduct this query as a select over a fuzzyjoin based on the PK inclusion relationship.
+ previousPKs.add(currentPKs);
+ // Avoids the duplicated PK generation in findPrimaryKeysInSubplan, especially for multiway fuzzy join.
+ // Refer to fj-dblp-csx-hybrid.aql in the optimized tests for example.
+ IsomorphismUtilities.mergeHomogeneousPK(leftInputOp, leftInputPKs);
+ // Fails if primary keys could not be inferred.
+ if (leftInputPKs.isEmpty() || rightInputPKs.isEmpty()) {
+ return false;
+ }
+
+ IAType rightType = (IAType) context.getOutputTypeEnvironment(rightInputOp).getVarType(rightInputVar);
+ // left-hand side and right-hand side of fuzzyjoin should be the same type
+ IAType actualLeftType = TypeComputeUtils.getActualType(leftType);
+ IAType actualRightType = TypeComputeUtils.getActualType(rightType);
+ if (!actualLeftType.deepEqual(actualRightType)) {
+ return false;
+ }
+ return true;
+ }
+
+ private String generateAqlTemplate(MetadataProvider metadataProvider, AbstractBinaryJoinOperator joinOp,
+ AbstractFunctionCallExpression simFuncExpr, List<LogicalVariable> leftInputPKs, IAType leftType,
+ List<LogicalVariable> rightInputPKs, ILogicalExpression thresholdConstantExpr) throws AlgebricksException {
FunctionIdentifier funcId = FuzzyUtils.getTokenizer(leftType.getTypeTag());
- String tokenizer;
- if (funcId == null) {
- tokenizer = "";
- } else {
+ String tokenizer = "";
+ if (funcId != null) {
tokenizer = funcId.getName();
}
- float simThreshold = FuzzyUtils.getSimThreshold(metadataProvider);
- String simFunction = FuzzyUtils.getSimFunction(metadataProvider);
+ String simFunction = FuzzyUtils.getSimFunction(simFuncExpr.getFunctionIdentifier());
+ float simThreshold;
+ ConstantExpression constExpr = (ConstantExpression) thresholdConstantExpr;
+ AsterixConstantValue constVal = (AsterixConstantValue) constExpr.getValue();
+ if (constVal.getObject().getType().equals(BuiltinType.AFLOAT)) {
+ simThreshold = ((AFloat) constVal.getObject()).getFloatValue();
+ } else {
+ simThreshold = FuzzyUtils.getSimThreshold(metadataProvider);
+ }
// finalize AQL+ query
String prepareJoin;
switch (joinOp.getJoinKind()) {
- case INNER: {
- prepareJoin = "join" + AQLPLUS;
+ case INNER:
+ prepareJoin = AQLPLUS_INNER_JOIN + AQLPLUS;
break;
+ case LEFT_OUTER:
+ prepareJoin = AQLPLUS_LEFTOUTER_JOIN + AQLPLUS;
+ break;
+ default:
+ throw new CompilationException(ErrorCode.COMPILATION_INVALID_EXPRESSION);
+ }
+ String groupByLeft = "";
+ String joinCondLeft = "";
+ // Step2. By default, we triggered the prefix fuzzy join strategy, which needs to initialize the mapping from
+ // the shared token to the actual tuples of the both sides. left(right)InputPKs is used to extract those
+ // mappings from prefix tokens to tuples.
+ for (int i = 0; i < leftInputPKs.size(); i++) {
+ if (i > 0) {
+ groupByLeft += ", ";
+ joinCondLeft += " and ";
}
- case LEFT_OUTER: {
- // TODO To make it work for Left Outer Joins, we should permute
- // the #LEFT and #RIGHT at the top of the AQL+ query. But, when
- // doing this, the
- // fuzzyjoin/user-vis-int-vis-user-lot-aqlplus_1.aql (the one
- // doing 3-way fuzzy joins) gives a different result. But even
- // if we don't change the FuzzyJoinRule, permuting the for
- // clauses in fuzzyjoin/user-vis-int-vis-user-lot-aqlplus_1.aql
- // leads to different results, which suggests there is some
- // other sort of bug.
- return false;
- // prepareJoin = "loj" + AQLPLUS;
- // break;
- }
- default: {
- throw new IllegalStateException();
- }
+ groupByLeft += String.format(Locale.US, GROUPBY_LEFT, i, i);
+ joinCondLeft += String.format(Locale.US, JOIN_COND_LEFT, i, i);
}
- String aqlPlus = String.format(Locale.US, prepareJoin, tokenizer, tokenizer, simFunction, simThreshold,
- tokenizer, tokenizer, simFunction, simThreshold, simFunction, simThreshold, simThreshold);
- LogicalVariable leftPKVar = leftInputPKs.get(0);
- LogicalVariable rightPKVar = rightInputPKs.get(0);
+ String groupByRight = "";
+ String joinCondRight = "";
+ for (int i = 0; i < rightInputPKs.size(); i++) {
+ if (i > 0) {
+ groupByRight += ", ";
+ joinCondRight += " and ";
+ }
+ groupByRight += String.format(Locale.US, GROUPBY_RIGHT, i, i);
+ joinCondRight += String.format(Locale.US, JOIN_COND_RIGHT, i, i);
+ }
+ return String.format(Locale.US, prepareJoin, tokenizer, tokenizer, simFunction, simThreshold, tokenizer,
+ tokenizer, simFunction, simThreshold, simFunction, simThreshold, simThreshold, groupByLeft,
+ groupByRight, joinCondRight, joinCondLeft);
+ }
+ private ILogicalOperator generatePrefixFuzzyJoinSubplan(IOptimizationContext context,
+ MetadataProvider metadataProvider, String aqlPlus, ILogicalOperator leftInputOp,
+ List<LogicalVariable> leftInputPKs, LogicalVariable leftInputVar, ILogicalOperator rightInputOp,
+ List<LogicalVariable> rightInputPKs, LogicalVariable rightInputVar) throws AlgebricksException {
+ // Step3. Translate the tokenizer, join condition and group by (shared token) as shown
+ // in the above AQLPLUS template.
Counter counter = new Counter(context.getVarCounter());
+ // The translator will compile metadata internally. Run this compilation
+ // under the same transaction id as the "outer" compilation.
+ AqlPlusExpressionToPlanTranslator translator = new AqlPlusExpressionToPlanTranslator(metadataProvider, counter);
+
+ LogicalOperatorDeepCopyWithNewVariablesVisitor copyVisitor =
+ new LogicalOperatorDeepCopyWithNewVariablesVisitor(context, context);
+
+ // Step3.1. Substitute the variable references of the above AQLPLUS template with
+ // the actually attached variables.
+ translator.addOperatorToMetaScope(new VarIdentifier("##LEFT_0"), leftInputOp);
+ translator.addVariableToMetaScope(new VarIdentifier("$$LEFT_0"), leftInputVar);
+ for (int i = 0; i < leftInputPKs.size(); i++) {
+ translator.addVariableToMetaScope(new VarIdentifier("$$LEFTPK_0_" + i), leftInputPKs.get(i));
+ }
+
+ // Step3.2. right side again.
+ translator.addOperatorToMetaScope(new VarIdentifier("##RIGHT_0"), rightInputOp);
+ translator.addVariableToMetaScope(new VarIdentifier("$$RIGHT_0"), rightInputVar);
+ for (int i = 0; i < rightInputPKs.size(); i++) {
+ translator.addVariableToMetaScope(new VarIdentifier("$$RIGHTPK_0_" + i), rightInputPKs.get(i));
+ }
+
+ // Step3.3. the suffix 0-3 is used for identifying the different level of variable references.
+ ILogicalOperator leftInputOpCopy = copyVisitor.deepCopy(leftInputOp);
+ translator.addOperatorToMetaScope(new VarIdentifier("##LEFT_1"), leftInputOpCopy);
+ LogicalVariable leftInputVarCopy = copyVisitor.varCopy(leftInputVar);
+ translator.addVariableToMetaScope(new VarIdentifier("$$LEFT_1"), leftInputVarCopy);
+ for (int i = 0; i < leftInputPKs.size(); i++) {
+ leftInputVarCopy = copyVisitor.varCopy(leftInputPKs.get(i));
+ translator.addVariableToMetaScope(new VarIdentifier("$$LEFTPK_1_" + i), leftInputVarCopy);
+ }
+ copyVisitor.updatePrimaryKeys(context);
+ copyVisitor.reset();
+
+ // Notice: pick side to run Stage 1, currently always picks RIGHT side. It means that the right side will
+ // produce the token order as well as its own token list.
+ for (int i = SUBSET_RIGHT_INDEX; i <= LENGTH_RIGHT_INDEX; i++) {
+ translator.addOperatorToMetaScope(new VarIdentifier("##RIGHT_" + i), copyVisitor.deepCopy(rightInputOp));
+ LogicalVariable rightInputVarCopy = copyVisitor.varCopy(rightInputVar);
+ translator.addVariableToMetaScope(new VarIdentifier("$$RIGHT_" + i), rightInputVarCopy);
+ for (int j = 0; j < rightInputPKs.size(); j++) {
+ rightInputVarCopy = copyVisitor.varCopy(rightInputPKs.get(j));
+ translator.addVariableToMetaScope(new VarIdentifier("$$RIGHTPK_" + i + "_" + j), rightInputVarCopy);
+ }
+ copyVisitor.updatePrimaryKeys(context);
+ copyVisitor.reset();
+ }
+ counter.set(context.getVarCounter());
AQLPlusParser parser = new AQLPlusParser(new StringReader(aqlPlus));
parser.initScope();
@@ -256,115 +424,59 @@ public class FuzzyJoinRule implements IAlgebraicRewriteRule {
try {
clauses = parser.Clauses();
} catch (ParseException e) {
- throw new AlgebricksException(e);
+ throw CompilationException.create(ErrorCode.COMPILATION_TRANSLATION_ERROR, e);
}
- // The translator will compile metadata internally. Run this compilation
- // under the same transaction id as the "outer" compilation.
- AqlPlusExpressionToPlanTranslator translator = new AqlPlusExpressionToPlanTranslator(metadataProvider, counter);
- context.setVarCounter(counter.get());
-
- LogicalOperatorDeepCopyWithNewVariablesVisitor deepCopyVisitor =
- new LogicalOperatorDeepCopyWithNewVariablesVisitor(context, context);
- translator.addOperatorToMetaScope(new Identifier("#LEFT"), leftInputOp);
- translator.addVariableToMetaScope(new Identifier("$$LEFT"), leftInputVar);
- translator.addVariableToMetaScope(new Identifier("$$LEFTPK"), leftPKVar);
-
- translator.addOperatorToMetaScope(new Identifier("#RIGHT"), rightInputOp);
- translator.addVariableToMetaScope(new Identifier("$$RIGHT"), rightInputVar);
- translator.addVariableToMetaScope(new Identifier("$$RIGHTPK"), rightPKVar);
-
- translator.addOperatorToMetaScope(new Identifier("#LEFT_1"), deepCopyVisitor.deepCopy(leftInputOp));
- translator.addVariableToMetaScope(new Identifier("$$LEFT_1"), deepCopyVisitor.varCopy(leftInputVar));
- translator.addVariableToMetaScope(new Identifier("$$LEFTPK_1"), deepCopyVisitor.varCopy(leftPKVar));
- deepCopyVisitor.updatePrimaryKeys(context);
- deepCopyVisitor.reset();
-
- // translator.addOperatorToMetaScope(new Identifier("#LEFT_2"),
- // deepCopyVisitor.deepCopy(leftInputOp, null));
- // translator.addVariableToMetaScope(new Identifier("$$LEFT_2"),
- // deepCopyVisitor.varCopy(leftInputVar));
- // translator.addVariableToMetaScope(new Identifier("$$LEFTPK_2"),
- // deepCopyVisitor.varCopy(leftPKVar));
- // deepCopyVisitor.updatePrimaryKeys(context);
- // deepCopyVisitor.reset();
- //
- // translator.addOperatorToMetaScope(new Identifier("#LEFT_3"),
- // deepCopyVisitor.deepCopy(leftInputOp, null));
- // translator.addVariableToMetaScope(new Identifier("$$LEFT_3"),
- // deepCopyVisitor.varCopy(leftInputVar));
- // translator.addVariableToMetaScope(new Identifier("$$LEFTPK_3"),
- // deepCopyVisitor.varCopy(leftPKVar));
- // deepCopyVisitor.updatePrimaryKeys(context);
- // deepCopyVisitor.reset();
-
- translator.addOperatorToMetaScope(new Identifier("#RIGHT_1"), deepCopyVisitor.deepCopy(rightInputOp));
- translator.addVariableToMetaScope(new Identifier("$$RIGHT_1"), deepCopyVisitor.varCopy(rightInputVar));
- translator.addVariableToMetaScope(new Identifier("$$RIGHTPK_1"), deepCopyVisitor.varCopy(rightPKVar));
- deepCopyVisitor.updatePrimaryKeys(context);
- deepCopyVisitor.reset();
-
- // TODO pick side to run Stage 1, currently always picks RIGHT side
- translator.addOperatorToMetaScope(new Identifier("#RIGHT_2"), deepCopyVisitor.deepCopy(rightInputOp));
- translator.addVariableToMetaScope(new Identifier("$$RIGHT_2"), deepCopyVisitor.varCopy(rightInputVar));
- translator.addVariableToMetaScope(new Identifier("$$RIGHTPK_2"), deepCopyVisitor.varCopy(rightPKVar));
- deepCopyVisitor.updatePrimaryKeys(context);
- deepCopyVisitor.reset();
-
- translator.addOperatorToMetaScope(new Identifier("#RIGHT_3"), deepCopyVisitor.deepCopy(rightInputOp));
- translator.addVariableToMetaScope(new Identifier("$$RIGHT_3"), deepCopyVisitor.varCopy(rightInputVar));
- translator.addVariableToMetaScope(new Identifier("$$RIGHTPK_3"), deepCopyVisitor.varCopy(rightPKVar));
- deepCopyVisitor.updatePrimaryKeys(context);
- deepCopyVisitor.reset();
-
- ILogicalPlan plan = translator.translate(clauses);
+ // Step 4. The essential substitution with translator.
+ ILogicalPlan plan;
+ try {
+ plan = translator.translate(clauses);
+ } catch (CompilationException e) {
+ throw CompilationException.create(ErrorCode.COMPILATION_TRANSLATION_ERROR, e);
+ }
context.setVarCounter(counter.get());
- ILogicalOperator outputOp = plan.getRoots().get(0).getValue();
+ return plan.getRoots().get(0).getValue();
+ }
- SelectOperator extraSelect = null;
- if (getItemExprRef != expRef) {
- // more than one join condition
- getItemExprRef.setValue(ConstantExpression.TRUE);
- switch (joinOp.getJoinKind()) {
- case INNER: {
- extraSelect = new SelectOperator(expRef, false, null);
- extraSelect.setSourceLocation(expRef.getValue().getSourceLocation());
- extraSelect.getInputs().add(new MutableObject<ILogicalOperator>(outputOp));
- outputOp = extraSelect;
- break;
- }
- case LEFT_OUTER: {
- if (outputOp.getOperatorTag() != LogicalOperatorTag.LEFTOUTERJOIN) {
- throw new IllegalStateException();
- }
- LeftOuterJoinOperator topJoin = (LeftOuterJoinOperator) outputOp;
- topJoin.getCondition().setValue(expRef.getValue());
- break;
- }
- default: {
- throw new IllegalStateException();
- }
- }
+ // Since the generatePrefixFuzzyJoinSubplan generates the prefix-based join operators for the partial simJoin
+ // of expRef, we need to add the full condition expRef\getItemExprRef into the top-level operator of the plan.
+ // Notice: Any composite select on leftOuterJoin with fuzzyjoin condition inlined can be regarded as its example.
+ // Example: leftouterjoin-probe-pidx-with-join-edit-distance-check-idx_01.aql or with more extra conditions inlined.
+ private void setConditionForLeftOuterJoin(LeftOuterJoinOperator topJoin, Mutable<ILogicalExpression> expRef) {
+ // Combine the conditions of top join of aqlplus plan and the original join
+ AbstractFunctionCallExpression andFunc =
+ new ScalarFunctionCallExpression(FunctionUtil.getFunctionInfo(AlgebricksBuiltinFunctions.AND));
+
+ List<Mutable<ILogicalExpression>> conjs = new ArrayList<>();
+ if (topJoin.getCondition().getValue().splitIntoConjuncts(conjs)) {
+ andFunc.getArguments().addAll(conjs);
+ } else {
+ andFunc.getArguments().add(new MutableObject<>(topJoin.getCondition().getValue()));
}
- opRef.setValue(outputOp);
- OperatorPropertiesUtil.typeOpRec(opRef, context);
- return true;
+
+ List<Mutable<ILogicalExpression>> conjs2 = new ArrayList<>();
+ if (expRef.getValue().splitIntoConjuncts(conjs2)) {
+ andFunc.getArguments().addAll(conjs2);
+ } else {
+ andFunc.getArguments().add(expRef);
+ }
+ topJoin.getCondition().setValue(andFunc);
}
/**
* Look for GET_ITEM function call.
*/
- private Mutable<ILogicalExpression> getSimilarityExpression(Mutable<ILogicalExpression> expRef) {
- ILogicalExpression exp = expRef.getValue();
+ private Mutable<ILogicalExpression> getSimilarityExpression(Mutable<ILogicalExpression> exprRef) {
+ ILogicalExpression exp = exprRef.getValue();
if (exp.getExpressionTag() == LogicalExpressionTag.FUNCTION_CALL) {
AbstractFunctionCallExpression funcExpr = (AbstractFunctionCallExpression) exp;
if (funcExpr.getFunctionIdentifier().equals(BuiltinFunctions.GET_ITEM)) {
- return expRef;
+ return exprRef;
}
if (funcExpr.getFunctionIdentifier().equals(AlgebricksBuiltinFunctions.AND)) {
- for (int i = 0; i < 2; i++) {
- Mutable<ILogicalExpression> expRefRet = getSimilarityExpression(funcExpr.getArguments().get(i));
+ for (Mutable<ILogicalExpression> arg : funcExpr.getArguments()) {
+ Mutable<ILogicalExpression> expRefRet = getSimilarityExpression(arg);
if (expRefRet != null) {
return expRefRet;
}
@@ -374,9 +486,19 @@ public class FuzzyJoinRule implements IAlgebraicRewriteRule {
return null;
}
- @Override
- public boolean rewritePre(Mutable<ILogicalOperator> opRef, IOptimizationContext context)
- throws AlgebricksException {
- return false;
+ // To extract all the PKs of the liveVars referenced by on of the fuzzyjoin branch branch.
+ private List<LogicalVariable> findPrimaryKeysInSubplan(Collection<LogicalVariable> liveVars,
+ IOptimizationContext context) {
+ Collection<LogicalVariable> primaryKeys = new HashSet<>();
+ for (LogicalVariable var : liveVars) {
+ List<LogicalVariable> pks = context.findPrimaryKey(var);
+ if (pks != null) {
+ primaryKeys.addAll(pks);
+ }
+ }
+ if (primaryKeys.isEmpty()) {
+ return new ArrayList<>();
+ }
+ return new ArrayList<>(primaryKeys);
}
}