You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Wenhai (JIRA)" <ji...@apache.org> on 2017/03/23 15:34:42 UTC
[jira] [Created] (ASTERIXDB-1852) Nested function poses wrong
nested loop plan.
Wenhai created ASTERIXDB-1852:
---------------------------------
Summary: Nested function poses wrong nested loop plan.
Key: ASTERIXDB-1852
URL: https://issues.apache.org/jira/browse/ASTERIXDB-1852
Project: Apache AsterixDB
Issue Type: Bug
Components: AsterixDB
Environment: MAC Linux
Reporter: Wenhai
Assignee: Wenhai
In a binary join query, the optimizer should generate a hash join plan. Due to the two functions has been used before the join condition, we get a wrong plan that employs the nested loop join.
Schema
{noformat}
drop dataverse test if exists;
create dataverse test;
use dataverse test;
create type DBLPType as
closed {
id : bigint,
dblpid : string,
title : string,
authors : string,
misc : string
}
create type CSXType as
closed {
id : bigint,
csxid : string,
title : string,
authors : string,
misc : string
}
create dataset DBLP(DBLPType) primary key id;
create dataset CSX(CSXType) primary key id;
{noformat}
One function-nested query
{noformat}
use dataverse test
set import-private-functions 'true'
for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, len($unrankedTokensDBLP))
for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, len($unrankedTokensCSX))
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
will generate the hash-based join plan
{noformat}
distribute result [%0->$$29]
-- DISTRIBUTE_RESULT |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
project ([$$29])
-- STREAM_PROJECT |PARTITIONED|
assign [$$29] <- [function-call: asterix:closed-object-constructor, Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
group by ([$$9 := %0->$$30; $$10 := %0->$$31]) decor ([]) {
aggregate [] <- []
-- AGGREGATE |LOCAL|
nested tuple source
-- NESTED_TUPLE_SOURCE |LOCAL|
}
-- EXTERNAL_GROUP_BY[$$30, $$31] |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$30, $$31] |PARTITIONED|
project ([$$30, $$31])
-- STREAM_PROJECT |PARTITIONED|
select (function-call: algebricks:ge, Args:[function-call: asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}])
-- STREAM_SELECT |PARTITIONED|
project ([$$2, $$6, $$30, $$31])
-- STREAM_PROJECT |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
join (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7])
-- HYBRID_HASH_JOIN [$$3][$$7] |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$3] |PARTITIONED|
unnest $$3 <- function-call: asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: asterix:len, Args:[%0->$$2]]
-- UNNEST |PARTITIONED|
project ([$$2, $$30])
-- STREAM_PROJECT |PARTITIONED|
assign [$$2] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$0, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$30, $$0] <- test:DBLP
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$7] |PARTITIONED|
unnest $$7 <- function-call: asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call: asterix:len, Args:[%0->$$6]]
-- UNNEST |PARTITIONED|
project ([$$6, $$31])
-- STREAM_PROJECT |PARTITIONED|
assign [$$6] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$4, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$31, $$4] <- test:CSX
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
{noformat}
In contrast, when we conduct two functions in the query like
{noformat}
use dataverse test
set import-private-functions 'true'
for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, abs(len($unrankedTokensDBLP)))
for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, abs(len($unrankedTokensDBLP)))
where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
We got a wrong plan with a nested loop join as follows
{noformat}
distribute result [%0->$$31]
-- DISTRIBUTE_RESULT |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
project ([$$31])
-- STREAM_PROJECT |PARTITIONED|
assign [$$31] <- [function-call: asterix:closed-object-constructor, Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
group by ([$$9 := %0->$$32; $$10 := %0->$$33]) decor ([]) {
aggregate [] <- []
-- AGGREGATE |LOCAL|
nested tuple source
-- NESTED_TUPLE_SOURCE |LOCAL|
}
-- EXTERNAL_GROUP_BY[$$32, $$33] |PARTITIONED|
exchange
-- HASH_PARTITION_EXCHANGE [$$32, $$33] |PARTITIONED|
project ([$$32, $$33])
-- STREAM_PROJECT |PARTITIONED|
select (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7])
-- STREAM_SELECT |PARTITIONED|
project ([$$32, $$33, $$3, $$7])
-- STREAM_PROJECT |PARTITIONED|
unnest $$7 <- function-call: asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call: asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]]
-- UNNEST |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
join (function-call: algebricks:ge, Args:[function-call: asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}])
-- NESTED_LOOP |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
unnest $$3 <- function-call: asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]]
-- UNNEST |PARTITIONED|
project ([$$32, $$2])
-- STREAM_PROJECT |PARTITIONED|
assign [$$2] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$0, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$32, $$0] <- test:DBLP
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
exchange
-- BROADCAST_EXCHANGE |PARTITIONED|
project ([$$33, $$6])
-- STREAM_PROJECT |PARTITIONED|
assign [$$6] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$4, AInt32: {2}]]]
-- ASSIGN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
data-scan []<-[$$33, $$4] <- test:CSX
-- DATASOURCE_SCAN |PARTITIONED|
exchange
-- ONE_TO_ONE_EXCHANGE |PARTITIONED|
empty-tuple-source
-- EMPTY_TUPLE_SOURCE |PARTITIONED|
{noformat}
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)