You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by "Wenhai (JIRA)" <ji...@apache.org> on 2017/03/23 15:34:42 UTC

[jira] [Created] (ASTERIXDB-1852) Nested function poses wrong nested loop plan.

Wenhai created ASTERIXDB-1852:
---------------------------------

             Summary: Nested function poses wrong nested loop plan.
                 Key: ASTERIXDB-1852
                 URL: https://issues.apache.org/jira/browse/ASTERIXDB-1852
             Project: Apache AsterixDB
          Issue Type: Bug
          Components: AsterixDB
         Environment: MAC Linux
            Reporter: Wenhai
            Assignee: Wenhai


In a binary join query, the optimizer should generate a hash join plan. Due to the two functions has been used before the join condition, we get a wrong plan that employs the nested loop join.
Schema
{noformat}
drop  dataverse test if exists;
create  dataverse test;
use dataverse test;
create type DBLPType as
 closed {
  id : bigint,
  dblpid : string,
  title : string,
  authors : string,
  misc : string
}
create type CSXType as
 closed {
  id : bigint,
  csxid : string,
  title : string,
  authors : string,
  misc : string
}
create  dataset DBLP(DBLPType) primary key id;
create  dataset CSX(CSXType) primary key id;
{noformat}
One function-nested query
{noformat}
use dataverse test
set import-private-functions 'true'

for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, len($unrankedTokensDBLP))

for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, len($unrankedTokensCSX))

where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
will generate the hash-based join plan
{noformat}
distribute result [%0->$$29]
-- DISTRIBUTE_RESULT  |PARTITIONED|
  exchange
  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
    project ([$$29])
    -- STREAM_PROJECT  |PARTITIONED|
      assign [$$29] <- [function-call: asterix:closed-object-constructor, Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
      -- ASSIGN  |PARTITIONED|
        exchange
        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
          group by ([$$9 := %0->$$30; $$10 := %0->$$31]) decor ([]) {
                    aggregate [] <- []
                    -- AGGREGATE  |LOCAL|
                      nested tuple source
                      -- NESTED_TUPLE_SOURCE  |LOCAL|
                 }
          -- EXTERNAL_GROUP_BY[$$30, $$31]  |PARTITIONED|
            exchange
            -- HASH_PARTITION_EXCHANGE [$$30, $$31]  |PARTITIONED|
              project ([$$30, $$31])
              -- STREAM_PROJECT  |PARTITIONED|
                select (function-call: algebricks:ge, Args:[function-call: asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}])
                -- STREAM_SELECT  |PARTITIONED|
                  project ([$$2, $$6, $$30, $$31])
                  -- STREAM_PROJECT  |PARTITIONED|
                    exchange
                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                      join (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7])
                      -- HYBRID_HASH_JOIN [$$3][$$7]  |PARTITIONED|
                        exchange
                        -- HASH_PARTITION_EXCHANGE [$$3]  |PARTITIONED|
                          unnest $$3 <- function-call: asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: asterix:len, Args:[%0->$$2]]
                          -- UNNEST  |PARTITIONED|
                            project ([$$2, $$30])
                            -- STREAM_PROJECT  |PARTITIONED|
                              assign [$$2] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$0, AInt32: {2}]]]
                              -- ASSIGN  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  data-scan []<-[$$30, $$0] <- test:DBLP
                                  -- DATASOURCE_SCAN  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      empty-tuple-source
                                      -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
                        exchange
                        -- HASH_PARTITION_EXCHANGE [$$7]  |PARTITIONED|
                          unnest $$7 <- function-call: asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call: asterix:len, Args:[%0->$$6]]
                          -- UNNEST  |PARTITIONED|
                            project ([$$6, $$31])
                            -- STREAM_PROJECT  |PARTITIONED|
                              assign [$$6] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$4, AInt32: {2}]]]
                              -- ASSIGN  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  data-scan []<-[$$31, $$4] <- test:CSX
                                  -- DATASOURCE_SCAN  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      empty-tuple-source
                                      -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{noformat}
In contrast, when we conduct two functions in the query like
{noformat}
use dataverse test
set import-private-functions 'true'

for $paperDBLP in dataset('DBLP')
let $idDBLP := $paperDBLP.id
let $unrankedTokensDBLP := word-tokens($paperDBLP.title)
for $prefixTokenDBLP in subset-collection($unrankedTokensDBLP, 0, abs(len($unrankedTokensDBLP)))

for $paperCSX in dataset('CSX')
let $idCSX := $paperCSX.id
let $unrankedTokensCSX := word-tokens($paperCSX.title)
for $prefixTokenCSX in subset-collection($unrankedTokensCSX, 0, abs(len($unrankedTokensDBLP)))

where $prefixTokenDBLP = $prefixTokenCSX
let $sim := similarity-jaccard-check($unrankedTokensDBLP, $unrankedTokensCSX, .8f)
where $sim[1] >= .8f
/*+ hash */
group by $idDBLP := $idDBLP, $idCSX := $idCSX with $sim
return {'idDBLP': $idDBLP, 'idCSX': $idCSX}
{noformat}
We got a wrong plan with a nested loop join as follows
{noformat}
distribute result [%0->$$31]
-- DISTRIBUTE_RESULT  |PARTITIONED|
  exchange
  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
    project ([$$31])
    -- STREAM_PROJECT  |PARTITIONED|
      assign [$$31] <- [function-call: asterix:closed-object-constructor, Args:[AString: {idDBLP}, %0->$$9, AString: {idCSX}, %0->$$10]]
      -- ASSIGN  |PARTITIONED|
        exchange
        -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
          group by ([$$9 := %0->$$32; $$10 := %0->$$33]) decor ([]) {
                    aggregate [] <- []
                    -- AGGREGATE  |LOCAL|
                      nested tuple source
                      -- NESTED_TUPLE_SOURCE  |LOCAL|
                 }
          -- EXTERNAL_GROUP_BY[$$32, $$33]  |PARTITIONED|
            exchange
            -- HASH_PARTITION_EXCHANGE [$$32, $$33]  |PARTITIONED|
              project ([$$32, $$33])
              -- STREAM_PROJECT  |PARTITIONED|
                select (function-call: algebricks:eq, Args:[%0->$$3, %0->$$7])
                -- STREAM_SELECT  |PARTITIONED|
                  project ([$$32, $$33, $$3, $$7])
                  -- STREAM_PROJECT  |PARTITIONED|
                    unnest $$7 <- function-call: asterix:subset-collection, Args:[%0->$$6, AInt64: {0}, function-call: asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]]
                    -- UNNEST  |PARTITIONED|
                      exchange
                      -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                        join (function-call: algebricks:ge, Args:[function-call: asterix:get-item, Args:[function-call: asterix:similarity-jaccard-check, Args:[%0->$$2, %0->$$6, AFloat: {0.8}], AInt64: {1}], AFloat: {0.8}])
                        -- NESTED_LOOP  |PARTITIONED|
                          exchange
                          -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                            unnest $$3 <- function-call: asterix:subset-collection, Args:[%0->$$2, AInt64: {0}, function-call: asterix:abs, Args:[function-call: asterix:len, Args:[%0->$$2]]]
                            -- UNNEST  |PARTITIONED|
                              project ([$$32, $$2])
                              -- STREAM_PROJECT  |PARTITIONED|
                                assign [$$2] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$0, AInt32: {2}]]]
                                -- ASSIGN  |PARTITIONED|
                                  exchange
                                  -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                    data-scan []<-[$$32, $$0] <- test:DBLP
                                    -- DATASOURCE_SCAN  |PARTITIONED|
                                      exchange
                                      -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                        empty-tuple-source
                                        -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
                          exchange
                          -- BROADCAST_EXCHANGE  |PARTITIONED|
                            project ([$$33, $$6])
                            -- STREAM_PROJECT  |PARTITIONED|
                              assign [$$6] <- [function-call: asterix:word-tokens, Args:[function-call: asterix:field-access-by-index, Args:[%0->$$4, AInt32: {2}]]]
                              -- ASSIGN  |PARTITIONED|
                                exchange
                                -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                  data-scan []<-[$$33, $$4] <- test:CSX
                                  -- DATASOURCE_SCAN  |PARTITIONED|
                                    exchange
                                    -- ONE_TO_ONE_EXCHANGE  |PARTITIONED|
                                      empty-tuple-source
                                      -- EMPTY_TUPLE_SOURCE  |PARTITIONED|
{noformat}



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)