You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hive.apache.org by "Vineet Garg (JIRA)" <ji...@apache.org> on 2019/01/17 23:10:00 UTC
[jira] [Commented] (HIVE-21132) Semi join edge is not being removed despite max bloomfilter entries set to 1

    [ https://issues.apache.org/jira/browse/HIVE-21132?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16745608#comment-16745608 ] 

Vineet Garg commented on HIVE-21132:
------------------------------------

Root cause is that Reduce de-duplication ends up messing up pattern of Semi join branches (GB-RS-GB-RS)

> Semi join edge is not being removed despite max bloomfilter entries set to 1
> ----------------------------------------------------------------------------
>
>                 Key: HIVE-21132
>                 URL: https://issues.apache.org/jira/browse/HIVE-21132
>             Project: Hive
>          Issue Type: Bug
>          Components: Query Planning
>    Affects Versions: 4.0.0
>            Reporter: Vineet Garg
>            Assignee: Vineet Garg
>            Priority: Major
>
> * Reproducer
> {code:sql}
> --! qt:dataset:lineitem
> --! qt:dataset:part
> --! qt:dataset:src
> set hive.support.concurrency=true;
> set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
> --set hive.compute.query.using.stats=false;
> set hive.mapred.mode=nonstrict;
> set hive.explain.user=false;
> set hive.optimize.ppd=true;
> set hive.ppd.remove.duplicatefilters=true;
> set hive.tez.dynamic.partition.pruning=true;
> set hive.tez.dynamic.semijoin.reduction=true;
> set hive.optimize.metadataonly=false;
> set hive.optimize.index.filter=true;
> set hive.stats.autogather=true;
> set hive.tez.bigtable.minsize.semijoin.reduction=1;
> set hive.tez.min.bloom.filter.entries=1;
> set hive.stats.fetch.column.stats=true;
> set hive.tez.bloom.filter.factor=1.0f;
> set hive.auto.convert.join=false;
> set hive.optimize.shared.work=false;
> create database tpch_test;
> use tpch_test;
> CREATE TABLE `customer`(
>   `c_custkey` bigint, 
>   `c_name` string, 
>   `c_address` string, 
>   `c_nationkey` bigint, 
>   `c_phone` string, 
>   `c_acctbal` double, 
>   `c_mktsegment` string, 
>   `c_comment` string)
> ROW FORMAT SERDE 
>   'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
> STORED AS INPUTFORMAT 
>   'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
> OUTPUTFORMAT 
>   'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
> TBLPROPERTIES (
>   'bucketing_version'='2', 
>   'transactional'='true', 
>   'transactional_properties'='default', 
>   'transient_lastDdlTime'='1543026723');
> CREATE TABLE `lineitem`(
>   `l_orderkey` bigint, 
>   `l_partkey` bigint, 
>   `l_suppkey` bigint, 
>   `l_linenumber` int, 
>   `l_quantity` double, 
>   `l_extendedprice` double, 
>   `l_discount` double, 
>   `l_tax` double, 
>   `l_returnflag` string, 
>   `l_linestatus` string, 
>   `l_shipdate` string, 
>   `l_commitdate` string, 
>   `l_receiptdate` string, 
>   `l_shipinstruct` string, 
>   `l_shipmode` string, 
>   `l_comment` string)
> ROW FORMAT SERDE 
>   'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
> STORED AS INPUTFORMAT 
>   'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
> OUTPUTFORMAT 
>   'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
> TBLPROPERTIES (
>   'bucketing_version'='2', 
>   'transactional'='true', 
>   'transactional_properties'='default', 
>   'transient_lastDdlTime'='1543027179');
> CREATE TABLE `orders`(
>   `o_orderkey` bigint, 
>   `o_custkey` bigint, 
>   `o_orderstatus` string, 
>   `o_totalprice` double, 
>   `o_orderdate` string, 
>   `o_orderpriority` string, 
>   `o_clerk` string, 
>   `o_shippriority` int, 
>   `o_comment` string)
> ROW FORMAT SERDE 
>   'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
> STORED AS INPUTFORMAT 
>   'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
> OUTPUTFORMAT 
>   'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
> TBLPROPERTIES (
>   'bucketing_version'='2', 
>   'transactional'='true', 
>   'transactional_properties'='default', 
>   'transient_lastDdlTime'='1543026824');
> alter table customer update statistics set('numRows'='150000000','rawDataSize'='8633707142');
> alter table lineitem update statistics set('numRows'='5999989709','rawDataSize'='184245066955');
> alter table orders update statistics set('numRows'='1500000000','rawDataSize'='46741318253');
> create view q18_tmp_cached as
> select l_orderkey, sum(l_quantity) as t_sum_quantity
> from lineitem
> where l_orderkey is not null
> group by l_orderkey;
> -- Set bloom filter size to huge number so we get any possible semijoin reductions
> set hive.tez.min.bloom.filter.entries=0;
> set hive.tez.max.bloom.filter.entries=1;
> create table q18_large_volume_customer_cached stored as orc tblproperties ('transactional'='true', 'transactional_properties'='default') as
> select c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice, sum(l_quantity)
> from customer, orders, q18_tmp_cached t, lineitem l
> where
>   c_custkey = o_custkey and o_orderkey = t.l_orderkey
>   and o_orderkey is not null and t.t_sum_quantity > 300
>   and o_orderkey = l.l_orderkey and l.l_orderkey is not null
> group by c_name, c_custkey, o_orderkey, o_orderdate, o_totalprice
> order by o_totalprice desc, o_orderdate
> limit 100;
> drop database tpch_test cascade;
> {code}
> To reproduce run the above as TestMiniLlapLocalCliDriver test



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)