You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hive.apache.org by "Zoltan Haindrich (Jira)" <ji...@apache.org> on 2022/01/18 10:21:00 UTC

[jira] [Updated] (HIVE-25874) Slow filter evaluation of nest struct fields in vectorized executions

     [ https://issues.apache.org/jira/browse/HIVE-25874?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Zoltan Haindrich updated HIVE-25874:
------------------------------------
    Description: 
time is spent at resizing vectors around [here|https://github.com/apache/hive/blob/200c0bf1feb259f4d95bf065a2ab38fe684383da/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java#L252] or in some other "ensureSize" method

{code:java}

create table t as
select
named_struct('id',13,'str','string','nest',named_struct('id',12,'str','string','arr',array('value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value')))
s;

-- go up to 1M rows
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
-- insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;


set hive.fetch.task.conversion=none;

select count(1) from t;
--explain
select s
.id from t
where 
s
.nest
.id  > 0;

 {code}


interestingly; the issue is not present:
* for a query not looking into the nested struct
* and in case the struct with the array is at the top level

{code}
select count(1) from t;
--explain
select s
.id from t
where 
s
-- .nest
.id  > 0;
{code}

  was:
{code:java}

create table t as
select
named_struct('id',13,'str','string','nest',named_struct('id',12,'str','string','arr',array('value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value')))
s;

-- go up to 1M rows
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
-- insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;


set hive.fetch.task.conversion=none;

select count(1) from t;
--explain
select s
.id from t
where 
s
.nest
.id  > 0;

 {code}


interestingly; the issue is not present:
* for a query not looking into the nested struct
* and in case the struct with the array is at the top level

{code}
select count(1) from t;
--explain
select s
.id from t
where 
s
-- .nest
.id  > 0;
{code}


> Slow filter evaluation of nest struct fields in vectorized executions
> ---------------------------------------------------------------------
>
>                 Key: HIVE-25874
>                 URL: https://issues.apache.org/jira/browse/HIVE-25874
>             Project: Hive
>          Issue Type: Improvement
>            Reporter: Zoltan Haindrich
>            Priority: Major
>
> time is spent at resizing vectors around [here|https://github.com/apache/hive/blob/200c0bf1feb259f4d95bf065a2ab38fe684383da/storage-api/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java#L252] or in some other "ensureSize" method
> {code:java}
> create table t as
> select
> named_struct('id',13,'str','string','nest',named_struct('id',12,'str','string','arr',array('value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value','value')))
> s;
> -- go up to 1M rows
> insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
> insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
> insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
> insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
> insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
> -- insert into table t select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t union all select * from t;
> set hive.fetch.task.conversion=none;
> select count(1) from t;
> --explain
> select s
> .id from t
> where 
> s
> .nest
> .id  > 0;
>  {code}
> interestingly; the issue is not present:
> * for a query not looking into the nested struct
> * and in case the struct with the array is at the top level
> {code}
> select count(1) from t;
> --explain
> select s
> .id from t
> where 
> s
> -- .nest
> .id  > 0;
> {code}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)