You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by dk...@apache.org on 2023/04/19 10:01:36 UTC
[hive] branch master updated: HIVE-27158: Store hive columns stats in puffin files for iceberg tables (Simhadri Govindappa, reviewed by Ayush Saxena, Denys Kuzmenko, Rajesh Balamohan, Zsolt Miskolczi)
This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new a8a0ae782be HIVE-27158: Store hive columns stats in puffin files for iceberg tables (Simhadri Govindappa, reviewed by Ayush Saxena, Denys Kuzmenko, Rajesh Balamohan, Zsolt Miskolczi)
a8a0ae782be is described below
commit a8a0ae782be87d1198006ea3cb508f14070231b7
Author: SimhadriGovindappa <si...@gmail.com>
AuthorDate: Wed Apr 19 15:31:29 2023 +0530
HIVE-27158: Store hive columns stats in puffin files for iceberg tables (Simhadri Govindappa, reviewed by Ayush Saxena, Denys Kuzmenko, Rajesh Balamohan, Zsolt Miskolczi)
Closes #4131
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 5 +-
.../iceberg/mr/hive/HiveIcebergStorageHandler.java | 103 +++-
.../src/test/queries/positive/col_stats.q | 58 ++
.../positive/use_basic_stats_from_iceberg.q | 4 +-
.../positive/vectorized_iceberg_read_mixed.q | 8 +
.../src/test/results/positive/col_stats.q.out | 615 +++++++++++++++++++++
.../positive/dynamic_partition_writes.q.out | 22 +-
.../llap/vectorized_iceberg_read_mixed.q.out | 110 +++-
.../positive/vectorized_iceberg_read_mixed.q.out | 71 ++-
.../hive/ql/metadata/HiveStorageHandler.java | 41 ++
.../hadoop/hive/ql/stats/ColStatsProcessor.java | 3 +
.../apache/hadoop/hive/ql/stats/StatsUtils.java | 8 +-
12 files changed, 1022 insertions(+), 26 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 8b666164212..7e6903a39d6 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2205,9 +2205,8 @@ public class HiveConf extends Configuration {
"padding tolerance config (hive.exec.orc.block.padding.tolerance)."),
HIVE_ORC_CODEC_POOL("hive.use.orc.codec.pool", false,
"Whether to use codec pool in ORC. Disable if there are bugs with codec reuse."),
- HIVE_USE_STATS_FROM("hive.use.stats.from","iceberg","Use stats from iceberg table snapshot for query " +
- "planning. This has three values metastore, puffin and iceberg"),
-
+ HIVE_ICEBERG_STATS_SOURCE("hive.iceberg.stats.source", "iceberg",
+ "Use stats from iceberg table snapshot for query planning. This has two values metastore and iceberg"),
HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true,
"If this is set the header for RCFiles will simply be RCF. If this is not\n" +
"set the header will be that borrowed from sequence files, e.g. SEQ- followed\n" +
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index bcadebbf4c0..db69d6c34c8 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -23,6 +23,7 @@ import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
@@ -35,14 +36,20 @@ import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.collections4.ListUtils;
+import org.apache.commons.lang3.SerializationUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.SnapshotContext;
import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.LockType;
@@ -112,6 +119,12 @@ import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.hadoop.HadoopConfigurable;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
+import org.apache.iceberg.puffin.Blob;
+import org.apache.iceberg.puffin.BlobMetadata;
+import org.apache.iceberg.puffin.Puffin;
+import org.apache.iceberg.puffin.PuffinCompressionCodec;
+import org.apache.iceberg.puffin.PuffinReader;
+import org.apache.iceberg.puffin.PuffinWriter;
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
@@ -121,7 +134,10 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.relocated.com.google.common.collect.Streams;
import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.ByteBuffers;
+import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.SerializationUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -136,6 +152,7 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
private static final String PUFFIN = "puffin";
public static final String COPY_ON_WRITE = "copy-on-write";
public static final String MERGE_ON_READ = "merge-on-read";
+ public static final String STATS = "/stats/";
/**
* Function template for producing a custom sort expression function:
* Takes the source column index and the bucket count to creat a function where Iceberg bucket UDF is used to build
@@ -318,7 +335,7 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
// For write queries where rows got modified, don't fetch from cache as values could have changed.
Table table = getTable(hmsTable);
- String statsSource = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_USE_STATS_FROM).toLowerCase();
+ String statsSource = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_ICEBERG_STATS_SOURCE).toLowerCase();
Map<String, String> stats = Maps.newHashMap();
switch (statsSource) {
case ICEBERG:
@@ -361,6 +378,90 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
return table;
}
+ @Override
+ public boolean canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+ Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ return table.currentSnapshot() != null ? getStatsSource().equals(ICEBERG) : false;
+ }
+
+ @Override
+ public boolean setColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable,
+ List<ColumnStatistics> colStats) {
+ Table tbl = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ String snapshotId = String.format("%s-STATS-%d", tbl.name(), tbl.currentSnapshot().snapshotId());
+ invalidateStats(getStatsPath(tbl));
+ byte[] serializeColStats = SerializationUtils.serialize((Serializable) colStats);
+ try (PuffinWriter writer = Puffin.write(tbl.io().newOutputFile(getStatsPath(tbl).toString()))
+ .createdBy(Constants.HIVE_ENGINE).build()) {
+ writer.add(
+ new Blob(
+ tbl.name() + "-" + snapshotId,
+ ImmutableList.of(1),
+ tbl.currentSnapshot().snapshotId(),
+ tbl.currentSnapshot().sequenceNumber(),
+ ByteBuffer.wrap(serializeColStats),
+ PuffinCompressionCodec.NONE,
+ ImmutableMap.of()));
+ writer.finish();
+ } catch (IOException e) {
+ LOG.error(String.valueOf(e));
+ }
+ return false;
+ }
+
+ @Override
+ public boolean canProvideColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+ Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ if (canSetColStatistics(hmsTable)) {
+ Path statsPath = getStatsPath(table);
+ try (FileSystem fs = statsPath.getFileSystem(conf)) {
+ if (fs.exists(statsPath)) {
+ return true;
+ }
+ } catch (IOException e) {
+ LOG.warn("Exception when trying to find Iceberg column stats for table:{} , snapshot:{} , " +
+ "statsPath: {} , stack trace: {}", table.name(), table.currentSnapshot(), statsPath, e);
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public List<ColumnStatisticsObj> getColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+ Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ String statsPath = getStatsPath(table).toString();
+ LOG.info("Using stats from puffin file at: {}", statsPath);
+ try (PuffinReader reader = Puffin.read(table.io().newInputFile(statsPath)).build()) {
+ List<BlobMetadata> blobMetadata = reader.fileMetadata().blobs();
+ Map<BlobMetadata, List<ColumnStatistics>> collect =
+ Streams.stream(reader.readAll(blobMetadata)).collect(Collectors.toMap(Pair::first,
+ blobMetadataByteBufferPair -> SerializationUtils.deserialize(
+ ByteBuffers.toByteArray(blobMetadataByteBufferPair.second()))));
+ return collect.get(blobMetadata.get(0)).get(0).getStatsObj();
+ } catch (IOException e) {
+ LOG.error("Error when trying to read iceberg col stats from puffin files: {}", e);
+ }
+ return null;
+ }
+
+ private String getStatsSource() {
+ return HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_ICEBERG_STATS_SOURCE, ICEBERG).toLowerCase();
+ }
+
+ private Path getStatsPath(Table table) {
+ return new Path(table.location() + STATS + table.name() + table.currentSnapshot().snapshotId());
+ }
+
+ private void invalidateStats(Path statsPath) {
+ try (FileSystem fs = statsPath.getFileSystem(conf)) {
+ if (fs.exists(statsPath)) {
+ fs.delete(statsPath, true);
+ }
+ } catch (IOException e) {
+ LOG.error("Failed to invalidate stale column stats: {}", e);
+ }
+ }
+
/**
* No need for exclusive locks when writing, since Iceberg tables use optimistic concurrency when writing
* and only lock the table during the commit operation.
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/col_stats.q b/iceberg/iceberg-handler/src/test/queries/positive/col_stats.q
new file mode 100644
index 00000000000..d7c4d811a8b
--- /dev/null
+++ b/iceberg/iceberg-handler/src/test/queries/positive/col_stats.q
@@ -0,0 +1,58 @@
+-- Mask random uuid
+--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/
+set hive.stats.autogather=true;
+set hive.stats.column.autogather=true;
+
+set hive.iceberg.stats.source=iceberg;
+drop table if exists tbl_ice_puffin;
+create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2');
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56);
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56);
+explain select * from tbl_ice_puffin order by a, b, c;
+select * from tbl_ice_puffin order by a, b, c;
+desc formatted tbl_ice_puffin b;
+update tbl_ice_puffin set b='two' where b='one' or b='three';
+analyze table tbl_ice_puffin compute statistics for columns;
+explain select * from tbl_ice_puffin order by a, b, c;
+select * from tbl_ice_puffin order by a, b, c;
+select count(*) from tbl_ice_puffin ;
+desc formatted tbl_ice_puffin b;
+
+
+-- Test if hive.iceberg.stats.source is empty
+set hive.iceberg.stats.source= ;
+drop table if exists tbl_ice_puffin;
+create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2');
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56);
+explain select * from tbl_ice_puffin order by a, b, c;
+
+
+set hive.iceberg.stats.source=iceberg;
+drop table if exists tbl_ice_puffin;
+create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2');
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56);
+explain select * from tbl_ice_puffin order by a, b, c;
+select * from tbl_ice_puffin order by a, b, c;
+select count(*) from tbl_ice_puffin ;
+desc formatted tbl_ice_puffin a;
+
+
+set hive.iceberg.stats.source=metastore;
+
+drop table if exists tbl_ice;
+create external table tbl_ice(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2');
+insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56);
+explain select * from tbl_ice order by a, b, c;
+select * from tbl_ice order by a, b, c;
+select count(*) from tbl_ice ;
+
+set hive.iceberg.stats.source=iceberg;
+delete from tbl_ice_puffin where a = 2;
+explain select * from tbl_ice order by a, b, c;
+select count(*) from tbl_ice ;
+
+create table t1 (a int) stored by iceberg tblproperties ('format-version'='2');
+create table t2 (b int) stored by iceberg tblproperties ('format-version'='2');
+describe formatted t1;
+describe formatted t2;
+explain select * from t1 join t2 on t1.a = t2.b;
\ No newline at end of file
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q b/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q
index 90e2d95d1df..d80f420c42c 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q
@@ -4,13 +4,13 @@ set hive.stats.autogather=true;
set hive.stats.column.autogather=true;
drop table if exists tbl_ice;
-set hive.use.stats.from = metastore;
+set hive.iceberg.stats.source=metastore;
create external table tbl_ice(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2');
insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56);
explain select * from tbl_ice order by a, b, c;
drop table if exists tbl_ice;
-set hive.use.stats.from = iceberg;
+set hive.iceberg.stats.source = iceberg;
create external table tbl_ice(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2');
insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56);
explain select * from tbl_ice order by a, b, c;
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q b/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q
index b630b1f802d..99069bc266a 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q
@@ -46,6 +46,14 @@ explain select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_s
select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal;
+create external table t1 stored as orc as select * from tbl_ice_mixed_all_types ;
+
+explain select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal;
+select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal;
+
+
create external table tbl_ice_mixed_parted (
a int,
b string
diff --git a/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out b/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out
new file mode 100644
index 00000000000..b1f13fa76b5
--- /dev/null
+++ b/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out
@@ -0,0 +1,615 @@
+PREHOOK: query: drop table if exists tbl_ice_puffin
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists tbl_ice_puffin
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=18 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=18 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=18 width=95)
+ default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 one 50
+1 one 50
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+3 three 52
+3 three 52
+4 four 53
+4 four 53
+5 five 54
+5 five 54
+111 one 55
+111 one 55
+333 two 56
+333 two 56
+PREHOOK: query: desc formatted tbl_ice_puffin b
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: query: desc formatted tbl_ice_puffin b
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+col_name b
+data_type string
+min
+max
+num_nulls 0
+distinct_count 5
+avg_col_len 3.4444444444444446
+max_col_len 5
+num_trues
+num_falses
+bit_vector HL
+comment
+COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+PREHOOK: query: update tbl_ice_puffin set b='two' where b='one' or b='three'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: update tbl_ice_puffin set b='two' where b='one' or b='three'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: analyze table tbl_ice_puffin compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: analyze table tbl_ice_puffin compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=24 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=24 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=24 width=95)
+ default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 two 50
+1 two 50
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+3 two 52
+3 two 52
+4 four 53
+4 four 53
+5 five 54
+5 five 54
+111 two 55
+111 two 55
+333 two 56
+333 two 56
+PREHOOK: query: select count(*) from tbl_ice_puffin
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice_puffin
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+18
+PREHOOK: query: desc formatted tbl_ice_puffin b
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: query: desc formatted tbl_ice_puffin b
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+col_name b
+data_type string
+min
+max
+num_nulls 0
+distinct_count 3
+avg_col_len 3.2222222222222223
+max_col_len 4
+num_trues
+num_falses
+bit_vector HL
+comment
+COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+PREHOOK: query: drop table if exists tbl_ice_puffin
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: drop table if exists tbl_ice_puffin
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+ default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: drop table if exists tbl_ice_puffin
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: drop table if exists tbl_ice_puffin
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+ default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 one 50
+2 two 51
+2 two 51
+2 two 51
+3 three 52
+4 four 53
+5 five 54
+111 one 55
+333 two 56
+PREHOOK: query: select count(*) from tbl_ice_puffin
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice_puffin
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+9
+PREHOOK: query: desc formatted tbl_ice_puffin a
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: query: desc formatted tbl_ice_puffin a
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+col_name a
+data_type int
+min 1
+max 333
+num_nulls 0
+distinct_count 7
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector HL
+comment
+COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+PREHOOK: query: drop table if exists tbl_ice
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists tbl_ice
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create external table tbl_ice(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+POSTHOOK: query: create external table tbl_ice(a int, b string, c int) stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+PREHOOK: query: insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice
+POSTHOOK: query: insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice
+PREHOOK: query: explain select * from tbl_ice order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+ default@tbl_ice,tbl_ice,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 one 50
+2 two 51
+2 two 51
+2 two 51
+3 three 52
+4 four 53
+5 five 54
+111 one 55
+333 two 56
+PREHOOK: query: select count(*) from tbl_ice
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+9
+PREHOOK: query: delete from tbl_ice_puffin where a = 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: delete from tbl_ice_puffin where a = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+ default@tbl_ice,tbl_ice,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select count(*) from tbl_ice
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+9
+PREHOOK: query: create table t1 (a int) stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: create table t1 (a int) stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+PREHOOK: query: create table t2 (b int) stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t2
+POSTHOOK: query: create table t2 (b int) stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t2
+PREHOOK: query: describe formatted t1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@t1
+POSTHOOK: query: describe formatted t1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@t1
+# col_name data_type comment
+a int
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\"}}
+ bucketing_version 2
+ current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"a\",\"required\":false,\"type\":\"int\"}]}
+ engine.hive.enabled true
+ format-version 2
+ iceberg.orc.files.only false
+ metadata_location hdfs://### HDFS PATH ###
+ numFiles 0
+ numRows 0
+ rawDataSize 0
+ serialization.format 1
+ snapshot-count 0
+ storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ table_type ICEBERG
+ totalSize 0
+#### A masked pattern was here ####
+ uuid #Masked#
+ write.delete.mode merge-on-read
+ write.merge.mode merge-on-read
+ write.update.mode merge-on-read
+
+# Storage Information
+SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+Compressed: No
+Sort Columns: []
+PREHOOK: query: describe formatted t2
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@t2
+POSTHOOK: query: describe formatted t2
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@t2
+# col_name data_type comment
+b int
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"b\":\"true\"}}
+ bucketing_version 2
+ current-schema {\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"b\",\"required\":false,\"type\":\"int\"}]}
+ engine.hive.enabled true
+ format-version 2
+ iceberg.orc.files.only false
+ metadata_location hdfs://### HDFS PATH ###
+ numFiles 0
+ numRows 0
+ rawDataSize 0
+ serialization.format 1
+ snapshot-count 0
+ storage_handler org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ table_type ICEBERG
+ totalSize 0
+#### A masked pattern was here ####
+ uuid #Masked#
+ write.delete.mode merge-on-read
+ write.merge.mode merge-on-read
+ write.update.mode merge-on-read
+
+# Storage Information
+SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+Compressed: No
+Sort Columns: []
+PREHOOK: query: explain select * from t1 join t2 on t1.a = t2.b
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from t1 join t2 on t1.a = t2.b
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2
+ File Output Operator [FS_10]
+ Merge Join Operator [MERGEJOIN_25] (rows=1 width=4)
+ Conds:RS_28._col0=RS_31._col0(Inner),Output:["_col0","_col1"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_28]
+ PartitionCols:_col0
+ Select Operator [SEL_27] (rows=1 width=4)
+ Output:["_col0"]
+ Filter Operator [FIL_26] (rows=1 width=4)
+ predicate:a is not null
+ TableScan [TS_0] (rows=1 width=4)
+ default@t1,t1,Tbl:COMPLETE,Col:NONE,Output:["a"]
+ <-Map 3 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_31]
+ PartitionCols:_col0
+ Select Operator [SEL_30] (rows=1 width=4)
+ Output:["_col0"]
+ Filter Operator [FIL_29] (rows=1 width=4)
+ predicate:b is not null
+ TableScan [TS_3] (rows=1 width=4)
+ default@t2,t2,Tbl:COMPLETE,Col:NONE,Output:["b"]
+
diff --git a/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out b/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
index 2cf955f898c..7e7a5eab1e3 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
@@ -76,9 +76,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:_col1
- Select Operator [SEL_12] (rows=22 width=87)
+ Select Operator [SEL_12] (rows=22 width=91)
Output:["_col0","_col1"]
- TableScan [TS_0] (rows=22 width=87)
+ TableScan [TS_0] (rows=22 width=91)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -90,7 +90,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=400)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"]
- Select Operator [SEL_14] (rows=22 width=87)
+ Select Operator [SEL_14] (rows=22 width=91)
Output:["a","ccy"]
Please refer to the previous Select Operator [SEL_12]
@@ -170,9 +170,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:iceberg_bucket(_col1, 2)
- Select Operator [SEL_12] (rows=22 width=87)
+ Select Operator [SEL_12] (rows=22 width=91)
Output:["_col0","_col1"]
- TableScan [TS_0] (rows=22 width=87)
+ TableScan [TS_0] (rows=22 width=91)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -184,7 +184,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=400)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"]
- Select Operator [SEL_14] (rows=22 width=87)
+ Select Operator [SEL_14] (rows=22 width=91)
Output:["a","ccy"]
Please refer to the previous Select Operator [SEL_12]
@@ -264,9 +264,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:_col1, iceberg_bucket(_col2, 3)
- Select Operator [SEL_12] (rows=22 width=94)
+ Select Operator [SEL_12] (rows=22 width=99)
Output:["_col0","_col1","_col2"]
- TableScan [TS_0] (rows=22 width=94)
+ TableScan [TS_0] (rows=22 width=99)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -278,7 +278,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=568)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)","min(c)","max(c)","count(c)","compute_bit_vector_hll(c)"]
- Select Operator [SEL_14] (rows=22 width=94)
+ Select Operator [SEL_14] (rows=22 width=99)
Output:["a","ccy","c"]
Please refer to the previous Select Operator [SEL_12]
@@ -403,7 +403,7 @@ Stage-3
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_14] (rows=4 width=99)
predicate:(b = 'EUR')
- TableScan [TS_0] (rows=22 width=94)
+ TableScan [TS_0] (rows=22 width=99)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
Reducer 3 vectorized
File Output Operator [FS_24]
@@ -461,7 +461,7 @@ Stage-3
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_12] (rows=1 width=99)
predicate:((c = 100L) and (b = 'USD'))
- TableScan [TS_0] (rows=22 width=94)
+ TableScan [TS_0] (rows=22 width=99)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
PARTITION_ONLY_SHUFFLE [RS_17]
Group By Operator [GBY_16] (rows=1 width=568)
diff --git a/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out b/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out
index bdef92e60f6..50ce82dc248 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out
@@ -589,13 +589,13 @@ STAGE PLANS:
minReductionHashAggr: 0.99
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
- Statistics: Num rows: 2 Data size: 746 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: double), _col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type: decimal(4,2))
null sort order: zzzzzzzzz
sort order: +++++++++
Map-reduce partition columns: _col0 (type: double), _col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type: decimal(4,2))
- Statistics: Num rows: 2 Data size: 746 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col9 (type: float)
Execution mode: vectorized, llap
LLAP IO: all inputs (cache only)
@@ -607,14 +607,116 @@ STAGE PLANS:
keys: KEY._col0 (type: double), KEY._col1 (type: boolean), KEY._col2 (type: int), KEY._col3 (type: bigint), KEY._col4 (type: binary), KEY._col5 (type: string), KEY._col6 (type: timestamp), KEY._col7 (type: date), KEY._col8 (type: decimal(4,2))
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
- Statistics: Num rows: 2 Data size: 746 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
Select Operator
expressions: _col9 (type: float), _col0 (type: double), _col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type: decimal(4,2))
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+POSTHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+1.1 1.2 false 4 567890123456789 6 col7 2012-10-03 19:58:08 1234-09-02 10.01
+5.1 6.2 true 40 567890123456780 8 col07 2012-10-03 19:58:09 1234-09-03 10.02
+PREHOOK: query: create external table t1 stored as orc as select * from tbl_ice_mixed_all_types
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: create external table t1 stored as orc as select * from tbl_ice_mixed_all_types
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.t_bigint SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_bigint, type:bigint, comment:null), ]
+POSTHOOK: Lineage: t1.t_binary SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_binary, type:binary, comment:null), ]
+POSTHOOK: Lineage: t1.t_boolean SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_boolean, type:boolean, comment:null), ]
+POSTHOOK: Lineage: t1.t_date SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_date, type:date, comment:null), ]
+POSTHOOK: Lineage: t1.t_decimal SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_decimal, type:decimal(4,2), comment:null), ]
+POSTHOOK: Lineage: t1.t_double SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_double, type:double, comment:null), ]
+POSTHOOK: Lineage: t1.t_float SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_float, type:float, comment:null), ]
+POSTHOOK: Lineage: t1.t_int SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_int, type:int, comment:null), ]
+POSTHOOK: Lineage: t1.t_string SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_string, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.t_timestamp SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_timestamp, type:timestamp, comment:null), ]
+PREHOOK: query: explain select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+POSTHOOK: query: explain select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: tbl_ice_mixed_all_types
Statistics: Num rows: 2 Data size: 746 Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: max(t_float)
+ keys: t_double (type: double), t_boolean (type: boolean), t_int (type: int), t_bigint (type: bigint), t_binary (type: binary), t_string (type: string), t_timestamp (type: timestamp), t_date (type: date), t_decimal (type: decimal(4,2))
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: double), _col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type: decimal(4,2))
+ null sort order: zzzzzzzzz
+ sort order: +++++++++
+ Map-reduce partition columns: _col0 (type: double), _col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type: decimal(4,2))
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col9 (type: float)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs (cache only)
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: max(VALUE._col0)
+ keys: KEY._col0 (type: double), KEY._col1 (type: boolean), KEY._col2 (type: int), KEY._col3 (type: bigint), KEY._col4 (type: binary), KEY._col5 (type: string), KEY._col6 (type: timestamp), KEY._col7 (type: date), KEY._col8 (type: decimal(4,2))
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ expressions: _col9 (type: float), _col0 (type: double), _col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type: decimal(4,2))
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 2 Data size: 746 Basic stats: COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE Column stats: COMPLETE
table:
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git a/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out b/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out
index a43950aa6ac..34696284306 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out
@@ -538,14 +538,79 @@ Stage-0
Stage-1
Reducer 2 vectorized
File Output Operator [FS_11]
- Select Operator [SEL_10] (rows=2 width=373)
+ Select Operator [SEL_10] (rows=1 width=373)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"]
- Group By Operator [GBY_9] (rows=2 width=373)
+ Group By Operator [GBY_9] (rows=1 width=373)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(VALUE._col0)"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3, KEY._col4, KEY._col5, KEY._col6, KEY._col7, KEY._col8
<-Map 1 [SIMPLE_EDGE] vectorized
SHUFFLE [RS_8]
PartitionCols:_col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
- Group By Operator [GBY_7] (rows=2 width=373)
+ Group By Operator [GBY_7] (rows=1 width=373)
+ Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(t_float)"],keys:t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+ TableScan [TS_0] (rows=2 width=373)
+ default@tbl_ice_mixed_all_types,tbl_ice_mixed_all_types,Tbl:COMPLETE,Col:COMPLETE,Output:["t_float","t_double","t_boolean","t_int","t_bigint","t_binary","t_string","t_timestamp","t_date","t_decimal"]
+
+PREHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1.1 1.2 false 4 567890123456789 6 col7 2012-10-03 19:58:08 1234-09-02 10.01
+5.1 6.2 true 40 567890123456780 8 col07 2012-10-03 19:58:09 1234-09-03 10.02
+PREHOOK: query: create external table t1 stored as orc as select * from tbl_ice_mixed_all_types
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: create external table t1 stored as orc as select * from tbl_ice_mixed_all_types
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.t_bigint SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_bigint, type:bigint, comment:null), ]
+POSTHOOK: Lineage: t1.t_binary SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_binary, type:binary, comment:null), ]
+POSTHOOK: Lineage: t1.t_boolean SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_boolean, type:boolean, comment:null), ]
+POSTHOOK: Lineage: t1.t_date SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_date, type:date, comment:null), ]
+POSTHOOK: Lineage: t1.t_decimal SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_decimal, type:decimal(4,2), comment:null), ]
+POSTHOOK: Lineage: t1.t_double SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_double, type:double, comment:null), ]
+POSTHOOK: Lineage: t1.t_float SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_float, type:float, comment:null), ]
+POSTHOOK: Lineage: t1.t_int SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_int, type:int, comment:null), ]
+POSTHOOK: Lineage: t1.t_string SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_string, type:string, comment:null), ]
+POSTHOOK: Lineage: t1.t_timestamp SIMPLE [(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_timestamp, type:timestamp, comment:null), ]
+PREHOOK: query: explain select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_11]
+ Select Operator [SEL_10] (rows=1 width=373)
+ Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"]
+ Group By Operator [GBY_9] (rows=1 width=373)
+ Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(VALUE._col0)"],keys:KEY._col0, KEY._col1, KEY._col2, KEY._col3, KEY._col4, KEY._col5, KEY._col6, KEY._col7, KEY._col8
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_8]
+ PartitionCols:_col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8
+ Group By Operator [GBY_7] (rows=1 width=373)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(t_float)"],keys:t_double, t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
TableScan [TS_0] (rows=2 width=373)
default@tbl_ice_mixed_all_types,tbl_ice_mixed_all_types,Tbl:COMPLETE,Col:COMPLETE,Output:["t_float","t_double","t_boolean","t_int","t_bigint","t_binary","t_string","t_timestamp","t_date","t_decimal"]
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
index aff2f51cbc1..65e14af478a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
@@ -29,6 +29,8 @@ import org.apache.hadoop.hive.common.classification.InterfaceStability;
import org.apache.hadoop.hive.common.type.SnapshotContext;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.LockType;
@@ -42,6 +44,7 @@ import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.parse.AlterTableExecuteSpec;
import org.apache.hadoop.hive.ql.parse.TransformSpec;
import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
@@ -245,6 +248,44 @@ public interface HiveStorageHandler extends Configurable {
return false;
}
+ /**
+ * Return some col statistics (Lower bounds, Upper bounds, Null value counts, NaN, total counts) calculated by
+ * the underlying storage handler implementation.
+ * @param table
+ * @return A List of Column Statistics Objects, can be null
+ */
+ default List<ColumnStatisticsObj>getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
+ return null;
+ }
+
+ /**
+ * Set column stats for non-native tables
+ * @param table
+ * @param colStats
+ * @return boolean
+ */
+ default boolean setColStatistics(org.apache.hadoop.hive.ql.metadata.Table table,
+ List<ColumnStatistics> colStats) {
+ return false;
+ }
+
+ /**
+ * Check if the storage handler can provide col statistics.
+ * @param tbl
+ * @return true if the storage handler can supply the col statistics
+ */
+ default boolean canProvideColStatistics(org.apache.hadoop.hive.ql.metadata.Table tbl) {
+ return false;
+ }
+
+ /**
+ * Check if the storage handler can set col statistics.
+ * @return true if the storage handler can set the col statistics
+ */
+ default boolean canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table tbl) {
+ return false;
+ }
+
/**
* Check if CTAS and CMV operations should behave in a direct-insert manner (i.e. no move task).
* <p>
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
index 95d4b439d16..e2ee8ae07b4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
@@ -218,6 +218,9 @@ public class ColStatsProcessor implements IStatsProcessor {
}
start = System. currentTimeMillis();
+ if (tbl != null && tbl.isNonNative() && tbl.getStorageHandler().canSetColStatistics(tbl)) {
+ tbl.getStorageHandler().setColStatistics(tbl, colStats);
+ }
db.setPartitionColumnStatistics(request);
end = System.currentTimeMillis();
LOG.info("Time taken to update " + colStats.size() + " stats : " + ((end - start)/1000F) + " seconds.");
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 9c1926a747e..a758bcdecd1 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1069,8 +1069,12 @@ public class StatsUtils {
}
if (fetchColStats && !colStatsToRetrieve.isEmpty()) {
try {
- List<ColumnStatisticsObj> colStat = Hive.get().getTableColumnStatistics(
- dbName, tabName, colStatsToRetrieve, false);
+ List<ColumnStatisticsObj> colStat;
+ if (table.isNonNative() && table.getStorageHandler().canProvideColStatistics(table)) {
+ colStat = table.getStorageHandler().getColStatistics(table);
+ } else {
+ colStat = Hive.get().getTableColumnStatistics(dbName, tabName, colStatsToRetrieve, false);
+ }
stats = convertColStats(colStat, tabName);
} catch (HiveException e) {
LOG.error("Failed to retrieve table statistics: ", e);