You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by dk...@apache.org on 2023/05/23 07:55:38 UTC
[hive] branch master updated: HIVE-27347: Addendum: count(*) should be using Iceberg stats only if there are no deletes (Denys Kuzmenko, reviewed by Ayush Saxena)
This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 36f5a09a997 HIVE-27347: Addendum: count(*) should be using Iceberg stats only if there are no deletes (Denys Kuzmenko, reviewed by Ayush Saxena)
36f5a09a997 is described below
commit 36f5a09a9975415154320c33cec4cf3ca5757f6b
Author: Denys Kuzmenko <de...@gmail.com>
AuthorDate: Tue May 23 10:55:28 2023 +0300
HIVE-27347: Addendum: count(*) should be using Iceberg stats only if there are no deletes (Denys Kuzmenko, reviewed by Ayush Saxena)
Closes #4343
---
.../iceberg/mr/hive/HiveIcebergStorageHandler.java | 25 ++++++--
.../src/test/queries/positive/iceberg_stats.q | 6 ++
.../src/test/results/positive/iceberg_stats.q.out | 73 +++++++++++++++++++++-
.../hive/ql/metadata/HiveStorageHandler.java | 8 +++
.../hadoop/hive/ql/optimizer/StatsOptimizer.java | 2 +-
5 files changed, 107 insertions(+), 7 deletions(-)
diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index ee7fbfaeb0d..66d336a03c9 100644
--- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -352,9 +352,8 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
// For write queries where rows got modified, don't fetch from cache as values could have changed.
Table table = getTable(hmsTable);
- String statsSource = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_ICEBERG_STATS_SOURCE).toLowerCase();
Map<String, String> stats = Maps.newHashMap();
- if (statsSource.equals(ICEBERG)) {
+ if (getStatsSource().equals(ICEBERG)) {
if (table.currentSnapshot() != null) {
Map<String, String> summary = table.currentSnapshot().summary();
if (summary != null) {
@@ -405,7 +404,7 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
@Override
public boolean canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
- return table.currentSnapshot() != null ? getStatsSource().equals(ICEBERG) : false;
+ return table.currentSnapshot() != null && getStatsSource().equals(ICEBERG);
}
@Override
@@ -463,11 +462,29 @@ public class HiveIcebergStorageHandler implements HiveStoragePredicateHandler, H
ByteBuffers.toByteArray(blobMetadataByteBufferPair.second()))));
return collect.get(blobMetadata.get(0)).get(0).getStatsObj();
} catch (IOException e) {
- LOG.error("Error when trying to read iceberg col stats from puffin files: {}", e);
+ LOG.error("Error when trying to read iceberg col stats from puffin files: ", e);
}
return null;
}
+ @Override
+ public boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+ if (getStatsSource().equals(ICEBERG)) {
+ Table table = getTable(hmsTable);
+ if (table.currentSnapshot() != null) {
+ Map<String, String> summary = table.currentSnapshot().summary();
+ if (summary != null && summary.containsKey(SnapshotSummary.TOTAL_EQ_DELETES_PROP) &&
+ summary.containsKey(SnapshotSummary.TOTAL_POS_DELETES_PROP)) {
+
+ long totalEqDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_EQ_DELETES_PROP));
+ long totalPosDeletes = Long.parseLong(summary.get(SnapshotSummary.TOTAL_POS_DELETES_PROP));
+ return totalEqDeletes + totalPosDeletes == 0;
+ }
+ }
+ }
+ return false;
+ }
+
private String getStatsSource() {
return HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_ICEBERG_STATS_SOURCE, ICEBERG).toLowerCase();
}
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
index 4238cbba4b9..8276cef8aae 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_stats.q
@@ -1,3 +1,5 @@
+--! qt:replace:/(\s+Statistics\: Num rows\: \d+ Data size\:\s+)\S+(\s+Basic stats\: \S+ Column stats\: \S+)/$1#Masked#$2/
+
set hive.compute.query.using.stats=true;
set hive.explain.user=false;
@@ -14,4 +16,8 @@ delete from ice01 where id in (2,4);
explain select count(*) from ice01;
select count(*) from ice01;
+-- iow
+insert overwrite table ice01 select * from ice01;
+explain select count(*) from ice01;
+
drop table ice01;
\ No newline at end of file
diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
index 150c48f2a16..60726e60008 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_stats.q.out
@@ -60,12 +60,55 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@ice01
POSTHOOK: Output: hdfs://### HDFS PATH ###
STAGE DEPENDENCIES:
- Stage-0 is a root stage
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: ice01
+ Statistics: Num rows: 3 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+ Select Operator
+ Statistics: Num rows: 3 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: count()
+ minReductionHashAggr: 0.6666666
+ mode: hash
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ null sort order:
+ sort order:
+ Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col0 (type: bigint)
+ Execution mode: vectorized
+ Reducer 2
+ Execution mode: vectorized
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(VALUE._col0)
+ mode: mergepartial
+ outputColumnNames: _col0
+ Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: #Masked# Basic stats: COMPLETE Column stats: COMPLETE
+ table:
+ input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
Stage: Stage-0
Fetch Operator
- limit: 1
+ limit: -1
Processor Tree:
ListSink
@@ -78,6 +121,32 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@ice01
POSTHOOK: Output: hdfs://### HDFS PATH ###
3
+PREHOOK: query: insert overwrite table ice01 select * from ice01
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice01
+PREHOOK: Output: default@ice01
+POSTHOOK: query: insert overwrite table ice01 select * from ice01
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice01
+POSTHOOK: Output: default@ice01
+PREHOOK: query: explain select count(*) from ice01
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice01
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select count(*) from ice01
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice01
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+STAGE DEPENDENCIES:
+ Stage-0 is a root stage
+
+STAGE PLANS:
+ Stage: Stage-0
+ Fetch Operator
+ limit: 1
+ Processor Tree:
+ ListSink
+
PREHOOK: query: drop table ice01
PREHOOK: type: DROPTABLE
PREHOOK: Input: default@ice01
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
index fb5a1066e1c..1ebbe508e0c 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
@@ -287,6 +287,14 @@ public interface HiveStorageHandler extends Configurable {
return false;
}
+ /**
+ * Check if the storage handler answer a few queries like count(1) purely using stats.
+ * @return true if the storage handler can answer query using statistics
+ */
+ default boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table tbl) {
+ return false;
+ }
+
/**
*
* Gets the storage format descriptor to be used for temp table for LOAD data.
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index fb5a3824deb..e6cc04c555e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -300,7 +300,7 @@ public class StatsOptimizer extends Transform {
return null;
}
if (MetaStoreUtils.isNonNativeTable(tbl.getTTable())
- && !tbl.getStorageHandler().canProvideBasicStatistics() ) {
+ && !tbl.getStorageHandler().canComputeQueryUsingStats(tbl)) {
Logger.info("Table " + tbl.getTableName() + " is non Native table. Skip StatsOptimizer.");
return null;
}