You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2023/09/26 02:22:59 UTC
[impala] 01/02: IMPALA-12462: Update only changed partitions after COMPUTE STATS
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 45d6815821a29b83c7a3daa3d380a40e0e4f3836
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Sun Sep 24 14:21:01 2023 +0200
IMPALA-12462: Update only changed partitions after COMPUTE STATS
This is mainly a revert of https://gerrit.cloudera.org/#/c/640/ but
some parts had to be updated due to changes in Impala.
See IMPALA-2201 for details about why this optimization was removed.
The patch can massively speed up COMPUTE STATS statement when the
majority of partitions has no changes.
COMPUTE STATS tpcds_parquet.store_sales;
before: 12s
after: 1s
Besides the DDL speed up the number of HMS events generated is also
reduced.
Testing:
- added test to verify COMPUTE STATS output
- correctness of cases when something is modified should be covered
by existing tests
- core tests passed
Change-Id: If2703e0790d5c25db98ed26f26f6d96281c366a3
Reviewed-on: http://gerrit.cloudera.org:8080/20505
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Wenzhe Zhou <wz...@cloudera.com>
---
.../apache/impala/service/CatalogOpExecutor.java | 31 +++++++++++++++++-----
.../QueryTest/compute-stats-incremental.test | 8 ++++++
.../queries/QueryTest/compute-stats.test | 8 ++++++
3 files changed, 41 insertions(+), 6 deletions(-)
diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
index 256666d1b..d263222db 100644
--- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
@@ -1881,15 +1881,34 @@ public class CatalogOpExecutor {
partitionStats.stats.setNum_rows(0L);
}
- // Unconditionally update the partition stats and row count, even if the partition
- // already has identical ones. This behavior results in possibly redundant work,
- // but it is predictable and easy to reason about because it does not depend on the
- // existing state of the metadata. See IMPALA-2201.
+ // Update the partition in HMS only if something has changed.
+ // Note that previously all partitions were updated to avoid an HMS bug.
+ // See IMPALA-2201.
+ boolean updatedPartition = false;
+ TPartitionStats existingPartStats = partition.getPartitionStats();
+ // Update the partition stats if: either there are no existing stats for this
+ // partition, or they're different.
+ if (existingPartStats == null || !existingPartStats.equals(partitionStats)) {
+ updatedPartition = true;
+ }
+
long numRows = partitionStats.stats.num_rows;
+ String existingRowCount =
+ partition.getParameters().get(StatsSetupConst.ROW_COUNT);
+ if (existingRowCount == null ||
+ !existingRowCount.equals(String.valueOf(numRows))) {
+ // The existing row count value wasn't set or has changed.
+ updatedPartition = true;
+ }
+
if (LOG.isTraceEnabled()) {
- LOG.trace(String.format("Updating stats for partition %s: numRows=%d",
- partition.getValuesAsString(), numRows));
+ LOG.trace("{} stats for partition {}: numRows={}",
+ updatedPartition ? "Updating" : "Skip updating",
+ partition.getValuesAsString(), numRows);
}
+
+ if (!updatedPartition) continue;
+
HdfsPartition.Builder partBuilder = new HdfsPartition.Builder(partition);
PartitionStatsUtil.partStatsToPartition(partitionStats, partBuilder);
partBuilder.setRowCountParam(numRows);
diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
index 2eb659fc7..6a8b90740 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
@@ -207,6 +207,14 @@ compute incremental stats alltypes_incremental partition(year=2010, month=2);
STRING
====
---- QUERY
+# Test that no partitions are updated if there are no changes (IMPALA-12462).
+compute incremental stats alltypes_incremental partition(year=2010, month=2);
+---- RESULTS
+'Updated 0 partition(s) and 11 column(s).'
+---- TYPES
+STRING
+====
+---- QUERY
show table stats alltypes_incremental;
---- RESULTS
'2009','1',310,1,'24.56KB','NOT CACHED','NOT CACHED','TEXT','true',regex:.*,'$ERASURECODE_POLICY'
diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test
index e9623425c..22e58bc14 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test
@@ -13,6 +13,14 @@ compute stats alltypes
STRING
====
---- QUERY
+# Test that no partitions are updated if there are no changes (IMPALA-12462).
+compute stats alltypes
+---- RESULTS
+'Updated 0 partition(s) and 11 column(s).'
+---- TYPES
+STRING
+====
+---- QUERY
show table stats alltypes
---- LABELS
YEAR, MONTH, #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION, EC POLICY