You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2023/09/26 02:22:59 UTC

[impala] 01/02: IMPALA-12462: Update only changed partitions after COMPUTE STATS

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 45d6815821a29b83c7a3daa3d380a40e0e4f3836
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Sun Sep 24 14:21:01 2023 +0200

    IMPALA-12462: Update only changed partitions after COMPUTE STATS
    
    This is mainly a revert of https://gerrit.cloudera.org/#/c/640/ but
    some parts had to be updated due to changes in Impala.
    See IMPALA-2201 for details about why this optimization was removed.
    
    The patch can massively speed up COMPUTE STATS statement when the
    majority of partitions has no changes.
    COMPUTE STATS tpcds_parquet.store_sales;
    before: 12s
    after:   1s
    
    Besides the DDL speed up the number of HMS events generated is also
    reduced.
    
    Testing:
    - added test to verify COMPUTE STATS output
    - correctness of cases when something is modified should be covered
      by existing tests
    - core tests passed
    
    Change-Id: If2703e0790d5c25db98ed26f26f6d96281c366a3
    Reviewed-on: http://gerrit.cloudera.org:8080/20505
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Wenzhe Zhou <wz...@cloudera.com>
---
 .../apache/impala/service/CatalogOpExecutor.java   | 31 +++++++++++++++++-----
 .../QueryTest/compute-stats-incremental.test       |  8 ++++++
 .../queries/QueryTest/compute-stats.test           |  8 ++++++
 3 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
index 256666d1b..d263222db 100644
--- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
@@ -1881,15 +1881,34 @@ public class CatalogOpExecutor {
         partitionStats.stats.setNum_rows(0L);
       }
 
-      // Unconditionally update the partition stats and row count, even if the partition
-      // already has identical ones. This behavior results in possibly redundant work,
-      // but it is predictable and easy to reason about because it does not depend on the
-      // existing state of the metadata. See IMPALA-2201.
+      // Update the partition in HMS only if something has changed.
+      // Note that previously all partitions were updated to avoid an HMS bug.
+      // See IMPALA-2201.
+      boolean updatedPartition = false;
+      TPartitionStats existingPartStats = partition.getPartitionStats();
+      // Update the partition stats if: either there are no existing stats for this
+      // partition, or they're different.
+      if (existingPartStats == null || !existingPartStats.equals(partitionStats)) {
+        updatedPartition = true;
+      }
+
       long numRows = partitionStats.stats.num_rows;
+      String existingRowCount =
+          partition.getParameters().get(StatsSetupConst.ROW_COUNT);
+      if (existingRowCount == null ||
+          !existingRowCount.equals(String.valueOf(numRows))) {
+        // The existing row count value wasn't set or has changed.
+        updatedPartition = true;
+      }
+
       if (LOG.isTraceEnabled()) {
-        LOG.trace(String.format("Updating stats for partition %s: numRows=%d",
-            partition.getValuesAsString(), numRows));
+        LOG.trace("{} stats for partition {}: numRows={}",
+            updatedPartition ? "Updating" : "Skip updating",
+            partition.getValuesAsString(), numRows);
       }
+
+      if (!updatedPartition) continue;
+
       HdfsPartition.Builder partBuilder = new HdfsPartition.Builder(partition);
       PartitionStatsUtil.partStatsToPartition(partitionStats, partBuilder);
       partBuilder.setRowCountParam(numRows);
diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
index 2eb659fc7..6a8b90740 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-incremental.test
@@ -207,6 +207,14 @@ compute incremental stats alltypes_incremental partition(year=2010, month=2);
 STRING
 ====
 ---- QUERY
+# Test that no partitions are updated if there are no changes (IMPALA-12462).
+compute incremental stats alltypes_incremental partition(year=2010, month=2);
+---- RESULTS
+'Updated 0 partition(s) and 11 column(s).'
+---- TYPES
+STRING
+====
+---- QUERY
 show table stats alltypes_incremental;
 ---- RESULTS
 '2009','1',310,1,'24.56KB','NOT CACHED','NOT CACHED','TEXT','true',regex:.*,'$ERASURECODE_POLICY'
diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test
index e9623425c..22e58bc14 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/compute-stats.test
@@ -13,6 +13,14 @@ compute stats alltypes
 STRING
 ====
 ---- QUERY
+# Test that no partitions are updated if there are no changes (IMPALA-12462).
+compute stats alltypes
+---- RESULTS
+'Updated 0 partition(s) and 11 column(s).'
+---- TYPES
+STRING
+====
+---- QUERY
 show table stats alltypes
 ---- LABELS
 YEAR, MONTH, #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION, EC POLICY