You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/12/13 12:15:26 UTC

[impala] branch master updated: IMPALA-11784: Don't call Iceberg's planFiles redundantly during table load

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new da304c1fe IMPALA-11784: Don't call Iceberg's planFiles redundantly during table load
da304c1fe is described below

commit da304c1feddd24ce5042d38365c9357d61608962
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Fri Dec 9 15:34:39 2022 +0100

    IMPALA-11784: Don't call Iceberg's planFiles redundantly during table load
    
    Iceberg's planFiles() API is very expensive because it involves reading
    the Avro manifest files. It's especially expensive on object stores,
    though manifest caching can help here.
    
    Currently we invoke this API two times during table loading (via
    IcebergUtil.getIcebergFiles()), once in loadAllPartition() and once in
    loadPartitionStats().
    
    With this patch we invoke IcebergUtil.getIcebergFiles() once, then pass
    the result object to loadAllPartition() and loadPartitionStats().
    
    Change-Id: I72575c722e65a91b14926cf24b4622b4499a4e20
    Reviewed-on: http://gerrit.cloudera.org:8080/19334
    Reviewed-by: <li...@sensorsdata.cn>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Tamas Mate <tm...@apache.org>
---
 .../main/java/org/apache/impala/catalog/FeIcebergTable.java | 13 ++++++-------
 .../main/java/org/apache/impala/catalog/IcebergTable.java   |  9 +++++++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
index eaf4d24a2..529dd3177 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
@@ -669,7 +669,8 @@ public interface FeIcebergTable extends FeFsTable {
      * path, using org.apache.hadoop.fs.Path to normalize the paths.
      */
     public static IcebergContentFileStore loadAllPartition(
-        IcebergTable table) throws IOException, TableLoadingException {
+        IcebergTable table, Pair<List<DataFile>, Set<DeleteFile>> icebergFiles)
+        throws IOException, TableLoadingException {
       Map<String, HdfsPartition.FileDescriptor> hdfsFileDescMap = new HashMap<>();
       Collection<HdfsPartition> partitions =
           ((HdfsTable)table.getFeFsTable()).partitionMap_.values();
@@ -680,9 +681,8 @@ public interface FeIcebergTable extends FeFsTable {
         }
       }
       IcebergContentFileStore fileStore = new IcebergContentFileStore();
-      Pair<List<DataFile>, Set<DeleteFile>> allFiles = IcebergUtil.getIcebergFiles(
-          table, new ArrayList<>(), /*timeTravelSpecl=*/null);
-      for (ContentFile contentFile : Iterables.concat(allFiles.first, allFiles.second)) {
+      for (ContentFile contentFile : Iterables.concat(icebergFiles.first,
+                                                      icebergFiles.second)) {
         addContentFileToFileStore(contentFile, fileStore, table, hdfsFileDescMap);
       }
       return fileStore;
@@ -736,9 +736,8 @@ public interface FeIcebergTable extends FeFsTable {
      * TODO(IMPALA-11516): Return better partition stats for V2 tables.
      */
     public static Map<String, TIcebergPartitionStats> loadPartitionStats(
-        IcebergTable table) throws TableLoadingException {
-      Pair<List<DataFile>, Set<DeleteFile>> icebergFiles = IcebergUtil
-          .getIcebergFiles(table, new ArrayList<>(), /*timeTravelSpec=*/null);
+        IcebergTable table, Pair<List<DataFile>, Set<DeleteFile>> icebergFiles)
+        throws TableLoadingException {
       List<DataFile> dataFileList = icebergFiles.first;
       Set<DeleteFile> deleteFileList = icebergFiles.second;
       Map<String, TIcebergPartitionStats> nameToStats = new HashMap<>();
diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
index 04e418e1c..a7333d342 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
@@ -29,10 +29,13 @@ import org.apache.hadoop.hive.common.StatsSetupConst;
 import org.apache.hadoop.hive.metastore.IMetaStoreClient;
 import org.apache.hadoop.hive.metastore.TableType;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DeleteFile;
 import org.apache.impala.analysis.IcebergPartitionField;
 import org.apache.impala.analysis.IcebergPartitionSpec;
 import org.apache.impala.analysis.IcebergPartitionTransform;
 import org.apache.impala.catalog.HdfsPartition.FileDescriptor;
+import org.apache.impala.common.Pair;
 import org.apache.impala.thrift.TCatalogObjectType;
 import org.apache.impala.thrift.TCompressionCodec;
 import org.apache.impala.thrift.TGetPartialCatalogObjectRequest;
@@ -355,8 +358,10 @@ public class IcebergTable extends Table implements FeIcebergTable {
         icebergParquetDictPageSize_ = Utils.getIcebergParquetDictPageSize(msTbl);
         hdfsTable_
             .load(false, msClient, msTable_, true, true, false, null, null,null, reason);
-        fileStore_ = Utils.loadAllPartition(this);
-        partitionStats_ = Utils.loadPartitionStats(this);
+        Pair<List<DataFile>, Set<DeleteFile>> icebergFiles = IcebergUtil
+            .getIcebergFiles(this, new ArrayList<>(), /*timeTravelSpec=*/null);
+        fileStore_ = Utils.loadAllPartition(this, icebergFiles);
+        partitionStats_ = Utils.loadPartitionStats(this, icebergFiles);
         setIcebergTableStats();
         loadAllColumnStats(msClient);
         setAvroSchema(msClient, msTbl, fileStore_);