You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2024/01/18 19:43:54 UTC

(impala) branch master updated: IMPALA-11710: Table properties are not updated in Iceberg metadata files

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 47e2cc3bb IMPALA-11710: Table properties are not updated in Iceberg metadata files
47e2cc3bb is described below

commit 47e2cc3bb9b982e6b9f30535fa7374f2da1b8dd3
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Tue Jan 16 18:03:24 2024 +0100

    IMPALA-11710: Table properties are not updated in Iceberg metadata files
    
    Impala has some troubles with Iceberg tables that don't specify
    'external.table.purge'='true'. Schema changes like ADD COLUMN is not
    working, also table properties that have been set by an ALTER TABLE
    SET TBLPROPERTIES statement can be reset by subsequent INSERT
    statements.
    
    There was a bug in CatalogOpExecutor.alterIcebergTables(). Its return
    value determines whether we need to also update the table definition in
    HMS, or it was already done by the Iceberg library. In fact, the HMS
    table definition is always updated by the Iceberg library if the table
    is handled by the HiveCatalog. In every other case we need to update
    the HMS table definition ourselves (unless the change won't affect
    HMS).
    
    The issue was that CatalogOpExecutor.alterIcebergTables() returned true
    (which means we need to update HMS) in case of Iceberg tables that
    didn't specify 'external.table.purge'='true'. This was a problem,
    because Iceberg already modified the HMS table and set
    'metadata_location' to a new metadata file. But then Impala came and
    modified some properties of the HMS table definition, but also reset
    'metadata_location' to the original value. Therefore subsequent
    operations/refreshes used the earlier state of the Iceberg table and
    they reset the modifications.
    
    Testing:
     * added e2e tests for Iceberg tables in different catalogs with
       'external.table.merge'='false'
    
    Change-Id: I2a82d022534e1e212d542fbd7916ae033c381c20
    Reviewed-on: http://gerrit.cloudera.org:8080/20907
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../apache/impala/service/CatalogOpExecutor.java   |  5 +-
 .../queries/QueryTest/iceberg-external.test        | 71 ++++++++++++++++++++++
 tests/query_test/test_iceberg.py                   |  3 +
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
index edd68215c..3e620aa51 100644
--- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
@@ -1405,13 +1405,16 @@ public class CatalogOpExecutor {
 
   /**
    * Executes the ALTER TABLE command for an Iceberg table and reloads its metadata.
+   * Returns true if we also need to update the table definition in HMS. Returns false
+   * if the HMS table is already updated by Iceberg, or there is nothing to update in
+   * HMS (the change is internal to Iceberg).
    */
   private boolean alterIcebergTable(TAlterTableParams params, TDdlExecResponse response,
       IcebergTable tbl, long newCatalogVersion, boolean wantMinimalResult,
       String debugAction)
       throws ImpalaException {
     Preconditions.checkState(tbl.isWriteLockedByCurrentThread());
-    boolean needsToUpdateHms = !isIcebergHmsIntegrationEnabled(tbl.getMetaStoreTable());
+    boolean needsToUpdateHms = !IcebergUtil.isHiveCatalog(tbl.getMetaStoreTable());
     try {
       org.apache.iceberg.Transaction iceTxn = IcebergUtil.getIcebergTransaction(tbl);
       switch (params.getAlter_type()) {
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-external.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-external.test
new file mode 100644
index 000000000..e57dc2a19
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-external.test
@@ -0,0 +1,71 @@
+====
+---- QUERY
+create table ice_hive_ext (i int)
+stored by iceberg
+tblproperties ('external.table.purge'='false');
+insert into ice_hive_ext values (4);
+---- DML_RESULTS: ice_hive_ext
+4
+---- TYPES
+INT
+====
+---- QUERY
+alter table ice_hive_ext add column j int;
+select * from ice_hive_ext;
+---- RESULTS
+4,NULL
+---- TYPES
+INT,INT
+====
+---- QUERY
+insert into ice_hive_ext values (5,5);
+---- DML_RESULTS: ice_hive_ext
+4,NULL
+5,5
+---- TYPES
+INT,INT
+====
+---- QUERY
+alter table ice_hive_ext set tblproperties ('external.table.purge'='true');
+insert into ice_hive_ext values (6,6);
+describe formatted ice_hive_ext
+---- RESULTS: VERIFY_IS_SUBSET
+'','external.table.purge','true                '
+---- TYPES
+STRING,STRING,STRING
+====
+---- QUERY
+create table ice_hadoop_tables_ext (i int)
+stored by iceberg
+tblproperties ('iceberg.catalog'='hadoop.tables', 'external.table.purge'='false');
+insert into ice_hadoop_tables_ext values (4);
+---- DML_RESULTS: ice_hadoop_tables_ext
+4
+---- TYPES
+INT
+====
+---- QUERY
+alter table ice_hadoop_tables_ext add column j int;
+select * from ice_hadoop_tables_ext;
+---- RESULTS
+4,NULL
+---- TYPES
+INT,INT
+====
+---- QUERY
+insert into ice_hadoop_tables_ext values (5,5);
+---- DML_RESULTS: ice_hadoop_tables_ext
+4,NULL
+5,5
+---- TYPES
+INT,INT
+====
+---- QUERY
+alter table ice_hadoop_tables_ext set tblproperties ('external.table.purge'='true');
+insert into ice_hadoop_tables_ext values (6,6);
+describe formatted ice_hadoop_tables_ext
+---- RESULTS: VERIFY_IS_SUBSET
+'','external.table.purge','true                '
+---- TYPES
+STRING,STRING,STRING
+====
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index dd87a006b..d1737df78 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -72,6 +72,9 @@ class TestIcebergTable(IcebergTestSuite):
   def test_alter_iceberg_tables(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-alter', vector, use_db=unique_database)
 
+  def test_external_iceberg_tables(self, vector, unique_database):
+    self.run_test_case('QueryTest/iceberg-external', vector, unique_database)
+
   def test_expire_snapshots(self, unique_database):
     tbl_name = unique_database + ".expire_snapshots"
     iceberg_catalogs = IcebergCatalogs(unique_database)