You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2023/09/05 01:59:54 UTC

[impala] 02/03: IMPALA-12409: Don't allow EXTERNAL Iceberg tables to point another Iceberg table in Hive catalog

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit a9aaeaa4896ce025bd9fff2fd60f230a1c1e3733
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Mon Aug 28 19:38:03 2023 +0200

    IMPALA-12409: Don't allow EXTERNAL Iceberg tables to point another Iceberg table in Hive catalog
    
    This patch forbids creating an EXTERNAL Iceberg table that points
    to another Iceberg table in the Hive Catalog. I.e. the following should
    be forbidden:
    
      CREATE EXTERNAL TABLE ice_ext
      STORED BY ICEBERG
      TBLPROPERTIES ('iceberg.table_identifier'='db.tbl');
    
    Loading such tables should also raise an error. Users need to query
    the original Iceberg tables. Alternatively they can create VIEWs if
    they want to query tables with a different name.
    
    Testing:
     * added e2e tests for CREATE EXTERNAL TABLE
     * added e2e test about loading such table
    
    Change-Id: Ifb0d7f0e7ec40fba356bd58b43f68d070432de71
    Reviewed-on: http://gerrit.cloudera.org:8080/20429
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../analysis/AlterTableSetTblProperties.java       |  4 ++
 .../apache/impala/analysis/CreateTableStmt.java    | 17 ++---
 .../org/apache/impala/catalog/IcebergTable.java    | 29 +++++++++
 .../queries/QueryTest/iceberg-catalogs.test        | 38 ------------
 .../queries/QueryTest/iceberg-insert.test          | 72 ----------------------
 .../queries/QueryTest/iceberg-negative.test        | 60 +++++++++++++++++-
 tests/query_test/test_iceberg.py                   | 27 ++++++++
 7 files changed, 123 insertions(+), 124 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/AlterTableSetTblProperties.java b/fe/src/main/java/org/apache/impala/analysis/AlterTableSetTblProperties.java
index 85ed35108..46d6d271e 100644
--- a/fe/src/main/java/org/apache/impala/analysis/AlterTableSetTblProperties.java
+++ b/fe/src/main/java/org/apache/impala/analysis/AlterTableSetTblProperties.java
@@ -25,6 +25,8 @@ import org.apache.avro.SchemaParseException;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
 import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;
 import org.apache.iceberg.DataFile;
+import org.apache.iceberg.mr.Catalogs;
+import org.apache.iceberg.mr.InputFormatConfig;
 import org.apache.impala.authorization.AuthorizationConfig;
 import org.apache.impala.catalog.FeFsTable;
 import org.apache.impala.catalog.FeHBaseTable;
@@ -155,6 +157,8 @@ public class AlterTableSetTblProperties extends AlterTableSetStmt {
     icebergPropertyCheck(IcebergTable.ICEBERG_CATALOG);
     icebergPropertyCheck(IcebergTable.ICEBERG_CATALOG_LOCATION);
     icebergPropertyCheck(IcebergTable.ICEBERG_TABLE_IDENTIFIER);
+    icebergPropertyCheck(Catalogs.NAME);
+    icebergPropertyCheck(InputFormatConfig.TABLE_IDENTIFIER);
     icebergPropertyCheck(IcebergTable.METADATA_LOCATION);
     if (tblProperties_.containsKey(IcebergTable.ICEBERG_FILE_FORMAT)) {
       icebergTableFormatCheck(tblProperties_.get(IcebergTable.ICEBERG_FILE_FORMAT));
diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
index bc378c5d5..8f8422555 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
@@ -775,6 +775,10 @@ public class CreateTableStmt extends StatementBase {
       analyzer_.addWarning("The table property 'external.table.purge' will be set "
           + "to 'TRUE' on newly created managed Iceberg tables.");
     }
+    if (isExternalWithNoPurge() && IcebergUtil.isHiveCatalog(getTblProperties())) {
+        throw new AnalysisException("Cannot create EXTERNAL Iceberg table in the " +
+            "Hive Catalog.");
+    }
   }
 
   private void validateTableInHiveCatalog() throws AnalysisException {
@@ -782,19 +786,6 @@ public class CreateTableStmt extends StatementBase {
       throw new AnalysisException(String.format("%s cannot be set for Iceberg table " +
           "stored in hive.catalog", IcebergTable.ICEBERG_CATALOG_LOCATION));
     }
-    if (isExternalWithNoPurge()) {
-      String tableId = getTblProperties().get(IcebergTable.ICEBERG_TABLE_IDENTIFIER);
-      if (tableId == null || tableId.isEmpty()) {
-        tableId = getTblProperties().get(Catalogs.NAME);
-      }
-      if (tableId == null || tableId.isEmpty()) {
-        throw new AnalysisException(String.format("Table property '%s' is necessary " +
-            "for external Iceberg tables stored in hive.catalog. " +
-            "For creating a completely new Iceberg table, use 'CREATE TABLE' " +
-            "(no EXTERNAL keyword).",
-            IcebergTable.ICEBERG_TABLE_IDENTIFIER));
-      }
-    }
   }
 
   private void validateTableInHadoopCatalog() throws AnalysisException {
diff --git a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
index 4c83acf02..2a9f7ade8 100644
--- a/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/IcebergTable.java
@@ -29,6 +29,10 @@ import org.apache.hadoop.hive.common.StatsSetupConst;
 import org.apache.hadoop.hive.metastore.IMetaStoreClient;
 import org.apache.hadoop.hive.metastore.TableType;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
+import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.catalog.TableIdentifier;
+import org.apache.iceberg.mr.Catalogs;
+import org.apache.iceberg.mr.InputFormatConfig;
 import org.apache.impala.analysis.IcebergPartitionField;
 import org.apache.impala.analysis.IcebergPartitionSpec;
 import org.apache.impala.analysis.IcebergPartitionTransform;
@@ -339,6 +343,7 @@ public class IcebergTable extends Table implements FeIcebergTable {
       throws TableLoadingException {
     final Timer.Context context =
         getMetrics().getTimer(Table.LOAD_DURATION_METRIC).time();
+    verifyTable(msTbl);
     try {
       // Copy the table to check later if anything has changed.
       msTable_ = msTbl.deepCopy();
@@ -394,6 +399,30 @@ public class IcebergTable extends Table implements FeIcebergTable {
     }
   }
 
+  /**
+   * @throws TableLoadingException when it is unsafe to load the table.
+   */
+  private void verifyTable(org.apache.hadoop.hive.metastore.api.Table msTbl)
+      throws TableLoadingException {
+    if (IcebergUtil.isHiveCatalog(msTbl.getParameters())) {
+      String tableId = IcebergUtil.getIcebergTableIdentifier(
+          msTbl.getDbName(), msTbl.getTableName()).toString();
+      Map<String, String> params = msTbl.getParameters();
+      if (!tableId.equalsIgnoreCase(
+              params.getOrDefault(IcebergTable.ICEBERG_TABLE_IDENTIFIER, tableId)) ||
+          !tableId.equalsIgnoreCase(
+              params.getOrDefault(Catalogs.NAME, tableId)) ||
+          !tableId.equalsIgnoreCase(
+              params.getOrDefault(InputFormatConfig.TABLE_IDENTIFIER, tableId))) {
+        throw new TableLoadingException(String.format(
+            "Table %s cannot be loaded because it is an " +
+            "EXTERNAL table in the HiveCatalog that points to another table. " +
+            "Query the original table instead.",
+            getFullName()));
+      }
+    }
+  }
+
   /**
    * Load schema and partitioning schemes directly from Iceberg.
    */
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-catalogs.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-catalogs.test
index c9430a306..843c12754 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-catalogs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-catalogs.test
@@ -122,41 +122,3 @@ row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/iceberg_hive_catalogs/data/labe
 ---- TYPES
 STRING, STRING, STRING, STRING
 ====
----- QUERY
-CREATE EXTERNAL TABLE iceberg_hive_catalogs_ext
-STORED AS ICEBERG
-TBLPROPERTIES('iceberg.catalog'='ice_hive_cat',
-'iceberg.table_identifier'='$DATABASE.iceberg_hive_catalogs');
----- RESULTS
-'Table has been created.'
-====
----- QUERY
-DESCRIBE FORMATTED iceberg_hive_catalogs_ext;
----- RESULTS: VERIFY_IS_SUBSET
-'Location:           ','$NAMENODE/test-warehouse/$DATABASE.db/iceberg_hive_catalogs','NULL'
-'','write.format.default','parquet             '
-'','iceberg.table_identifier','$DATABASE.iceberg_hive_catalogs'
-'','name                ','$DATABASE.iceberg_hive_catalogs'
----- TYPES
-string, string, string
-====
----- QUERY
-SELECT * FROM iceberg_hive_catalogs_ext;
----- RESULTS
-'ice',3.14
----- TYPES
-STRING,DECIMAL
-====
----- QUERY
-DROP TABLE iceberg_hive_catalogs_ext;
----- RESULTS
-'Table has been dropped.'
-====
----- QUERY
-REFRESH iceberg_hive_catalogs;
-SELECT * FROM iceberg_hive_catalogs;
----- RESULTS
-'ice',3.14
----- TYPES
-STRING,DECIMAL
-====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test
index 5d72d2cdb..0773f0b63 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test
@@ -167,84 +167,12 @@ select * from iceberg_hive_cat;
 INT
 ====
 ---- QUERY
-# Query external Iceberg table
-create external table iceberg_hive_cat_ext (i int)
-stored as iceberg
-location '$WAREHOUSE_LOCATION_PREFIX/test-warehouse/$DATABASE.db/iceberg_hive_cat'
-tblproperties('iceberg.catalog'='hive.catalog',
-    'iceberg.table_identifier'='$DATABASE.iceberg_hive_cat');
----- RESULTS
-'Table has been created.'
-====
----- QUERY
-select * from iceberg_hive_cat_ext;
----- RESULTS
-7
----- TYPES
-INT
-====
----- QUERY
-# INSET INTO external Iceberg table stored in HiveCatalog.
-insert into iceberg_hive_cat_ext values (8);
-select * from iceberg_hive_cat_ext;
----- RESULTS
-7
-8
----- TYPES
-INT
-====
----- QUERY
-# Query original table
-refresh iceberg_hive_cat;
-select * from iceberg_hive_cat;
----- RESULTS
-7
-8
----- TYPES
-INT
-====
----- QUERY
-# DROP external Iceberg table
-drop table iceberg_hive_cat_ext
----- RESULTS
-'Table has been dropped.'
-====
----- QUERY
-# Original table is not affected after external table drop.
-refresh iceberg_hive_cat;
-select * from iceberg_hive_cat;
----- RESULTS
-7
-8
----- TYPES
-INT
-====
----- QUERY
-# Create another external Iceberg table
-create external table iceberg_hive_cat_ext_2 (i int)
-stored as iceberg
-location '$WAREHOUSE_LOCATION_PREFIX/test-warehouse/$DATABASE.db/iceberg_hive_cat'
-tblproperties('iceberg.catalog'='hive.catalog',
-    'iceberg.table_identifier'='$DATABASE.iceberg_hive_cat');
-select * from iceberg_hive_cat_ext_2
----- RESULTS
-7
-8
-====
----- QUERY
 # DROP the synchronized Iceberg table (data is purged).
 drop table iceberg_hive_cat
 ---- RESULTS
 'Table has been dropped.'
 ====
 ---- QUERY
-# The data has been purged, so querying the external table fails.
-refresh iceberg_hive_cat_ext_2;
-select * from iceberg_hive_cat_ext_2
----- CATCH
-Table does not exist
-====
----- QUERY
 # Insert into hive catalog with custom location.
 create table iceberg_hive_cat_custom_loc (i int)
 stored as iceberg
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test
index a3b55ee67..99ac5d090 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test
@@ -93,7 +93,55 @@ CREATE EXTERNAL TABLE iceberg_hive_tbls_external(
 )
 STORED AS ICEBERG;
 ---- CATCH
-Table property 'iceberg.table_identifier' is necessary for external Iceberg tables stored in hive.catalog. For creating a completely new Iceberg table, use 'CREATE TABLE' (no EXTERNAL keyword).
+Cannot create EXTERNAL Iceberg table in the Hive Catalog.
+====
+---- QUERY
+CREATE EXTERNAL TABLE iceberg_hive_tbls_external(
+  level STRING
+)
+STORED AS ICEBERG
+TBLPROPERTIES ('iceberg.table_identifier'='functional_parquet.iceberg_partitioned');
+---- CATCH
+Cannot create EXTERNAL Iceberg table in the Hive Catalog.
+====
+---- QUERY
+CREATE EXTERNAL TABLE iceberg_hive_tbls_external(
+  level STRING
+)
+STORED AS ICEBERG
+TBLPROPERTIES ('iceberg.catalog'='hive.catalog',
+'iceberg.table_identifier'='functional_parquet.iceberg_partitioned');
+---- CATCH
+Cannot create EXTERNAL Iceberg table in the Hive Catalog.
+====
+---- QUERY
+CREATE TABLE iceberg_hive_alias(
+  level STRING
+)
+STORED AS ICEBERG
+TBLPROPERTIES ('iceberg.table_identifier'='functional_parquet.iceberg_mixed_file_format');
+---- CATCH
+AlreadyExistsException: Table already exists: functional_parquet.iceberg_mixed_file_format
+====
+---- QUERY
+CREATE EXTERNAL TABLE iceberg_hive_tbls_external(
+  level STRING
+)
+STORED AS ICEBERG
+TBLPROPERTIES ('iceberg.catalog'='hive.catalog',
+'iceberg.table_identifier'='functional_parquet.iceberg_partitioned');
+---- CATCH
+Cannot create EXTERNAL Iceberg table in the Hive Catalog.
+====
+---- QUERY
+CREATE EXTERNAL TABLE iceberg_hive_tbls_external(
+  level STRING
+)
+STORED AS ICEBERG
+TBLPROPERTIES ('iceberg.catalog'='ice_hive_catalog',
+'iceberg.table_identifier'='functional_parquet.iceberg_partitioned');
+---- CATCH
+Cannot create EXTERNAL Iceberg table in the Hive Catalog.
 ====
 ---- QUERY
 CREATE EXTERNAL TABLE fake_iceberg_table_hadoop_catalog
@@ -249,6 +297,16 @@ ALTER TABLE iceberg_table_hadoop_catalog set TBLPROPERTIES('iceberg.table_identi
 AnalysisException: Changing the 'iceberg.table_identifier' table property is not supported for Iceberg table.
 ====
 ---- QUERY
+ALTER TABLE iceberg_table_hadoop_catalog set TBLPROPERTIES('name'='fake_db.fake_table');
+---- CATCH
+AnalysisException: Changing the 'name' table property is not supported for Iceberg table.
+====
+---- QUERY
+ALTER TABLE iceberg_table_hadoop_catalog set TBLPROPERTIES('iceberg.mr.table.identifier'='fake_db.fake_table');
+---- CATCH
+AnalysisException: Changing the 'iceberg.mr.table.identifier' table property is not supported for Iceberg table.
+====
+---- QUERY
 ALTER TABLE iceberg_table_hadoop_catalog unset TBLPROPERTIES('iceberg.table_identifier');
 ---- CATCH
 AnalysisException: Unsetting the 'iceberg.table_identifier' table property is not supported for Iceberg table.
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 43f7974ba..02cf985d9 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -1086,6 +1086,33 @@ class TestIcebergTable(IcebergTestSuite):
 
     assert parquet_column_name_type_list == iceberg_column_name_type_list
 
+  @SkipIfFS.hive
+  def test_hive_external_forbidden(self, vector, unique_database):
+    tbl_name = unique_database + ".hive_ext"
+    error_msg = ("cannot be loaded because it is an EXTERNAL table in the HiveCatalog "
+        "that points to another table. Query the original table instead.")
+    self.execute_query("create table {0} (i int) stored by iceberg".
+        format(tbl_name))
+    # 'iceberg.table_identifier' can refer to another table
+    self.run_stmt_in_hive("""alter table {0} set tblproperties
+        ('external.table.purge'='false',
+         'iceberg.table_identifier'='functional_iceberg.iceberg_partitioned')""".
+         format(tbl_name))
+    ex = self.execute_query_expect_failure(self.client, "refresh {0}".format(tbl_name))
+    assert error_msg in str(ex)
+    # 'iceberg.mr.table.identifier' can refer to another table
+    self.run_stmt_in_hive("""
+        alter table {0} unset tblproperties('iceberg.table_identifier')""".
+        format(tbl_name))
+    self.run_stmt_in_hive("""alter table {0} set tblproperties
+        ('iceberg.mr.table.identifier'='functional_iceberg.iceberg_partitioned')""".
+        format(tbl_name))
+    ex = self.execute_query_expect_failure(self.client, "refresh {0}".format(tbl_name))
+    assert error_msg in str(ex)
+    # 'name' can also refer to another table but cannot be set by Hive/Impala. Also,
+    # during table migration both Impala and Hive clears existing table properties
+    # See IMPALA-12410
+
   @SkipIfFS.incorrent_reported_ec
   def test_compute_stats(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-compute-stats', vector, unique_database)