You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2024/01/20 16:56:16 UTC

(impala) 02/02: Addendum: IMPALA-12584: Enable strict data file access by default

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 7e5bb385e1d391c790c6645e5abc3d38d37f39f0
Author: Peter Rozsa <pr...@cloudera.com>
AuthorDate: Tue Jan 9 16:37:22 2024 +0100

    Addendum: IMPALA-12584: Enable strict data file access by default
    
    This change sets the default value to 'true' for
    'iceberg_restrict_data_file_location' and changes the flag name to
    'iceberg_allow_datafiles_in_table_location_only'. Tests related to
    multiple storage locations in Iceberg tables are moved out to custom
    cluster tests. During test data loading, the flag is set to 'false'
    to make the creation of 'iceberg_multiple_storage_locations' table
    possible.
    
    Change-Id: Ifec84c86132a8a44d7e161006dcf51be2e7c7e57
    Reviewed-on: http://gerrit.cloudera.org:8080/20874
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/global-flags.cc                      |  5 +++--
 be/src/util/backend-gflag-util.cc                  |  6 +++---
 common/thrift/BackendGflags.thrift                 |  2 +-
 .../org/apache/impala/catalog/FeIcebergTable.java  |  2 +-
 .../org/apache/impala/service/BackendConfig.java   |  8 ++++++--
 .../impala/catalog/FileMetadataLoaderTest.java     |  2 ++
 testdata/bin/create-load-data.sh                   |  3 +++
 tests/custom_cluster/test_iceberg_strict_data.py   | 24 ++++++++++++----------
 tests/query_test/test_iceberg.py                   |  4 ----
 9 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index 8f4d07dd9..20fdab4a3 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -421,8 +421,9 @@ DEFINE_int32(iceberg_reload_new_files_threshold, 100, "(Advanced) If during a ta
     "reload all file metadata. If number of new files are less or equal to this, "
     "catalogd will only load the metadata of the newly added files.");
 
-DEFINE_bool(iceberg_restrict_data_file_location, false, "If true, Impala does not allow "
-    "Iceberg data file locations outside of the table directory during reads");
+DEFINE_bool(iceberg_allow_datafiles_in_table_location_only, true, "If true, Impala "
+    "does not allow Iceberg data file locations outside of the table directory during "
+    "reads");
 
 // Host and port of Statestore Service
 DEFINE_string(state_store_host, "localhost",
diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc
index 64f2fe0be..40414c871 100644
--- a/be/src/util/backend-gflag-util.cc
+++ b/be/src/util/backend-gflag-util.cc
@@ -111,7 +111,7 @@ DECLARE_string(java_weigher);
 DECLARE_int32(iceberg_reload_new_files_threshold);
 DECLARE_bool(enable_skipping_older_events);
 DECLARE_bool(enable_json_scanner);
-DECLARE_bool(iceberg_restrict_data_file_location);
+DECLARE_bool(iceberg_allow_datafiles_in_table_location_only);
 DECLARE_int32(catalog_operation_log_size);
 DECLARE_string(hostname);
 DECLARE_bool(allow_catalog_cache_op_from_masked_users);
@@ -443,8 +443,8 @@ Status PopulateThriftBackendGflags(TBackendGflags& cfg) {
   cfg.__set_iceberg_reload_new_files_threshold(FLAGS_iceberg_reload_new_files_threshold);
   cfg.__set_enable_skipping_older_events(FLAGS_enable_skipping_older_events);
   cfg.__set_enable_json_scanner(FLAGS_enable_json_scanner);
-  cfg.__set_iceberg_restrict_data_file_location(
-    FLAGS_iceberg_restrict_data_file_location);
+  cfg.__set_iceberg_allow_datafiles_in_table_location_only(
+      FLAGS_iceberg_allow_datafiles_in_table_location_only);
   cfg.__set_max_filter_error_rate_from_full_scan(
       FLAGS_max_filter_error_rate_from_full_scan);
   cfg.__set_catalog_operation_log_size(FLAGS_catalog_operation_log_size);
diff --git a/common/thrift/BackendGflags.thrift b/common/thrift/BackendGflags.thrift
index fb1ef7fc8..35aecb63d 100644
--- a/common/thrift/BackendGflags.thrift
+++ b/common/thrift/BackendGflags.thrift
@@ -275,5 +275,5 @@ struct TBackendGflags {
 
   122: required bool allow_catalog_cache_op_from_masked_users
 
-  123: required bool iceberg_restrict_data_file_location
+  123: required bool iceberg_allow_datafiles_in_table_location_only
 }
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
index 16c68a382..d04cf2530 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
@@ -1027,7 +1027,7 @@ public interface FeIcebergTable extends FeFsTable {
       Table icebergApiTable = icebergTable.getIcebergApiTable();
       Preconditions.checkNotNull(icebergApiTable);
       Map<String, String> properties = icebergApiTable.properties();
-      if (BackendConfig.INSTANCE.icebergRestrictDataFileLocation()) {
+      if (BackendConfig.INSTANCE.icebergAllowDatafileInTableLocationOnly()) {
         return true;
       }
       return !(PropertyUtil.propertyAsBoolean(properties,
diff --git a/fe/src/main/java/org/apache/impala/service/BackendConfig.java b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
index 570669216..c7b4a0bef 100644
--- a/fe/src/main/java/org/apache/impala/service/BackendConfig.java
+++ b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
@@ -429,8 +429,12 @@ public class BackendConfig {
     return backendCfg_.iceberg_reload_new_files_threshold;
   }
 
-  public boolean icebergRestrictDataFileLocation() {
-    return backendCfg_.iceberg_restrict_data_file_location;
+  public boolean icebergAllowDatafileInTableLocationOnly() {
+    return backendCfg_.iceberg_allow_datafiles_in_table_location_only;
+  }
+
+  public void setIcebergAllowDatafileInTableLocationOnly(boolean flag) {
+    backendCfg_.iceberg_allow_datafiles_in_table_location_only = flag;
   }
 
   public boolean isJsonScannerEnabled() {
diff --git a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
index d84e04ff2..a0f5129e2 100644
--- a/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/FileMetadataLoaderTest.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.impala.catalog.HdfsPartition.FileDescriptor;
 import org.apache.impala.catalog.iceberg.GroupedContentFiles;
 import org.apache.impala.compat.MetastoreShim;
+import org.apache.impala.service.BackendConfig;
 import org.apache.impala.testutil.CatalogServiceTestCatalog;
 import org.apache.impala.thrift.TNetworkAddress;
 import org.apache.impala.util.IcebergUtil;
@@ -291,6 +292,7 @@ public class FileMetadataLoaderTest {
   @Test
   public void testIcebergMultipleStorageLocations() throws IOException, CatalogException {
     CatalogServiceCatalog catalog = CatalogServiceTestCatalog.create();
+    BackendConfig.INSTANCE.setIcebergAllowDatafileInTableLocationOnly(false);
     IcebergFileMetadataLoader fml1 = getLoaderForIcebergTable(catalog,
         "functional_parquet", "iceberg_multiple_storage_locations",
         /* oldFds = */ Collections.emptyList(),
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index dfb8e8d58..3571af516 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -165,6 +165,9 @@ function start-impala {
   : ${START_CLUSTER_ARGS=""}
   # Use a fast statestore update so that DDL operations run faster.
   START_CLUSTER_ARGS_INT="--state_store_args=--statestore_update_frequency_ms=50"
+  # Disable strict datafile location checks for Iceberg tables
+  DATAFILE_LOCATION_CHECK="-iceberg_allow_datafiles_in_table_location_only=false"
+  START_CLUSTER_ARGS_INT+=("--catalogd_args=$DATAFILE_LOCATION_CHECK")
   if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then
     START_CLUSTER_ARGS_INT+=("--impalad_args=--abort_on_config_error=false -s 1")
   else
diff --git a/tests/custom_cluster/test_iceberg_strict_data.py b/tests/custom_cluster/test_iceberg_strict_data.py
index 52f8f5a76..2426dd602 100644
--- a/tests/custom_cluster/test_iceberg_strict_data.py
+++ b/tests/custom_cluster/test_iceberg_strict_data.py
@@ -20,33 +20,35 @@ import pytest
 
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 
-SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
-  "functional_parquet.iceberg_multiple_storage_locations"
-EXCEPTION = "IcebergTableLoadingException: " \
-  "Error loading metadata for Iceberg table"
-
 
 class TestIcebergStrictDataFileLocation(CustomClusterTestSuite):
   """Tests for checking the behaviour of startup flag
-   'iceberg_restrict_data_file_location'."""
+   'iceberg_allow_datafiles_in_table_location_only'."""
+
+  SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
+    "functional_parquet.iceberg_multiple_storage_locations"
+  EXCEPTION = "IcebergTableLoadingException: " \
+    "Error loading metadata for Iceberg table"
+
   @classmethod
   def get_workload(self):
     return 'functional-query'
 
   @CustomClusterTestSuite.with_args(
-      catalogd_args='--iceberg_restrict_data_file_location=true')
+      catalogd_args='--iceberg_allow_datafiles_in_table_location_only=true')
   @pytest.mark.execute_serially
   def test_restricted_location(self, vector):
     """If the flag is enabled, tables with multiple storage locations will fail
     to load their datafiles."""
-    result = self.execute_query_expect_failure(self.client, SELECT_STATEMENT)
-    assert EXCEPTION in str(result)
+    result = self.execute_query_expect_failure(self.client, self.SELECT_STATEMENT)
+    assert self.EXCEPTION in str(result)
 
   @CustomClusterTestSuite.with_args(
-      catalogd_args='--iceberg_restrict_data_file_location=false')
+      catalogd_args='--iceberg_allow_datafiles_in_table_location_only=false')
   @pytest.mark.execute_serially
   def test_disabled(self, vector):
     """If the flag is disabled, and tables with multiple storage locations
     are configured properly, the tables load successfully."""
-    result = self.execute_query_expect_success(self.client, SELECT_STATEMENT)
+    result = self.execute_query_expect_success(self.client, self.SELECT_STATEMENT)
     assert '9' in result.data
+    self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table', vector)
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 814f328c6..769850b17 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -1092,10 +1092,6 @@ class TestIcebergTable(IcebergTestSuite):
     args = ['-q', 'DROP TABLE {0}.{1}'.format(db_name, tbl_name)]
     results = run_impala_shell_cmd(vector, args)
 
-  def test_multiple_storage_locations(self, vector, unique_database):
-    self.run_test_case('QueryTest/iceberg-multiple-storage-locations-table',
-                       vector, unique_database)
-
   def test_mixed_file_format(self, vector, unique_database):
     self.run_test_case('QueryTest/iceberg-mixed-file-format', vector,
                       unique_database)