You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2023/12/18 16:05:05 UTC

(impala) 01/02: IMPALA-12584: Add backend config to restrict data file locations for Iceberg tables

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit bd0ba644e6b8953028f9061e8f457a4ad33154aa
Author: Peter Rozsa <pr...@cloudera.com>
AuthorDate: Tue Dec 12 08:56:35 2023 +0100

    IMPALA-12584: Add backend config to restrict data file locations for Iceberg tables
    
    This change adds backend flag 'iceberg_restrict_data_file_location',
    when the flag is set to 'true', Impala will raise an error when at least
    one data file of an Iceberg table is outside of the table directory.
    The default value of the flag is 'false'.
    
    Tests:
     - custom-cluster test added to validate both states of the flag
    
    Change-Id: I60e3d93b5039dc977417e7b097b3d6ddeda52de4
    Reviewed-on: http://gerrit.cloudera.org:8080/20786
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/global-flags.cc                      |  3 ++
 be/src/util/backend-gflag-util.cc                  |  3 ++
 common/thrift/BackendGflags.thrift                 |  2 +
 .../org/apache/impala/catalog/FeIcebergTable.java  |  4 ++
 .../org/apache/impala/service/BackendConfig.java   |  4 ++
 tests/custom_cluster/test_iceberg_strict_data.py   | 52 ++++++++++++++++++++++
 6 files changed, 68 insertions(+)

diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index 906a99ab3..8f4d07dd9 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -421,6 +421,9 @@ DEFINE_int32(iceberg_reload_new_files_threshold, 100, "(Advanced) If during a ta
     "reload all file metadata. If number of new files are less or equal to this, "
     "catalogd will only load the metadata of the newly added files.");
 
+DEFINE_bool(iceberg_restrict_data_file_location, false, "If true, Impala does not allow "
+    "Iceberg data file locations outside of the table directory during reads");
+
 // Host and port of Statestore Service
 DEFINE_string(state_store_host, "localhost",
     "hostname where StatestoreService is running");
diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc
index adaa47ac5..8c349aac3 100644
--- a/be/src/util/backend-gflag-util.cc
+++ b/be/src/util/backend-gflag-util.cc
@@ -111,6 +111,7 @@ DECLARE_string(java_weigher);
 DECLARE_int32(iceberg_reload_new_files_threshold);
 DECLARE_bool(enable_skipping_older_events);
 DECLARE_bool(enable_json_scanner);
+DECLARE_bool(iceberg_restrict_data_file_location);
 DECLARE_int32(catalog_operation_log_size);
 DECLARE_string(hostname);
 DECLARE_bool(allow_catalog_cache_op_from_masked_users);
@@ -441,6 +442,8 @@ Status PopulateThriftBackendGflags(TBackendGflags& cfg) {
   cfg.__set_iceberg_reload_new_files_threshold(FLAGS_iceberg_reload_new_files_threshold);
   cfg.__set_enable_skipping_older_events(FLAGS_enable_skipping_older_events);
   cfg.__set_enable_json_scanner(FLAGS_enable_json_scanner);
+  cfg.__set_iceberg_restrict_data_file_location(
+    FLAGS_iceberg_restrict_data_file_location);
   cfg.__set_max_filter_error_rate_from_full_scan(
       FLAGS_max_filter_error_rate_from_full_scan);
   cfg.__set_catalog_operation_log_size(FLAGS_catalog_operation_log_size);
diff --git a/common/thrift/BackendGflags.thrift b/common/thrift/BackendGflags.thrift
index 0f7818267..fb1ef7fc8 100644
--- a/common/thrift/BackendGflags.thrift
+++ b/common/thrift/BackendGflags.thrift
@@ -274,4 +274,6 @@ struct TBackendGflags {
   121: required string hostname
 
   122: required bool allow_catalog_cache_op_from_masked_users
+
+  123: required bool iceberg_restrict_data_file_location
 }
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
index cd2fd0af5..553428617 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
@@ -71,6 +71,7 @@ import org.apache.impala.common.Pair;
 import org.apache.impala.common.PrintUtils;
 import org.apache.impala.common.Reference;
 import org.apache.impala.fb.FbFileBlock;
+import org.apache.impala.service.BackendConfig;
 import org.apache.impala.thrift.TColumn;
 import org.apache.impala.thrift.TCompressionCodec;
 import org.apache.impala.thrift.THdfsCompression;
@@ -1028,6 +1029,9 @@ public interface FeIcebergTable extends FeFsTable {
       Table icebergApiTable = icebergTable.getIcebergApiTable();
       Preconditions.checkNotNull(icebergApiTable);
       Map<String, String> properties = icebergApiTable.properties();
+      if (BackendConfig.INSTANCE.icebergRestrictDataFileLocation()) {
+        return true;
+      }
       return !(PropertyUtil.propertyAsBoolean(properties,
           TableProperties.OBJECT_STORE_ENABLED,
           TableProperties.OBJECT_STORE_ENABLED_DEFAULT)
diff --git a/fe/src/main/java/org/apache/impala/service/BackendConfig.java b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
index fb776dcaf..570669216 100644
--- a/fe/src/main/java/org/apache/impala/service/BackendConfig.java
+++ b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
@@ -429,6 +429,10 @@ public class BackendConfig {
     return backendCfg_.iceberg_reload_new_files_threshold;
   }
 
+  public boolean icebergRestrictDataFileLocation() {
+    return backendCfg_.iceberg_restrict_data_file_location;
+  }
+
   public boolean isJsonScannerEnabled() {
     return backendCfg_.enable_json_scanner;
   }
diff --git a/tests/custom_cluster/test_iceberg_strict_data.py b/tests/custom_cluster/test_iceberg_strict_data.py
new file mode 100644
index 000000000..52f8f5a76
--- /dev/null
+++ b/tests/custom_cluster/test_iceberg_strict_data.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import, division, print_function
+import pytest
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+
+SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
+  "functional_parquet.iceberg_multiple_storage_locations"
+EXCEPTION = "IcebergTableLoadingException: " \
+  "Error loading metadata for Iceberg table"
+
+
+class TestIcebergStrictDataFileLocation(CustomClusterTestSuite):
+  """Tests for checking the behaviour of startup flag
+   'iceberg_restrict_data_file_location'."""
+  @classmethod
+  def get_workload(self):
+    return 'functional-query'
+
+  @CustomClusterTestSuite.with_args(
+      catalogd_args='--iceberg_restrict_data_file_location=true')
+  @pytest.mark.execute_serially
+  def test_restricted_location(self, vector):
+    """If the flag is enabled, tables with multiple storage locations will fail
+    to load their datafiles."""
+    result = self.execute_query_expect_failure(self.client, SELECT_STATEMENT)
+    assert EXCEPTION in str(result)
+
+  @CustomClusterTestSuite.with_args(
+      catalogd_args='--iceberg_restrict_data_file_location=false')
+  @pytest.mark.execute_serially
+  def test_disabled(self, vector):
+    """If the flag is disabled, and tables with multiple storage locations
+    are configured properly, the tables load successfully."""
+    result = self.execute_query_expect_success(self.client, SELECT_STATEMENT)
+    assert '9' in result.data