You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2023/12/18 16:05:04 UTC

(impala) branch master updated (4114fe8db -> 0aa1f3a5f)

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


    from 4114fe8db IMPALA-12632: Use Atomics for CpuUsageRatio counters
     new bd0ba644e IMPALA-12584: Add backend config to restrict data file locations for Iceberg tables
     new 0aa1f3a5f IMPALA-12628: iceberg negative test fail in non-HDFS environments

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/common/global-flags.cc                      |  3 ++
 be/src/util/backend-gflag-util.cc                  |  3 ++
 common/thrift/BackendGflags.thrift                 |  2 +
 .../org/apache/impala/catalog/FeIcebergTable.java  |  4 ++
 .../org/apache/impala/service/BackendConfig.java   |  4 ++
 .../queries/QueryTest/iceberg-negative.test        |  2 +
 tests/custom_cluster/test_iceberg_strict_data.py   | 52 ++++++++++++++++++++++
 7 files changed, 70 insertions(+)
 create mode 100644 tests/custom_cluster/test_iceberg_strict_data.py


(impala) 02/02: IMPALA-12628: iceberg negative test fail in non-HDFS environments

Posted by cs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 0aa1f3a5feb052f7c1bccb771f40fdbc28b8ed05
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Mon Dec 18 16:47:42 2023 +0100

    IMPALA-12628: iceberg negative test fail in non-HDFS environments
    
    There's a HIVE_QUERY section in iceberg-negative.test. Such sections
    use Hive to execute statements. In non-HDFS environments like S3,
    Ozone, etc. Hive is not configured properly, so we cannot use
    it.
    
    This patch adds the '---- IS_HDFS_ONLY' section to the relevant
    test blocks (The HIVE_QUERY that creates the table, and the
    Impala test that checks an error message).
    
    Change-Id: I7987724f11d0c88d09577f52294ecc10d8ce39ce
    Reviewed-on: http://gerrit.cloudera.org:8080/20812
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Zoltan Borok-Nagy <bo...@cloudera.com>
---
 .../workloads/functional-query/queries/QueryTest/iceberg-negative.test  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test
index cd22079b2..c7ecc8785 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-negative.test
@@ -840,11 +840,13 @@ update functional_parquet.iceberg_int_partitioned set file__position = 42;
 ---- CATCH
 AnalysisException: Left-hand side in assignment expression 'file__position=42' must be a column reference
 ====
+---- IS_HDFS_ONLY
 ---- HIVE_QUERY
 CREATE TABLE $DATABASE.ice_v2_timestamptz (i int, ts TIMESTAMP WITH LOCAL TIME ZONE)
 STORED BY ICEBERG
 TBLPROPERTIES ('format-version'='2');
 ====
+---- IS_HDFS_ONLY
 ---- QUERY
 invalidate metadata $DATABASE.ice_v2_timestamptz;
 update ice_v2_timestamptz set i = 3;


(impala) 01/02: IMPALA-12584: Add backend config to restrict data file locations for Iceberg tables

Posted by cs...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit bd0ba644e6b8953028f9061e8f457a4ad33154aa
Author: Peter Rozsa <pr...@cloudera.com>
AuthorDate: Tue Dec 12 08:56:35 2023 +0100

    IMPALA-12584: Add backend config to restrict data file locations for Iceberg tables
    
    This change adds backend flag 'iceberg_restrict_data_file_location',
    when the flag is set to 'true', Impala will raise an error when at least
    one data file of an Iceberg table is outside of the table directory.
    The default value of the flag is 'false'.
    
    Tests:
     - custom-cluster test added to validate both states of the flag
    
    Change-Id: I60e3d93b5039dc977417e7b097b3d6ddeda52de4
    Reviewed-on: http://gerrit.cloudera.org:8080/20786
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/global-flags.cc                      |  3 ++
 be/src/util/backend-gflag-util.cc                  |  3 ++
 common/thrift/BackendGflags.thrift                 |  2 +
 .../org/apache/impala/catalog/FeIcebergTable.java  |  4 ++
 .../org/apache/impala/service/BackendConfig.java   |  4 ++
 tests/custom_cluster/test_iceberg_strict_data.py   | 52 ++++++++++++++++++++++
 6 files changed, 68 insertions(+)

diff --git a/be/src/common/global-flags.cc b/be/src/common/global-flags.cc
index 906a99ab3..8f4d07dd9 100644
--- a/be/src/common/global-flags.cc
+++ b/be/src/common/global-flags.cc
@@ -421,6 +421,9 @@ DEFINE_int32(iceberg_reload_new_files_threshold, 100, "(Advanced) If during a ta
     "reload all file metadata. If number of new files are less or equal to this, "
     "catalogd will only load the metadata of the newly added files.");
 
+DEFINE_bool(iceberg_restrict_data_file_location, false, "If true, Impala does not allow "
+    "Iceberg data file locations outside of the table directory during reads");
+
 // Host and port of Statestore Service
 DEFINE_string(state_store_host, "localhost",
     "hostname where StatestoreService is running");
diff --git a/be/src/util/backend-gflag-util.cc b/be/src/util/backend-gflag-util.cc
index adaa47ac5..8c349aac3 100644
--- a/be/src/util/backend-gflag-util.cc
+++ b/be/src/util/backend-gflag-util.cc
@@ -111,6 +111,7 @@ DECLARE_string(java_weigher);
 DECLARE_int32(iceberg_reload_new_files_threshold);
 DECLARE_bool(enable_skipping_older_events);
 DECLARE_bool(enable_json_scanner);
+DECLARE_bool(iceberg_restrict_data_file_location);
 DECLARE_int32(catalog_operation_log_size);
 DECLARE_string(hostname);
 DECLARE_bool(allow_catalog_cache_op_from_masked_users);
@@ -441,6 +442,8 @@ Status PopulateThriftBackendGflags(TBackendGflags& cfg) {
   cfg.__set_iceberg_reload_new_files_threshold(FLAGS_iceberg_reload_new_files_threshold);
   cfg.__set_enable_skipping_older_events(FLAGS_enable_skipping_older_events);
   cfg.__set_enable_json_scanner(FLAGS_enable_json_scanner);
+  cfg.__set_iceberg_restrict_data_file_location(
+    FLAGS_iceberg_restrict_data_file_location);
   cfg.__set_max_filter_error_rate_from_full_scan(
       FLAGS_max_filter_error_rate_from_full_scan);
   cfg.__set_catalog_operation_log_size(FLAGS_catalog_operation_log_size);
diff --git a/common/thrift/BackendGflags.thrift b/common/thrift/BackendGflags.thrift
index 0f7818267..fb1ef7fc8 100644
--- a/common/thrift/BackendGflags.thrift
+++ b/common/thrift/BackendGflags.thrift
@@ -274,4 +274,6 @@ struct TBackendGflags {
   121: required string hostname
 
   122: required bool allow_catalog_cache_op_from_masked_users
+
+  123: required bool iceberg_restrict_data_file_location
 }
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
index cd2fd0af5..553428617 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeIcebergTable.java
@@ -71,6 +71,7 @@ import org.apache.impala.common.Pair;
 import org.apache.impala.common.PrintUtils;
 import org.apache.impala.common.Reference;
 import org.apache.impala.fb.FbFileBlock;
+import org.apache.impala.service.BackendConfig;
 import org.apache.impala.thrift.TColumn;
 import org.apache.impala.thrift.TCompressionCodec;
 import org.apache.impala.thrift.THdfsCompression;
@@ -1028,6 +1029,9 @@ public interface FeIcebergTable extends FeFsTable {
       Table icebergApiTable = icebergTable.getIcebergApiTable();
       Preconditions.checkNotNull(icebergApiTable);
       Map<String, String> properties = icebergApiTable.properties();
+      if (BackendConfig.INSTANCE.icebergRestrictDataFileLocation()) {
+        return true;
+      }
       return !(PropertyUtil.propertyAsBoolean(properties,
           TableProperties.OBJECT_STORE_ENABLED,
           TableProperties.OBJECT_STORE_ENABLED_DEFAULT)
diff --git a/fe/src/main/java/org/apache/impala/service/BackendConfig.java b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
index fb776dcaf..570669216 100644
--- a/fe/src/main/java/org/apache/impala/service/BackendConfig.java
+++ b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
@@ -429,6 +429,10 @@ public class BackendConfig {
     return backendCfg_.iceberg_reload_new_files_threshold;
   }
 
+  public boolean icebergRestrictDataFileLocation() {
+    return backendCfg_.iceberg_restrict_data_file_location;
+  }
+
   public boolean isJsonScannerEnabled() {
     return backendCfg_.enable_json_scanner;
   }
diff --git a/tests/custom_cluster/test_iceberg_strict_data.py b/tests/custom_cluster/test_iceberg_strict_data.py
new file mode 100644
index 000000000..52f8f5a76
--- /dev/null
+++ b/tests/custom_cluster/test_iceberg_strict_data.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import, division, print_function
+import pytest
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+
+SELECT_STATEMENT = "SELECT COUNT(1) FROM " \
+  "functional_parquet.iceberg_multiple_storage_locations"
+EXCEPTION = "IcebergTableLoadingException: " \
+  "Error loading metadata for Iceberg table"
+
+
+class TestIcebergStrictDataFileLocation(CustomClusterTestSuite):
+  """Tests for checking the behaviour of startup flag
+   'iceberg_restrict_data_file_location'."""
+  @classmethod
+  def get_workload(self):
+    return 'functional-query'
+
+  @CustomClusterTestSuite.with_args(
+      catalogd_args='--iceberg_restrict_data_file_location=true')
+  @pytest.mark.execute_serially
+  def test_restricted_location(self, vector):
+    """If the flag is enabled, tables with multiple storage locations will fail
+    to load their datafiles."""
+    result = self.execute_query_expect_failure(self.client, SELECT_STATEMENT)
+    assert EXCEPTION in str(result)
+
+  @CustomClusterTestSuite.with_args(
+      catalogd_args='--iceberg_restrict_data_file_location=false')
+  @pytest.mark.execute_serially
+  def test_disabled(self, vector):
+    """If the flag is disabled, and tables with multiple storage locations
+    are configured properly, the tables load successfully."""
+    result = self.execute_query_expect_success(self.client, SELECT_STATEMENT)
+    assert '9' in result.data