You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by bo...@apache.org on 2022/09/26 15:58:11 UTC
[impala] 02/02: IMPALA-11582: Implement table sampling for Iceberg tables

This is an automated email from the ASF dual-hosted git repository.

boroknagyz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit b91aa065377a1d154cd9eb5f5cd9ffb6da919b65
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Thu Sep 15 14:04:34 2022 +0200

    IMPALA-11582: Implement table sampling for Iceberg tables
    
    This patch adds table sampling functionalities for Iceberg tables.
    From now it's possible to execute SELECT and COMPUTE STATS statements
    with table sampling.
    
    Predicates in the WHERE clause affect the results of table sampling
    similarly to how legacy tables work (sampling is applied after static
    partition and file pruning).
    
    Sampling is repeatable via the REPEATABLE clause.
    
    Testing
     * planner tests
     * e2e tests for V1 and V2 tables
    
    Change-Id: I5de151747c0e9d9379a4051252175fccf42efd7d
    Reviewed-on: http://gerrit.cloudera.org:8080/18989
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../org/apache/impala/analysis/SelectStmt.java     |   2 +
 .../org/apache/impala/planner/HdfsScanNode.java    |  15 ++-
 .../org/apache/impala/planner/IcebergScanNode.java |  53 ++++++++
 .../queries/PlannerTest/tablesample.test           | 150 ++++++++++++++++++++-
 .../queries/QueryTest/iceberg-tablesample-v2.test  | 105 +++++++++++++++
 .../queries/QueryTest/iceberg-tablesample.test     |  43 ++++++
 tests/query_test/test_iceberg.py                   |  10 ++
 7 files changed, 371 insertions(+), 7 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java b/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java
index 2ca45c8b9..4164f21ec 100644
--- a/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/SelectStmt.java
@@ -1320,6 +1320,7 @@ public class SelectStmt extends QueryStmt {
    *  - stmt does not have GROUP BY clause
    *  - stmt does not have HAVING clause
    *  - tableRefs contains only one BaseTableRef
+   *  - tableRef doesn't have sampling param
    *  - table is the Iceberg table
    *  - SelectList must contains 'count(*)' or 'count(constant)'
    *  - SelectList can contain other agg functions, e.g. min, sum, etc
@@ -1336,6 +1337,7 @@ public class SelectStmt extends QueryStmt {
     if (tables.size() != 1) return;
     TableRef tableRef = tables.get(0);
     if (!(tableRef instanceof BaseTableRef)) return;
+    if (tableRef.getSampleParams() != null) return;
 
     TableName tableName = tableRef.getDesc().getTableName();
     FeTable table;
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 8c236d0f3..1958595ad 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -1147,8 +1147,7 @@ public class HdfsScanNode extends ScanNode {
       // Pass a minimum sample size of 0 because users cannot set a minimum sample size
       // for scans directly. For compute stats, a minimum sample size can be set, and
       // the sampling percent is adjusted to reflect it.
-      sampledFiles = FeFsTable.Utils.getFilesSample(tbl_, partitions_, percentBytes, 0,
-          randomSeed);
+      sampledFiles = getFilesSample(percentBytes, 0, randomSeed);
     }
 
     long scanRangeBytesLimit = analyzer.getQueryCtx().client_request.getQuery_options()
@@ -1305,6 +1304,18 @@ public class HdfsScanNode extends ScanNode {
     return fileDescs;
   }
 
+  /**
+   * Returns a sample of file descriptors associated to this scan node.
+   * @param percentBytes must be between 0 and 100.
+   * @param minSampleBytes minimum number of bytes to read.
+   * @param randomSeed used for random number generation.
+   */
+  protected Map<SampledPartitionMetadata, List<FileDescriptor>> getFilesSample(
+      long percentBytes, long minSampleBytes, long randomSeed) {
+    return FeFsTable.Utils.getFilesSample(tbl_, partitions_, percentBytes, minSampleBytes,
+        randomSeed);
+  }
+
   /**
    * Update the estimate of maximum number of rows per scan range based on the fraction
    * of bytes of the scan range relative to the total bytes per partition or table.
diff --git a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
index 4532a8fe0..c023c417d 100644
--- a/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/IcebergScanNode.java
@@ -19,7 +19,11 @@ package org.apache.impala.planner;
 
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.Random;
 
 import org.apache.impala.analysis.Expr;
 import org.apache.impala.analysis.MultiAggregateInfo;
@@ -35,6 +39,7 @@ import org.apache.impala.common.ImpalaRuntimeException;
 import org.apache.impala.fb.FbIcebergDataFileFormat;
 
 import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
 
 /**
  * Scan of a single iceberg table.
@@ -108,4 +113,52 @@ public class IcebergScanNode extends HdfsScanNode {
       return fileDescs_;
     }
   }
+
+  /**
+   * Returns a sample of file descriptors associated to this scan node.
+   * The algorithm is based on FeFsTable.Utils.getFilesSample()
+   */
+  @Override
+  protected Map<SampledPartitionMetadata, List<FileDescriptor>> getFilesSample(
+      long percentBytes, long minSampleBytes, long randomSeed) {
+    Preconditions.checkState(percentBytes >= 0 && percentBytes <= 100);
+    Preconditions.checkState(minSampleBytes >= 0);
+
+    // Ensure a consistent ordering of files for repeatable runs.
+    List<FileDescriptor> orderedFds = Lists.newArrayList(fileDescs_);
+    Collections.sort(orderedFds);
+
+    Preconditions.checkState(partitions_.size() == 1);
+    FeFsPartition part = partitions_.get(0);
+    SampledPartitionMetadata sampledPartitionMetadata =
+        new SampledPartitionMetadata(part.getId(), part.getFsType());
+
+    long totalBytes = 0;
+    for (FileDescriptor fd : orderedFds) {
+      totalBytes += fd.getFileLength();
+    }
+
+    int numFilesRemaining = orderedFds.size();
+    double fracPercentBytes = (double) percentBytes / 100;
+    long targetBytes = (long) Math.round(totalBytes * fracPercentBytes);
+    targetBytes = Math.max(targetBytes, minSampleBytes);
+
+    // Randomly select files until targetBytes has been reached or all files have been
+    // selected.
+    Random rnd = new Random(randomSeed);
+    long selectedBytes = 0;
+    List<FileDescriptor> sampleFiles = Lists.newArrayList();
+    while (selectedBytes < targetBytes && numFilesRemaining > 0) {
+      int selectedIdx = rnd.nextInt(numFilesRemaining);
+      FileDescriptor fd = orderedFds.get(selectedIdx);
+      sampleFiles.add(fd);
+      selectedBytes += fd.getFileLength();
+      // Avoid selecting the same file multiple times.
+      orderedFds.set(selectedIdx, orderedFds.get(numFilesRemaining - 1));
+      --numFilesRemaining;
+    }
+    Map<SampledPartitionMetadata, List<FileDescriptor>> result = new HashMap<>();
+    result.put(sampledPartitionMetadata, sampleFiles);
+    return result;
+  }
 }
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
index 39b44bda0..2b61a833e 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tablesample.test
@@ -169,20 +169,20 @@ PLAN-ROOT SINK
 select id from functional_parquet.alltypes tablesample system(10) repeatable(1234)
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=20.00MB mem-reservation=4.02MB thread-reservation=2
+|  Per-Host Resources: mem-estimate=20.00MB mem-reservation=4.01MB thread-reservation=2
 PLAN-ROOT SINK
 |  output exprs: id
 |  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
 |
 00:SCAN HDFS [functional_parquet.alltypes]
-   HDFS partitions=3/24 files=3 size=25.50KB
+   HDFS partitions=3/24 files=3 size=23.71KB
    stored statistics:
      table: rows=unavailable size=unavailable
-     partitions: 0/24 rows=12.84K
+     partitions: 0/24 rows=11.95K
      columns: unavailable
    extrapolated-rows=disabled max-scan-range-rows=unavailable
-   mem-estimate=16.00MB mem-reservation=16.00KB thread-reservation=1
-   tuple-ids=0 row-size=4B cardinality=1.27K
+   mem-estimate=16.00MB mem-reservation=8.00KB thread-reservation=1
+   tuple-ids=0 row-size=4B cardinality=1.20K
    in pipelines: 00(GETNEXT)
 ====
 # Sampling in a subquery.
@@ -247,3 +247,143 @@ PLAN-ROOT SINK
    tuple-ids=0 row-size=4B cardinality=730
    in pipelines: 00(GETNEXT)
 ====
+# Sampling Iceberg tables.
+select * from functional_parquet.iceberg_non_partitioned tablesample system(10) repeatable(1234)
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=68.00MB mem-reservation=4.03MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: functional_parquet.iceberg_non_partitioned.id, functional_parquet.iceberg_non_partitioned.user, functional_parquet.iceberg_non_partitioned.action, functional_parquet.iceberg_non_partitioned.event_time
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+00:SCAN HDFS [functional_parquet.iceberg_non_partitioned]
+   HDFS partitions=1/1 files=3 size=3.41KB
+   stored statistics:
+     table: rows=unavailable size=unavailable
+     columns: unavailable
+   extrapolated-rows=disabled max-scan-range-rows=unavailable
+   mem-estimate=64.00MB mem-reservation=32.00KB thread-reservation=1
+   tuple-ids=0 row-size=44B cardinality=380
+   in pipelines: 00(GETNEXT)
+====
+# Sampling Iceberg tables. Count(*) is not optimized.
+select count(*) from functional_parquet.iceberg_non_partitioned tablesample system(10) repeatable(1234)
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=32.02MB mem-reservation=4.00MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: count(*)
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+01:AGGREGATE [FINALIZE]
+|  output: count(*)
+|  mem-estimate=16.00KB mem-reservation=0B spill-buffer=2.00MB thread-reservation=0
+|  tuple-ids=1 row-size=8B cardinality=1
+|  in pipelines: 01(GETNEXT), 00(OPEN)
+|
+00:SCAN HDFS [functional_parquet.iceberg_non_partitioned]
+   HDFS partitions=1/1 files=3 size=3.41KB
+   stored statistics:
+     table: rows=unavailable size=unavailable
+     columns: all
+   extrapolated-rows=disabled max-scan-range-rows=unavailable
+   mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1
+   tuple-ids=0 row-size=0B cardinality=380
+   in pipelines: 00(GETNEXT)
+====
+# Sampling partitioned Iceberg tables.
+select * from functional_parquet.iceberg_partitioned tablesample system(50) repeatable(1234)
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=68.00MB mem-reservation=4.03MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: functional_parquet.iceberg_partitioned.id, functional_parquet.iceberg_partitioned.user, functional_parquet.iceberg_partitioned.action, functional_parquet.iceberg_partitioned.event_time
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+00:SCAN HDFS [functional_parquet.iceberg_partitioned]
+   HDFS partitions=1/1 files=10 size=11.46KB
+   stored statistics:
+     table: rows=unavailable size=unavailable
+     columns: unavailable
+   extrapolated-rows=disabled max-scan-range-rows=unavailable
+   mem-estimate=64.00MB mem-reservation=32.00KB thread-reservation=1
+   tuple-ids=0 row-size=44B cardinality=1.98K
+   in pipelines: 00(GETNEXT)
+====
+# Sampling Iceberg tables with predicates. Predicate pushdown to Iceberg happens
+# before sampling (similarly to static partition pruning).
+select * from functional_parquet.iceberg_partitioned tablesample system(50) repeatable(1234)
+where action = 'click'
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=68.00MB mem-reservation=4.03MB thread-reservation=2
+PLAN-ROOT SINK
+|  output exprs: functional_parquet.iceberg_partitioned.id, functional_parquet.iceberg_partitioned.user, functional_parquet.iceberg_partitioned.action, functional_parquet.iceberg_partitioned.event_time
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+00:SCAN HDFS [functional_parquet.iceberg_partitioned]
+   HDFS partitions=1/1 files=4 size=4.57KB
+   predicates: action = 'click'
+   stored statistics:
+     table: rows=unavailable size=unavailable
+     columns: unavailable
+   extrapolated-rows=disabled max-scan-range-rows=unavailable
+   parquet statistics predicates: action = 'click'
+   parquet dictionary predicates: action = 'click'
+   mem-estimate=64.00MB mem-reservation=32.00KB thread-reservation=1
+   tuple-ids=0 row-size=44B cardinality=198
+   in pipelines: 00(GETNEXT)
+====
+# Sampling Iceberg V2 tables. Delete files are not sampled, only the data files. So we
+# don't return rows that are deleted.
+select * from functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files
+tablesample system(10) repeatable(1234)
+---- PLAN
+F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
+|  Per-Host Resources: mem-estimate=101.94MB mem-reservation=5.98MB thread-reservation=3
+PLAN-ROOT SINK
+|  output exprs: functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files.i, functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files.s
+|  mem-estimate=4.00MB mem-reservation=4.00MB spill-buffer=2.00MB thread-reservation=0
+|
+04:UNION
+|  pass-through-operands: all
+|  mem-estimate=0B mem-reservation=0B thread-reservation=0
+|  tuple-ids=0 row-size=36B cardinality=4.85K
+|  in pipelines: 03(GETNEXT), 00(GETNEXT)
+|
+|--02:DELETE EVENTS HASH JOIN [LEFT ANTI JOIN]
+|  |  hash predicates: functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files.file__position = functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files-position-delete.pos, functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files.input__file__name = functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files-position-delete.file_path
+|  |  mem-estimate=1.94MB mem-reservation=1.94MB spill-buffer=64.00KB thread-reservation=0
+|  |  tuple-ids=0 row-size=36B cardinality=2.42K
+|  |  in pipelines: 00(GETNEXT), 01(OPEN)
+|  |
+|  |--01:SCAN HDFS [functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files-POSITION-DELETE-01 functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files-position-delete]
+|  |     HDFS partitions=1/1 files=2 size=5.33KB
+|  |     stored statistics:
+|  |       table: rows=4 size=5.33KB
+|  |       columns: all
+|  |     extrapolated-rows=disabled max-scan-range-rows=2
+|  |     mem-estimate=32.00MB mem-reservation=16.00KB thread-reservation=1
+|  |     tuple-ids=1 row-size=245B cardinality=4
+|  |     in pipelines: 01(GETNEXT)
+|  |
+|  00:SCAN HDFS [functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files]
+|     HDFS partitions=1/1 files=1 size=625B
+|     stored statistics:
+|       table: rows=unavailable size=unavailable
+|       columns missing stats: i, s
+|     extrapolated-rows=disabled max-scan-range-rows=unavailable
+|     mem-estimate=64.00MB mem-reservation=32.00KB thread-reservation=1
+|     tuple-ids=0 row-size=36B cardinality=2.42K
+|     in pipelines: 00(GETNEXT)
+|
+03:SCAN HDFS [functional_parquet.iceberg_v2_positional_not_all_data_files_have_delete_files]
+   HDFS partitions=1/1 files=1 size=620B
+   stored statistics:
+     table: rows=unavailable size=unavailable
+     columns missing stats: i, s
+   extrapolated-rows=disabled max-scan-range-rows=unavailable
+   mem-estimate=64.00MB mem-reservation=32.00KB thread-reservation=1
+   tuple-ids=0 row-size=36B cardinality=2.42K
+   in pipelines: 03(GETNEXT)
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-tablesample-v2.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-tablesample-v2.test
new file mode 100644
index 000000000..ab0b034b9
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-tablesample-v2.test
@@ -0,0 +1,105 @@
+====
+---- QUERY
+select * from iceberg_v2_no_deletes tablesample system(10) repeatable(1234)
+---- RESULTS
+1,'x'
+2,'y'
+3,'z'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+select count(*) from iceberg_v2_no_deletes tablesample system(10) repeatable(1234)
+---- RESULTS
+3
+---- TYPES
+BIGINT
+====
+---- QUERY
+select * from iceberg_v2_positional_not_all_data_files_have_delete_files tablesample system(10) repeatable(1234)
+---- RESULTS
+5,'X'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+select count(*) from iceberg_v2_positional_not_all_data_files_have_delete_files tablesample system(10) repeatable(1234)
+---- RESULTS
+1
+---- TYPES
+BIGINT
+====
+---- QUERY
+select * from iceberg_v2_partitioned_position_deletes tablesample system(50) repeatable(1234);
+---- RESULTS
+18,'Alan','click',2020-01-01 10:00:00
+12,'Alan','click',2020-01-01 10:00:00
+10,'Alan','click',2020-01-01 10:00:00
+6,'Alex','view',2020-01-01 09:00:00
+20,'Alex','view',2020-01-01 09:00:00
+4,'Alex','view',2020-01-01 09:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
+---- QUERY
+select * from iceberg_v2_partitioned_position_deletes tablesample system(50) repeatable(1234)
+where action = 'click';
+---- RESULTS
+18,'Alan','click',2020-01-01 10:00:00
+12,'Alan','click',2020-01-01 10:00:00
+10,'Alan','click',2020-01-01 10:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
+---- QUERY
+select * from iceberg_v2_no_deletes_orc tablesample system(10) repeatable(1234)
+---- RESULTS
+1,'x'
+2,'y'
+3,'z'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+select count(*) from iceberg_v2_no_deletes_orc tablesample system(10) repeatable(1234)
+---- RESULTS
+3
+---- TYPES
+BIGINT
+====
+---- QUERY
+select * from iceberg_v2_positional_not_all_data_files_have_delete_files_orc tablesample system(10) repeatable(1234)
+---- RESULTS
+5,'X'
+---- TYPES
+INT, STRING
+====
+---- QUERY
+select count(*) from iceberg_v2_positional_not_all_data_files_have_delete_files tablesample system(10) repeatable(1234)
+---- RESULTS
+1
+---- TYPES
+BIGINT
+====
+---- QUERY
+select * from iceberg_v2_partitioned_position_deletes_orc tablesample system(50) repeatable(1234);
+---- RESULTS
+20,'Alex','view',2020-01-01 09:00:00
+6,'Alex','view',2020-01-01 09:00:00
+4,'Alex','view',2020-01-01 09:00:00
+12,'Alan','click',2020-01-01 10:00:00
+18,'Alan','click',2020-01-01 10:00:00
+10,'Alan','click',2020-01-01 10:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
+---- QUERY
+select * from iceberg_v2_partitioned_position_deletes_orc tablesample system(50) repeatable(1234)
+where action = 'click';
+---- RESULTS
+12,'Alan','click',2020-01-01 10:00:00
+18,'Alan','click',2020-01-01 10:00:00
+10,'Alan','click',2020-01-01 10:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/iceberg-tablesample.test b/testdata/workloads/functional-query/queries/QueryTest/iceberg-tablesample.test
new file mode 100644
index 000000000..0c3dd048d
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/iceberg-tablesample.test
@@ -0,0 +1,43 @@
+====
+---- QUERY
+set TIMEZONE='Europe/Budapest';
+select * from iceberg_non_partitioned tablesample system(10) repeatable(1234)
+---- RESULTS
+20,'Alex','view',2020-01-01 09:00:00
+9,'Alan','click',2020-01-01 10:00:00
+6,'Alex','view',2020-01-01 09:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
+---- QUERY
+select count(*) from iceberg_non_partitioned tablesample system(10) repeatable(1234)
+---- RESULTS
+3
+---- TYPES
+BIGINT
+====
+---- QUERY
+set TIMEZONE='Europe/Budapest';
+select * from iceberg_partitioned tablesample system(10) repeatable(1234)
+---- RESULTS
+16,'Lisa','download',2020-01-01 11:00:00
+3,'Alan','click',2020-01-01 10:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
+---- QUERY
+select count(*) from iceberg_partitioned tablesample system(10) repeatable(1234)
+---- RESULTS
+2
+---- TYPES
+BIGINT
+====
+---- QUERY
+set TIMEZONE='Europe/Budapest';
+select * from iceberg_partitioned tablesample system(10) repeatable(1234)
+where action = 'click';
+---- RESULTS
+10,'Alan','click',2020-01-01 10:00:00
+---- TYPES
+INT, STRING, STRING, TIMESTAMP
+====
diff --git a/tests/query_test/test_iceberg.py b/tests/query_test/test_iceberg.py
index 8ee20efa2..85f4d39e6 100644
--- a/tests/query_test/test_iceberg.py
+++ b/tests/query_test/test_iceberg.py
@@ -792,6 +792,10 @@ class TestIcebergTable(IcebergTestSuite):
     self.run_test_case('QueryTest/iceberg-mixed-file-format', vector,
                       unique_database)
 
+  def test_table_sampling(self, vector):
+    self.run_test_case('QueryTest/iceberg-tablesample', vector,
+        use_db="functional_parquet")
+
   def _create_table_like_parquet_helper(self, vector, unique_database, tbl_name,
                                         expect_success):
     create_table_from_parquet(self.client, unique_database, tbl_name)
@@ -873,3 +877,9 @@ class TestIcebergV2Table(IcebergTestSuite):
   @SkipIf.not_hdfs
   def test_read_position_deletes_orc(self, vector):
     self.run_test_case('QueryTest/iceberg-v2-read-position-deletes-orc', vector)
+
+  @SkipIfDockerizedCluster.internal_hostname
+  @SkipIf.not_hdfs
+  def test_table_sampling_v2(self, vector):
+    self.run_test_case('QueryTest/iceberg-tablesample-v2', vector,
+        use_db="functional_parquet")