You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/08/08 23:37:17 UTC
[impala] 20/27: IMPALA-11414: Off-by-one error in Parquet late materialization
This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 83e9cfc1d57827f72121d70781aa5e70ae39d7b1
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
AuthorDate: Mon Jul 4 16:22:03 2022 +0200
IMPALA-11414: Off-by-one error in Parquet late materialization
With PARQUET_LATE_MATERIALIZATION we can set the number of minimum
consecutive rows that if filtered out, we avoid materialization of rows
in other columns in parquet.
E.g. if PARQUET_LATE_MATERIALIZATION is 10, and in a filtered column we
find at least 10 consecutive rows that don't pass the predicates we
avoid materializing the corresponding rows in the other columns.
But due to an off-by-one error we actually only needed
(PARQUET_LATE_MATERIALIZATION - 1) consecutive elements. This means if
we set PARQUET_LATE_MATERIALIZATION to one, then we need zero
consecutive filtered out elements which leads to a crash/DCHECK. The bug
is in the GetMicroBatches() algorithm when we produce the micro batches
based on the selected rows.
Setting PARQUET_LATE_MATERIALIZATION to 0 doesn't make sense so it
shouldn't be allowed.
Testing
* e2e test with PARQUET_LATE_MATERIALIZATION=1
* e2e test for checking SET PARQUET_LATE_MATERIALIZATION=N
Change-Id: I38f95ad48c4ac8c1e06651565ab5c496283b29fa
Reviewed-on: http://gerrit.cloudera.org:8080/18700
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/scratch-tuple-batch-test.cc | 2 +-
be/src/exec/scratch-tuple-batch.h | 11 ++++++-----
be/src/service/query-options.cc | 6 ++++--
.../parquet-late-materialization-unique-db.test | 15 +++++++++++++++
.../functional-query/queries/QueryTest/set.test | 16 ++++++++++++++++
tests/query_test/test_parquet_late_materialization.py | 4 ++++
6 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/be/src/exec/scratch-tuple-batch-test.cc b/be/src/exec/scratch-tuple-batch-test.cc
index fc730bc78..df9d3d2ff 100644
--- a/be/src/exec/scratch-tuple-batch-test.cc
+++ b/be/src/exec/scratch-tuple-batch-test.cc
@@ -64,7 +64,7 @@ class ScratchTupleBatchTest : public testing::Test {
int last_true_idx = batch.start;
for (int j = batch.start + 1; j < batch.end; j++) {
if (selected_rows[j]) {
- EXPECT_TRUE(j - last_true_idx + 1 <= gap);
+ EXPECT_LE(j - last_true_idx, gap);
last_true_idx = j;
}
}
diff --git a/be/src/exec/scratch-tuple-batch.h b/be/src/exec/scratch-tuple-batch.h
index 31edd2bb9..6513c5c70 100644
--- a/be/src/exec/scratch-tuple-batch.h
+++ b/be/src/exec/scratch-tuple-batch.h
@@ -169,10 +169,11 @@ struct ScratchTupleBatch {
/// Creates micro batches that needs to be scanned.
/// Bits set in 'selected_rows' are the rows that needs to be scanned. Consecutive
/// bits set are used to create micro batches. Micro batches that differ by less than
- /// 'skip_length', are merged together. E.g., for micro batches 1-8, 11-20, 35-100
- /// derived from 'selected_rows' and 'skip_length' as 10, first two micro batches would
- /// be merged into 1-20 as they differ by 3 (11 - 8) which is less than 10
- /// ('skip_length'). Precondition for the function is there is at least one micro batch
+ /// or equal to 'skip_length', are merged together.
+ /// E.g., for micro batches 1-8, 11-20, 35-100 derived from 'selected_rows' and
+ /// 'skip_length' as 10, first two micro batches would be merged into 1-20 as there is a
+ /// non-matching run of 2 rows (9, 10) which is <= 10 ('skip_length').
+ /// Precondition for the function is there is at least one micro batch
/// present i.e., atleast one of the 'selected_rows' is true.
int GetMicroBatches(int skip_length, ScratchMicroBatch* batches) {
int batch_idx = 0;
@@ -185,7 +186,7 @@ struct ScratchTupleBatch {
// start the first ever micro batch.
start = i;
last = i;
- } else if (i - last < skip_length) {
+ } else if (i - last <= skip_length) {
// continue the old micro batch as 'last' is within 'skip_length' of last
// micro batch.
last = i;
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 49bb06476..c5a1206ff 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -1160,9 +1160,11 @@ Status impala::SetQueryOption(const string& key, const string& value,
StringParser::ParseResult result;
const int32_t threshold =
StringParser::StringToInt<int32_t>(value.c_str(), value.length(), &result);
- if (value == nullptr || result != StringParser::PARSE_SUCCESS || threshold < -1) {
+ if (value == nullptr || result != StringParser::PARSE_SUCCESS || threshold < -1 ||
+ threshold == 0) {
return Status(Substitute("Invalid parquet late materialization threshold: "
- "'$0'. Only integer value -1 and above is allowed.", value));
+ "'$0'. Use -1 to disable the feature, and values > 0 to specify the "
+ "minimum skip length.", value));
}
query_options->__set_parquet_late_materialization_threshold(threshold);
break;
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-late-materialization-unique-db.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-late-materialization-unique-db.test
new file mode 100644
index 000000000..8097bb781
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-late-materialization-unique-db.test
@@ -0,0 +1,15 @@
+# This tests pages skipped by parquet late materialization.
+====
+---- QUERY
+create table late_mat(i int, j int)
+stored as parquet;
+insert into late_mat values (1,1), (0,0), (1,1), (0,0), (0,0), (1,1);
+set PARQUET_LATE_MATERIALIZATION_THRESHOLD=1;
+select i, j from late_mat where j = 0;
+---- RESULTS
+0,0
+0,0
+0,0
+---- TYPES
+INT, INT
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/set.test b/testdata/workloads/functional-query/queries/QueryTest/set.test
index b899f8656..f3a49aaeb 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/set.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/set.test
@@ -284,3 +284,19 @@ set SCAN_NODE_CODEGEN_THRESHOLD = "foo";
set max_io_buffers="foo";
---- RESULTS
====
+---- QUERY
+set PARQUET_LATE_MATERIALIZATION_THRESHOLD=-1;
+set PARQUET_LATE_MATERIALIZATION_THRESHOLD=1;
+set PARQUET_LATE_MATERIALIZATION_THRESHOLD=10000;
+---- RESULTS
+====
+---- QUERY
+set PARQUET_LATE_MATERIALIZATION_THRESHOLD=-2;
+---- CATCH
+Invalid parquet late materialization threshold: '-2'. Use -1 to disable the feature, and values > 0 to specify the minimum skip length.
+====
+---- QUERY
+set PARQUET_LATE_MATERIALIZATION_THRESHOLD=0;
+---- CATCH
+Invalid parquet late materialization threshold: '0'. Use -1 to disable the feature, and values > 0 to specify the minimum skip length.
+====
diff --git a/tests/query_test/test_parquet_late_materialization.py b/tests/query_test/test_parquet_late_materialization.py
index a1f64d92c..670daa4d6 100644
--- a/tests/query_test/test_parquet_late_materialization.py
+++ b/tests/query_test/test_parquet_late_materialization.py
@@ -35,3 +35,7 @@ class TestParquetLateMaterialization(ImpalaTestSuite):
def test_parquet_late_materialization(self, vector):
self.run_test_case('QueryTest/parquet-late-materialization', vector)
+
+ def test_parquet_late_materialization_unique_db(self, vector, unique_database):
+ self.run_test_case('QueryTest/parquet-late-materialization-unique-db', vector,
+ unique_database)