You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/02/10 17:20:54 UTC
[impala] branch master updated: IMPALA-8110: Fix Parquet min/max
filters for narrowed integer types
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new ebc2c36 IMPALA-8110: Fix Parquet min/max filters for narrowed integer types
ebc2c36 is described below
commit ebc2c366f5780a89f09eb2014ca94f9d970f50b4
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Tue Jan 21 14:56:14 2020 -0800
IMPALA-8110: Fix Parquet min/max filters for narrowed integer types
This patch adds validation for the paired stats values of tinyint
and smallint column data type when reading min/max column stats
value from Parquet file.
Testing:
- Added automatic test cases in parquet-stats.test for column data
type been changed from int to tinyint, from smallint to tinyint
and from int to smallint.
- Passed EE tests.
- Passed all core tests.
Change-Id: Id8bdaf4c4b2d0c6ea26d6e9bf013afca647e53a1
Reviewed-on: http://gerrit.cloudera.org:8080/15087
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/parquet/hdfs-parquet-scanner.cc | 6 +-
be/src/exec/parquet/parquet-column-stats.cc | 38 ++++++++--
be/src/exec/parquet/parquet-column-stats.h | 3 +
.../queries/QueryTest/parquet-stats.test | 82 ++++++++++++++++++++++
4 files changed, 121 insertions(+), 8 deletions(-)
diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 0a74f34..9760aff 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -697,10 +697,12 @@ bool HdfsParquetScanner::ReadStatFromIndex(const ColumnStatsReader& stats_reader
switch (stats_field) {
case ColumnStatsReader::StatsField::MIN:
return stats_reader.ReadFromString(
- stats_field, column_index.min_values[page_idx], slot);
+ stats_field, column_index.min_values[page_idx],
+ &column_index.max_values[page_idx], slot);
case ColumnStatsReader::StatsField::MAX:
return stats_reader.ReadFromString(
- stats_field, column_index.max_values[page_idx], slot);
+ stats_field, column_index.max_values[page_idx],
+ &column_index.min_values[page_idx], slot);
default:
DCHECK(false);
}
diff --git a/be/src/exec/parquet/parquet-column-stats.cc b/be/src/exec/parquet/parquet-column-stats.cc
index 7dca501..6073860 100644
--- a/be/src/exec/parquet/parquet-column-stats.cc
+++ b/be/src/exec/parquet/parquet-column-stats.cc
@@ -48,23 +48,32 @@ bool ColumnStatsReader::ReadFromThrift(StatsField stats_field, void* slot) const
// Try to read the requested stats field. If it is not set, we may fall back to reading
// the old stats, based on the column type.
const string* stat_value = nullptr;
+ const string* paired_stats_value = nullptr;
switch (stats_field) {
case StatsField::MIN:
if (stats.__isset.min_value && CanUseStats()) {
stat_value = &stats.min_value;
+ if (stats.__isset.max_value)
+ paired_stats_value = &stats.max_value;
break;
}
if (stats.__isset.min && CanUseDeprecatedStats()) {
stat_value = &stats.min;
+ if (stats.__isset.max)
+ paired_stats_value = &stats.max;
}
break;
case StatsField::MAX:
if (stats.__isset.max_value && CanUseStats()) {
stat_value = &stats.max_value;
+ if (stats.__isset.min_value)
+ paired_stats_value = &stats.min_value;
break;
}
if (stats.__isset.max && CanUseDeprecatedStats()) {
stat_value = &stats.max;
+ if (stats.__isset.min)
+ paired_stats_value = &stats.min;
}
break;
default:
@@ -72,11 +81,11 @@ bool ColumnStatsReader::ReadFromThrift(StatsField stats_field, void* slot) const
}
if (stat_value == nullptr) return false;
- return ReadFromString(stats_field, *stat_value, slot);
+ return ReadFromString(stats_field, *stat_value, paired_stats_value, slot);
}
bool ColumnStatsReader::ReadFromString(StatsField stats_field,
- const string& encoded_value, void* slot) const {
+ const string& encoded_value, const string* paired_stats_value, void* slot) const {
switch (col_type_.type) {
case TYPE_BOOLEAN:
return ColumnStats<bool>::DecodePlainValue(encoded_value, slot,
@@ -84,10 +93,20 @@ bool ColumnStatsReader::ReadFromString(StatsField stats_field,
case TYPE_TINYINT: {
// parquet::Statistics encodes INT_8 values using 4 bytes.
int32_t col_stats;
+ int32_t paired_stats_val = 0;
bool ret = ColumnStats<int32_t>::DecodePlainValue(encoded_value, &col_stats,
parquet::Type::INT32);
- if (!ret || col_stats < std::numeric_limits<int8_t>::min() ||
- col_stats > std::numeric_limits<int8_t>::max()) {
+ if (!ret || paired_stats_value == nullptr) return false;
+ ret = ColumnStats<int32_t>::DecodePlainValue(*paired_stats_value,
+ &paired_stats_val, parquet::Type::INT32);
+ // Check if the values of the column stats and paired stats are in valid range.
+ // The column stats values could be invalid if the column data type
+ // has been changed.
+ if (!ret ||
+ col_stats < std::numeric_limits<int8_t>::min() ||
+ col_stats > std::numeric_limits<int8_t>::max() ||
+ paired_stats_val < std::numeric_limits<int8_t>::min() ||
+ paired_stats_val > std::numeric_limits<int8_t>::max()) {
return false;
}
*static_cast<int8_t*>(slot) = col_stats;
@@ -96,10 +115,17 @@ bool ColumnStatsReader::ReadFromString(StatsField stats_field,
case TYPE_SMALLINT: {
// parquet::Statistics encodes INT_16 values using 4 bytes.
int32_t col_stats;
+ int32_t paired_stats_val = 0;
bool ret = ColumnStats<int32_t>::DecodePlainValue(encoded_value, &col_stats,
parquet::Type::INT32);
- if (!ret || col_stats < std::numeric_limits<int16_t>::min() ||
- col_stats > std::numeric_limits<int16_t>::max()) {
+ if (!ret || paired_stats_value == nullptr) return false;
+ ret = ColumnStats<int32_t>::DecodePlainValue(*paired_stats_value,
+ &paired_stats_val, parquet::Type::INT32);
+ if (!ret ||
+ col_stats < std::numeric_limits<int16_t>::min() ||
+ col_stats > std::numeric_limits<int16_t>::max() ||
+ paired_stats_val < std::numeric_limits<int16_t>::min() ||
+ paired_stats_val > std::numeric_limits<int16_t>::max()) {
return false;
}
*static_cast<int16_t*>(slot) = col_stats;
diff --git a/be/src/exec/parquet/parquet-column-stats.h b/be/src/exec/parquet/parquet-column-stats.h
index 9eb08ff..89d1ad6 100644
--- a/be/src/exec/parquet/parquet-column-stats.h
+++ b/be/src/exec/parquet/parquet-column-stats.h
@@ -271,7 +271,10 @@ public:
bool ReadFromThrift(StatsField stats_field, void* slot) const;
/// Read plain encoded value from a string 'encoded_value' into 'slot'.
+ /// Set paired_stats_value as nullptr if there is no corresponding paired stats,
+ /// or paired stats value is not set.
bool ReadFromString(StatsField stats_field, const std::string& encoded_value,
+ const std::string* paired_stats_value,
void* slot) const;
// Gets the null_count statistics from the column chunk's metadata and returns
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
index 0d01933..8c9585b 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
@@ -681,3 +681,85 @@ NaN
select * from test_multiple_nans where val > 20
---- RESULTS
====
+---- QUERY
+# Test the cases when column data types are changed from 32 bit integer to 8 bit integer,
+# from 16 bit integer to 8 bit integer, and from 32 bit integer to 16 bit integer.
+# Some existing data values in the table will be overflow and cause the min/max stats
+# values to be invalid.
+set PARQUET_READ_STATISTICS=1;
+create table tnarrow(i32_to_8 int, i16_to_8 smallint, i32_to_16 int) stored as parquet;
+insert into tnarrow values (1, 2, 3), (200, 201, 40000);
+alter table tnarrow change column i32_to_8 i32_to_8 tinyint;
+alter table tnarrow change column i16_to_8 i16_to_8 tinyint;
+alter table tnarrow change column i32_to_16 i32_to_16 smallint;
+insert into tnarrow values (5, 6, 7), (-5, -6, -7);
+select count(*) from tnarrow;
+---- RESULTS
+4
+====
+---- QUERY
+# Test the case when column data type is changed from 32 bit integer to 8 bit integer.
+select count(*) from tnarrow where i32_to_8 < 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_8 > 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_8 = 1;
+---- RESULTS
+1
+====
+---- QUERY
+# When the column data type is changed, 200 is overflown as value -56.
+select count(*) from tnarrow where i32_to_8 = -56;
+---- RESULTS
+1
+====
+---- QUERY
+# Test the case when column data type is changed from 16 bit integer to 8 bit integer.
+select count(*) from tnarrow where i16_to_8 < 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i16_to_8 > 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i16_to_8 = 2;
+---- RESULTS
+1
+====
+---- QUERY
+# When the column data type is changed, 201 is overflown as value -55.
+select count(*) from tnarrow where i16_to_8 = -55;
+---- RESULTS
+1
+====
+---- QUERY
+# Test the case when column data type is changed from 32 bit integer to 16 bit integer.
+select count(*) from tnarrow where i32_to_16 < 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_16 > 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_16 = 3;
+---- RESULTS
+1
+====
+---- QUERY
+# When the column data type is changed, 40000 is overflown as value -25536.
+select count(*) from tnarrow where i32_to_16 = -25536;
+---- RESULTS
+1
+====