You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/02/10 17:20:54 UTC

[impala] branch master updated: IMPALA-8110: Fix Parquet min/max filters for narrowed integer types

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new ebc2c36  IMPALA-8110: Fix Parquet min/max filters for narrowed integer types
ebc2c36 is described below

commit ebc2c366f5780a89f09eb2014ca94f9d970f50b4
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Tue Jan 21 14:56:14 2020 -0800

    IMPALA-8110: Fix Parquet min/max filters for narrowed integer types
    
    This patch adds validation for the paired stats values of tinyint
    and smallint column data type when reading min/max column stats
    value from Parquet file.
    
    Testing:
     - Added automatic test cases in parquet-stats.test for column data
       type been changed from int to tinyint, from smallint to tinyint
       and from int to smallint.
     - Passed EE tests.
     - Passed all core tests.
    
    Change-Id: Id8bdaf4c4b2d0c6ea26d6e9bf013afca647e53a1
    Reviewed-on: http://gerrit.cloudera.org:8080/15087
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/parquet/hdfs-parquet-scanner.cc        |  6 +-
 be/src/exec/parquet/parquet-column-stats.cc        | 38 ++++++++--
 be/src/exec/parquet/parquet-column-stats.h         |  3 +
 .../queries/QueryTest/parquet-stats.test           | 82 ++++++++++++++++++++++
 4 files changed, 121 insertions(+), 8 deletions(-)

diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc
index 0a74f34..9760aff 100644
--- a/be/src/exec/parquet/hdfs-parquet-scanner.cc
+++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc
@@ -697,10 +697,12 @@ bool HdfsParquetScanner::ReadStatFromIndex(const ColumnStatsReader& stats_reader
   switch (stats_field) {
     case ColumnStatsReader::StatsField::MIN:
       return stats_reader.ReadFromString(
-          stats_field, column_index.min_values[page_idx], slot);
+          stats_field, column_index.min_values[page_idx],
+          &column_index.max_values[page_idx], slot);
     case ColumnStatsReader::StatsField::MAX:
       return stats_reader.ReadFromString(
-          stats_field, column_index.max_values[page_idx], slot);
+          stats_field, column_index.max_values[page_idx],
+          &column_index.min_values[page_idx], slot);
     default:
       DCHECK(false);
   }
diff --git a/be/src/exec/parquet/parquet-column-stats.cc b/be/src/exec/parquet/parquet-column-stats.cc
index 7dca501..6073860 100644
--- a/be/src/exec/parquet/parquet-column-stats.cc
+++ b/be/src/exec/parquet/parquet-column-stats.cc
@@ -48,23 +48,32 @@ bool ColumnStatsReader::ReadFromThrift(StatsField stats_field, void* slot) const
   // Try to read the requested stats field. If it is not set, we may fall back to reading
   // the old stats, based on the column type.
   const string* stat_value = nullptr;
+  const string* paired_stats_value = nullptr;
   switch (stats_field) {
     case StatsField::MIN:
       if (stats.__isset.min_value && CanUseStats()) {
         stat_value = &stats.min_value;
+        if (stats.__isset.max_value)
+          paired_stats_value = &stats.max_value;
         break;
       }
       if (stats.__isset.min && CanUseDeprecatedStats()) {
         stat_value = &stats.min;
+        if (stats.__isset.max)
+          paired_stats_value = &stats.max;
       }
       break;
     case StatsField::MAX:
       if (stats.__isset.max_value && CanUseStats()) {
         stat_value = &stats.max_value;
+        if (stats.__isset.min_value)
+          paired_stats_value = &stats.min_value;
         break;
       }
       if (stats.__isset.max && CanUseDeprecatedStats()) {
         stat_value = &stats.max;
+        if (stats.__isset.min)
+          paired_stats_value = &stats.min;
       }
       break;
     default:
@@ -72,11 +81,11 @@ bool ColumnStatsReader::ReadFromThrift(StatsField stats_field, void* slot) const
   }
   if (stat_value == nullptr) return false;
 
-  return ReadFromString(stats_field, *stat_value, slot);
+  return ReadFromString(stats_field, *stat_value, paired_stats_value, slot);
 }
 
 bool ColumnStatsReader::ReadFromString(StatsField stats_field,
-    const string& encoded_value, void* slot) const {
+    const string& encoded_value, const string* paired_stats_value, void* slot) const {
   switch (col_type_.type) {
     case TYPE_BOOLEAN:
       return ColumnStats<bool>::DecodePlainValue(encoded_value, slot,
@@ -84,10 +93,20 @@ bool ColumnStatsReader::ReadFromString(StatsField stats_field,
     case TYPE_TINYINT: {
       // parquet::Statistics encodes INT_8 values using 4 bytes.
       int32_t col_stats;
+      int32_t paired_stats_val = 0;
       bool ret = ColumnStats<int32_t>::DecodePlainValue(encoded_value, &col_stats,
           parquet::Type::INT32);
-      if (!ret || col_stats < std::numeric_limits<int8_t>::min() ||
-          col_stats > std::numeric_limits<int8_t>::max()) {
+      if (!ret || paired_stats_value == nullptr) return false;
+      ret = ColumnStats<int32_t>::DecodePlainValue(*paired_stats_value,
+          &paired_stats_val, parquet::Type::INT32);
+      // Check if the values of the column stats and paired stats are in valid range.
+      // The column stats values could be invalid if the column data type
+      // has been changed.
+      if (!ret ||
+          col_stats < std::numeric_limits<int8_t>::min() ||
+          col_stats > std::numeric_limits<int8_t>::max() ||
+          paired_stats_val < std::numeric_limits<int8_t>::min() ||
+          paired_stats_val > std::numeric_limits<int8_t>::max()) {
         return false;
       }
       *static_cast<int8_t*>(slot) = col_stats;
@@ -96,10 +115,17 @@ bool ColumnStatsReader::ReadFromString(StatsField stats_field,
     case TYPE_SMALLINT: {
       // parquet::Statistics encodes INT_16 values using 4 bytes.
       int32_t col_stats;
+      int32_t paired_stats_val = 0;
       bool ret = ColumnStats<int32_t>::DecodePlainValue(encoded_value, &col_stats,
           parquet::Type::INT32);
-      if (!ret || col_stats < std::numeric_limits<int16_t>::min() ||
-          col_stats > std::numeric_limits<int16_t>::max()) {
+      if (!ret || paired_stats_value == nullptr) return false;
+      ret = ColumnStats<int32_t>::DecodePlainValue(*paired_stats_value,
+          &paired_stats_val, parquet::Type::INT32);
+      if (!ret ||
+          col_stats < std::numeric_limits<int16_t>::min() ||
+          col_stats > std::numeric_limits<int16_t>::max() ||
+          paired_stats_val < std::numeric_limits<int16_t>::min() ||
+          paired_stats_val > std::numeric_limits<int16_t>::max()) {
         return false;
       }
       *static_cast<int16_t*>(slot) = col_stats;
diff --git a/be/src/exec/parquet/parquet-column-stats.h b/be/src/exec/parquet/parquet-column-stats.h
index 9eb08ff..89d1ad6 100644
--- a/be/src/exec/parquet/parquet-column-stats.h
+++ b/be/src/exec/parquet/parquet-column-stats.h
@@ -271,7 +271,10 @@ public:
   bool ReadFromThrift(StatsField stats_field, void* slot) const;
 
   /// Read plain encoded value from a string 'encoded_value' into 'slot'.
+  /// Set paired_stats_value as nullptr if there is no corresponding paired stats,
+  /// or paired stats value is not set.
   bool ReadFromString(StatsField stats_field, const std::string& encoded_value,
+      const std::string* paired_stats_value,
       void* slot) const;
 
   // Gets the null_count statistics from the column chunk's metadata and returns
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
index 0d01933..8c9585b 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
@@ -681,3 +681,85 @@ NaN
 select * from test_multiple_nans where val > 20
 ---- RESULTS
 ====
+---- QUERY
+# Test the cases when column data types are changed from 32 bit integer to 8 bit integer,
+# from 16 bit integer to 8 bit integer, and from 32 bit integer to 16 bit integer.
+# Some existing data values in the table will be overflow and cause the min/max stats
+# values to be invalid.
+set PARQUET_READ_STATISTICS=1;
+create table tnarrow(i32_to_8 int, i16_to_8 smallint, i32_to_16 int) stored as parquet;
+insert into tnarrow values (1, 2, 3), (200, 201, 40000);
+alter table tnarrow change column i32_to_8 i32_to_8 tinyint;
+alter table tnarrow change column i16_to_8 i16_to_8 tinyint;
+alter table tnarrow change column i32_to_16 i32_to_16 smallint;
+insert into tnarrow values (5, 6, 7), (-5, -6, -7);
+select count(*) from tnarrow;
+---- RESULTS
+4
+====
+---- QUERY
+# Test the case when column data type is changed from 32 bit integer to 8 bit integer.
+select count(*) from tnarrow where i32_to_8 < 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_8 > 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_8 = 1;
+---- RESULTS
+1
+====
+---- QUERY
+# When the column data type is changed, 200 is overflown as value -56.
+select count(*) from tnarrow where i32_to_8 = -56;
+---- RESULTS
+1
+====
+---- QUERY
+# Test the case when column data type is changed from 16 bit integer to 8 bit integer.
+select count(*) from tnarrow where i16_to_8 < 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i16_to_8 > 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i16_to_8 = 2;
+---- RESULTS
+1
+====
+---- QUERY
+# When the column data type is changed, 201 is overflown as value -55.
+select count(*) from tnarrow where i16_to_8 = -55;
+---- RESULTS
+1
+====
+---- QUERY
+# Test the case when column data type is changed from 32 bit integer to 16 bit integer.
+select count(*) from tnarrow where i32_to_16 < 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_16 > 0;
+---- RESULTS
+2
+====
+---- QUERY
+select count(*) from tnarrow where i32_to_16 = 3;
+---- RESULTS
+1
+====
+---- QUERY
+# When the column data type is changed, 40000 is overflown as value -25536.
+select count(*) from tnarrow where i32_to_16 = -25536;
+---- RESULTS
+1
+====