You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/03/08 16:15:05 UTC
[4/5] impala git commit: IMPALA-6542: Fix inconsistent write path of Parquet min/max statistics

IMPALA-6542: Fix inconsistent write path of Parquet min/max statistics

Quick fix of Parquet write path until the Parquet community
agrees on the ordering of floating point numbers.

The behavior follows the way fmax()/fmin() works, ie. Impala
will only write NaN into the stats when all the values are NaNs.
This behavior is aligned with the quick fix of Parquet-CPP.

Added e2e tests as well.

Change-Id: I3957806948f7c661af4be5495f2ec92d1e9fc9d6
Reviewed-on: http://gerrit.cloudera.org:8080/9381
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1201f284
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1201f284
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1201f284

Branch: refs/heads/2.x
Commit: 1201f284738ebc29a66a522591a1feca4f7a505e
Parents: d006dcc
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
Authored: Wed Feb 21 17:29:02 2018 +0100
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Mar 8 10:24:38 2018 +0000

----------------------------------------------------------------------
 be/src/exec/parquet-column-stats.h              | 14 +++
 be/src/exec/parquet-column-stats.inline.h       |  4 +-
 .../queries/QueryTest/parquet-stats.test        | 89 ++++++++++++++++----
 3 files changed, 90 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1201f284/be/src/exec/parquet-column-stats.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.h b/be/src/exec/parquet-column-stats.h
index e9cf801..44d4a65 100644
--- a/be/src/exec/parquet-column-stats.h
+++ b/be/src/exec/parquet-column-stats.h
@@ -62,6 +62,20 @@ class ColumnStatsBase {
   /// the minimum or maximum value.
   enum class StatsField { MIN, MAX };
 
+  /// min and max functions for types that are not floating point numbers
+  template <typename T, typename Enable = void>
+  struct MinMaxTrait {
+    static decltype(auto) MinValue(const T& a, const T& b) { return std::min(a, b); }
+    static decltype(auto) MaxValue(const T& a, const T& b) { return std::max(a, b); }
+  };
+
+  /// min and max functions for floating point types
+  template <typename T>
+  struct MinMaxTrait<T, std::enable_if_t<std::is_floating_point<T>::value>> {
+    static decltype(auto) MinValue(const T& a, const T& b) { return std::fmin(a, b); }
+    static decltype(auto) MaxValue(const T& a, const T& b) { return std::fmax(a, b); }
+  };
+
   ColumnStatsBase() : has_min_max_values_(false), null_count_(0) {}
   virtual ~ColumnStatsBase() {}
 

http://git-wip-us.apache.org/repos/asf/impala/blob/1201f284/be/src/exec/parquet-column-stats.inline.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.inline.h b/be/src/exec/parquet-column-stats.inline.h
index 5b67ee7..0b618f9 100644
--- a/be/src/exec/parquet-column-stats.inline.h
+++ b/be/src/exec/parquet-column-stats.inline.h
@@ -37,8 +37,8 @@ inline void ColumnStats<T>::Update(const T& min_value, const T& max_value) {
     min_value_ = min_value;
     max_value_ = max_value;
   } else {
-    min_value_ = std::min(min_value_, min_value);
-    max_value_ = std::max(max_value_, max_value);
+    min_value_ = MinMaxTrait<T>::MinValue(min_value_, min_value);
+    max_value_ = MinMaxTrait<T>::MaxValue(max_value_, max_value);
   }
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/1201f284/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
index 273dff8..73ad3e4 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
@@ -492,9 +492,9 @@ aggregation(SUM, NumRowGroups): 1
 aggregation(SUM, NumStatsFilteredRowGroups): 0
 ====
 ---- QUERY
-# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering.
-# When the first value is NaN in a column chunk, Impala chooses it as min_value and
-# max_value for statistics. In this case the min/max filter should be ignored.
+# IMPALA-6527: NaN values lead to incorrect filtering.
+# When the first value is NaN in a column chunk, Impala chose it as min_value and
+# max_value for statistics. Test if it is no longer the case.
 create table test_nan(val double) stored as parquet;
 insert into test_nan values (cast('NaN' as double)), (42);
 select * from test_nan where val > 0
@@ -502,29 +502,39 @@ select * from test_nan where val > 0
 42
 ====
 ---- QUERY
-# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering
-# test equality predicate
-select * from test_nan where val = 42
+# IMPALA-6527: NaN values lead to incorrect filtering
+# Test if '<' predicate produces expected results as well.
+select * from test_nan where val < 100
 ---- RESULTS
 42
 ====
 ---- QUERY
 # IMPALA-6527: NaN values lead to incorrect filtering
-# test predicate that is true for NaN
+# Test if valid statistics are written. The column chunk should be filtered out by
+# the min filter.
+select * from test_nan where val < 10
+---- RESULTS
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# IMPALA-6527: NaN values lead to incorrect filtering
+# Test predicate that is true for NaN.
 select * from test_nan where not val >= 0
 ---- RESULTS
 NaN
 ====
 ---- QUERY
 # IMPALA-6527: NaN values lead to incorrect filtering
-# test predicate that is true for NaN
+# Test predicate that is true for NaN.
 select * from test_nan where val != 0
 ---- RESULTS
 NaN
 42
 ====
 ---- QUERY
-# Statistics filtering must not filter out row groups if predicate can be true for NaN
+# Statistics filtering must not filter out row groups if predicate can be true for NaN.
 create table test_nan_true_predicate(val double) stored as parquet;
 insert into test_nan_true_predicate values (10), (20), (cast('NaN' as double));
 select * from test_nan_true_predicate where not val >= 0
@@ -532,22 +542,22 @@ select * from test_nan_true_predicate where not val >= 0
 NaN
 ====
 ---- QUERY
-# NaN is the last element, predicate is true for NaN and value
+# NaN is the last element, predicate is true for NaN and value.
 select * from test_nan_true_predicate where not val >= 20
 ---- RESULTS
 10
 NaN
 ====
 ---- QUERY
-# NaN is the last element, predicate is true for NaN and value
+# NaN is the last element, predicate is true for NaN and value.
 select * from test_nan_true_predicate where val != 10
 ---- RESULTS
 20
 NaN
 ====
 ---- QUERY
-# Test the case when NaN is inserted between two values
-# Test predicate true for NaN and false for the values
+# Test the case when NaN is inserted between two values.
+# Test predicate true for NaN and false for the values.
 create table test_nan_in_the_middle(val double) stored as parquet;
 insert into test_nan_in_the_middle values (10), (cast('NaN' as double)), (20);
 select * from test_nan_in_the_middle where not val >= 0
@@ -555,16 +565,65 @@ select * from test_nan_in_the_middle where not val >= 0
 NaN
 ====
 ---- QUERY
-# NaN in the middle, predicate true for NaN and value
+# NaN in the middle, predicate true for NaN and value.
 select * from test_nan_in_the_middle where not val >= 20
 ---- RESULTS
 10
 NaN
 ====
 ---- QUERY
-# NaN in the middle, '!=' should return NaN and value
+# NaN in the middle, '!=' should return NaN and value.
 select * from test_nan_in_the_middle where val != 10
 ---- RESULTS
 NaN
 20
 ====
+---- QUERY
+# Test the case when there are only NaNs in the column chunk.
+# Test predicate true for NaN
+create table test_nan_only(val double) stored as parquet;
+insert into test_nan_only values (cast('NaN' as double)), (cast('NaN' as double)),
+    (cast('NaN' as double));
+select * from test_nan_only where not val >= 0
+---- RESULTS
+NaN
+NaN
+NaN
+====
+---- QUERY
+# There are only NaN values, predicate is false for NaN
+select * from test_nan_only where val >= 20
+---- RESULTS
+====
+---- QUERY
+# Test the case when a number is following multiple NaNs.
+# Test predicate true for NaN, false for the inserted number
+create table test_multiple_nans(val double) stored as parquet;
+insert into test_multiple_nans values (cast('NaN' as double)), (cast('NaN' as double)),
+    (cast('NaN' as double)), (20);
+select * from test_multiple_nans where not val >= 0
+---- RESULTS
+NaN
+NaN
+NaN
+====
+---- QUERY
+# Multiple NaNs followed by a number, predicate is false for NaN and true for the number
+select * from test_multiple_nans where val >= 20
+---- RESULTS
+20
+====
+---- QUERY
+# Multiple NaNs followed by a number, predicate is true for NaN and true for the number
+select * from test_multiple_nans where not val > 20
+---- RESULTS
+NaN
+NaN
+NaN
+20
+====
+---- QUERY
+# Multiple NaNs followed by a number, predicate is false for NaN and false for the number
+select * from test_multiple_nans where val > 20
+---- RESULTS
+====