You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/02/09 16:59:21 UTC
[02/21] impala git commit: IMPALA-6113: Skip row groups with
predicates on NULL columns
IMPALA-6113: Skip row groups with predicates on NULL columns
Based on the existing Parquet column chunk level statistics null_count,
Impala's Parquet scanner is enhanced to skip an entire row group if the
null_count statistics indicate that all the values under the predicated
column are NULL as we wouldn't get any result rows from that row group
anyway.
Change-Id: I141317af0e0df30da8f220b29b0bfba364f40ddf
Reviewed-on: http://gerrit.cloudera.org:8080/9140
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b59a7846
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b59a7846
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b59a7846
Branch: refs/heads/2.x
Commit: b59a784657c9d6903d5125b686f5c95483d574e4
Parents: 4c6a83a
Author: Gabor Kaszab <ga...@cloudera.com>
Authored: Wed Jan 24 17:01:34 2018 +0100
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Feb 8 07:01:52 2018 +0000
----------------------------------------------------------------------
be/src/exec/hdfs-parquet-scanner.cc | 7 +++++
be/src/exec/parquet-column-stats.cc | 13 ++++++++
be/src/exec/parquet-column-stats.h | 6 ++++
.../queries/QueryTest/parquet-stats.test | 33 ++++++++++++++++++++
4 files changed, 59 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/b59a7846/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index c14edd7..7a10f3c 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -584,6 +584,13 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
DCHECK(false) << "Unsupported function name for statistics evaluation: " << fn_name;
}
+ int64_t null_count = 0;
+ bool null_count_result = ColumnStatsBase::ReadNullCountStat(col_chunk, &null_count);
+ if (null_count_result && null_count == col_chunk.meta_data.num_values) {
+ *skip_row_group = true;
+ break;
+ }
+
if (stats_read) {
TupleRow row;
row.SetTuple(0, min_max_tuple_);
http://git-wip-us.apache.org/repos/asf/impala/blob/b59a7846/be/src/exec/parquet-column-stats.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.cc b/be/src/exec/parquet-column-stats.cc
index 4443309..a1d1155 100644
--- a/be/src/exec/parquet-column-stats.cc
+++ b/be/src/exec/parquet-column-stats.cc
@@ -129,6 +129,19 @@ bool ColumnStatsBase::ReadFromThrift(const parquet::ColumnChunk& col_chunk,
return false;
}
+bool ColumnStatsBase::ReadNullCountStat(const parquet::ColumnChunk& col_chunk,
+ int64_t* null_count) {
+ if (!(col_chunk.__isset.meta_data && col_chunk.meta_data.__isset.statistics)) {
+ return false;
+ }
+ const parquet::Statistics& stats = col_chunk.meta_data.statistics;
+ if (stats.__isset.null_count) {
+ *null_count = stats.null_count;
+ return true;
+ }
+ return false;
+}
+
Status ColumnStatsBase::CopyToBuffer(StringBuffer* buffer, StringValue* value) {
if (value->ptr == buffer->buffer()) return Status::OK();
buffer->Clear();
http://git-wip-us.apache.org/repos/asf/impala/blob/b59a7846/be/src/exec/parquet-column-stats.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.h b/be/src/exec/parquet-column-stats.h
index 0ff277c..e9cf801 100644
--- a/be/src/exec/parquet-column-stats.h
+++ b/be/src/exec/parquet-column-stats.h
@@ -73,6 +73,12 @@ class ColumnStatsBase {
const ColumnType& col_type, const parquet::ColumnOrder* col_order,
StatsField stats_field, void* slot);
+ // Gets the null_count statistics from the given column chunk's metadata and returns
+ // it via an output parameter.
+ // Returns true if the null_count stats were read successfully, false otherwise.
+ static bool ReadNullCountStat(const parquet::ColumnChunk& col_chunk,
+ int64_t* null_count);
+
/// Merges this statistics object with values from 'other'. If other has not been
/// initialized, then this object will not be changed.
virtual void Merge(const ColumnStatsBase& other) = 0;
http://git-wip-us.apache.org/repos/asf/impala/blob/b59a7846/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
index d03b4c9..70b5f27 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
@@ -458,3 +458,36 @@ select count(*) from functional_parquet.alltypes where id < 0;
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
+---- QUERY
+# Check that all the row groups are skipped using null_count stat
+create table table_for_null_count_test (i int, j int) stored as parquet;
+insert into table_for_null_count_test values (1, NULL), (2, NULL), (3, NULL);
+select count(*) from table_for_null_count_test where j < 3;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Insert another row group where not all the 'j' values are NULL
+insert into table_for_null_count_test values (4, 1), (5, NULL);
+select i from table_for_null_count_test where j < 3;
+---- RESULTS
+4
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Turning off parquet stats and verifying that no row groups are skipped
+set PARQUET_READ_STATISTICS=0;
+create table table_for_null_count_test2 (i int, j int) stored as parquet;
+insert into table_for_null_count_test2 values (1, NULL), (2, NULL), (3, NULL);
+select count(*) from table_for_null_count_test2 where j < 3;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====