You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/03/31 19:04:41 UTC

[impala] branch master updated: IMPALA-9572: Fix DCHECK in nested Parquet scanning

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new e8f604a  IMPALA-9572: Fix DCHECK in nested Parquet scanning
e8f604a is described below

commit e8f604a2139be2ee3f011e6f2ce71fa0dde26492
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Mon Mar 30 20:15:09 2020 +0200

    IMPALA-9572: Fix DCHECK in nested Parquet scanning
    
    The issue occurred when there were skipped pages and a column
    inside a collection was scanned, but its position was not needed.
    The repetition level still needs to be read in this case, as the
    skipped ranges are set in top level rows, so collection items
    need to know which top level row do they belong to.
    
    A DCHECK in StrideWriter's constructor was hit, otherwise the
    code ran correctly in release mode. The DCHECK is moved to
    functions where the condition would actually cause problems.
    
    Testing:
    - added and ran a regression test
    
    Change-Id: I5e8ef514ead71f732c73f910af7fd1aecd37bb81
    Reviewed-on: http://gerrit.cloudera.org:8080/15598
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/parquet/parquet-column-readers.cc                    | 2 +-
 be/src/util/mem-util.h                                           | 9 ++++++++-
 .../queries/QueryTest/nested-types-parquet-page-index.test       | 9 +++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc
index 512632b..58cc12f 100644
--- a/be/src/exec/parquet/parquet-column-readers.cc
+++ b/be/src/exec/parquet/parquet-column-readers.cc
@@ -1323,7 +1323,7 @@ int BaseScalarColumnReader::FillPositionsInCandidateRange(int rows_remaining,
     int rep_level = rep_levels_.CacheGetNext();
     if (rep_level == 0) ++row_count;
     ++val_count;
-    if (pos_slot_desc_ != nullptr) {
+    if (pos_writer.IsValid()) {
       if (rep_level <= max_rep_level() - 1) pos_current_value_ = 0;
       *pos_writer.Advance() = pos_current_value_++;
     }
diff --git a/be/src/util/mem-util.h b/be/src/util/mem-util.h
index a7125a9..16e8085 100644
--- a/be/src/util/mem-util.h
+++ b/be/src/util/mem-util.h
@@ -31,17 +31,21 @@ namespace impala {
 template <typename T>
 struct StrideWriter {
   // The next element to write to. May not be aligned to the natural alignment of T.
+  // Can be null for convenience, but the StrideWriter cannot be used in that case.
   T* current;
 
   // The stride in bytes between subsequent values.
   int64_t stride;
 
   explicit StrideWriter(T* current, int64_t stride) : current(current), stride(stride) {
-    DCHECK(current != nullptr);
   }
 
+  /// No other functions can be called if false.
+  ALWAYS_INLINE bool IsValid() const { return current != nullptr; }
+
   /// Set the next element to 'val' and advance 'current' to the next element.
   ALWAYS_INLINE void SetNext(T& val) {
+    DCHECK(IsValid());
     // memcpy() is necessary because 'current' may not be aligned.
     memcpy(current, &val, sizeof(T));
     SkipNext(1);
@@ -49,6 +53,7 @@ struct StrideWriter {
 
   /// Return a pointer to the current element and advance 'current' to the next element.
   ALWAYS_INLINE T* Advance() {
+    DCHECK(IsValid());
     T* curr = current;
     SkipNext(1);
     return curr;
@@ -56,11 +61,13 @@ struct StrideWriter {
 
   /// Set the next 'count' elements to 'val', advancing 'current' by 'count' values.
   void SetNext(T& val, int64_t count) {
+    DCHECK(IsValid());
     for (int64_t i = 0; i < count; ++i) SetNext(val);
   }
 
   /// Advance 'current' by 'count' values.
   ALWAYS_INLINE void SkipNext(int64_t count) {
+    DCHECK(IsValid());
     DCHECK_GE(count, 0);
     current = reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(current) + stride * count);
   }
diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test
index 8c85b57..8a372f7 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test
@@ -702,3 +702,12 @@ BIGINT, DECIMAL
 ---- RUNTIME_PROFILE
 aggregation(SUM, NumStatsFilteredPages): 6
 ====
+---- QUERY
+# Regression test for IMPALA-9572.
+select count(l_partkey) from tpch_nested_parquet.customer.c_orders.o_lineitems
+where l_partkey < 10
+---- RESULTS
+263
+---- TYPES
+BIGINT
+====