You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/07/16 15:26:43 UTC

[impala] 05/06: IMPALA-5031: Out-of-range enums are undefined behavior

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit a852b9fb749860221d8919ac94efc39115d8a65b
Author: Jim Apple <jb...@apache.org>
AuthorDate: Thu Jul 4 11:32:22 2019 -0700

    IMPALA-5031: Out-of-range enums are undefined behavior
    
    This eliminates an out-of-range enum value in the end-to-end
    tests. The interesting part of the backtrace is:
    
    exec/parquet/parquet-column-readers.cc:1530:112: runtime error: load
      of value 38, which is not a valid value for type 'Encoding::type'
        #0 BaseScalarColumnReader::ReadDataPage()
           exec/parquet/parquet-column-readers.cc:1530:112
        #1 BaseScalarColumnReader::NextPage()
           exec/parquet/parquet-column-readers.cc:1769:28
        #2 bool ScalarColumnReader<long, (parquet::Type::type)2, true>
           ::ReadValueBatch<false>(int, int, unsigned char*, int*)
           exec/parquet/parquet-column-readers.cc:459:12
        #3 ScalarColumnReader<long, (parquet::Type::type)2, true>
           ::ReadNonRepeatedValueBatch(MemPool*, int, int, unsigned char*,
           int*) exec/parquet/parquet-column-readers.cc:106:12
        #4 HdfsParquetScanner::AssembleRows(vector<ParquetColumnReader*>
           const&, RowBatch*, bool*)
           exec/parquet/hdfs-parquet-scanner.cc:1113:42
        #5 HdfsParquetScanner::GetNextInternal(RowBatch*)
           exec/parquet/hdfs-parquet-scanner.cc:456:19
        #6 HdfsParquetScanner::ProcessSplit()
           exec/parquet/hdfs-parquet-scanner.cc:353:21
        #7 HdfsScanNode::ProcessSplit(vector<FilterContext> const&,
           MemPool*, io::ScanRange*, long*) exec/hdfs-scan-node.cc:514:21
        #8 HdfsScanNode::ScannerThread(bool, long)
           exec/hdfs-scan-node.cc:415:7
        #9 HdfsScanNode::ThreadTokenAvailableCb(ThreadResourcePool*)::$_0
           ::operator()() const exec/hdfs-scan-node.cc:337:13
    
    Change-Id: Ia86de44daaf56a941fb95b15d5dfd7b5a2752129
    Reviewed-on: http://gerrit.cloudera.org:8080/13804
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/parquet/parquet-column-readers.cc |  4 ++--
 be/src/exec/parquet/parquet-level-decoder.cc  | 12 +++++++++---
 be/src/exec/parquet/parquet-level-decoder.h   |  2 +-
 common/thrift/parquet.thrift                  |  9 +++++++++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc
index 0ed8cd8..f115002 100644
--- a/be/src/exec/parquet/parquet-column-readers.cc
+++ b/be/src/exec/parquet/parquet-column-readers.cc
@@ -1522,13 +1522,13 @@ Status BaseScalarColumnReader::ReadDataPage() {
 
     // Initialize the repetition level data
     RETURN_IF_ERROR(rep_levels_.Init(filename(),
-        current_page_header_.data_page_header.repetition_level_encoding,
+        &current_page_header_.data_page_header.repetition_level_encoding,
         parent_->perm_pool_.get(), parent_->state_->batch_size(), max_rep_level(), &data_,
         &data_size));
 
     // Initialize the definition level data
     RETURN_IF_ERROR(def_levels_.Init(filename(),
-        current_page_header_.data_page_header.definition_level_encoding,
+        &current_page_header_.data_page_header.definition_level_encoding,
         parent_->perm_pool_.get(), parent_->state_->batch_size(), max_def_level(), &data_,
         &data_size));
 
diff --git a/be/src/exec/parquet/parquet-level-decoder.cc b/be/src/exec/parquet/parquet-level-decoder.cc
index 166230c..e4c2abc 100644
--- a/be/src/exec/parquet/parquet-level-decoder.cc
+++ b/be/src/exec/parquet/parquet-level-decoder.cc
@@ -21,6 +21,7 @@
 #include "runtime/mem-pool.h"
 #include "runtime/mem-tracker.h"
 #include "util/bit-util.h"
+#include "util/ubsan.h"
 
 #include "common/names.h"
 
@@ -32,7 +33,7 @@ const int16_t ParquetLevel::ROW_GROUP_END;
 const int16_t ParquetLevel::INVALID_LEVEL;
 const int16_t ParquetLevel::INVALID_POS;
 
-Status ParquetLevelDecoder::Init(const string& filename, Encoding::type encoding,
+Status ParquetLevelDecoder::Init(const string& filename, const Encoding::type* encoding,
     MemPool* cache_pool, int cache_size, int max_level, uint8_t** data, int* data_size) {
   DCHECK(*data != nullptr);
   DCHECK_GE(*data_size, 0);
@@ -46,7 +47,12 @@ Status ParquetLevelDecoder::Init(const string& filename, Encoding::type encoding
   if (max_level == 0) return Status::OK();
 
   int32_t num_bytes = 0;
-  switch (encoding) {
+  if (Ubsan::EnumToInt(encoding) > Encoding::MAX_ENUM_VALUE) {
+    stringstream ss;
+    ss << "Unsupported encoding: " << Ubsan::EnumToInt(encoding);
+    return Status(ss.str());
+  }
+  switch (*encoding) {
     case Encoding::RLE: {
       Status status;
       if (!ReadWriteUtil::Read(data, data_size, &num_bytes, &status)) {
@@ -63,7 +69,7 @@ Status ParquetLevelDecoder::Init(const string& filename, Encoding::type encoding
       return Status(TErrorCode::PARQUET_BIT_PACKED_LEVELS, filename);
     default: {
       stringstream ss;
-      ss << "Unsupported encoding: " << encoding;
+      ss << "Unsupported encoding: " << *encoding;
       return Status(ss.str());
     }
   }
diff --git a/be/src/exec/parquet/parquet-level-decoder.h b/be/src/exec/parquet/parquet-level-decoder.h
index 8626b4d..58bda02 100644
--- a/be/src/exec/parquet/parquet-level-decoder.h
+++ b/be/src/exec/parquet/parquet-level-decoder.h
@@ -52,7 +52,7 @@ class ParquetLevelDecoder {
   /// Initialize the LevelDecoder. Reads and advances the provided data buffer if the
   /// encoding requires reading metadata from the page header. 'cache_size' will be
   /// rounded up to a multiple of 32 internally.
-  Status Init(const string& filename, parquet::Encoding::type encoding,
+  Status Init(const string& filename, const parquet::Encoding::type* encoding,
       MemPool* cache_pool, int cache_size, int max_level, uint8_t** data, int* data_size);
 
   /// Returns the next level or INVALID_LEVEL if there was an error. Not as efficient
diff --git a/common/thrift/parquet.thrift b/common/thrift/parquet.thrift
index 6c9011b..1197c2e 100644
--- a/common/thrift/parquet.thrift
+++ b/common/thrift/parquet.thrift
@@ -450,6 +450,15 @@ enum Encoding {
   /** Dictionary encoding: the ids are encoded using the RLE encoding
    */
   RLE_DICTIONARY = 8;
+
+  /**
+   * Useful for checking an integer's value before casting it to an enum of this type.
+   * That check has value in avoiding undefined behavior in the [expr] section of the
+   * C++14 standard: "If during the evaluation of an expression, the result is not
+   * mathematically defined or not in the range of representable values for its type,
+   * the behavior is undefined."
+   */
+  MAX_ENUM_VALUE = 8;
 }
 
 /**