You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/07/16 15:26:43 UTC
[impala] 05/06: IMPALA-5031: Out-of-range enums are undefined
behavior
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit a852b9fb749860221d8919ac94efc39115d8a65b
Author: Jim Apple <jb...@apache.org>
AuthorDate: Thu Jul 4 11:32:22 2019 -0700
IMPALA-5031: Out-of-range enums are undefined behavior
This eliminates an out-of-range enum value in the end-to-end
tests. The interesting part of the backtrace is:
exec/parquet/parquet-column-readers.cc:1530:112: runtime error: load
of value 38, which is not a valid value for type 'Encoding::type'
#0 BaseScalarColumnReader::ReadDataPage()
exec/parquet/parquet-column-readers.cc:1530:112
#1 BaseScalarColumnReader::NextPage()
exec/parquet/parquet-column-readers.cc:1769:28
#2 bool ScalarColumnReader<long, (parquet::Type::type)2, true>
::ReadValueBatch<false>(int, int, unsigned char*, int*)
exec/parquet/parquet-column-readers.cc:459:12
#3 ScalarColumnReader<long, (parquet::Type::type)2, true>
::ReadNonRepeatedValueBatch(MemPool*, int, int, unsigned char*,
int*) exec/parquet/parquet-column-readers.cc:106:12
#4 HdfsParquetScanner::AssembleRows(vector<ParquetColumnReader*>
const&, RowBatch*, bool*)
exec/parquet/hdfs-parquet-scanner.cc:1113:42
#5 HdfsParquetScanner::GetNextInternal(RowBatch*)
exec/parquet/hdfs-parquet-scanner.cc:456:19
#6 HdfsParquetScanner::ProcessSplit()
exec/parquet/hdfs-parquet-scanner.cc:353:21
#7 HdfsScanNode::ProcessSplit(vector<FilterContext> const&,
MemPool*, io::ScanRange*, long*) exec/hdfs-scan-node.cc:514:21
#8 HdfsScanNode::ScannerThread(bool, long)
exec/hdfs-scan-node.cc:415:7
#9 HdfsScanNode::ThreadTokenAvailableCb(ThreadResourcePool*)::$_0
::operator()() const exec/hdfs-scan-node.cc:337:13
Change-Id: Ia86de44daaf56a941fb95b15d5dfd7b5a2752129
Reviewed-on: http://gerrit.cloudera.org:8080/13804
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/parquet/parquet-column-readers.cc | 4 ++--
be/src/exec/parquet/parquet-level-decoder.cc | 12 +++++++++---
be/src/exec/parquet/parquet-level-decoder.h | 2 +-
common/thrift/parquet.thrift | 9 +++++++++
4 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc
index 0ed8cd8..f115002 100644
--- a/be/src/exec/parquet/parquet-column-readers.cc
+++ b/be/src/exec/parquet/parquet-column-readers.cc
@@ -1522,13 +1522,13 @@ Status BaseScalarColumnReader::ReadDataPage() {
// Initialize the repetition level data
RETURN_IF_ERROR(rep_levels_.Init(filename(),
- current_page_header_.data_page_header.repetition_level_encoding,
+ ¤t_page_header_.data_page_header.repetition_level_encoding,
parent_->perm_pool_.get(), parent_->state_->batch_size(), max_rep_level(), &data_,
&data_size));
// Initialize the definition level data
RETURN_IF_ERROR(def_levels_.Init(filename(),
- current_page_header_.data_page_header.definition_level_encoding,
+ ¤t_page_header_.data_page_header.definition_level_encoding,
parent_->perm_pool_.get(), parent_->state_->batch_size(), max_def_level(), &data_,
&data_size));
diff --git a/be/src/exec/parquet/parquet-level-decoder.cc b/be/src/exec/parquet/parquet-level-decoder.cc
index 166230c..e4c2abc 100644
--- a/be/src/exec/parquet/parquet-level-decoder.cc
+++ b/be/src/exec/parquet/parquet-level-decoder.cc
@@ -21,6 +21,7 @@
#include "runtime/mem-pool.h"
#include "runtime/mem-tracker.h"
#include "util/bit-util.h"
+#include "util/ubsan.h"
#include "common/names.h"
@@ -32,7 +33,7 @@ const int16_t ParquetLevel::ROW_GROUP_END;
const int16_t ParquetLevel::INVALID_LEVEL;
const int16_t ParquetLevel::INVALID_POS;
-Status ParquetLevelDecoder::Init(const string& filename, Encoding::type encoding,
+Status ParquetLevelDecoder::Init(const string& filename, const Encoding::type* encoding,
MemPool* cache_pool, int cache_size, int max_level, uint8_t** data, int* data_size) {
DCHECK(*data != nullptr);
DCHECK_GE(*data_size, 0);
@@ -46,7 +47,12 @@ Status ParquetLevelDecoder::Init(const string& filename, Encoding::type encoding
if (max_level == 0) return Status::OK();
int32_t num_bytes = 0;
- switch (encoding) {
+ if (Ubsan::EnumToInt(encoding) > Encoding::MAX_ENUM_VALUE) {
+ stringstream ss;
+ ss << "Unsupported encoding: " << Ubsan::EnumToInt(encoding);
+ return Status(ss.str());
+ }
+ switch (*encoding) {
case Encoding::RLE: {
Status status;
if (!ReadWriteUtil::Read(data, data_size, &num_bytes, &status)) {
@@ -63,7 +69,7 @@ Status ParquetLevelDecoder::Init(const string& filename, Encoding::type encoding
return Status(TErrorCode::PARQUET_BIT_PACKED_LEVELS, filename);
default: {
stringstream ss;
- ss << "Unsupported encoding: " << encoding;
+ ss << "Unsupported encoding: " << *encoding;
return Status(ss.str());
}
}
diff --git a/be/src/exec/parquet/parquet-level-decoder.h b/be/src/exec/parquet/parquet-level-decoder.h
index 8626b4d..58bda02 100644
--- a/be/src/exec/parquet/parquet-level-decoder.h
+++ b/be/src/exec/parquet/parquet-level-decoder.h
@@ -52,7 +52,7 @@ class ParquetLevelDecoder {
/// Initialize the LevelDecoder. Reads and advances the provided data buffer if the
/// encoding requires reading metadata from the page header. 'cache_size' will be
/// rounded up to a multiple of 32 internally.
- Status Init(const string& filename, parquet::Encoding::type encoding,
+ Status Init(const string& filename, const parquet::Encoding::type* encoding,
MemPool* cache_pool, int cache_size, int max_level, uint8_t** data, int* data_size);
/// Returns the next level or INVALID_LEVEL if there was an error. Not as efficient
diff --git a/common/thrift/parquet.thrift b/common/thrift/parquet.thrift
index 6c9011b..1197c2e 100644
--- a/common/thrift/parquet.thrift
+++ b/common/thrift/parquet.thrift
@@ -450,6 +450,15 @@ enum Encoding {
/** Dictionary encoding: the ids are encoded using the RLE encoding
*/
RLE_DICTIONARY = 8;
+
+ /**
+ * Useful for checking an integer's value before casting it to an enum of this type.
+ * That check has value in avoiding undefined behavior in the [expr] section of the
+ * C++14 standard: "If during the evaluation of an expression, the result is not
+ * mathematically defined or not in the range of representable values for its type,
+ * the behavior is undefined."
+ */
+ MAX_ENUM_VALUE = 8;
}
/**