You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/01/30 13:20:27 UTC
[impala] 01/05: IMPALA-5031: out-of-range enum values are undefined
behavior
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit e111627888dd1072aceabb81c4a88fec63a1e345
Author: Jim Apple <jb...@apache.org>
AuthorDate: Fri Jan 25 20:15:29 2019 -0800
IMPALA-5031: out-of-range enum values are undefined behavior
This patch adds a utility, Ubsan::EnumToInt(), that can take a pointer
to an enumeration value and return its integral value without invoking
undefined behavior. This is not as simple as "int i = *pe" because (a)
the underlying integral type might not be int, and (b) if pe points to
a value outside of the range of the enum as defined in the standard
(section dcl.enum), then dereferencing it is undefined behavior.
This fixes errors found in the end-to-end tests:
exec/parquet/parquet-metadata-utils.cc:215:26: runtime error: load
of value 5000, which is not a valid value for type
'CompressionCodec::type'
exec/parquet/parquet-metadata-utils.cc:216:26: runtime error: load
of value 5000, which is not a valid value for type
'CompressionCodec::type'
exec/parquet/parquet-metadata-utils.cc:217:26: runtime error: load
of value 5000, which is not a valid value for type
'CompressionCodec::type'
exec/parquet/parquet-metadata-utils.cc:219:47: runtime error: load
of value 5000, which is not a valid value for type
'CompressionCodec::type'
exec/parquet/parquet-metadata-utils.cc:501:22: runtime error: load
of value 4294967278, which is not a valid value for type
'FieldRepetitionType::type'
exec/parquet/parquet-metadata-utils.cc:503:29: runtime error: load
of value 4294967278, which is not a valid value for type
'FieldRepetitionType::type'
Change-Id: Iecdf301065da8f091a274e7a0585a11fcc79da7d
Reviewed-on: http://gerrit.cloudera.org:8080/12280
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/parquet/parquet-metadata-utils.cc | 15 +++++----
be/src/util/ubsan.h | 45 +++++++++++++++++++++++++++
2 files changed, 54 insertions(+), 6 deletions(-)
diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc
index 3e369af..301e308 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet/parquet-metadata-utils.cc
@@ -31,6 +31,7 @@
#include "exec/parquet/parquet-common.h"
#include "runtime/runtime-state.h"
#include "util/debug-util.h"
+#include "util/ubsan.h"
#include "common/names.h"
@@ -212,11 +213,12 @@ Status ParquetMetadataUtils::ValidateRowGroupColumn(
}
// Check the compression is supported.
- if (col_chunk_metadata.codec != parquet::CompressionCodec::UNCOMPRESSED &&
- col_chunk_metadata.codec != parquet::CompressionCodec::SNAPPY &&
- col_chunk_metadata.codec != parquet::CompressionCodec::GZIP) {
+ const auto codec = Ubsan::EnumToInt(&col_chunk_metadata.codec);
+ if (codec != parquet::CompressionCodec::UNCOMPRESSED &&
+ codec != parquet::CompressionCodec::SNAPPY &&
+ codec != parquet::CompressionCodec::GZIP) {
return Status(Substitute("File '$0' uses an unsupported compression: $1 for column "
- "'$2'.", filename, col_chunk_metadata.codec, schema_element.name));
+ "'$2'.", filename, codec, schema_element.name));
}
if (col_chunk_metadata.type != schema_element.type) {
@@ -498,9 +500,10 @@ Status ParquetSchemaResolver::CreateSchemaTree(
// updating ira_def_level
node->def_level_of_immediate_repeated_ancestor = ira_def_level;
- if (node->element->repetition_type == parquet::FieldRepetitionType::OPTIONAL) {
+ const auto repetition_type = Ubsan::EnumToInt(&node->element->repetition_type);
+ if (repetition_type == parquet::FieldRepetitionType::OPTIONAL) {
++max_def_level;
- } else if (node->element->repetition_type == parquet::FieldRepetitionType::REPEATED &&
+ } else if (repetition_type == parquet::FieldRepetitionType::REPEATED &&
!is_root_schema /*PARQUET-843*/) {
++max_rep_level;
// Repeated fields add a definition level. This is used to distinguish between an
diff --git a/be/src/util/ubsan.h b/be/src/util/ubsan.h
index d1f45eb..08628fb 100644
--- a/be/src/util/ubsan.h
+++ b/be/src/util/ubsan.h
@@ -22,6 +22,7 @@
// undefined behavior.
#include <cstring>
+#include <type_traits>
#include "common/logging.h"
@@ -48,6 +49,50 @@ class Ubsan {
}
return std::memcmp(s1, s2, n);
}
+ // Convert a potential enum value (that may be out of range) into its underlying integer
+ // implementation. This is required because, according to the standard, enum values that
+ // are out of range induce undefined behavior. For instance,
+ //
+ // enum A {B = 0, C = 5};
+ // int d = 6;
+ // A e;
+ // memcpy(&e, &d, sizeof(e));
+ // if (e == B)
+ // ; // undefined behavior: load of value 6, which is not a valid value for type 'A'
+ // if (EnumToInt(&e) == B)
+ // ; // OK: the type of EnumToInt(&e) is int, and B is converted to int for the
+ // // comparison
+ //
+ // EnumToInt() is a worse alternative to not treating a pointer to arbitrary memory as a
+ // pointer to an enum. To put it another way, the first block below is a better way to
+ // handle possibly-out-of-range enum values:
+ //
+ // extern char * v;
+ // enum A { B = 0, C = 45 };
+ // A x;
+ // if (good_way) {
+ // std::underlying_type_t<T> i;
+ // std::memcpy(&i, v, sizeof(i));
+ // if (B <= i && i <= C) x = i;
+ // } else {
+ // A * y = reinterpret_cast<A *>(v);
+ // int i = EnumToInt(y);
+ // if (B <= i && i <= C) x = *y;
+ // }
+ //
+ // The second block is worse because y is masquerading as a legitimate pointer to an A
+ // and could get dereferenced illegally as the code evolves. Unfortunately,
+ // deserialization methods don't always make the better way an option - sometimes the
+ // possibly invalid pointer to A (like y) is created externally.
+ template<typename T>
+ static auto EnumToInt(const T * e) {
+ std::underlying_type_t<T> i;
+ static_assert(sizeof(i) == sizeof(*e), "enum underlying type is the wrong size");
+ // We have to memcpy, rather than directly assigning i = *e, because dereferencing e
+ // creates undefined behavior.
+ memcpy(&i, e, sizeof(i));
+ return i;
+ }
};
#endif // UTIL_UBSAN_H_