You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/01/30 13:20:27 UTC

[impala] 01/05: IMPALA-5031: out-of-range enum values are undefined behavior

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit e111627888dd1072aceabb81c4a88fec63a1e345
Author: Jim Apple <jb...@apache.org>
AuthorDate: Fri Jan 25 20:15:29 2019 -0800

    IMPALA-5031: out-of-range enum values are undefined behavior
    
    This patch adds a utility, Ubsan::EnumToInt(), that can take a pointer
    to an enumeration value and return its integral value without invoking
    undefined behavior. This is not as simple as "int i = *pe" because (a)
    the underlying integral type might not be int, and (b) if pe points to
    a value outside of the range of the enum as defined in the standard
    (section dcl.enum), then dereferencing it is undefined behavior.
    
    This fixes errors found in the end-to-end tests:
    
        exec/parquet/parquet-metadata-utils.cc:215:26: runtime error: load
          of value 5000, which is not a valid value for type
         'CompressionCodec::type'
        exec/parquet/parquet-metadata-utils.cc:216:26: runtime error: load
          of value 5000, which is not a valid value for type
          'CompressionCodec::type'
        exec/parquet/parquet-metadata-utils.cc:217:26: runtime error: load
          of value 5000, which is not a valid value for type
          'CompressionCodec::type'
        exec/parquet/parquet-metadata-utils.cc:219:47: runtime error: load
          of value 5000, which is not a valid value for type
          'CompressionCodec::type'
        exec/parquet/parquet-metadata-utils.cc:501:22: runtime error: load
          of value 4294967278, which is not a valid value for type
          'FieldRepetitionType::type'
        exec/parquet/parquet-metadata-utils.cc:503:29: runtime error: load
          of value 4294967278, which is not a valid value for type
          'FieldRepetitionType::type'
    
    Change-Id: Iecdf301065da8f091a274e7a0585a11fcc79da7d
    Reviewed-on: http://gerrit.cloudera.org:8080/12280
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/parquet/parquet-metadata-utils.cc | 15 +++++----
 be/src/util/ubsan.h                           | 45 +++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc
index 3e369af..301e308 100644
--- a/be/src/exec/parquet/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet/parquet-metadata-utils.cc
@@ -31,6 +31,7 @@
 #include "exec/parquet/parquet-common.h"
 #include "runtime/runtime-state.h"
 #include "util/debug-util.h"
+#include "util/ubsan.h"
 
 #include "common/names.h"
 
@@ -212,11 +213,12 @@ Status ParquetMetadataUtils::ValidateRowGroupColumn(
   }
 
   // Check the compression is supported.
-  if (col_chunk_metadata.codec != parquet::CompressionCodec::UNCOMPRESSED &&
-      col_chunk_metadata.codec != parquet::CompressionCodec::SNAPPY &&
-      col_chunk_metadata.codec != parquet::CompressionCodec::GZIP) {
+  const auto codec = Ubsan::EnumToInt(&col_chunk_metadata.codec);
+  if (codec != parquet::CompressionCodec::UNCOMPRESSED &&
+      codec != parquet::CompressionCodec::SNAPPY &&
+      codec != parquet::CompressionCodec::GZIP) {
     return Status(Substitute("File '$0' uses an unsupported compression: $1 for column "
-        "'$2'.", filename, col_chunk_metadata.codec, schema_element.name));
+        "'$2'.", filename, codec, schema_element.name));
   }
 
   if (col_chunk_metadata.type != schema_element.type) {
@@ -498,9 +500,10 @@ Status ParquetSchemaResolver::CreateSchemaTree(
   // updating ira_def_level
   node->def_level_of_immediate_repeated_ancestor = ira_def_level;
 
-  if (node->element->repetition_type == parquet::FieldRepetitionType::OPTIONAL) {
+  const auto repetition_type = Ubsan::EnumToInt(&node->element->repetition_type);
+  if (repetition_type == parquet::FieldRepetitionType::OPTIONAL) {
     ++max_def_level;
-  } else if (node->element->repetition_type == parquet::FieldRepetitionType::REPEATED &&
+  } else if (repetition_type == parquet::FieldRepetitionType::REPEATED &&
              !is_root_schema /*PARQUET-843*/) {
     ++max_rep_level;
     // Repeated fields add a definition level. This is used to distinguish between an
diff --git a/be/src/util/ubsan.h b/be/src/util/ubsan.h
index d1f45eb..08628fb 100644
--- a/be/src/util/ubsan.h
+++ b/be/src/util/ubsan.h
@@ -22,6 +22,7 @@
 // undefined behavior.
 
 #include <cstring>
+#include <type_traits>
 
 #include "common/logging.h"
 
@@ -48,6 +49,50 @@ class Ubsan {
     }
     return std::memcmp(s1, s2, n);
   }
+  // Convert a potential enum value (that may be out of range) into its underlying integer
+  // implementation. This is required because, according to the standard, enum values that
+  // are out of range induce undefined behavior. For instance,
+  //
+  //     enum A {B = 0, C = 5};
+  //     int d = 6;
+  //     A e;
+  //     memcpy(&e, &d, sizeof(e));
+  //     if (e == B)
+  //     ; // undefined behavior: load of value 6, which is not a valid value for type 'A'
+  //     if (EnumToInt(&e) == B)
+  //     ; // OK: the type of EnumToInt(&e) is int, and B is converted to int for the
+  //       // comparison
+  //
+  // EnumToInt() is a worse alternative to not treating a pointer to arbitrary memory as a
+  // pointer to an enum. To put it another way, the first block below is a better way to
+  // handle possibly-out-of-range enum values:
+  //
+  // extern char * v;
+  // enum A { B = 0, C = 45 };
+  // A x;
+  // if (good_way) {
+  //   std::underlying_type_t<T> i;
+  //   std::memcpy(&i, v, sizeof(i));
+  //   if (B <= i && i <= C) x = i;
+  // } else {
+  //   A * y = reinterpret_cast<A *>(v);
+  //   int i = EnumToInt(y);
+  //   if (B <= i && i <= C) x = *y;
+  // }
+  //
+  // The second block is worse because y is masquerading as a legitimate pointer to an A
+  // and could get dereferenced illegally as the code evolves. Unfortunately,
+  // deserialization methods don't always make the better way an option - sometimes the
+  // possibly invalid pointer to A (like y) is created externally.
+  template<typename T>
+  static auto EnumToInt(const T * e) {
+    std::underlying_type_t<T> i;
+    static_assert(sizeof(i) == sizeof(*e), "enum underlying type is the wrong size");
+    // We have to memcpy, rather than directly assigning i = *e, because dereferencing e
+    // creates undefined behavior.
+    memcpy(&i, e, sizeof(i));
+    return i;
+  }
 };
 
 #endif // UTIL_UBSAN_H_