You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/04/23 15:13:34 UTC

[arrow] branch master updated: PARQUET-1565: [C++] Add default case to catch all unhandled physical types

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5f8f08b  PARQUET-1565: [C++]  Add default case to catch all unhandled physical types
5f8f08b is described below

commit 5f8f08bd4a706c7f04f6a8136d213707d7fe5c11
Author: Hatem Helal <hh...@mathworks.com>
AuthorDate: Tue Apr 23 17:13:25 2019 +0200

    PARQUET-1565: [C++]  Add default case to catch all unhandled physical types
    
    This should prevent the SEGV observed when calling `FromParquetSchema` while reading the corrupt file [PARQUET-1481](https://github.com/apache/parquet-testing/blob/master/bad_data/PARQUET-1481.parquet)
    
    Author: Hatem Helal <hh...@mathworks.com>
    Author: Hatem Helal <ha...@gmail.com>
    
    Closes #4171 from hatemhelal/parquet-1565 and squashes the following commits:
    
    8f6286ad6 <Hatem Helal> reworked TryReadDataFile to accept an expected status code
    f2fb6160a <Hatem Helal> remove unnecessary try..catch
    81ffcb2c0 <Hatem Helal> clang-format changes
    22adb4ac5 <Hatem Helal> use google style function argument comments
    ac12dce81 <Hatem Helal> rename function parameter
    d928095b8 <Hatem Helal> Fix IWYU failure
    584995bb9 <Hatem Helal> Add test util for looking up good and bad PARQUET_TEST_DATA files
    a8c832bff <Hatem Helal> Fix regression test for PARQUET-1481
    0945fc03d <Hatem Helal> Add unittest for IOError when reading a corrupt Parquet file
    9cbe77fb9 <Hatem Helal> Add default case to catch all unhandled physical types
---
 cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 31 ++++++++++-------------
 cpp/src/parquet/arrow/arrow-schema-test.cc        | 14 ++++++++++
 cpp/src/parquet/arrow/schema.cc                   |  5 ++++
 cpp/src/parquet/util/test-common.h                | 23 +++++++++++++++++
 4 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index b54c3f4..3d0c647 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2334,34 +2334,31 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
   ASSERT_EQ(expected, calculated);
 }
 
-void TryReadDataFile(const std::string& testing_file_path, bool should_succeed = true) {
-  std::string dir_string(test::get_data_dir());
-  std::stringstream ss;
-  ss << dir_string << "/" << testing_file_path;
-  auto path = ss.str();
-
+void TryReadDataFile(const std::string& path,
+                     ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
   auto pool = ::arrow::default_memory_pool();
 
   std::unique_ptr<FileReader> arrow_reader;
-  try {
-    arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)));
-    std::shared_ptr<::arrow::Table> table;
-    ASSERT_OK(arrow_reader->ReadTable(&table));
-  } catch (const ParquetException& e) {
-    if (should_succeed) {
-      FAIL() << "Exception thrown when reading file: " << e.what();
-    }
-  }
+
+  arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)));
+  std::shared_ptr<::arrow::Table> table;
+  auto status = arrow_reader->ReadTable(&table);
+
+  ASSERT_TRUE(status.code() == expected_code)
+      << "Expected reading file to return "
+      << arrow::Status(expected_code, "").CodeAsString() << ", but got "
+      << status.ToString();
 }
 
 TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
   // PARQUET-995
-  TryReadDataFile("alltypes_plain.parquet");
+  TryReadDataFile(test::get_data_file("alltypes_plain.parquet"));
 }
 
 TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   // PARQUET-1481
-  TryReadDataFile("bad_data/PARQUET-1481.parquet", false /* should_succeed */);
+  auto path = test::get_data_file("PARQUET-1481.parquet", /*is_good=*/false);
+  TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
 class TestArrowReaderAdHocSparkAndHvr
diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc
index ce189e1..8a371ff 100644
--- a/cpp/src/parquet/arrow/arrow-schema-test.cc
+++ b/cpp/src/parquet/arrow/arrow-schema-test.cc
@@ -21,7 +21,9 @@
 #include "gtest/gtest.h"
 
 #include "parquet/arrow/schema.h"
+#include "parquet/file_reader.h"
 #include "parquet/schema.h"
+#include "parquet/util/test-common.h"
 
 #include "arrow/api.h"
 #include "arrow/testing/gtest_util.h"
@@ -849,5 +851,17 @@ TEST(InvalidSchema, ParquetNegativeDecimalScale) {
                 ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema));
 }
 
+TEST(TestFromParquetSchema, CorruptMetadata) {
+  // PARQUET-1565: ensure that an IOError is returned when the parquet file contains
+  // corrupted metadata.
+  auto path = test::get_data_file("PARQUET-1481.parquet", /*is_good=*/false);
+
+  std::unique_ptr<parquet::ParquetFileReader> reader =
+      parquet::ParquetFileReader::OpenFile(path);
+  const auto parquet_schema = reader->metadata()->schema();
+  std::shared_ptr<::arrow::Schema> arrow_schema;
+  ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, &arrow_schema));
+}
+
 }  // namespace arrow
 }  // namespace parquet
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index f1ebad0..1b03398 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -196,6 +196,11 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr<ArrowType>*
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
       RETURN_NOT_OK(FromFLBA(primitive, out));
       break;
+    default: {
+      // PARQUET-1565: This can occur if the file is corrupt
+      return Status::IOError("Invalid physical column type: ",
+                             TypeToString(primitive.physical_type()));
+    }
   }
   return Status::OK();
 }
diff --git a/cpp/src/parquet/util/test-common.h b/cpp/src/parquet/util/test-common.h
index 11f2d39..9e9b8c5 100644
--- a/cpp/src/parquet/util/test-common.h
+++ b/cpp/src/parquet/util/test-common.h
@@ -22,6 +22,7 @@
 #include <iostream>
 #include <limits>
 #include <random>
+#include <string>
 #include <vector>
 
 #include "parquet/exception.h"
@@ -49,6 +50,28 @@ const char* get_data_dir() {
   return result;
 }
 
+std::string get_bad_data_dir() {
+  // PARQUET_TEST_DATA should point to ARROW_HOME/cpp/submodules/parquet-testing/data
+  // so need to reach one folder up to access the "bad_data" folder.
+  std::string data_dir(get_data_dir());
+  std::stringstream ss;
+  ss << data_dir << "/../bad_data";
+  return ss.str();
+}
+
+std::string get_data_file(const std::string& filename, bool is_good = true) {
+  std::stringstream ss;
+
+  if (is_good) {
+    ss << get_data_dir();
+  } else {
+    ss << get_bad_data_dir();
+  }
+
+  ss << "/" << filename;
+  return ss.str();
+}
+
 template <typename T>
 static inline void assert_vector_equal(const std::vector<T>& left,
                                        const std::vector<T>& right) {