You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/04/23 15:13:34 UTC
[arrow] branch master updated: PARQUET-1565: [C++] Add default case
to catch all unhandled physical types
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5f8f08b PARQUET-1565: [C++] Add default case to catch all unhandled physical types
5f8f08b is described below
commit 5f8f08bd4a706c7f04f6a8136d213707d7fe5c11
Author: Hatem Helal <hh...@mathworks.com>
AuthorDate: Tue Apr 23 17:13:25 2019 +0200
PARQUET-1565: [C++] Add default case to catch all unhandled physical types
This should prevent the SEGV observed when calling `FromParquetSchema` while reading the corrupt file [PARQUET-1481](https://github.com/apache/parquet-testing/blob/master/bad_data/PARQUET-1481.parquet)
Author: Hatem Helal <hh...@mathworks.com>
Author: Hatem Helal <ha...@gmail.com>
Closes #4171 from hatemhelal/parquet-1565 and squashes the following commits:
8f6286ad6 <Hatem Helal> reworked TryReadDataFile to accept an expected status code
f2fb6160a <Hatem Helal> remove unnecessary try..catch
81ffcb2c0 <Hatem Helal> clang-format changes
22adb4ac5 <Hatem Helal> use google style function argument comments
ac12dce81 <Hatem Helal> rename function parameter
d928095b8 <Hatem Helal> Fix IWYU failure
584995bb9 <Hatem Helal> Add test util for looking up good and bad PARQUET_TEST_DATA files
a8c832bff <Hatem Helal> Fix regression test for PARQUET-1481
0945fc03d <Hatem Helal> Add unittest for IOError when reading a corrupt Parquet file
9cbe77fb9 <Hatem Helal> Add default case to catch all unhandled physical types
---
cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 31 ++++++++++-------------
cpp/src/parquet/arrow/arrow-schema-test.cc | 14 ++++++++++
cpp/src/parquet/arrow/schema.cc | 5 ++++
cpp/src/parquet/util/test-common.h | 23 +++++++++++++++++
4 files changed, 56 insertions(+), 17 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index b54c3f4..3d0c647 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2334,34 +2334,31 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
ASSERT_EQ(expected, calculated);
}
-void TryReadDataFile(const std::string& testing_file_path, bool should_succeed = true) {
- std::string dir_string(test::get_data_dir());
- std::stringstream ss;
- ss << dir_string << "/" << testing_file_path;
- auto path = ss.str();
-
+void TryReadDataFile(const std::string& path,
+ ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
auto pool = ::arrow::default_memory_pool();
std::unique_ptr<FileReader> arrow_reader;
- try {
- arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)));
- std::shared_ptr<::arrow::Table> table;
- ASSERT_OK(arrow_reader->ReadTable(&table));
- } catch (const ParquetException& e) {
- if (should_succeed) {
- FAIL() << "Exception thrown when reading file: " << e.what();
- }
- }
+
+ arrow_reader.reset(new FileReader(pool, ParquetFileReader::OpenFile(path, false)));
+ std::shared_ptr<::arrow::Table> table;
+ auto status = arrow_reader->ReadTable(&table);
+
+ ASSERT_TRUE(status.code() == expected_code)
+ << "Expected reading file to return "
+ << arrow::Status(expected_code, "").CodeAsString() << ", but got "
+ << status.ToString();
}
TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
// PARQUET-995
- TryReadDataFile("alltypes_plain.parquet");
+ TryReadDataFile(test::get_data_file("alltypes_plain.parquet"));
}
TEST(TestArrowReaderAdHoc, CorruptedSchema) {
// PARQUET-1481
- TryReadDataFile("bad_data/PARQUET-1481.parquet", false /* should_succeed */);
+ auto path = test::get_data_file("PARQUET-1481.parquet", /*is_good=*/false);
+ TryReadDataFile(path, ::arrow::StatusCode::IOError);
}
class TestArrowReaderAdHocSparkAndHvr
diff --git a/cpp/src/parquet/arrow/arrow-schema-test.cc b/cpp/src/parquet/arrow/arrow-schema-test.cc
index ce189e1..8a371ff 100644
--- a/cpp/src/parquet/arrow/arrow-schema-test.cc
+++ b/cpp/src/parquet/arrow/arrow-schema-test.cc
@@ -21,7 +21,9 @@
#include "gtest/gtest.h"
#include "parquet/arrow/schema.h"
+#include "parquet/file_reader.h"
#include "parquet/schema.h"
+#include "parquet/util/test-common.h"
#include "arrow/api.h"
#include "arrow/testing/gtest_util.h"
@@ -849,5 +851,17 @@ TEST(InvalidSchema, ParquetNegativeDecimalScale) {
ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema));
}
+TEST(TestFromParquetSchema, CorruptMetadata) {
+ // PARQUET-1565: ensure that an IOError is returned when the parquet file contains
+ // corrupted metadata.
+ auto path = test::get_data_file("PARQUET-1481.parquet", /*is_good=*/false);
+
+ std::unique_ptr<parquet::ParquetFileReader> reader =
+ parquet::ParquetFileReader::OpenFile(path);
+ const auto parquet_schema = reader->metadata()->schema();
+ std::shared_ptr<::arrow::Schema> arrow_schema;
+ ASSERT_RAISES(IOError, FromParquetSchema(parquet_schema, &arrow_schema));
+}
+
} // namespace arrow
} // namespace parquet
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index f1ebad0..1b03398 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -196,6 +196,11 @@ Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr<ArrowType>*
case ParquetType::FIXED_LEN_BYTE_ARRAY:
RETURN_NOT_OK(FromFLBA(primitive, out));
break;
+ default: {
+ // PARQUET-1565: This can occur if the file is corrupt
+ return Status::IOError("Invalid physical column type: ",
+ TypeToString(primitive.physical_type()));
+ }
}
return Status::OK();
}
diff --git a/cpp/src/parquet/util/test-common.h b/cpp/src/parquet/util/test-common.h
index 11f2d39..9e9b8c5 100644
--- a/cpp/src/parquet/util/test-common.h
+++ b/cpp/src/parquet/util/test-common.h
@@ -22,6 +22,7 @@
#include <iostream>
#include <limits>
#include <random>
+#include <string>
#include <vector>
#include "parquet/exception.h"
@@ -49,6 +50,28 @@ const char* get_data_dir() {
return result;
}
+std::string get_bad_data_dir() {
+ // PARQUET_TEST_DATA should point to ARROW_HOME/cpp/submodules/parquet-testing/data
+ // so need to reach one folder up to access the "bad_data" folder.
+ std::string data_dir(get_data_dir());
+ std::stringstream ss;
+ ss << data_dir << "/../bad_data";
+ return ss.str();
+}
+
+std::string get_data_file(const std::string& filename, bool is_good = true) {
+ std::stringstream ss;
+
+ if (is_good) {
+ ss << get_data_dir();
+ } else {
+ ss << get_bad_data_dir();
+ }
+
+ ss << "/" << filename;
+ return ss.str();
+}
+
template <typename T>
static inline void assert_vector_equal(const std::vector<T>& left,
const std::vector<T>& right) {