You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/28 14:41:58 UTC
[arrow] branch master updated: ARROW-7273: [Python][C++][Parquet]
Do not permit constructing a non-nullable null field in Python,
catch this case in Arrow->Parquet schema conversion
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d57d8a2 ARROW-7273: [Python][C++][Parquet] Do not permit constructing a non-nullable null field in Python, catch this case in Arrow->Parquet schema conversion
d57d8a2 is described below
commit d57d8a2993c35eff8ac1d3814ee740eb5a72e89c
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Jun 28 09:41:40 2020 -0500
ARROW-7273: [Python][C++][Parquet] Do not permit constructing a non-nullable null field in Python, catch this case in Arrow->Parquet schema conversion
This was the simplest triage I could think of. Fixes a DCHECK failure in debug / segfault in release builds
Closes #7562 from wesm/ARROW-7273
Authored-by: Wes McKinney <we...@apache.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/parquet/arrow/arrow_schema_test.cc | 13 +++++++++++--
cpp/src/parquet/arrow/schema.cc | 7 +++++--
python/pyarrow/tests/test_types.py | 6 ++++++
python/pyarrow/types.pxi | 3 +++
4 files changed, 25 insertions(+), 4 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 2f1dcda..b61694b 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -813,8 +813,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
-1},
{"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"),
LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
- -1},
- {"null", ::arrow::null(), LogicalType::Null(), ParquetType::INT32, -1}};
+ -1}};
std::vector<std::shared_ptr<Field>> arrow_fields;
std::vector<NodePtr> parquet_fields;
@@ -1118,6 +1117,16 @@ TEST(InvalidSchema, ParquetNegativeDecimalScale) {
ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema));
}
+TEST(InvalidSchema, NonNullableNullType) {
+ const auto& field = ::arrow::field("f0", ::arrow::null(), /*nullable=*/false);
+ const auto& arrow_schema = ::arrow::schema({field});
+ std::shared_ptr<::parquet::WriterProperties> properties =
+ ::parquet::default_writer_properties();
+ std::shared_ptr<SchemaDescriptor> result_schema;
+ ASSERT_RAISES(Invalid,
+ ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema));
+}
+
TEST(TestFromParquetSchema, CorruptMetadata) {
// PARQUET-1565: ensure that an IOError is returned when the parquet file contains
// corrupted metadata.
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index de3f527..eca443c 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -226,10 +226,13 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
int scale = -1;
switch (field->type()->id()) {
- case ArrowTypeId::NA:
+ case ArrowTypeId::NA: {
type = ParquetType::INT32;
logical_type = LogicalType::Null();
- break;
+ if (repetition != Repetition::OPTIONAL) {
+ return Status::Invalid("NullType Arrow field must be nullable");
+ }
+ } break;
case ArrowTypeId::BOOL:
type = ParquetType::BOOLEAN;
break;
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 80f881d..eb6dd52 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -112,6 +112,12 @@ def test_is_null():
assert not types.is_null(pa.list_(pa.int32()))
+def test_null_field_may_not_be_non_nullable():
+ # ARROW-7273
+ with pytest.raises(ValueError):
+ pa.field('f0', pa.null(), nullable=False)
+
+
def test_is_decimal():
assert types.is_decimal(pa.decimal128(19, 4))
assert not types.is_decimal(pa.int32())
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 76f2abb..e541510 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1704,6 +1704,9 @@ def field(name, type, bint nullable=True, metadata=None):
metadata = ensure_metadata(metadata, allow_none=True)
c_meta = pyarrow_unwrap_metadata(metadata)
+ if _type.type.id() == _Type_NA and not nullable:
+ raise ValueError("A null type field may not be non-nullable")
+
result.sp_field.reset(
new CField(tobytes(name), _type.sp_type, nullable, c_meta)
)