You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/28 14:41:58 UTC

[arrow] branch master updated: ARROW-7273: [Python][C++][Parquet] Do not permit constructing a non-nullable null field in Python, catch this case in Arrow->Parquet schema conversion

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d57d8a2  ARROW-7273: [Python][C++][Parquet] Do not permit constructing a non-nullable null field in Python, catch this case in Arrow->Parquet schema conversion
d57d8a2 is described below

commit d57d8a2993c35eff8ac1d3814ee740eb5a72e89c
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Jun 28 09:41:40 2020 -0500

    ARROW-7273: [Python][C++][Parquet] Do not permit constructing a non-nullable null field in Python, catch this case in Arrow->Parquet schema conversion
    
    This was the simplest triage I could think of. Fixes a DCHECK failure in debug / segfault in release builds
    
    Closes #7562 from wesm/ARROW-7273
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/arrow/arrow_schema_test.cc | 13 +++++++++++--
 cpp/src/parquet/arrow/schema.cc            |  7 +++++--
 python/pyarrow/tests/test_types.py         |  6 ++++++
 python/pyarrow/types.pxi                   |  3 +++
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc
index 2f1dcda..b61694b 100644
--- a/cpp/src/parquet/arrow/arrow_schema_test.cc
+++ b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -813,8 +813,7 @@ TEST_F(TestConvertArrowSchema, ArrowFields) {
        -1},
       {"timestamp(nanosecond, CET)", ::arrow::timestamp(::arrow::TimeUnit::NANO, "CET"),
        LogicalType::Timestamp(true, LogicalType::TimeUnit::MICROS), ParquetType::INT64,
-       -1},
-      {"null", ::arrow::null(), LogicalType::Null(), ParquetType::INT32, -1}};
+       -1}};
 
   std::vector<std::shared_ptr<Field>> arrow_fields;
   std::vector<NodePtr> parquet_fields;
@@ -1118,6 +1117,16 @@ TEST(InvalidSchema, ParquetNegativeDecimalScale) {
                 ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema));
 }
 
+TEST(InvalidSchema, NonNullableNullType) {
+  const auto& field = ::arrow::field("f0", ::arrow::null(), /*nullable=*/false);
+  const auto& arrow_schema = ::arrow::schema({field});
+  std::shared_ptr<::parquet::WriterProperties> properties =
+      ::parquet::default_writer_properties();
+  std::shared_ptr<SchemaDescriptor> result_schema;
+  ASSERT_RAISES(Invalid,
+                ToParquetSchema(arrow_schema.get(), *properties.get(), &result_schema));
+}
+
 TEST(TestFromParquetSchema, CorruptMetadata) {
   // PARQUET-1565: ensure that an IOError is returned when the parquet file contains
   // corrupted metadata.
diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc
index de3f527..eca443c 100644
--- a/cpp/src/parquet/arrow/schema.cc
+++ b/cpp/src/parquet/arrow/schema.cc
@@ -226,10 +226,13 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
   int scale = -1;
 
   switch (field->type()->id()) {
-    case ArrowTypeId::NA:
+    case ArrowTypeId::NA: {
       type = ParquetType::INT32;
       logical_type = LogicalType::Null();
-      break;
+      if (repetition != Repetition::OPTIONAL) {
+        return Status::Invalid("NullType Arrow field must be nullable");
+      }
+    } break;
     case ArrowTypeId::BOOL:
       type = ParquetType::BOOLEAN;
       break;
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 80f881d..eb6dd52 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -112,6 +112,12 @@ def test_is_null():
     assert not types.is_null(pa.list_(pa.int32()))
 
 
+def test_null_field_may_not_be_non_nullable():
+    # ARROW-7273
+    with pytest.raises(ValueError):
+        pa.field('f0', pa.null(), nullable=False)
+
+
 def test_is_decimal():
     assert types.is_decimal(pa.decimal128(19, 4))
     assert not types.is_decimal(pa.int32())
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 76f2abb..e541510 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -1704,6 +1704,9 @@ def field(name, type, bint nullable=True, metadata=None):
     metadata = ensure_metadata(metadata, allow_none=True)
     c_meta = pyarrow_unwrap_metadata(metadata)
 
+    if _type.type.id() == _Type_NA and not nullable:
+        raise ValueError("A null type field may not be non-nullable")
+
     result.sp_field.reset(
         new CField(tobytes(name), _type.sp_type, nullable, c_meta)
     )