You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2023/01/18 08:34:23 UTC

[arrow] 07/10: GH-14875: [C++] C Data Interface: check imported buffer for non-null (#14814)

This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-11.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 8d1e357b7721af971ff02a0325fced448b78a26b
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Jan 17 18:53:02 2023 +0100

    GH-14875: [C++] C Data Interface: check imported buffer for non-null (#14814)
    
    The C data interface may expose null data pointers for zero-sized buffers.
    Make sure that all buffer pointers remain non-null internally.
    
    Followup to GH-14805
    
    * Closes: #14875
    
    Authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/array/validate.cc |  11 ++--
 cpp/src/arrow/c/bridge.cc       |  40 +++++++++---
 cpp/src/arrow/c/bridge_test.cc  | 132 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 167 insertions(+), 16 deletions(-)

diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 56470ac74b..c1a37c4234 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -459,14 +459,17 @@ struct ValidateArrayImpl {
       if (buffer == nullptr) {
         continue;
       }
-      int64_t min_buffer_size = -1;
+      int64_t min_buffer_size = 0;
       switch (spec.kind) {
         case DataTypeLayout::BITMAP:
-          min_buffer_size = bit_util::BytesForBits(length_plus_offset);
+          // If length == 0, buffer size can be 0 regardless of offset
+          if (data.length > 0) {
+            min_buffer_size = bit_util::BytesForBits(length_plus_offset);
+          }
           break;
         case DataTypeLayout::FIXED_WIDTH:
-          if (MultiplyWithOverflow(length_plus_offset, spec.byte_width,
-                                   &min_buffer_size)) {
+          if (data.length > 0 && MultiplyWithOverflow(length_plus_offset, spec.byte_width,
+                                                      &min_buffer_size)) {
             return Status::Invalid("Array of type ", type.ToString(),
                                    " has impossibly large length and offset");
           }
diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc
index 7579a1c489..d6ea60f520 100644
--- a/cpp/src/arrow/c/bridge.cc
+++ b/cpp/src/arrow/c/bridge.cc
@@ -31,6 +31,7 @@
 #include "arrow/c/util_internal.h"
 #include "arrow/extension_type.h"
 #include "arrow/memory_pool.h"
+#include "arrow/memory_pool_internal.h"  // for kZeroSizeArea
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
 #include "arrow/stl_allocator.h"
@@ -60,6 +61,8 @@ using internal::SchemaExportTraits;
 
 using internal::ToChars;
 
+using memory_pool::internal::kZeroSizeArea;
+
 namespace {
 
 Status ExportingNotImplemented(const DataType& type) {
@@ -1265,7 +1268,8 @@ class ImportedBuffer : public Buffer {
 };
 
 struct ArrayImporter {
-  explicit ArrayImporter(const std::shared_ptr<DataType>& type) : type_(type) {}
+  explicit ArrayImporter(const std::shared_ptr<DataType>& type)
+      : type_(type), zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea, 0)) {}
 
   Status Import(struct ArrowArray* src) {
     if (ArrowArrayIsReleased(src)) {
@@ -1529,7 +1533,7 @@ struct ArrayImporter {
   }
 
   Status ImportNullBitmap(int32_t buffer_id = 0) {
-    RETURN_NOT_OK(ImportBitsBuffer(buffer_id));
+    RETURN_NOT_OK(ImportBitsBuffer(buffer_id, /*is_null_bitmap=*/true));
     if (data_->null_count > 0 && data_->buffers[buffer_id] == nullptr) {
       return Status::Invalid(
           "ArrowArray struct has null bitmap buffer but non-zero null_count ",
@@ -1538,15 +1542,20 @@ struct ArrayImporter {
     return Status::OK();
   }
 
-  Status ImportBitsBuffer(int32_t buffer_id) {
+  Status ImportBitsBuffer(int32_t buffer_id, bool is_null_bitmap = false) {
     // Compute visible size of buffer
-    int64_t buffer_size = bit_util::BytesForBits(c_struct_->length + c_struct_->offset);
-    return ImportBuffer(buffer_id, buffer_size);
+    int64_t buffer_size =
+        (c_struct_->length > 0)
+            ? bit_util::BytesForBits(c_struct_->length + c_struct_->offset)
+            : 0;
+    return ImportBuffer(buffer_id, buffer_size, is_null_bitmap);
   }
 
   Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) {
     // Compute visible size of buffer
-    int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset);
+    int64_t buffer_size = (c_struct_->length > 0)
+                              ? byte_width * (c_struct_->length + c_struct_->offset)
+                              : 0;
     return ImportBuffer(buffer_id, buffer_size);
   }
 
@@ -1563,17 +1572,27 @@ struct ArrayImporter {
                                   int64_t byte_width = 1) {
     auto offsets = data_->GetValues<OffsetType>(offsets_buffer_id);
     // Compute visible size of buffer
-    int64_t buffer_size = byte_width * offsets[c_struct_->length];
+    int64_t buffer_size =
+        (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0;
     return ImportBuffer(buffer_id, buffer_size);
   }
 
-  Status ImportBuffer(int32_t buffer_id, int64_t buffer_size) {
+  Status ImportBuffer(int32_t buffer_id, int64_t buffer_size,
+                      bool is_null_bitmap = false) {
     std::shared_ptr<Buffer>* out = &data_->buffers[buffer_id];
     auto data = reinterpret_cast<const uint8_t*>(c_struct_->buffers[buffer_id]);
     if (data != nullptr) {
       *out = std::make_shared<ImportedBuffer>(data, buffer_size, import_);
-    } else {
+    } else if (is_null_bitmap) {
       out->reset();
+    } else {
+      // Ensure that imported buffers are never null (except for the null bitmap)
+      if (buffer_size != 0) {
+        return Status::Invalid(
+            "ArrowArrayStruct contains null data pointer "
+            "for a buffer with non-zero computed size");
+      }
+      *out = zero_size_buffer_;
     }
     return Status::OK();
   }
@@ -1585,6 +1604,9 @@ struct ArrayImporter {
   std::shared_ptr<ImportedArrayData> import_;
   std::shared_ptr<ArrayData> data_;
   std::vector<ArrayImporter> child_importers_;
+
+  // For imported null buffer pointers
+  std::shared_ptr<Buffer> zero_size_buffer_;
 };
 
 }  // namespace
diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc
index 813d429533..90fe9d5965 100644
--- a/cpp/src/arrow/c/bridge_test.cc
+++ b/cpp/src/arrow/c/bridge_test.cc
@@ -124,6 +124,24 @@ class ReleaseCallback {
 using SchemaReleaseCallback = ReleaseCallback<SchemaExportTraits>;
 using ArrayReleaseCallback = ReleaseCallback<ArrayExportTraits>;
 
+// Whether c_struct or any of its descendents have non-null data pointers.
+bool HasData(const ArrowArray* c_struct) {
+  for (int64_t i = 0; i < c_struct->n_buffers; ++i) {
+    if (c_struct->buffers[i] != nullptr) {
+      return true;
+    }
+  }
+  if (c_struct->dictionary && HasData(c_struct->dictionary)) {
+    return true;
+  }
+  for (int64_t i = 0; i < c_struct->n_children; ++i) {
+    if (HasData(c_struct->children[i])) {
+      return true;
+    }
+  }
+  return false;
+}
+
 static const std::vector<std::string> kMetadataKeys1{"key1", "key2"};
 static const std::vector<std::string> kMetadataValues1{"", "bar"};
 
@@ -1659,6 +1677,8 @@ static const uint8_t bits_buffer1[] = {0xed, 0xed};
 static const void* buffers_no_nulls_no_data[1] = {nullptr};
 static const void* buffers_nulls_no_data1[1] = {bits_buffer1};
 
+static const void* all_buffers_omitted[3] = {nullptr, nullptr, nullptr};
+
 static const uint8_t data_buffer1[] = {1, 2,  3,  4,  5,  6,  7,  8,
                                        9, 10, 11, 12, 13, 14, 15, 16};
 static const uint8_t data_buffer2[] = "abcdefghijklmnopqrstuvwxyz";
@@ -1724,10 +1744,13 @@ static const uint8_t string_data_buffer1[] = "foobarquuxxyzzy";
 static const int32_t string_offsets_buffer1[] = {0, 3, 3, 6, 10, 15};
 static const void* string_buffers_no_nulls1[3] = {nullptr, string_offsets_buffer1,
                                                   string_data_buffer1};
+static const void* string_buffers_omitted[3] = {nullptr, string_offsets_buffer1, nullptr};
 
 static const int64_t large_string_offsets_buffer1[] = {0, 3, 3, 6, 10};
 static const void* large_string_buffers_no_nulls1[3] = {
     nullptr, large_string_offsets_buffer1, string_data_buffer1};
+static const void* large_string_buffers_omitted[3] = {
+    nullptr, large_string_offsets_buffer1, nullptr};
 
 static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8};
 static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1};
@@ -1901,9 +1924,9 @@ class TestArrayImport : public ::testing::Test {
     Reset();                                        // for further tests
 
     ASSERT_OK(array->ValidateFull());
-    // Special case: Null array doesn't have any data, so it needn't
-    // keep the ArrowArray struct alive.
-    if (type->id() != Type::NA) {
+    // Special case: arrays without data (such as Null arrays) needn't keep
+    // the ArrowArray struct alive.
+    if (HasData(&c_struct_)) {
       cb.AssertNotCalled();
     }
     AssertArraysEqual(*expected, *array, true);
@@ -1990,6 +2013,10 @@ TEST_F(TestArrayImport, Primitive) {
   CheckImport(ArrayFromJSON(boolean(), "[true, null, false]"));
   FillPrimitive(3, 1, 0, primitive_buffers_nulls1_8);
   CheckImport(ArrayFromJSON(boolean(), "[true, null, false]"));
+
+  // Empty array with null data pointers
+  FillPrimitive(0, 0, 0, all_buffers_omitted);
+  CheckImport(ArrayFromJSON(int32(), "[]"));
 }
 
 TEST_F(TestArrayImport, Temporal) {
@@ -2070,6 +2097,12 @@ TEST_F(TestArrayImport, PrimitiveWithOffset) {
 
   FillPrimitive(4, 0, 7, primitive_buffers_no_nulls1_8);
   CheckImport(ArrayFromJSON(boolean(), "[false, false, true, false]"));
+
+  // Empty array with null data pointers
+  FillPrimitive(0, 0, 2, all_buffers_omitted);
+  CheckImport(ArrayFromJSON(int32(), "[]"));
+  FillPrimitive(0, 0, 3, all_buffers_omitted);
+  CheckImport(ArrayFromJSON(boolean(), "[]"));
 }
 
 TEST_F(TestArrayImport, NullWithOffset) {
@@ -2092,10 +2125,48 @@ TEST_F(TestArrayImport, String) {
   FillStringLike(4, 0, 0, large_string_buffers_no_nulls1);
   CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])"));
 
+  // Empty array with null data pointers
+  FillStringLike(0, 0, 0, string_buffers_omitted);
+  CheckImport(ArrayFromJSON(utf8(), "[]"));
+  FillStringLike(0, 0, 0, large_string_buffers_omitted);
+  CheckImport(ArrayFromJSON(large_binary(), "[]"));
+}
+
+TEST_F(TestArrayImport, StringWithOffset) {
+  FillStringLike(3, 0, 1, string_buffers_no_nulls1);
+  CheckImport(ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"));
+  FillStringLike(2, 0, 2, large_string_buffers_no_nulls1);
+  CheckImport(ArrayFromJSON(large_utf8(), R"(["bar", "quux"])"));
+
+  // Empty array with null data pointers
+  FillStringLike(0, 0, 1, string_buffers_omitted);
+  CheckImport(ArrayFromJSON(utf8(), "[]"));
+}
+
+TEST_F(TestArrayImport, FixedSizeBinary) {
   FillPrimitive(2, 0, 0, primitive_buffers_no_nulls2);
   CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["abc", "def"])"));
   FillPrimitive(2, 0, 0, primitive_buffers_no_nulls3);
   CheckImport(ArrayFromJSON(decimal(15, 4), R"(["12345.6789", "98765.4321"])"));
+
+  // Empty array with null data pointers
+  FillPrimitive(0, 0, 0, all_buffers_omitted);
+  CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]"));
+  FillPrimitive(0, 0, 0, all_buffers_omitted);
+  CheckImport(ArrayFromJSON(decimal(15, 4), "[]"));
+}
+
+TEST_F(TestArrayImport, FixedSizeBinaryWithOffset) {
+  FillPrimitive(1, 0, 1, primitive_buffers_no_nulls2);
+  CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["def"])"));
+  FillPrimitive(1, 0, 1, primitive_buffers_no_nulls3);
+  CheckImport(ArrayFromJSON(decimal(15, 4), R"(["98765.4321"])"));
+
+  // Empty array with null data pointers
+  FillPrimitive(0, 0, 1, all_buffers_omitted);
+  CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]"));
+  FillPrimitive(0, 0, 1, all_buffers_omitted);
+  CheckImport(ArrayFromJSON(decimal(15, 4), "[]"));
 }
 
 TEST_F(TestArrayImport, List) {
@@ -2117,6 +2188,11 @@ TEST_F(TestArrayImport, List) {
   FillFixedSizeListLike(3, 0, 0, buffers_no_nulls_no_data);
   CheckImport(
       ArrayFromJSON(fixed_size_list(int8(), 3), "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]"));
+
+  // Empty child array with null data pointers
+  FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+  FillFixedSizeListLike(0, 0, 0, buffers_no_nulls_no_data);
+  CheckImport(ArrayFromJSON(fixed_size_list(int8(), 3), "[]"));
 }
 
 TEST_F(TestArrayImport, NestedList) {
@@ -2205,6 +2281,15 @@ TEST_F(TestArrayImport, SparseUnion) {
   FillUnionLike(UnionMode::SPARSE, 4, 0, 0, 2, sparse_union_buffers1_legacy,
                 /*legacy=*/true);
   CheckImport(expected);
+
+  // Empty array with null data pointers
+  expected = ArrayFromJSON(type, "[]");
+  FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+  FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+  FillUnionLike(UnionMode::SPARSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false);
+  FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+  FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+  FillUnionLike(UnionMode::SPARSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false);
 }
 
 TEST_F(TestArrayImport, DenseUnion) {
@@ -2223,6 +2308,15 @@ TEST_F(TestArrayImport, DenseUnion) {
   FillUnionLike(UnionMode::DENSE, 5, 0, 0, 2, dense_union_buffers1_legacy,
                 /*legacy=*/true);
   CheckImport(expected);
+
+  // Empty array with null data pointers
+  expected = ArrayFromJSON(type, "[]");
+  FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+  FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+  FillUnionLike(UnionMode::DENSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false);
+  FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+  FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+  FillUnionLike(UnionMode::DENSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false);
 }
 
 TEST_F(TestArrayImport, StructWithOffset) {
@@ -2359,6 +2453,29 @@ TEST_F(TestArrayImport, PrimitiveError) {
   // Zero null bitmap but non-zero null_count
   FillPrimitive(3, 1, 0, primitive_buffers_no_nulls1_8);
   CheckImportError(int8());
+
+  // Null data pointers with non-zero length
+  FillPrimitive(1, 0, 0, all_buffers_omitted);
+  CheckImportError(int8());
+  FillPrimitive(1, 0, 0, all_buffers_omitted);
+  CheckImportError(boolean());
+  FillPrimitive(1, 0, 0, all_buffers_omitted);
+  CheckImportError(fixed_size_binary(3));
+}
+
+TEST_F(TestArrayImport, StringError) {
+  // Bad number of buffers
+  FillStringLike(4, 0, 0, string_buffers_no_nulls1);
+  c_struct_.n_buffers = 2;
+  CheckImportError(utf8());
+
+  // Null data pointers with non-zero length
+  FillStringLike(4, 0, 0, string_buffers_omitted);
+  CheckImportError(utf8());
+
+  // Null offsets pointer
+  FillStringLike(0, 0, 0, all_buffers_omitted);
+  CheckImportError(utf8());
 }
 
 TEST_F(TestArrayImport, StructError) {
@@ -2369,6 +2486,13 @@ TEST_F(TestArrayImport, StructError) {
   CheckImportError(struct_({field("strs", utf8())}));
 }
 
+TEST_F(TestArrayImport, ListError) {
+  // Null offsets pointer
+  FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8);
+  FillListLike(0, 0, 0, all_buffers_omitted);
+  CheckImportError(list(int8()));
+}
+
 TEST_F(TestArrayImport, MapError) {
   // Bad number of (struct) children in map child
   FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1);
@@ -2859,8 +2983,10 @@ TEST_F(TestArrayRoundtrip, UnknownNullCount) {
 TEST_F(TestArrayRoundtrip, List) {
   TestWithJSON(list(int32()), "[]");
   TestWithJSON(list(int32()), "[[4, 5], [6, null], null]");
+  TestWithJSON(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]");
 
   TestWithJSONSliced(list(int32()), "[[4, 5], [6, null], null]");
+  TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]");
 }
 
 TEST_F(TestArrayRoundtrip, Struct) {