You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2023/01/18 08:34:23 UTC
[arrow] 07/10: GH-14875: [C++] C Data Interface: check imported buffer for non-null (#14814)
This is an automated email from the ASF dual-hosted git repository.
raulcd pushed a commit to branch maint-11.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 8d1e357b7721af971ff02a0325fced448b78a26b
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Jan 17 18:53:02 2023 +0100
GH-14875: [C++] C Data Interface: check imported buffer for non-null (#14814)
The C data interface may expose null data pointers for zero-sized buffers.
Make sure that all buffer pointers remain non-null internally.
Followup to GH-14805
* Closes: #14875
Authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/array/validate.cc | 11 ++--
cpp/src/arrow/c/bridge.cc | 40 +++++++++---
cpp/src/arrow/c/bridge_test.cc | 132 +++++++++++++++++++++++++++++++++++++++-
3 files changed, 167 insertions(+), 16 deletions(-)
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 56470ac74b..c1a37c4234 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -459,14 +459,17 @@ struct ValidateArrayImpl {
if (buffer == nullptr) {
continue;
}
- int64_t min_buffer_size = -1;
+ int64_t min_buffer_size = 0;
switch (spec.kind) {
case DataTypeLayout::BITMAP:
- min_buffer_size = bit_util::BytesForBits(length_plus_offset);
+ // If length == 0, buffer size can be 0 regardless of offset
+ if (data.length > 0) {
+ min_buffer_size = bit_util::BytesForBits(length_plus_offset);
+ }
break;
case DataTypeLayout::FIXED_WIDTH:
- if (MultiplyWithOverflow(length_plus_offset, spec.byte_width,
- &min_buffer_size)) {
+ if (data.length > 0 && MultiplyWithOverflow(length_plus_offset, spec.byte_width,
+ &min_buffer_size)) {
return Status::Invalid("Array of type ", type.ToString(),
" has impossibly large length and offset");
}
diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc
index 7579a1c489..d6ea60f520 100644
--- a/cpp/src/arrow/c/bridge.cc
+++ b/cpp/src/arrow/c/bridge.cc
@@ -31,6 +31,7 @@
#include "arrow/c/util_internal.h"
#include "arrow/extension_type.h"
#include "arrow/memory_pool.h"
+#include "arrow/memory_pool_internal.h" // for kZeroSizeArea
#include "arrow/record_batch.h"
#include "arrow/result.h"
#include "arrow/stl_allocator.h"
@@ -60,6 +61,8 @@ using internal::SchemaExportTraits;
using internal::ToChars;
+using memory_pool::internal::kZeroSizeArea;
+
namespace {
Status ExportingNotImplemented(const DataType& type) {
@@ -1265,7 +1268,8 @@ class ImportedBuffer : public Buffer {
};
struct ArrayImporter {
- explicit ArrayImporter(const std::shared_ptr<DataType>& type) : type_(type) {}
+ explicit ArrayImporter(const std::shared_ptr<DataType>& type)
+ : type_(type), zero_size_buffer_(std::make_shared<Buffer>(kZeroSizeArea, 0)) {}
Status Import(struct ArrowArray* src) {
if (ArrowArrayIsReleased(src)) {
@@ -1529,7 +1533,7 @@ struct ArrayImporter {
}
Status ImportNullBitmap(int32_t buffer_id = 0) {
- RETURN_NOT_OK(ImportBitsBuffer(buffer_id));
+ RETURN_NOT_OK(ImportBitsBuffer(buffer_id, /*is_null_bitmap=*/true));
if (data_->null_count > 0 && data_->buffers[buffer_id] == nullptr) {
return Status::Invalid(
"ArrowArray struct has null bitmap buffer but non-zero null_count ",
@@ -1538,15 +1542,20 @@ struct ArrayImporter {
return Status::OK();
}
- Status ImportBitsBuffer(int32_t buffer_id) {
+ Status ImportBitsBuffer(int32_t buffer_id, bool is_null_bitmap = false) {
// Compute visible size of buffer
- int64_t buffer_size = bit_util::BytesForBits(c_struct_->length + c_struct_->offset);
- return ImportBuffer(buffer_id, buffer_size);
+ int64_t buffer_size =
+ (c_struct_->length > 0)
+ ? bit_util::BytesForBits(c_struct_->length + c_struct_->offset)
+ : 0;
+ return ImportBuffer(buffer_id, buffer_size, is_null_bitmap);
}
Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) {
// Compute visible size of buffer
- int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset);
+ int64_t buffer_size = (c_struct_->length > 0)
+ ? byte_width * (c_struct_->length + c_struct_->offset)
+ : 0;
return ImportBuffer(buffer_id, buffer_size);
}
@@ -1563,17 +1572,27 @@ struct ArrayImporter {
int64_t byte_width = 1) {
auto offsets = data_->GetValues<OffsetType>(offsets_buffer_id);
// Compute visible size of buffer
- int64_t buffer_size = byte_width * offsets[c_struct_->length];
+ int64_t buffer_size =
+ (c_struct_->length > 0) ? byte_width * offsets[c_struct_->length] : 0;
return ImportBuffer(buffer_id, buffer_size);
}
- Status ImportBuffer(int32_t buffer_id, int64_t buffer_size) {
+ Status ImportBuffer(int32_t buffer_id, int64_t buffer_size,
+ bool is_null_bitmap = false) {
std::shared_ptr<Buffer>* out = &data_->buffers[buffer_id];
auto data = reinterpret_cast<const uint8_t*>(c_struct_->buffers[buffer_id]);
if (data != nullptr) {
*out = std::make_shared<ImportedBuffer>(data, buffer_size, import_);
- } else {
+ } else if (is_null_bitmap) {
out->reset();
+ } else {
+ // Ensure that imported buffers are never null (except for the null bitmap)
+ if (buffer_size != 0) {
+ return Status::Invalid(
+ "ArrowArrayStruct contains null data pointer "
+ "for a buffer with non-zero computed size");
+ }
+ *out = zero_size_buffer_;
}
return Status::OK();
}
@@ -1585,6 +1604,9 @@ struct ArrayImporter {
std::shared_ptr<ImportedArrayData> import_;
std::shared_ptr<ArrayData> data_;
std::vector<ArrayImporter> child_importers_;
+
+ // For imported null buffer pointers
+ std::shared_ptr<Buffer> zero_size_buffer_;
};
} // namespace
diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc
index 813d429533..90fe9d5965 100644
--- a/cpp/src/arrow/c/bridge_test.cc
+++ b/cpp/src/arrow/c/bridge_test.cc
@@ -124,6 +124,24 @@ class ReleaseCallback {
using SchemaReleaseCallback = ReleaseCallback<SchemaExportTraits>;
using ArrayReleaseCallback = ReleaseCallback<ArrayExportTraits>;
+// Whether c_struct or any of its descendents have non-null data pointers.
+bool HasData(const ArrowArray* c_struct) {
+ for (int64_t i = 0; i < c_struct->n_buffers; ++i) {
+ if (c_struct->buffers[i] != nullptr) {
+ return true;
+ }
+ }
+ if (c_struct->dictionary && HasData(c_struct->dictionary)) {
+ return true;
+ }
+ for (int64_t i = 0; i < c_struct->n_children; ++i) {
+ if (HasData(c_struct->children[i])) {
+ return true;
+ }
+ }
+ return false;
+}
+
static const std::vector<std::string> kMetadataKeys1{"key1", "key2"};
static const std::vector<std::string> kMetadataValues1{"", "bar"};
@@ -1659,6 +1677,8 @@ static const uint8_t bits_buffer1[] = {0xed, 0xed};
static const void* buffers_no_nulls_no_data[1] = {nullptr};
static const void* buffers_nulls_no_data1[1] = {bits_buffer1};
+static const void* all_buffers_omitted[3] = {nullptr, nullptr, nullptr};
+
static const uint8_t data_buffer1[] = {1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16};
static const uint8_t data_buffer2[] = "abcdefghijklmnopqrstuvwxyz";
@@ -1724,10 +1744,13 @@ static const uint8_t string_data_buffer1[] = "foobarquuxxyzzy";
static const int32_t string_offsets_buffer1[] = {0, 3, 3, 6, 10, 15};
static const void* string_buffers_no_nulls1[3] = {nullptr, string_offsets_buffer1,
string_data_buffer1};
+static const void* string_buffers_omitted[3] = {nullptr, string_offsets_buffer1, nullptr};
static const int64_t large_string_offsets_buffer1[] = {0, 3, 3, 6, 10};
static const void* large_string_buffers_no_nulls1[3] = {
nullptr, large_string_offsets_buffer1, string_data_buffer1};
+static const void* large_string_buffers_omitted[3] = {
+ nullptr, large_string_offsets_buffer1, nullptr};
static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8};
static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1};
@@ -1901,9 +1924,9 @@ class TestArrayImport : public ::testing::Test {
Reset(); // for further tests
ASSERT_OK(array->ValidateFull());
- // Special case: Null array doesn't have any data, so it needn't
- // keep the ArrowArray struct alive.
- if (type->id() != Type::NA) {
+ // Special case: arrays without data (such as Null arrays) needn't keep
+ // the ArrowArray struct alive.
+ if (HasData(&c_struct_)) {
cb.AssertNotCalled();
}
AssertArraysEqual(*expected, *array, true);
@@ -1990,6 +2013,10 @@ TEST_F(TestArrayImport, Primitive) {
CheckImport(ArrayFromJSON(boolean(), "[true, null, false]"));
FillPrimitive(3, 1, 0, primitive_buffers_nulls1_8);
CheckImport(ArrayFromJSON(boolean(), "[true, null, false]"));
+
+ // Empty array with null data pointers
+ FillPrimitive(0, 0, 0, all_buffers_omitted);
+ CheckImport(ArrayFromJSON(int32(), "[]"));
}
TEST_F(TestArrayImport, Temporal) {
@@ -2070,6 +2097,12 @@ TEST_F(TestArrayImport, PrimitiveWithOffset) {
FillPrimitive(4, 0, 7, primitive_buffers_no_nulls1_8);
CheckImport(ArrayFromJSON(boolean(), "[false, false, true, false]"));
+
+ // Empty array with null data pointers
+ FillPrimitive(0, 0, 2, all_buffers_omitted);
+ CheckImport(ArrayFromJSON(int32(), "[]"));
+ FillPrimitive(0, 0, 3, all_buffers_omitted);
+ CheckImport(ArrayFromJSON(boolean(), "[]"));
}
TEST_F(TestArrayImport, NullWithOffset) {
@@ -2092,10 +2125,48 @@ TEST_F(TestArrayImport, String) {
FillStringLike(4, 0, 0, large_string_buffers_no_nulls1);
CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])"));
+ // Empty array with null data pointers
+ FillStringLike(0, 0, 0, string_buffers_omitted);
+ CheckImport(ArrayFromJSON(utf8(), "[]"));
+ FillStringLike(0, 0, 0, large_string_buffers_omitted);
+ CheckImport(ArrayFromJSON(large_binary(), "[]"));
+}
+
+TEST_F(TestArrayImport, StringWithOffset) {
+ FillStringLike(3, 0, 1, string_buffers_no_nulls1);
+ CheckImport(ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"));
+ FillStringLike(2, 0, 2, large_string_buffers_no_nulls1);
+ CheckImport(ArrayFromJSON(large_utf8(), R"(["bar", "quux"])"));
+
+ // Empty array with null data pointers
+ FillStringLike(0, 0, 1, string_buffers_omitted);
+ CheckImport(ArrayFromJSON(utf8(), "[]"));
+}
+
+TEST_F(TestArrayImport, FixedSizeBinary) {
FillPrimitive(2, 0, 0, primitive_buffers_no_nulls2);
CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["abc", "def"])"));
FillPrimitive(2, 0, 0, primitive_buffers_no_nulls3);
CheckImport(ArrayFromJSON(decimal(15, 4), R"(["12345.6789", "98765.4321"])"));
+
+ // Empty array with null data pointers
+ FillPrimitive(0, 0, 0, all_buffers_omitted);
+ CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]"));
+ FillPrimitive(0, 0, 0, all_buffers_omitted);
+ CheckImport(ArrayFromJSON(decimal(15, 4), "[]"));
+}
+
+TEST_F(TestArrayImport, FixedSizeBinaryWithOffset) {
+ FillPrimitive(1, 0, 1, primitive_buffers_no_nulls2);
+ CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["def"])"));
+ FillPrimitive(1, 0, 1, primitive_buffers_no_nulls3);
+ CheckImport(ArrayFromJSON(decimal(15, 4), R"(["98765.4321"])"));
+
+ // Empty array with null data pointers
+ FillPrimitive(0, 0, 1, all_buffers_omitted);
+ CheckImport(ArrayFromJSON(fixed_size_binary(3), "[]"));
+ FillPrimitive(0, 0, 1, all_buffers_omitted);
+ CheckImport(ArrayFromJSON(decimal(15, 4), "[]"));
}
TEST_F(TestArrayImport, List) {
@@ -2117,6 +2188,11 @@ TEST_F(TestArrayImport, List) {
FillFixedSizeListLike(3, 0, 0, buffers_no_nulls_no_data);
CheckImport(
ArrayFromJSON(fixed_size_list(int8(), 3), "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]"));
+
+ // Empty child array with null data pointers
+ FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+ FillFixedSizeListLike(0, 0, 0, buffers_no_nulls_no_data);
+ CheckImport(ArrayFromJSON(fixed_size_list(int8(), 3), "[]"));
}
TEST_F(TestArrayImport, NestedList) {
@@ -2205,6 +2281,15 @@ TEST_F(TestArrayImport, SparseUnion) {
FillUnionLike(UnionMode::SPARSE, 4, 0, 0, 2, sparse_union_buffers1_legacy,
/*legacy=*/true);
CheckImport(expected);
+
+ // Empty array with null data pointers
+ expected = ArrayFromJSON(type, "[]");
+ FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+ FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+ FillUnionLike(UnionMode::SPARSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false);
+ FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+ FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+ FillUnionLike(UnionMode::SPARSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false);
}
TEST_F(TestArrayImport, DenseUnion) {
@@ -2223,6 +2308,15 @@ TEST_F(TestArrayImport, DenseUnion) {
FillUnionLike(UnionMode::DENSE, 5, 0, 0, 2, dense_union_buffers1_legacy,
/*legacy=*/true);
CheckImport(expected);
+
+ // Empty array with null data pointers
+ expected = ArrayFromJSON(type, "[]");
+ FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+ FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+ FillUnionLike(UnionMode::DENSE, 0, 0, 0, 2, all_buffers_omitted, /*legacy=*/false);
+ FillStringLike(AddChild(), 0, 0, 0, string_buffers_omitted);
+ FillPrimitive(AddChild(), 0, 0, 0, all_buffers_omitted);
+ FillUnionLike(UnionMode::DENSE, 0, 0, 3, 2, all_buffers_omitted, /*legacy=*/false);
}
TEST_F(TestArrayImport, StructWithOffset) {
@@ -2359,6 +2453,29 @@ TEST_F(TestArrayImport, PrimitiveError) {
// Zero null bitmap but non-zero null_count
FillPrimitive(3, 1, 0, primitive_buffers_no_nulls1_8);
CheckImportError(int8());
+
+ // Null data pointers with non-zero length
+ FillPrimitive(1, 0, 0, all_buffers_omitted);
+ CheckImportError(int8());
+ FillPrimitive(1, 0, 0, all_buffers_omitted);
+ CheckImportError(boolean());
+ FillPrimitive(1, 0, 0, all_buffers_omitted);
+ CheckImportError(fixed_size_binary(3));
+}
+
+TEST_F(TestArrayImport, StringError) {
+ // Bad number of buffers
+ FillStringLike(4, 0, 0, string_buffers_no_nulls1);
+ c_struct_.n_buffers = 2;
+ CheckImportError(utf8());
+
+ // Null data pointers with non-zero length
+ FillStringLike(4, 0, 0, string_buffers_omitted);
+ CheckImportError(utf8());
+
+ // Null offsets pointer
+ FillStringLike(0, 0, 0, all_buffers_omitted);
+ CheckImportError(utf8());
}
TEST_F(TestArrayImport, StructError) {
@@ -2369,6 +2486,13 @@ TEST_F(TestArrayImport, StructError) {
CheckImportError(struct_({field("strs", utf8())}));
}
+TEST_F(TestArrayImport, ListError) {
+ // Null offsets pointer
+ FillPrimitive(AddChild(), 0, 0, 0, primitive_buffers_no_nulls1_8);
+ FillListLike(0, 0, 0, all_buffers_omitted);
+ CheckImportError(list(int8()));
+}
+
TEST_F(TestArrayImport, MapError) {
// Bad number of (struct) children in map child
FillStringLike(AddChild(), 5, 0, 0, string_buffers_no_nulls1);
@@ -2859,8 +2983,10 @@ TEST_F(TestArrayRoundtrip, UnknownNullCount) {
TEST_F(TestArrayRoundtrip, List) {
TestWithJSON(list(int32()), "[]");
TestWithJSON(list(int32()), "[[4, 5], [6, null], null]");
+ TestWithJSON(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]");
TestWithJSONSliced(list(int32()), "[[4, 5], [6, null], null]");
+ TestWithJSONSliced(fixed_size_list(int32(), 3), "[[4, 5, 6], null, [7, 8, null]]");
}
TEST_F(TestArrayRoundtrip, Struct) {