You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/04/02 17:39:22 UTC

[arrow] branch master updated: ARROW-7008: [C++] Check binary offsets and data buffers for nullness in validation. Produce valid arrays in DictionaryEncode on zero-length arrays

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 15936cf  ARROW-7008: [C++] Check binary offsets and data buffers for nullness in validation. Produce valid arrays in DictionaryEncode on zero-length arrays
15936cf is described below

commit 15936cf4f88622d0adf62c53b102f803558a41b6
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu Apr 2 12:38:43 2020 -0500

    ARROW-7008: [C++] Check binary offsets and data buffers for nullness in validation. Produce valid arrays in DictionaryEncode on zero-length arrays
    
    A couple of related issues:
    
    * Validate() and ValidateFull() did not check the offsets and data buffers of BinaryArray for nullness. We might relax the requirements of the columnar format docs to allow them to be null, but that should be a separate matter
    * DictionaryEncode (and other hash kernels) were producing invalid dictionaries for zero-length BinaryArray input
    
    Closes #6803 from wesm/ARROW-7008
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/array/dict_internal.h        | 12 +++++-------
 cpp/src/arrow/array/validate.cc            |  8 +++++++-
 cpp/src/arrow/array_dict_test.cc           | 13 +++++++++++++
 cpp/src/arrow/compute/kernels/hash_test.cc | 16 ++++++++++++++++
 python/pyarrow/tests/test_array.py         |  8 ++++++++
 5 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h
index 0125dfb..a9ac06c 100644
--- a/cpp/src/arrow/array/dict_internal.h
+++ b/cpp/src/arrow/array/dict_internal.h
@@ -133,17 +133,15 @@ struct DictionaryTraits<T, enable_if_base_binary<T>> {
 
     // Create the offsets buffer
     auto dict_length = static_cast<int64_t>(memo_table.size() - start_offset);
-    if (dict_length > 0) {
-      RETURN_NOT_OK(
-          AllocateBuffer(pool, sizeof(offset_type) * (dict_length + 1), &dict_offsets));
-      auto raw_offsets = reinterpret_cast<offset_type*>(dict_offsets->mutable_data());
-      memo_table.CopyOffsets(static_cast<int32_t>(start_offset), raw_offsets);
-    }
+    RETURN_NOT_OK(
+        AllocateBuffer(pool, sizeof(offset_type) * (dict_length + 1), &dict_offsets));
+    auto raw_offsets = reinterpret_cast<offset_type*>(dict_offsets->mutable_data());
+    memo_table.CopyOffsets(static_cast<int32_t>(start_offset), raw_offsets);
 
     // Create the data buffer
     auto values_size = memo_table.values_size();
+    RETURN_NOT_OK(AllocateBuffer(pool, values_size, &dict_data));
     if (values_size > 0) {
-      RETURN_NOT_OK(AllocateBuffer(pool, values_size, &dict_data));
       memo_table.CopyValues(static_cast<int32_t>(start_offset), dict_data->size(),
                             dict_data->mutable_data());
     }
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 6a687fa..c6ad260 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -207,6 +207,9 @@ struct ValidateArrayVisitor {
     if (array.data()->buffers.size() != 3) {
       return Status::Invalid("number of buffers is != 3");
     }
+    if (array.value_data() == nullptr) {
+      return Status::Invalid("value data buffer is null");
+    }
     return ValidateOffsets(array);
   }
 
@@ -457,7 +460,7 @@ struct ValidateArrayDataVisitor {
     if (!indices_status.ok()) {
       return Status::Invalid("Dictionary indices invalid: ", indices_status.ToString());
     }
-    return Status::OK();
+    return ValidateArrayData(*array.dictionary());
   }
 
   Status Visit(const ExtensionArray& array) {
@@ -467,6 +470,9 @@ struct ValidateArrayDataVisitor {
  protected:
   template <typename BinaryArrayType>
   Status ValidateBinaryArray(const BinaryArrayType& array) {
+    if (array.value_data() == nullptr) {
+      return Status::Invalid("value data buffer is null");
+    }
     return ValidateOffsets(array, array.value_data()->size());
   }
 
diff --git a/cpp/src/arrow/array_dict_test.cc b/cpp/src/arrow/array_dict_test.cc
index a1120dc..52e5e98 100644
--- a/cpp/src/arrow/array_dict_test.cc
+++ b/cpp/src/arrow/array_dict_test.cc
@@ -980,6 +980,19 @@ TEST(TestDictionary, Validate) {
   // Only checking index type for now
   ASSERT_OK(arr->ValidateFull());
 
+  // ARROW-7008: Invalid dict was not being validated
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, nullptr, nullptr};
+  auto invalid_data = std::make_shared<ArrayData>(utf8(), 0, buffers);
+
+  indices = ArrayFromJSON(int16(), "[]");
+  arr = std::make_shared<DictionaryArray>(dict_type, indices, MakeArray(invalid_data));
+  ASSERT_RAISES(Invalid, arr->ValidateFull());
+
+  // Make the data buffer non-null
+  ASSERT_OK(AllocateBuffer(0, &buffers[2]));
+  arr = std::make_shared<DictionaryArray>(dict_type, indices, MakeArray(invalid_data));
+  ASSERT_RAISES(Invalid, arr->ValidateFull());
+
   ASSERT_DEATH(
       {
         std::shared_ptr<Array> null_dict_arr =
diff --git a/cpp/src/arrow/compute/kernels/hash_test.cc b/cpp/src/arrow/compute/kernels/hash_test.cc
index 0797f61..7ab5223 100644
--- a/cpp/src/arrow/compute/kernels/hash_test.cc
+++ b/cpp/src/arrow/compute/kernels/hash_test.cc
@@ -37,6 +37,7 @@
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 
 #include "arrow/compute/context.h"
@@ -48,6 +49,9 @@
 #include "arrow/ipc/json_simple.h"
 
 namespace arrow {
+
+using internal::checked_cast;
+
 namespace compute {
 
 using StringTypes =
@@ -644,6 +648,18 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
   AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array());
 }
 
+TEST_F(TestHashKernel, ZeroLengthDictionaryEncode) {
+  // ARROW-7008
+  auto values = ArrayFromJSON(utf8(), "[]");
+  Datum datum_result;
+  ASSERT_OK(DictionaryEncode(&this->ctx_, values, &datum_result));
+
+  std::shared_ptr<Array> result = datum_result.make_array();
+  const auto& dict_result = checked_cast<const DictionaryArray&>(*result);
+  ASSERT_OK(dict_result.Validate());
+  ASSERT_OK(dict_result.ValidateFull());
+}
+
 TEST_F(TestHashKernel, ChunkedArrayZeroChunk) {
   // ARROW-6857
   auto chunked_array = std::make_shared<ChunkedArray>(ArrayVector{}, utf8());
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 18f28f6..d7cf989 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1168,6 +1168,14 @@ def test_dictionary_encode_sliced():
         assert result.type == expected.type
 
 
+def test_dictionary_encode_zero_length():
+    # User-facing experience of ARROW-7008
+    arr = pa.array([], type=pa.string())
+    encoded = arr.dictionary_encode()
+    assert len(encoded.dictionary) == 0
+    encoded.validate(full=True)
+
+
 def test_cast_time32_to_int():
     arr = pa.array(np.array([0, 1, 2], dtype='int32'),
                    type=pa.time32('s'))