You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2019/03/20 08:48:47 UTC

[arrow] branch master updated: ARROW-3208: [C++] Fix Cast dictionary to numeric segfault

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 37f898f  ARROW-3208: [C++] Fix Cast dictionary to numeric segfault
37f898f is described below

commit 37f898f9ca7d76448223407d436aeee6a81a8f7d
Author: François Saint-Jacques <fs...@gmail.com>
AuthorDate: Wed Mar 20 09:48:37 2019 +0100

    ARROW-3208: [C++] Fix Cast dictionary to numeric segfault
    
    Author: François Saint-Jacques <fs...@gmail.com>
    
    Closes #3978 from fsaintjacques/ARROW-3208-python-parquet-segfault-partition and squashes the following commits:
    
    8a4c2613 <François Saint-Jacques> ARROW-3208:  Fix Cast dictionary to numeric segfault with nullptr Buffer
---
 cpp/src/arrow/compute/kernels/cast-test.cc | 17 +++++++++++++++++
 cpp/src/arrow/compute/kernels/cast.cc      | 19 +++++++++++++------
 python/pyarrow/tests/test_parquet.py       | 16 ++++++++++++++++
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc
index 271b233..8cf71d1 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -33,6 +33,7 @@
 #include "arrow/table.h"
 #include "arrow/testing/gtest_common.h"
 #include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
 #include "arrow/type.h"
 #include "arrow/type_fwd.h"
 #include "arrow/type_traits.h"
@@ -1116,6 +1117,22 @@ TYPED_TEST(TestDictionaryCast, Basic) {
   this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options);
 }
 
+TEST_F(TestCast, DictToNumericNoNulls) {
+  // ARROW-3208
+  CastOptions options;
+
+  // Convoluted way to create an array with nullptr bitmap buffer
+  auto array_ = _MakeArray<Int32Type, int32_t>(int32(), {1, 2, 3, 4, 5, 6}, {});
+  auto data = array_->data();
+  data->buffers[0] = nullptr;
+  auto array = MakeArray(data);
+
+  Datum encoded;
+  ASSERT_OK(DictionaryEncode(&this->ctx_, array->data(), &encoded));
+
+  this->CheckPass(*MakeArray(encoded.array()), *array, array->type(), options);
+}
+
 TEST_F(TestCast, DictToNonDictNoNulls) {
   vector<std::string> dict_values = {"foo", "bar", "baz"};
   auto ex_dict = _MakeArray<StringType, std::string>(utf8(), dict_values, {});
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index ee21803..2d3e1a8 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -842,15 +842,22 @@ struct CastFunctor<T, DictionaryType,
 template <typename IndexType, typename c_type>
 void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
                                c_type* out) {
-  internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
-                                           indices.length());
+  const auto& in = indices.data()->GetValues<typename IndexType::c_type>(1);
+  int64_t length = indices.length();
 
-  auto in = indices.data()->GetValues<typename IndexType::c_type>(1);
-  for (int64_t i = 0; i < indices.length(); ++i) {
-    if (valid_bits_reader.IsSet()) {
+  if (indices.null_count() == 0) {
+    for (int64_t i = 0; i < length; ++i) {
       out[i] = dictionary[in[i]];
     }
-    valid_bits_reader.Next();
+  } else {
+    auto null_bitmap = indices.null_bitmap_data();
+    internal::BitmapReader valid_bits_reader(null_bitmap, indices.offset(), length);
+    for (int64_t i = 0; i < length; ++i) {
+      if (valid_bits_reader.IsSet()) {
+        out[i] = dictionary[in[i]];
+      }
+      valid_bits_reader.Next();
+    }
   }
 }
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 8406f36..34d0956 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2479,3 +2479,19 @@ def test_write_nested_zero_length_array_chunk_failure():
                   for batch in my_arrays]
     tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
     _check_roundtrip(tbl)
+
+
+def test_partitioned_dataset(tempdir):
+    # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset
+    # to a Parquet file
+    path = tempdir / "ARROW-3208"
+    df = pd.DataFrame({
+        'one': [-1, 10, 2.5, 100, 1000, 1, 29.2],
+        'two': [-1, 10, 2, 100, 1000, 1, 11],
+        'three': [0, 0, 0, 0, 0, 0, 0]
+    })
+    table = pa.Table.from_pandas(df)
+    pq.write_to_dataset(table, root_path=str(path),
+                        partition_cols=['one', 'two'])
+    table = pq.ParquetDataset(path).read()
+    pq.write_table(table, path / "output.parquet")