You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2019/03/20 08:48:47 UTC
[arrow] branch master updated: ARROW-3208: [C++] Fix Cast
dictionary to numeric segfault
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 37f898f ARROW-3208: [C++] Fix Cast dictionary to numeric segfault
37f898f is described below
commit 37f898f9ca7d76448223407d436aeee6a81a8f7d
Author: François Saint-Jacques <fs...@gmail.com>
AuthorDate: Wed Mar 20 09:48:37 2019 +0100
ARROW-3208: [C++] Fix Cast dictionary to numeric segfault
Author: François Saint-Jacques <fs...@gmail.com>
Closes #3978 from fsaintjacques/ARROW-3208-python-parquet-segfault-partition and squashes the following commits:
8a4c2613 <François Saint-Jacques> ARROW-3208: Fix Cast dictionary to numeric segfault with nullptr Buffer
---
cpp/src/arrow/compute/kernels/cast-test.cc | 17 +++++++++++++++++
cpp/src/arrow/compute/kernels/cast.cc | 19 +++++++++++++------
python/pyarrow/tests/test_parquet.py | 16 ++++++++++++++++
3 files changed, 46 insertions(+), 6 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc
index 271b233..8cf71d1 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -33,6 +33,7 @@
#include "arrow/table.h"
#include "arrow/testing/gtest_common.h"
#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "arrow/type_traits.h"
@@ -1116,6 +1117,22 @@ TYPED_TEST(TestDictionaryCast, Basic) {
this->CheckPass(*MakeArray(out.array()), *plain_array, plain_array->type(), options);
}
+TEST_F(TestCast, DictToNumericNoNulls) {
+ // ARROW-3208
+ CastOptions options;
+
+ // Convoluted way to create an array with nullptr bitmap buffer
+ auto array_ = _MakeArray<Int32Type, int32_t>(int32(), {1, 2, 3, 4, 5, 6}, {});
+ auto data = array_->data();
+ data->buffers[0] = nullptr;
+ auto array = MakeArray(data);
+
+ Datum encoded;
+ ASSERT_OK(DictionaryEncode(&this->ctx_, array->data(), &encoded));
+
+ this->CheckPass(*MakeArray(encoded.array()), *array, array->type(), options);
+}
+
TEST_F(TestCast, DictToNonDictNoNulls) {
vector<std::string> dict_values = {"foo", "bar", "baz"};
auto ex_dict = _MakeArray<StringType, std::string>(utf8(), dict_values, {});
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index ee21803..2d3e1a8 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -842,15 +842,22 @@ struct CastFunctor<T, DictionaryType,
template <typename IndexType, typename c_type>
void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
c_type* out) {
- internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
- indices.length());
+ const auto& in = indices.data()->GetValues<typename IndexType::c_type>(1);
+ int64_t length = indices.length();
- auto in = indices.data()->GetValues<typename IndexType::c_type>(1);
- for (int64_t i = 0; i < indices.length(); ++i) {
- if (valid_bits_reader.IsSet()) {
+ if (indices.null_count() == 0) {
+ for (int64_t i = 0; i < length; ++i) {
out[i] = dictionary[in[i]];
}
- valid_bits_reader.Next();
+ } else {
+ auto null_bitmap = indices.null_bitmap_data();
+ internal::BitmapReader valid_bits_reader(null_bitmap, indices.offset(), length);
+ for (int64_t i = 0; i < length; ++i) {
+ if (valid_bits_reader.IsSet()) {
+ out[i] = dictionary[in[i]];
+ }
+ valid_bits_reader.Next();
+ }
}
}
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 8406f36..34d0956 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2479,3 +2479,19 @@ def test_write_nested_zero_length_array_chunk_failure():
for batch in my_arrays]
tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
_check_roundtrip(tbl)
+
+
+def test_partitioned_dataset(tempdir):
+ # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset
+ # to a Parquet file
+ path = tempdir / "ARROW-3208"
+ df = pd.DataFrame({
+ 'one': [-1, 10, 2.5, 100, 1000, 1, 29.2],
+ 'two': [-1, 10, 2, 100, 1000, 1, 11],
+ 'three': [0, 0, 0, 0, 0, 0, 0]
+ })
+ table = pa.Table.from_pandas(df)
+ pq.write_to_dataset(table, root_path=str(path),
+ partition_cols=['one', 'two'])
+ table = pq.ParquetDataset(path).read()
+ pq.write_table(table, path / "output.parquet")