You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/01/28 16:33:58 UTC

[arrow] branch master updated: ARROW-1992: [C++/Python] Fix segfault when string to categorical empty string array

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d322634  ARROW-1992: [C++/Python] Fix segfault when string to categorical empty string array
d322634 is described below

commit d3226349fc61a0ffbb2139f259053ae787e500c8
Author: Licht-T <li...@outlook.jp>
AuthorDate: Sun Jan 28 17:33:20 2018 +0100

    ARROW-1992: [C++/Python] Fix segfault when string to categorical empty string array
    
    This closes [ARROW-1992](https://issues.apache.org/jira/browse/ARROW-1992).
    
    Author: Licht-T <li...@outlook.jp>
    
    Closes #1508 from Licht-T/fix-segfault-when-string_to_categorical-empty-string-array and squashes the following commits:
    
    afea4be [Licht-T] BUG: Fix segfault when to_pandas the empty string array with string_to_categorical=True
    f90e7b8 [Licht-T] TST: Add test for to_pandas the empty string array with string_to_categorical=True
---
 cpp/src/arrow/compute/kernels/hash.cc       |  8 +++++++-
 python/pyarrow/tests/test_convert_pandas.py | 15 +++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc
index 8fac796..acbf403 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -407,12 +407,18 @@ class HashTableKernel<Type, Action, enable_if_binary<Type>> : public HashTable {
   }
 
   Status Append(const ArrayData& arr) override {
+    constexpr uint8_t empty_value = 0;
     if (!initialized_) {
       RETURN_NOT_OK(Init());
     }
 
     const int32_t* offsets = GetValues<int32_t>(arr, 1);
-    const uint8_t* data = GetValues<uint8_t>(arr, 2);
+    const uint8_t* data;
+    if (arr.buffers[2].get() == nullptr) {
+      data = &empty_value;
+    } else {
+      data = GetValues<uint8_t>(arr, 2);
+    }
 
     auto action = static_cast<Action*>(this);
     RETURN_NOT_OK(action->Reserve(arr.length));
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 5acb9c3..fa265e5 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1237,6 +1237,21 @@ class TestPandasConversion(object):
         assert data_column['numpy_type'] == 'object'
         assert data_column['metadata'] == {'precision': 26, 'scale': 11}
 
+    def test_table_empty_str(self):
+        values = ['', '', '', '', '']
+        df = pd.DataFrame({'strings': values})
+        field = pa.field('strings', pa.string())
+        schema = pa.schema([field])
+        table = pa.Table.from_pandas(df, schema=schema)
+
+        result1 = table.to_pandas(strings_to_categorical=False)
+        expected1 = pd.DataFrame({'strings': values})
+        tm.assert_frame_equal(result1, expected1, check_dtype=True)
+
+        result2 = table.to_pandas(strings_to_categorical=True)
+        expected2 = pd.DataFrame({'strings': pd.Categorical(values)})
+        tm.assert_frame_equal(result2, expected2, check_dtype=True)
+
     def test_table_str_to_categorical_without_na(self):
         values = ['a', 'a', 'b', 'b', 'c']
         df = pd.DataFrame({'strings': values})

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.