You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/18 13:37:11 UTC

arrow git commit: ARROW-1188: [Python] Handle Feather case where category values are null type

Repository: arrow
Updated Branches:
  refs/heads/master 6035d9bd4 -> 8152433e7


ARROW-1188: [Python] Handle Feather case where category values are null type

We already had handled a dense null array in ARROW-1187, but the dictionary write path was circumventing that code

Author: Wes McKinney <we...@twosigma.com>

Closes #863 from wesm/ARROW-1188 and squashes the following commits:

43e970d1 [Wes McKinney] Intercept NullArray in Dictionary Feather write path so that written as an all-null StringArray as with non-dictionary NullArray
a31e3358 [Wes McKinney] Add failing test case


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8152433e
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8152433e
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8152433e

Branch: refs/heads/master
Commit: 8152433e7815630bf225aade1ffe53b20b393553
Parents: 6035d9b
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Jul 18 09:37:06 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jul 18 09:37:06 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/ipc/feather.cc         | 26 ++++++++++++++++++++------
 python/pyarrow/tests/test_feather.py |  6 ++++++
 2 files changed, 26 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/8152433e/cpp/src/arrow/ipc/feather.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index 1bcd505..d5b938b 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -497,6 +497,19 @@ fbs::Type ToFlatbufferType(Type::type type) {
   return fbs::Type_MIN;
 }
 
+static Status SanitizeUnsupportedTypes(
+    const Array& values, std::shared_ptr<Array>* out) {
+  if (values.type_id() == Type::NA) {
+    // As long as R doesn't support NA, we write this as a StringColumn
+    // to ensure stable roundtrips.
+    *out = std::make_shared<StringArray>(
+        values.length(), nullptr, nullptr, values.null_bitmap(), values.null_count());
+    return Status::OK();
+  } else {
+    return MakeArray(values.data(), out);
+  }
+}
+
 class TableWriter::TableWriterImpl : public ArrayVisitor {
  public:
   TableWriterImpl() : initialized_stream_(false), metadata_(0) {}
@@ -622,11 +635,9 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
   }
 
   Status Visit(const NullArray& values) override {
-    // As long as R doesn't support NA, we write this as a StringColumn
-    // to ensure stable roundtrips.
-    StringArray str_values(
-        values.length(), nullptr, nullptr, values.null_bitmap(), values.null_count());
-    return WritePrimitiveValues(str_values);
+    std::shared_ptr<Array> sanitized_nulls;
+    RETURN_NOT_OK(SanitizeUnsupportedTypes(values, &sanitized_nulls));
+    return WritePrimitiveValues(*sanitized_nulls);
   }
 
 #define VISIT_PRIMITIVE(TYPE) \
@@ -658,7 +669,10 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
     RETURN_NOT_OK(WritePrimitiveValues(*values.indices()));
 
     ArrayMetadata levels_meta;
-    RETURN_NOT_OK(WriteArray(*dict_type.dictionary(), &levels_meta));
+    std::shared_ptr<Array> sanitized_dictionary;
+    RETURN_NOT_OK(
+        SanitizeUnsupportedTypes(*dict_type.dictionary(), &sanitized_dictionary));
+    RETURN_NOT_OK(WriteArray(*sanitized_dictionary, &levels_meta));
     current_column_->SetCategory(levels_meta, dict_type.ordered());
     return Status::OK();
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/8152433e/python/pyarrow/tests/test_feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 7978ace..93d6736 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -298,6 +298,12 @@ class TestFeatherReader(unittest.TestCase):
         df = pd.DataFrame({'all_none': [None] * 10})
         self._check_pandas_roundtrip(df, null_counts=[10])
 
+    def test_all_null_category(self):
+        # ARROW-1188
+        df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)})
+        df = df.assign(B=df.B.astype("category"))
+        self._check_pandas_roundtrip(df, null_counts=[0, 3])
+
     def test_multithreaded_read(self):
         data = {'c{0}'.format(i): [''] * 10
                 for i in range(100)}