You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/07/18 13:37:11 UTC
arrow git commit: ARROW-1188: [Python] Handle Feather case where
category values are null type
Repository: arrow
Updated Branches:
refs/heads/master 6035d9bd4 -> 8152433e7
ARROW-1188: [Python] Handle Feather case where category values are null type
We already had handled a dense null array in ARROW-1187, but the dictionary write path was circumventing that code
Author: Wes McKinney <we...@twosigma.com>
Closes #863 from wesm/ARROW-1188 and squashes the following commits:
43e970d1 [Wes McKinney] Intercept NullArray in Dictionary Feather write path so that written as an all-null StringArray as with non-dictionary NullArray
a31e3358 [Wes McKinney] Add failing test case
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/8152433e
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/8152433e
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/8152433e
Branch: refs/heads/master
Commit: 8152433e7815630bf225aade1ffe53b20b393553
Parents: 6035d9b
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Jul 18 09:37:06 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jul 18 09:37:06 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/ipc/feather.cc | 26 ++++++++++++++++++++------
python/pyarrow/tests/test_feather.py | 6 ++++++
2 files changed, 26 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/8152433e/cpp/src/arrow/ipc/feather.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc
index 1bcd505..d5b938b 100644
--- a/cpp/src/arrow/ipc/feather.cc
+++ b/cpp/src/arrow/ipc/feather.cc
@@ -497,6 +497,19 @@ fbs::Type ToFlatbufferType(Type::type type) {
return fbs::Type_MIN;
}
+static Status SanitizeUnsupportedTypes(
+ const Array& values, std::shared_ptr<Array>* out) {
+ if (values.type_id() == Type::NA) {
+ // As long as R doesn't support NA, we write this as a StringColumn
+ // to ensure stable roundtrips.
+ *out = std::make_shared<StringArray>(
+ values.length(), nullptr, nullptr, values.null_bitmap(), values.null_count());
+ return Status::OK();
+ } else {
+ return MakeArray(values.data(), out);
+ }
+}
+
class TableWriter::TableWriterImpl : public ArrayVisitor {
public:
TableWriterImpl() : initialized_stream_(false), metadata_(0) {}
@@ -622,11 +635,9 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
}
Status Visit(const NullArray& values) override {
- // As long as R doesn't support NA, we write this as a StringColumn
- // to ensure stable roundtrips.
- StringArray str_values(
- values.length(), nullptr, nullptr, values.null_bitmap(), values.null_count());
- return WritePrimitiveValues(str_values);
+ std::shared_ptr<Array> sanitized_nulls;
+ RETURN_NOT_OK(SanitizeUnsupportedTypes(values, &sanitized_nulls));
+ return WritePrimitiveValues(*sanitized_nulls);
}
#define VISIT_PRIMITIVE(TYPE) \
@@ -658,7 +669,10 @@ class TableWriter::TableWriterImpl : public ArrayVisitor {
RETURN_NOT_OK(WritePrimitiveValues(*values.indices()));
ArrayMetadata levels_meta;
- RETURN_NOT_OK(WriteArray(*dict_type.dictionary(), &levels_meta));
+ std::shared_ptr<Array> sanitized_dictionary;
+ RETURN_NOT_OK(
+ SanitizeUnsupportedTypes(*dict_type.dictionary(), &sanitized_dictionary));
+ RETURN_NOT_OK(WriteArray(*sanitized_dictionary, &levels_meta));
current_column_->SetCategory(levels_meta, dict_type.ordered());
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/8152433e/python/pyarrow/tests/test_feather.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py
index 7978ace..93d6736 100644
--- a/python/pyarrow/tests/test_feather.py
+++ b/python/pyarrow/tests/test_feather.py
@@ -298,6 +298,12 @@ class TestFeatherReader(unittest.TestCase):
df = pd.DataFrame({'all_none': [None] * 10})
self._check_pandas_roundtrip(df, null_counts=[10])
+ def test_all_null_category(self):
+ # ARROW-1188
+ df = pd.DataFrame({"A": (1, 2, 3), "B": (None, None, None)})
+ df = df.assign(B=df.B.astype("category"))
+ self._check_pandas_roundtrip(df, null_counts=[0, 3])
+
def test_multithreaded_read(self):
data = {'c{0}'.format(i): [''] * 10
for i in range(100)}