You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@arrow.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2018/04/21 14:37:00 UTC

[jira] [Commented] (ARROW-2314) [Python] Union array slicing is defective

    [ https://issues.apache.org/jira/browse/ARROW-2314?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16446833#comment-16446833 ] 

ASF GitHub Bot commented on ARROW-2314:
---------------------------------------

xhochy closed pull request #1757: ARROW-2314: [C++/Python] Fix union array slicing
URL: https://github.com/apache/arrow/pull/1757
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 80d64c8712..861b8dfacc 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -465,7 +465,16 @@ Status UnionArray::MakeSparse(const Array& type_ids,
 
 std::shared_ptr<Array> UnionArray::child(int i) const {
   if (!boxed_fields_[i]) {
-    boxed_fields_[i] = MakeArray(data_->child_data[i]);
+    std::shared_ptr<ArrayData> child_data = data_->child_data[i];
+    if (mode() == UnionMode::SPARSE) {
+      // Sparse union: need to adjust child if union is sliced
+      // (for dense unions, the need to lookup through the offsets
+      //  makes this unnecessary)
+      if (data_->offset != 0 || child_data->length > data_->length) {
+        child_data = SliceData(*child_data.get(), data_->offset, data_->length);
+      }
+    }
+    boxed_fields_[i] = MakeArray(child_data);
   }
   DCHECK(boxed_fields_[i]);
   return boxed_fields_[i];
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 07e7a13252..d66e849be3 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -674,6 +674,10 @@ class ARROW_EXPORT UnionArray : public Array {
 
   UnionMode::type mode() const { return static_cast<const UnionType&>(*type()).mode(); }
 
+  // Return the given field as an individual array.
+  // For sparse unions, the returned array has its offset, length and null
+  // count adjusted.
+  // For dense unions, the returned array is unchanged.
   std::shared_ptr<Array> child(int pos) const;
 
   /// Only use this while the UnionArray is in scope
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 002d4b8520..b7b93130c4 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -189,15 +189,10 @@ class RangeEqualsVisitor {
       id = left_ids[i];
       child_num = type_id_to_child_num[id];
 
-      const int64_t left_abs_index = i + left.offset();
-      const int64_t right_abs_index = o_i + right.offset();
-
       // TODO(wesm): really we should be comparing stretches of non-null data
       // rather than looking at one value at a time.
       if (union_mode == UnionMode::SPARSE) {
-        if (!left.child(child_num)->RangeEquals(left_abs_index, left_abs_index + 1,
-                                                right_abs_index,
-                                                right.child(child_num))) {
+        if (!left.child(child_num)->RangeEquals(i, i + 1, o_i, right.child(child_num))) {
           return false;
         }
       } else {
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 86f2ed1d68..cdb3399954 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -409,7 +409,7 @@ class RecordBatchSerializer : public ArrayVisitor {
 
       // Allocate an array of child offsets. Set all to -1 to indicate that we
       // haven't observed a first occurrence of a particular child yet
-      std::vector<int32_t> child_offsets(max_code + 1);
+      std::vector<int32_t> child_offsets(max_code + 1, -1);
       std::vector<int32_t> child_lengths(max_code + 1, 0);
 
       if (offset != 0) {
@@ -427,16 +427,23 @@ class RecordBatchSerializer : public ArrayVisitor {
         int32_t* shifted_offsets =
             reinterpret_cast<int32_t*>(shifted_offsets_buffer->mutable_data());
 
+        // Offsets may not be ascending, so we need to find out the start offset
+        // for each child
         for (int64_t i = 0; i < length; ++i) {
           const uint8_t code = type_ids[i];
-          int32_t shift = child_offsets[code];
-          if (shift == -1) {
-            child_offsets[code] = shift = unshifted_offsets[i];
+          if (child_offsets[code] == -1) {
+            child_offsets[code] = unshifted_offsets[i];
+          } else {
+            child_offsets[code] = std::min(child_offsets[code], unshifted_offsets[i]);
           }
-          shifted_offsets[i] = unshifted_offsets[i] - shift;
+        }
 
+        // Now compute shifted offsets by subtracting child offset
+        for (int64_t i = 0; i < length; ++i) {
+          const uint8_t code = type_ids[i];
+          shifted_offsets[i] = unshifted_offsets[i] - child_offsets[code];
           // Update the child length to account for observed value
-          ++child_lengths[code];
+          child_lengths[code] = std::max(child_lengths[code], shifted_offsets[i] + 1);
         }
 
         value_offsets = shifted_offsets_buffer;
@@ -449,24 +456,25 @@ class RecordBatchSerializer : public ArrayVisitor {
 
         // TODO: ARROW-809, for sliced unions, tricky to know how much to
         // truncate the children. For now, we are truncating the children to be
-        // no longer than the parent union
-        const uint8_t code = type.type_codes()[i];
-        const int64_t child_length = child_lengths[code];
-        if (offset != 0 || length < child_length) {
-          child = child->Slice(child_offsets[code], std::min(length, child_length));
+        // no longer than the parent union.
+        if (offset != 0) {
+          const uint8_t code = type.type_codes()[i];
+          const int64_t child_offset = child_offsets[code];
+          const int64_t child_length = child_lengths[code];
+
+          if (child_offset > 0) {
+            child = child->Slice(child_offset, child_length);
+          } else if (child_length < child->length()) {
+            // This case includes when child is not encountered at all
+            child = child->Slice(0, child_length);
+          }
         }
         RETURN_NOT_OK(VisitArray(*child));
       }
     } else {
       for (int i = 0; i < array.num_fields(); ++i) {
-        std::shared_ptr<Array> child = array.child(i);
-
-        // Sparse union, slicing is simpler
-        if (offset != 0 || length < child->length()) {
-          // If offset is non-zero, slice the child array
-          child = child->Slice(offset, length);
-        }
-        RETURN_NOT_OK(VisitArray(*child));
+        // Sparse union, slicing is done for us by child()
+        RETURN_NOT_OK(VisitArray(*array.child(i)));
       }
     }
     ++max_recursion_depth_;
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 4a441fb97f..cf33aaa530 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -333,6 +333,25 @@ def test_union_from_sparse():
     assert result.to_pylist() == [b'a', 1, b'b', b'c', 2, 3, b'd']
 
 
+def test_union_array_slice():
+    # ARROW-2314
+    arr = pa.UnionArray.from_sparse(pa.array([0, 0, 1, 1], type=pa.int8()),
+                                    [pa.array(["a", "b", "c", "d"]),
+                                     pa.array([1, 2, 3, 4])])
+    assert arr[1:].to_pylist() == ["b", 3, 4]
+
+    binary = pa.array([b'a', b'b', b'c', b'd'], type='binary')
+    int64 = pa.array([1, 2, 3], type='int64')
+    types = pa.array([0, 1, 0, 0, 1, 1, 0], type='int8')
+    value_offsets = pa.array([0, 0, 2, 1, 1, 2, 3], type='int32')
+
+    arr = pa.UnionArray.from_dense(types, value_offsets, [binary, int64])
+    lst = arr.to_pylist()
+    for i in range(len(arr)):
+        for j in range(i, len(arr)):
+            assert arr[i:j].to_pylist() == lst[i:j]
+
+
 def test_string_from_buffers():
     array = pa.array(["a", None, "b", "c"])
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> [Python] Union array slicing is defective
> -----------------------------------------
>
>                 Key: ARROW-2314
>                 URL: https://issues.apache.org/jira/browse/ARROW-2314
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, Python
>    Affects Versions: 0.8.0
>            Reporter: Antoine Pitrou
>            Assignee: Antoine Pitrou
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.10.0
>
>
> {code:python}
> >>> a = pa.UnionArray.from_sparse(pa.array([0,1,1], type=pa.int8()), [pa.array(["a", "b", "c"]), pa.array([2,3,4])])
> >>> a
> <pyarrow.lib.UnionArray object at 0x7fe9381304a8>
> [
>   'a',
>   3,
>   4
> ]
> >>> a[1:]
> <pyarrow.lib.UnionArray object at 0x7fe939409598>
> [
>   2,
>   3
> ]
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)