You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/02/10 18:15:53 UTC

arrow git commit: ARROW-544: [C++] Test writing zero-length record batches, zero-length BinaryArray fixes

Repository: arrow
Updated Branches:
  refs/heads/master 0ab425245 -> 42b55d98c


ARROW-544: [C++] Test writing zero-length record batches, zero-length BinaryArray fixes

I believe this should fix the failure reported in the Spark integration work. We'll need to upgrade the conda test packages to verify. cc @BryanCutler

Author: Wes McKinney <we...@twosigma.com>

Closes #333 from wesm/ARROW-544 and squashes the following commits:

f80d58f [Wes McKinney] Protect zero-length record batches from incomplete buffer metadata
f876dce [Wes McKinney] Test with null value_offsets too
1dc7733 [Wes McKinney] Test writing zero-length record batches, misc zero-length fixes


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/42b55d98
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/42b55d98
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/42b55d98

Branch: refs/heads/master
Commit: 42b55d98c151d87e5a7c6442800f3e5b9316499b
Parents: 0ab4252
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Feb 10 13:15:39 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Feb 10 13:15:39 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/array-string-test.cc    |  4 ++
 cpp/src/arrow/array.cc                | 11 +++-
 cpp/src/arrow/ipc/adapter.cc          | 27 ++++++---
 cpp/src/arrow/ipc/ipc-adapter-test.cc | 89 +++++++++++++++++++++---------
 4 files changed, 94 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/array-string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc
index c4d9bf4..d8a3585 100644
--- a/cpp/src/arrow/array-string-test.cc
+++ b/cpp/src/arrow/array-string-test.cc
@@ -470,4 +470,8 @@ TEST_F(TestStringArray, TestSliceEquality) {
   CheckSliceEquality<BinaryType>();
 }
 
+TEST_F(TestBinaryArray, LengthZeroCtor) {
+  BinaryArray array(0, nullptr, nullptr);
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index bf368d9..81678e3 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -270,9 +270,12 @@ BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int32_t length,
     const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset)
     : Array(type, length, null_bitmap, null_count, offset),
       value_offsets_(value_offsets),
-      raw_value_offsets_(reinterpret_cast<const int32_t*>(value_offsets_->data())),
+      raw_value_offsets_(nullptr),
       data_(data),
       raw_data_(nullptr) {
+  if (value_offsets_ != nullptr) {
+    raw_value_offsets_ = reinterpret_cast<const int32_t*>(value_offsets_->data());
+  }
   if (data_ != nullptr) { raw_data_ = data_->data(); }
 }
 
@@ -384,8 +387,10 @@ UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int32_t length,
     : Array(type, length, null_bitmap, null_count, offset),
       children_(children),
       type_ids_(type_ids),
-      value_offsets_(value_offsets) {
-  raw_type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data());
+      raw_type_ids_(nullptr),
+      value_offsets_(value_offsets),
+      raw_value_offsets_(nullptr) {
+  if (type_ids) { raw_type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data()); }
   if (value_offsets) {
     raw_value_offsets_ = reinterpret_cast<const int32_t*>(value_offsets->data());
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/ipc/adapter.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc
index 3613ccb..f36ff37 100644
--- a/cpp/src/arrow/ipc/adapter.cc
+++ b/cpp/src/arrow/ipc/adapter.cc
@@ -602,8 +602,13 @@ class ArrayLoader : public TypeVisitor {
 
     std::shared_ptr<Buffer> offsets;
     std::shared_ptr<Buffer> values;
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &values));
+    if (field_meta.length > 0) {
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &values));
+    } else {
+      context_->buffer_index += 2;
+      offsets = values = nullptr;
+    }
 
     result_ = std::make_shared<CONTAINER>(
         field_meta.length, offsets, values, null_bitmap, field_meta.null_count);
@@ -661,7 +666,12 @@ class ArrayLoader : public TypeVisitor {
     RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
 
     std::shared_ptr<Buffer> offsets;
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
+    if (field_meta.length > 0) {
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index, &offsets));
+    } else {
+      offsets = nullptr;
+    }
+    ++context_->buffer_index;
 
     const int num_children = type.num_children();
     if (num_children != 1) {
@@ -708,13 +718,16 @@ class ArrayLoader : public TypeVisitor {
     std::shared_ptr<Buffer> null_bitmap;
     RETURN_NOT_OK(LoadCommon(&field_meta, &null_bitmap));
 
-    std::shared_ptr<Buffer> type_ids;
+    std::shared_ptr<Buffer> type_ids = nullptr;
     std::shared_ptr<Buffer> offsets = nullptr;
-    RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &type_ids));
 
-    if (type.mode == UnionMode::DENSE) {
-      RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &offsets));
+    if (field_meta.length > 0) {
+      RETURN_NOT_OK(GetBuffer(context_->buffer_index, &type_ids));
+      if (type.mode == UnionMode::DENSE) {
+        RETURN_NOT_OK(GetBuffer(context_->buffer_index + 1, &offsets));
+      }
     }
+    context_->buffer_index += type.mode == UnionMode::DENSE? 2 : 1;
 
     std::vector<std::shared_ptr<Array>> fields;
     RETURN_NOT_OK(LoadChildren(type.children(), &fields));

http://git-wip-us.apache.org/repos/asf/arrow/blob/42b55d98/cpp/src/arrow/ipc/ipc-adapter-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc
index bae6578..d11b95b 100644
--- a/cpp/src/arrow/ipc/ipc-adapter-test.cc
+++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc
@@ -71,6 +71,42 @@ class TestWriteRecordBatch : public ::testing::TestWithParam<MakeRecordBatch*>,
     return ReadRecordBatch(metadata, batch.schema(), &buffer_reader, batch_result);
   }
 
+  void CheckRoundtrip(const RecordBatch& batch, int64_t buffer_size) {
+    std::shared_ptr<RecordBatch> batch_result;
+
+    ASSERT_OK(RoundTripHelper(batch, 1 << 16, &batch_result));
+    EXPECT_EQ(batch.num_rows(), batch_result->num_rows());
+
+    ASSERT_TRUE(batch.schema()->Equals(batch_result->schema()));
+    ASSERT_EQ(batch.num_columns(), batch_result->num_columns())
+        << batch.schema()->ToString()
+        << " result: " << batch_result->schema()->ToString();
+
+    for (int i = 0; i < batch.num_columns(); ++i) {
+      const auto& left = *batch.column(i);
+      const auto& right = *batch_result->column(i);
+      if (!left.Equals(right)) {
+        std::stringstream pp_result;
+        std::stringstream pp_expected;
+
+        ASSERT_OK(PrettyPrint(left, 0, &pp_expected));
+        ASSERT_OK(PrettyPrint(right, 0, &pp_result));
+
+        FAIL() << "Index: " << i << " Expected: " << pp_expected.str()
+               << "\nGot: " << pp_result.str();
+      }
+    }
+  }
+
+  void CheckRoundtrip(const std::shared_ptr<Array>& array, int64_t buffer_size) {
+    auto f0 = arrow::field("f0", array->type());
+    std::vector<std::shared_ptr<Field>> fields = {f0};
+    auto schema = std::make_shared<Schema>(fields);
+
+    RecordBatch batch(schema, 0, {array});
+    CheckRoundtrip(batch, buffer_size);
+  }
+
  protected:
   std::shared_ptr<io::MemoryMappedFile> mmap_;
   MemoryPool* pool_;
@@ -79,48 +115,47 @@ class TestWriteRecordBatch : public ::testing::TestWithParam<MakeRecordBatch*>,
 TEST_P(TestWriteRecordBatch, RoundTrip) {
   std::shared_ptr<RecordBatch> batch;
   ASSERT_OK((*GetParam())(&batch));  // NOLINT clang-tidy gtest issue
-  std::shared_ptr<RecordBatch> batch_result;
-  ASSERT_OK(RoundTripHelper(*batch, 1 << 16, &batch_result));
-
-  // do checks
-  ASSERT_TRUE(batch->schema()->Equals(batch_result->schema()));
-  ASSERT_EQ(batch->num_columns(), batch_result->num_columns())
-      << batch->schema()->ToString() << " result: " << batch_result->schema()->ToString();
-  EXPECT_EQ(batch->num_rows(), batch_result->num_rows());
-  for (int i = 0; i < batch->num_columns(); ++i) {
-    EXPECT_TRUE(batch->column(i)->Equals(batch_result->column(i)))
-        << "Idx: " << i << " Name: " << batch->column_name(i);
-  }
+
+  CheckRoundtrip(*batch, 1 << 20);
 }
 
 TEST_P(TestWriteRecordBatch, SliceRoundTrip) {
   std::shared_ptr<RecordBatch> batch;
   ASSERT_OK((*GetParam())(&batch));  // NOLINT clang-tidy gtest issue
-  std::shared_ptr<RecordBatch> batch_result;
 
   // Skip the zero-length case
   if (batch->num_rows() < 2) { return; }
 
   auto sliced_batch = batch->Slice(2, 10);
+  CheckRoundtrip(*sliced_batch, 1 << 20);
+}
+
+TEST_P(TestWriteRecordBatch, ZeroLengthArrays) {
+  std::shared_ptr<RecordBatch> batch;
+  ASSERT_OK((*GetParam())(&batch));  // NOLINT clang-tidy gtest issue
 
-  ASSERT_OK(RoundTripHelper(*sliced_batch, 1 << 16, &batch_result));
+  std::shared_ptr<RecordBatch> zero_length_batch;
+  if (batch->num_rows() > 2) {
+    zero_length_batch = batch->Slice(2, 0);
+  } else {
+    zero_length_batch = batch->Slice(0, 0);
+  }
 
-  EXPECT_EQ(sliced_batch->num_rows(), batch_result->num_rows());
+  CheckRoundtrip(*zero_length_batch, 1 << 20);
 
-  for (int i = 0; i < sliced_batch->num_columns(); ++i) {
-    const auto& left = *sliced_batch->column(i);
-    const auto& right = *batch_result->column(i);
-    if (!left.Equals(right)) {
-      std::stringstream pp_result;
-      std::stringstream pp_expected;
+  // ARROW-544: check binary array
+  std::shared_ptr<MutableBuffer> value_offsets;
+  ASSERT_OK(AllocateBuffer(pool_, sizeof(int32_t), &value_offsets));
+  *reinterpret_cast<int32_t*>(value_offsets->mutable_data()) = 0;
 
-      ASSERT_OK(PrettyPrint(left, 0, &pp_expected));
-      ASSERT_OK(PrettyPrint(right, 0, &pp_result));
+  std::shared_ptr<Array> bin_array = std::make_shared<BinaryArray>(0, value_offsets,
+      std::make_shared<Buffer>(nullptr, 0), std::make_shared<Buffer>(nullptr, 0));
 
-      FAIL() << "Index: " << i << " Expected: " << pp_expected.str()
-             << "\nGot: " << pp_result.str();
-    }
-  }
+  // null value_offsets
+  std::shared_ptr<Array> bin_array2 = std::make_shared<BinaryArray>(0, nullptr, nullptr);
+
+  CheckRoundtrip(bin_array, 1 << 20);
+  CheckRoundtrip(bin_array2, 1 << 20);
 }
 
 INSTANTIATE_TEST_CASE_P(