You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/16 01:37:11 UTC

arrow git commit: ARROW-182: [C++] Factor out Array::Validate into a separate function

Repository: arrow
Updated Branches:
  refs/heads/master 222cbfeea -> 86a905562


ARROW-182: [C++] Factor out Array::Validate into a separate function

This makes it easier to use templates for writing more validation logic, and also we don't need the additional array methods

Author: Wes McKinney <we...@twosigma.com>

Closes #692 from wesm/ARROW-182 and squashes the following commits:

9753a59 [Wes McKinney] Factor out Array::Validate into a separate function


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/86a90556
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/86a90556
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/86a90556

Branch: refs/heads/master
Commit: 86a905562873c7d93c75c906dae0f17bd3c58aa1
Parents: 222cbfe
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon May 15 21:37:03 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon May 15 21:37:03 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc     |  34 ++---
 cpp/src/arrow/array.cc          | 255 +++++++++++++++++------------------
 cpp/src/arrow/array.h           |  27 ++--
 cpp/src/arrow/ipc/test-common.h |   2 +-
 4 files changed, 153 insertions(+), 165 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 99279f3..636d97f 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -685,7 +685,7 @@ class TestStringArray : public ::testing::Test {
 TEST_F(TestStringArray, TestArrayBasics) {
   ASSERT_EQ(length_, strings_->length());
   ASSERT_EQ(1, strings_->null_count());
-  ASSERT_OK(strings_->Validate());
+  ASSERT_OK(ValidateArray(*strings_));
 }
 
 TEST_F(TestStringArray, TestType) {
@@ -801,7 +801,7 @@ class TestStringBuilder : public TestBuilder {
     EXPECT_OK(builder_->Finish(&out));
 
     result_ = std::dynamic_pointer_cast<StringArray>(out);
-    result_->Validate();
+    ASSERT_OK(ValidateArray(*result_));
   }
 
  protected:
@@ -899,7 +899,7 @@ class TestBinaryArray : public ::testing::Test {
 TEST_F(TestBinaryArray, TestArrayBasics) {
   ASSERT_EQ(length_, strings_->length());
   ASSERT_EQ(1, strings_->null_count());
-  ASSERT_OK(strings_->Validate());
+  ASSERT_OK(ValidateArray(*strings_));
 }
 
 TEST_F(TestBinaryArray, TestType) {
@@ -969,7 +969,7 @@ class TestBinaryBuilder : public TestBuilder {
     EXPECT_OK(builder_->Finish(&out));
 
     result_ = std::dynamic_pointer_cast<BinaryArray>(out);
-    result_->Validate();
+    ASSERT_OK(ValidateArray(*result_));
   }
 
  protected:
@@ -994,7 +994,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
     }
   }
   Done();
-  ASSERT_OK(result_->Validate());
+  ASSERT_OK(ValidateArray(*result_));
   ASSERT_EQ(reps * N, result_->length());
   ASSERT_EQ(reps, result_->null_count());
   ASSERT_EQ(reps * 6, result_->data()->size());
@@ -1354,7 +1354,7 @@ TEST_F(TestListBuilder, TestAppendNull) {
 
   Done();
 
-  ASSERT_OK(result_->Validate());
+  ASSERT_OK(ValidateArray(*result_));
   ASSERT_TRUE(result_->IsNull(0));
   ASSERT_TRUE(result_->IsNull(1));
 
@@ -1368,7 +1368,7 @@ TEST_F(TestListBuilder, TestAppendNull) {
 
 void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& values,
     const vector<uint8_t>& is_valid) {
-  ASSERT_OK(result->Validate());
+  ASSERT_OK(ValidateArray(*result));
   ASSERT_EQ(1, result->null_count());
   ASSERT_EQ(0, result->values()->null_count());
 
@@ -1446,13 +1446,13 @@ TEST_F(TestListBuilder, BulkAppendInvalid) {
   }
 
   Done();
-  ASSERT_RAISES(Invalid, result_->Validate());
+  ASSERT_RAISES(Invalid, ValidateArray(*result_));
 }
 
 TEST_F(TestListBuilder, TestZeroLength) {
   // All buffers are null
   Done();
-  ASSERT_OK(result_->Validate());
+  ASSERT_OK(ValidateArray(*result_));
 }
 
 // ----------------------------------------------------------------------
@@ -1569,9 +1569,9 @@ TEST(TestDictionary, Validate) {
   std::shared_ptr<Array> arr3 = std::make_shared<DictionaryArray>(dict_type, indices3);
 
   // Only checking index type for now
-  ASSERT_OK(arr->Validate());
-  ASSERT_RAISES(Invalid, arr2->Validate());
-  ASSERT_OK(arr3->Validate());
+  ASSERT_OK(ValidateArray(*arr));
+  ASSERT_RAISES(Invalid, ValidateArray(*arr2));
+  ASSERT_OK(ValidateArray(*arr3));
 }
 
 // ----------------------------------------------------------------------
@@ -1582,7 +1582,7 @@ void ValidateBasicStructArray(const StructArray* result,
     const vector<uint8_t>& list_is_valid, const vector<int>& list_lengths,
     const vector<int>& list_offsets, const vector<int32_t>& int_values) {
   ASSERT_EQ(4, result->length());
-  ASSERT_OK(result->Validate());
+  ASSERT_OK(ValidateArray(*result));
 
   auto list_char_arr = static_cast<ListArray*>(result->field(0).get());
   auto char_arr = static_cast<Int8Array*>(list_char_arr->values().get());
@@ -1666,7 +1666,7 @@ TEST_F(TestStructBuilder, TestAppendNull) {
 
   Done();
 
-  ASSERT_OK(result_->Validate());
+  ASSERT_OK(ValidateArray(*result_));
 
   ASSERT_EQ(2, static_cast<int>(result_->fields().size()));
   ASSERT_EQ(2, result_->length());
@@ -1777,8 +1777,8 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) {
   }
 
   Done();
-  // Even null bitmap of the parent Struct is not valid, Validate() will ignore it.
-  ASSERT_OK(result_->Validate());
+  // Even null bitmap of the parent Struct is not valid, validate will ignore it.
+  ASSERT_OK(ValidateArray(*result_));
 }
 
 TEST_F(TestStructBuilder, TestEquality) {
@@ -1924,7 +1924,7 @@ TEST_F(TestStructBuilder, TestEquality) {
 TEST_F(TestStructBuilder, TestZeroLength) {
   // All buffers are null
   Done();
-  ASSERT_OK(result_->Validate());
+  ASSERT_OK(ValidateArray(*result_));
 }
 
 // ----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 76dda2c..c5acf3e 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -105,10 +105,6 @@ bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
   return are_equal;
 }
 
-Status Array::Validate() const {
-  return Status::OK();
-}
-
 // Last two parameters are in-out parameters
 static inline void ConformSliceParams(
     int64_t array_offset, int64_t array_length, int64_t* offset, int64_t* length) {
@@ -166,58 +162,6 @@ std::shared_ptr<Array> BooleanArray::Slice(int64_t offset, int64_t length) const
 // ----------------------------------------------------------------------
 // ListArray
 
-Status ListArray::Validate() const {
-  if (length_ < 0) { return Status::Invalid("Length was negative"); }
-  if (length_ && !value_offsets_) { return Status::Invalid("value_offsets_ was null"); }
-  if (value_offsets_->size() / static_cast<int>(sizeof(int32_t)) < length_) {
-    std::stringstream ss;
-    ss << "offset buffer size (bytes): " << value_offsets_->size()
-       << " isn't large enough for length: " << length_;
-    return Status::Invalid(ss.str());
-  }
-  const int32_t last_offset = this->value_offset(length_);
-  if (last_offset > 0) {
-    if (!values_) {
-      return Status::Invalid("last offset was non-zero and values was null");
-    }
-    if (values_->length() != last_offset) {
-      std::stringstream ss;
-      ss << "Final offset invariant not equal to values length: " << last_offset
-         << "!=" << values_->length();
-      return Status::Invalid(ss.str());
-    }
-
-    const Status child_valid = values_->Validate();
-    if (!child_valid.ok()) {
-      std::stringstream ss;
-      ss << "Child array invalid: " << child_valid.ToString();
-      return Status::Invalid(ss.str());
-    }
-  }
-
-  int32_t prev_offset = this->value_offset(0);
-  if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); }
-  for (int64_t i = 1; i <= length_; ++i) {
-    int32_t current_offset = this->value_offset(i);
-    if (IsNull(i - 1) && current_offset != prev_offset) {
-      std::stringstream ss;
-      ss << "Offset invariant failure at: " << i
-         << " inconsistent value_offsets for null slot" << current_offset
-         << "!=" << prev_offset;
-      return Status::Invalid(ss.str());
-    }
-    if (current_offset < prev_offset) {
-      std::stringstream ss;
-      ss << "Offset invariant failure: " << i
-         << " inconsistent offset for non-null slot: " << current_offset << "<"
-         << prev_offset;
-      return Status::Invalid(ss.str());
-    }
-    prev_offset = current_offset;
-  }
-  return Status::OK();
-}
-
 std::shared_ptr<Array> ListArray::Slice(int64_t offset, int64_t length) const {
   ConformSliceParams(offset_, length_, &offset, &length);
   return std::make_shared<ListArray>(
@@ -250,11 +194,6 @@ BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
   if (data_ != nullptr) { raw_data_ = data_->data(); }
 }
 
-Status BinaryArray::Validate() const {
-  // TODO(wesm): what to do here?
-  return Status::OK();
-}
-
 std::shared_ptr<Array> BinaryArray::Slice(int64_t offset, int64_t length) const {
   ConformSliceParams(offset_, length_, &offset, &length);
   return std::make_shared<BinaryArray>(
@@ -267,11 +206,6 @@ StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_of
     : BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) {
 }
 
-Status StringArray::Validate() const {
-  // TODO(emkornfield) Validate proper UTF8 code points?
-  return BinaryArray::Validate();
-}
-
 std::shared_ptr<Array> StringArray::Slice(int64_t offset, int64_t length) const {
   ConformSliceParams(offset_, length_, &offset, &length);
   return std::make_shared<StringArray>(
@@ -361,42 +295,6 @@ std::shared_ptr<Array> StructArray::field(int pos) const {
   return children_[pos];
 }
 
-Status StructArray::Validate() const {
-  if (length_ < 0) { return Status::Invalid("Length was negative"); }
-
-  if (null_count() > length_) {
-    return Status::Invalid("Null count exceeds the length of this struct");
-  }
-
-  if (children_.size() > 0) {
-    // Validate fields
-    int64_t array_length = children_[0]->length();
-    size_t idx = 0;
-    for (auto it : children_) {
-      if (it->length() != array_length) {
-        std::stringstream ss;
-        ss << "Length is not equal from field " << it->type()->ToString()
-           << " at position {" << idx << "}";
-        return Status::Invalid(ss.str());
-      }
-
-      const Status child_valid = it->Validate();
-      if (!child_valid.ok()) {
-        std::stringstream ss;
-        ss << "Child array invalid: " << child_valid.ToString() << " at position {" << idx
-           << "}";
-        return Status::Invalid(ss.str());
-      }
-      ++idx;
-    }
-
-    if (array_length > 0 && array_length != length_) {
-      return Status::Invalid("Struct's length is not equal to its child arrays");
-    }
-  }
-  return Status::OK();
-}
-
 std::shared_ptr<Array> StructArray::Slice(int64_t offset, int64_t length) const {
   ConformSliceParams(offset_, length_, &offset, &length);
   return std::make_shared<StructArray>(
@@ -427,17 +325,6 @@ std::shared_ptr<Array> UnionArray::child(int pos) const {
   return children_[pos];
 }
 
-Status UnionArray::Validate() const {
-  if (length_ < 0) { return Status::Invalid("Length was negative"); }
-
-  if (null_count() > length_) {
-    return Status::Invalid("Null count exceeds the length of this struct");
-  }
-
-  DCHECK(false) << "Validate not yet implemented";
-  return Status::OK();
-}
-
 std::shared_ptr<Array> UnionArray::Slice(int64_t offset, int64_t length) const {
   ConformSliceParams(offset_, length_, &offset, &length);
   return std::make_shared<UnionArray>(type_, length, children_, type_ids_, value_offsets_,
@@ -456,14 +343,6 @@ DictionaryArray::DictionaryArray(
   DCHECK_EQ(type->id(), Type::DICTIONARY);
 }
 
-Status DictionaryArray::Validate() const {
-  Type::type index_type_id = indices_->type()->id();
-  if (!is_integer(index_type_id)) {
-    return Status::Invalid("Dictionary indices must be integer type");
-  }
-  return Status::OK();
-}
-
 std::shared_ptr<Array> DictionaryArray::dictionary() const {
   return dict_type_->dictionary();
 }
@@ -476,20 +355,136 @@ std::shared_ptr<Array> DictionaryArray::Slice(int64_t offset, int64_t length) co
 // ----------------------------------------------------------------------
 // Implement Array::Accept as inline visitor
 
-struct AcceptVirtualVisitor {
-  explicit AcceptVirtualVisitor(ArrayVisitor* visitor) : visitor(visitor) {}
+Status Array::Accept(ArrayVisitor* visitor) const {
+  return VisitArrayInline(*this, visitor);
+}
+
+// ----------------------------------------------------------------------
+// Implement Array::Validate as inline visitor
+
+struct ValidateVisitor {
+  Status Visit(const NullArray& array) { return Status::OK(); }
+
+  Status Visit(const PrimitiveArray& array) { return Status::OK(); }
+
+  Status Visit(const BinaryArray& array) {
+    // TODO(wesm): what to do here?
+    return Status::OK();
+  }
+
+  Status Visit(const ListArray& array) {
+    if (array.length() < 0) { return Status::Invalid("Length was negative"); }
+
+    auto value_offsets = array.value_offsets();
+    if (array.length() && !value_offsets) {
+      return Status::Invalid("value_offsets_ was null");
+    }
+    if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
+      std::stringstream ss;
+      ss << "offset buffer size (bytes): " << value_offsets->size()
+         << " isn't large enough for length: " << array.length();
+      return Status::Invalid(ss.str());
+    }
+    const int32_t last_offset = array.value_offset(array.length());
+    if (last_offset > 0) {
+      if (!array.values()) {
+        return Status::Invalid("last offset was non-zero and values was null");
+      }
+      if (array.values()->length() != last_offset) {
+        std::stringstream ss;
+        ss << "Final offset invariant not equal to values length: " << last_offset
+           << "!=" << array.values()->length();
+        return Status::Invalid(ss.str());
+      }
+
+      const Status child_valid = ValidateArray(*array.values());
+      if (!child_valid.ok()) {
+        std::stringstream ss;
+        ss << "Child array invalid: " << child_valid.ToString();
+        return Status::Invalid(ss.str());
+      }
+    }
+
+    int32_t prev_offset = array.value_offset(0);
+    if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); }
+    for (int64_t i = 1; i <= array.length(); ++i) {
+      int32_t current_offset = array.value_offset(i);
+      if (array.IsNull(i - 1) && current_offset != prev_offset) {
+        std::stringstream ss;
+        ss << "Offset invariant failure at: " << i
+           << " inconsistent value_offsets for null slot" << current_offset
+           << "!=" << prev_offset;
+        return Status::Invalid(ss.str());
+      }
+      if (current_offset < prev_offset) {
+        std::stringstream ss;
+        ss << "Offset invariant failure: " << i
+           << " inconsistent offset for non-null slot: " << current_offset << "<"
+           << prev_offset;
+        return Status::Invalid(ss.str());
+      }
+      prev_offset = current_offset;
+    }
+    return Status::OK();
+  }
 
-  ArrayVisitor* visitor;
+  Status Visit(const StructArray& array) {
+    if (array.length() < 0) { return Status::Invalid("Length was negative"); }
 
-  template <typename T>
-  Status Visit(const T& array) {
-    return visitor->Visit(array);
+    if (array.null_count() > array.length()) {
+      return Status::Invalid("Null count exceeds the length of this struct");
+    }
+
+    if (array.fields().size() > 0) {
+      // Validate fields
+      int64_t array_length = array.fields()[0]->length();
+      size_t idx = 0;
+      for (auto it : array.fields()) {
+        if (it->length() != array_length) {
+          std::stringstream ss;
+          ss << "Length is not equal from field " << it->type()->ToString()
+             << " at position {" << idx << "}";
+          return Status::Invalid(ss.str());
+        }
+
+        const Status child_valid = ValidateArray(*it);
+        if (!child_valid.ok()) {
+          std::stringstream ss;
+          ss << "Child array invalid: " << child_valid.ToString() << " at position {"
+             << idx << "}";
+          return Status::Invalid(ss.str());
+        }
+        ++idx;
+      }
+
+      if (array_length > 0 && array_length != array.length()) {
+        return Status::Invalid("Struct's length is not equal to its child arrays");
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const UnionArray& array) {
+    if (array.length() < 0) { return Status::Invalid("Length was negative"); }
+
+    if (array.null_count() > array.length()) {
+      return Status::Invalid("Null count exceeds the length of this struct");
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const DictionaryArray& array) {
+    Type::type index_type_id = array.indices()->type()->id();
+    if (!is_integer(index_type_id)) {
+      return Status::Invalid("Dictionary indices must be integer type");
+    }
+    return Status::OK();
   }
 };
 
-Status Array::Accept(ArrayVisitor* visitor) const {
-  AcceptVirtualVisitor inline_visitor(visitor);
-  return VisitArrayInline(*this, visitor);
+Status ValidateArray(const Array& array) {
+  ValidateVisitor validate_visitor;
+  return VisitArrayInline(array, &validate_visitor);
 }
 
 // ----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 071d4e3..331c6c3 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -108,11 +108,6 @@ class ARROW_EXPORT Array {
   bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
       int64_t other_start_idx) const;
 
-  /// Determines if the array is internally consistent.
-  ///
-  /// Defaults to always returning Status::OK. This can be an expensive check.
-  virtual Status Validate() const;
-
   Status Accept(ArrayVisitor* visitor) const;
 
   /// Construct a zero-copy slice of the array with the indicated offset and
@@ -238,8 +233,6 @@ class ARROW_EXPORT ListArray : public Array {
     values_ = values;
   }
 
-  Status Validate() const override;
-
   // Return a shared pointer in case the requestor desires to share ownership
   // with this array.
   std::shared_ptr<Array> values() const { return values_; }
@@ -306,8 +299,6 @@ class ARROW_EXPORT BinaryArray : public Array {
     return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
   }
 
-  Status Validate() const override;
-
   std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;
 
  protected:
@@ -342,8 +333,6 @@ class ARROW_EXPORT StringArray : public BinaryArray {
     return std::string(reinterpret_cast<const char*>(str), nchars);
   }
 
-  Status Validate() const override;
-
   std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;
 };
 
@@ -406,8 +395,6 @@ class ARROW_EXPORT StructArray : public Array {
       std::shared_ptr<Buffer> null_bitmap = nullptr, int64_t null_count = 0,
       int64_t offset = 0);
 
-  Status Validate() const override;
-
   // Return a shared pointer in case the requestor desires to share ownership
   // with this array.
   std::shared_ptr<Array> field(int pos) const;
@@ -436,8 +423,6 @@ class ARROW_EXPORT UnionArray : public Array {
       const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0,
       int64_t offset = 0);
 
-  Status Validate() const override;
-
   /// Note that this buffer does not account for any slice offset
   std::shared_ptr<Buffer> type_ids() const { return type_ids_; }
 
@@ -490,8 +475,6 @@ class ARROW_EXPORT DictionaryArray : public Array {
   DictionaryArray(
       const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices);
 
-  Status Validate() const override;
-
   std::shared_ptr<Array> indices() const { return indices_; }
   std::shared_ptr<Array> dictionary() const;
 
@@ -525,6 +508,16 @@ ARROW_EXTERN_TEMPLATE NumericArray<Time32Type>;
 ARROW_EXTERN_TEMPLATE NumericArray<Time64Type>;
 ARROW_EXTERN_TEMPLATE NumericArray<TimestampType>;
 
+
+/// \brief Perform any validation checks to determine obvious inconsistencies
+/// with the array's internal data
+///
+/// This can be an expensive check.
+///
+/// \param array an Array instance
+/// \return Status
+Status ValidateArray(const Array& array);
+
 }  // namespace arrow
 
 #endif

http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index c8ca21c..5caa3a9 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -133,7 +133,7 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
   ListBuilder builder(pool, child_array);
   RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data()));
   RETURN_NOT_OK(builder.Finish(out));
-  return (*out)->Validate();
+  return ValidateArray(**out);
 }
 
 typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out);