You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/05/16 01:37:11 UTC
arrow git commit: ARROW-182: [C++] Factor out Array::Validate into a
separate function
Repository: arrow
Updated Branches:
refs/heads/master 222cbfeea -> 86a905562
ARROW-182: [C++] Factor out Array::Validate into a separate function
This makes it easier to use templates for writing more validation logic, and also we don't need the additional array methods
Author: Wes McKinney <we...@twosigma.com>
Closes #692 from wesm/ARROW-182 and squashes the following commits:
9753a59 [Wes McKinney] Factor out Array::Validate into a separate function
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/86a90556
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/86a90556
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/86a90556
Branch: refs/heads/master
Commit: 86a905562873c7d93c75c906dae0f17bd3c58aa1
Parents: 222cbfe
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon May 15 21:37:03 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon May 15 21:37:03 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/array-test.cc | 34 ++---
cpp/src/arrow/array.cc | 255 +++++++++++++++++------------------
cpp/src/arrow/array.h | 27 ++--
cpp/src/arrow/ipc/test-common.h | 2 +-
4 files changed, 153 insertions(+), 165 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 99279f3..636d97f 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -685,7 +685,7 @@ class TestStringArray : public ::testing::Test {
TEST_F(TestStringArray, TestArrayBasics) {
ASSERT_EQ(length_, strings_->length());
ASSERT_EQ(1, strings_->null_count());
- ASSERT_OK(strings_->Validate());
+ ASSERT_OK(ValidateArray(*strings_));
}
TEST_F(TestStringArray, TestType) {
@@ -801,7 +801,7 @@ class TestStringBuilder : public TestBuilder {
EXPECT_OK(builder_->Finish(&out));
result_ = std::dynamic_pointer_cast<StringArray>(out);
- result_->Validate();
+ ASSERT_OK(ValidateArray(*result_));
}
protected:
@@ -899,7 +899,7 @@ class TestBinaryArray : public ::testing::Test {
TEST_F(TestBinaryArray, TestArrayBasics) {
ASSERT_EQ(length_, strings_->length());
ASSERT_EQ(1, strings_->null_count());
- ASSERT_OK(strings_->Validate());
+ ASSERT_OK(ValidateArray(*strings_));
}
TEST_F(TestBinaryArray, TestType) {
@@ -969,7 +969,7 @@ class TestBinaryBuilder : public TestBuilder {
EXPECT_OK(builder_->Finish(&out));
result_ = std::dynamic_pointer_cast<BinaryArray>(out);
- result_->Validate();
+ ASSERT_OK(ValidateArray(*result_));
}
protected:
@@ -994,7 +994,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
}
}
Done();
- ASSERT_OK(result_->Validate());
+ ASSERT_OK(ValidateArray(*result_));
ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(reps, result_->null_count());
ASSERT_EQ(reps * 6, result_->data()->size());
@@ -1354,7 +1354,7 @@ TEST_F(TestListBuilder, TestAppendNull) {
Done();
- ASSERT_OK(result_->Validate());
+ ASSERT_OK(ValidateArray(*result_));
ASSERT_TRUE(result_->IsNull(0));
ASSERT_TRUE(result_->IsNull(1));
@@ -1368,7 +1368,7 @@ TEST_F(TestListBuilder, TestAppendNull) {
void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& values,
const vector<uint8_t>& is_valid) {
- ASSERT_OK(result->Validate());
+ ASSERT_OK(ValidateArray(*result));
ASSERT_EQ(1, result->null_count());
ASSERT_EQ(0, result->values()->null_count());
@@ -1446,13 +1446,13 @@ TEST_F(TestListBuilder, BulkAppendInvalid) {
}
Done();
- ASSERT_RAISES(Invalid, result_->Validate());
+ ASSERT_RAISES(Invalid, ValidateArray(*result_));
}
TEST_F(TestListBuilder, TestZeroLength) {
// All buffers are null
Done();
- ASSERT_OK(result_->Validate());
+ ASSERT_OK(ValidateArray(*result_));
}
// ----------------------------------------------------------------------
@@ -1569,9 +1569,9 @@ TEST(TestDictionary, Validate) {
std::shared_ptr<Array> arr3 = std::make_shared<DictionaryArray>(dict_type, indices3);
// Only checking index type for now
- ASSERT_OK(arr->Validate());
- ASSERT_RAISES(Invalid, arr2->Validate());
- ASSERT_OK(arr3->Validate());
+ ASSERT_OK(ValidateArray(*arr));
+ ASSERT_RAISES(Invalid, ValidateArray(*arr2));
+ ASSERT_OK(ValidateArray(*arr3));
}
// ----------------------------------------------------------------------
@@ -1582,7 +1582,7 @@ void ValidateBasicStructArray(const StructArray* result,
const vector<uint8_t>& list_is_valid, const vector<int>& list_lengths,
const vector<int>& list_offsets, const vector<int32_t>& int_values) {
ASSERT_EQ(4, result->length());
- ASSERT_OK(result->Validate());
+ ASSERT_OK(ValidateArray(*result));
auto list_char_arr = static_cast<ListArray*>(result->field(0).get());
auto char_arr = static_cast<Int8Array*>(list_char_arr->values().get());
@@ -1666,7 +1666,7 @@ TEST_F(TestStructBuilder, TestAppendNull) {
Done();
- ASSERT_OK(result_->Validate());
+ ASSERT_OK(ValidateArray(*result_));
ASSERT_EQ(2, static_cast<int>(result_->fields().size()));
ASSERT_EQ(2, result_->length());
@@ -1777,8 +1777,8 @@ TEST_F(TestStructBuilder, BulkAppendInvalid) {
}
Done();
- // Even null bitmap of the parent Struct is not valid, Validate() will ignore it.
- ASSERT_OK(result_->Validate());
+ // Even null bitmap of the parent Struct is not valid, validate will ignore it.
+ ASSERT_OK(ValidateArray(*result_));
}
TEST_F(TestStructBuilder, TestEquality) {
@@ -1924,7 +1924,7 @@ TEST_F(TestStructBuilder, TestEquality) {
TEST_F(TestStructBuilder, TestZeroLength) {
// All buffers are null
Done();
- ASSERT_OK(result_->Validate());
+ ASSERT_OK(ValidateArray(*result_));
}
// ----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 76dda2c..c5acf3e 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -105,10 +105,6 @@ bool Array::RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
return are_equal;
}
-Status Array::Validate() const {
- return Status::OK();
-}
-
// Last two parameters are in-out parameters
static inline void ConformSliceParams(
int64_t array_offset, int64_t array_length, int64_t* offset, int64_t* length) {
@@ -166,58 +162,6 @@ std::shared_ptr<Array> BooleanArray::Slice(int64_t offset, int64_t length) const
// ----------------------------------------------------------------------
// ListArray
-Status ListArray::Validate() const {
- if (length_ < 0) { return Status::Invalid("Length was negative"); }
- if (length_ && !value_offsets_) { return Status::Invalid("value_offsets_ was null"); }
- if (value_offsets_->size() / static_cast<int>(sizeof(int32_t)) < length_) {
- std::stringstream ss;
- ss << "offset buffer size (bytes): " << value_offsets_->size()
- << " isn't large enough for length: " << length_;
- return Status::Invalid(ss.str());
- }
- const int32_t last_offset = this->value_offset(length_);
- if (last_offset > 0) {
- if (!values_) {
- return Status::Invalid("last offset was non-zero and values was null");
- }
- if (values_->length() != last_offset) {
- std::stringstream ss;
- ss << "Final offset invariant not equal to values length: " << last_offset
- << "!=" << values_->length();
- return Status::Invalid(ss.str());
- }
-
- const Status child_valid = values_->Validate();
- if (!child_valid.ok()) {
- std::stringstream ss;
- ss << "Child array invalid: " << child_valid.ToString();
- return Status::Invalid(ss.str());
- }
- }
-
- int32_t prev_offset = this->value_offset(0);
- if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); }
- for (int64_t i = 1; i <= length_; ++i) {
- int32_t current_offset = this->value_offset(i);
- if (IsNull(i - 1) && current_offset != prev_offset) {
- std::stringstream ss;
- ss << "Offset invariant failure at: " << i
- << " inconsistent value_offsets for null slot" << current_offset
- << "!=" << prev_offset;
- return Status::Invalid(ss.str());
- }
- if (current_offset < prev_offset) {
- std::stringstream ss;
- ss << "Offset invariant failure: " << i
- << " inconsistent offset for non-null slot: " << current_offset << "<"
- << prev_offset;
- return Status::Invalid(ss.str());
- }
- prev_offset = current_offset;
- }
- return Status::OK();
-}
-
std::shared_ptr<Array> ListArray::Slice(int64_t offset, int64_t length) const {
ConformSliceParams(offset_, length_, &offset, &length);
return std::make_shared<ListArray>(
@@ -250,11 +194,6 @@ BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
if (data_ != nullptr) { raw_data_ = data_->data(); }
}
-Status BinaryArray::Validate() const {
- // TODO(wesm): what to do here?
- return Status::OK();
-}
-
std::shared_ptr<Array> BinaryArray::Slice(int64_t offset, int64_t length) const {
ConformSliceParams(offset_, length_, &offset, &length);
return std::make_shared<BinaryArray>(
@@ -267,11 +206,6 @@ StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_of
: BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) {
}
-Status StringArray::Validate() const {
- // TODO(emkornfield) Validate proper UTF8 code points?
- return BinaryArray::Validate();
-}
-
std::shared_ptr<Array> StringArray::Slice(int64_t offset, int64_t length) const {
ConformSliceParams(offset_, length_, &offset, &length);
return std::make_shared<StringArray>(
@@ -361,42 +295,6 @@ std::shared_ptr<Array> StructArray::field(int pos) const {
return children_[pos];
}
-Status StructArray::Validate() const {
- if (length_ < 0) { return Status::Invalid("Length was negative"); }
-
- if (null_count() > length_) {
- return Status::Invalid("Null count exceeds the length of this struct");
- }
-
- if (children_.size() > 0) {
- // Validate fields
- int64_t array_length = children_[0]->length();
- size_t idx = 0;
- for (auto it : children_) {
- if (it->length() != array_length) {
- std::stringstream ss;
- ss << "Length is not equal from field " << it->type()->ToString()
- << " at position {" << idx << "}";
- return Status::Invalid(ss.str());
- }
-
- const Status child_valid = it->Validate();
- if (!child_valid.ok()) {
- std::stringstream ss;
- ss << "Child array invalid: " << child_valid.ToString() << " at position {" << idx
- << "}";
- return Status::Invalid(ss.str());
- }
- ++idx;
- }
-
- if (array_length > 0 && array_length != length_) {
- return Status::Invalid("Struct's length is not equal to its child arrays");
- }
- }
- return Status::OK();
-}
-
std::shared_ptr<Array> StructArray::Slice(int64_t offset, int64_t length) const {
ConformSliceParams(offset_, length_, &offset, &length);
return std::make_shared<StructArray>(
@@ -427,17 +325,6 @@ std::shared_ptr<Array> UnionArray::child(int pos) const {
return children_[pos];
}
-Status UnionArray::Validate() const {
- if (length_ < 0) { return Status::Invalid("Length was negative"); }
-
- if (null_count() > length_) {
- return Status::Invalid("Null count exceeds the length of this struct");
- }
-
- DCHECK(false) << "Validate not yet implemented";
- return Status::OK();
-}
-
std::shared_ptr<Array> UnionArray::Slice(int64_t offset, int64_t length) const {
ConformSliceParams(offset_, length_, &offset, &length);
return std::make_shared<UnionArray>(type_, length, children_, type_ids_, value_offsets_,
@@ -456,14 +343,6 @@ DictionaryArray::DictionaryArray(
DCHECK_EQ(type->id(), Type::DICTIONARY);
}
-Status DictionaryArray::Validate() const {
- Type::type index_type_id = indices_->type()->id();
- if (!is_integer(index_type_id)) {
- return Status::Invalid("Dictionary indices must be integer type");
- }
- return Status::OK();
-}
-
std::shared_ptr<Array> DictionaryArray::dictionary() const {
return dict_type_->dictionary();
}
@@ -476,20 +355,136 @@ std::shared_ptr<Array> DictionaryArray::Slice(int64_t offset, int64_t length) co
// ----------------------------------------------------------------------
// Implement Array::Accept as inline visitor
-struct AcceptVirtualVisitor {
- explicit AcceptVirtualVisitor(ArrayVisitor* visitor) : visitor(visitor) {}
+Status Array::Accept(ArrayVisitor* visitor) const {
+ return VisitArrayInline(*this, visitor);
+}
+
+// ----------------------------------------------------------------------
+// Implement Array::Validate as inline visitor
+
+struct ValidateVisitor {
+ Status Visit(const NullArray& array) { return Status::OK(); }
+
+ Status Visit(const PrimitiveArray& array) { return Status::OK(); }
+
+ Status Visit(const BinaryArray& array) {
+ // TODO(wesm): what to do here?
+ return Status::OK();
+ }
+
+ Status Visit(const ListArray& array) {
+ if (array.length() < 0) { return Status::Invalid("Length was negative"); }
+
+ auto value_offsets = array.value_offsets();
+ if (array.length() && !value_offsets) {
+ return Status::Invalid("value_offsets_ was null");
+ }
+ if (value_offsets->size() / static_cast<int>(sizeof(int32_t)) < array.length()) {
+ std::stringstream ss;
+ ss << "offset buffer size (bytes): " << value_offsets->size()
+ << " isn't large enough for length: " << array.length();
+ return Status::Invalid(ss.str());
+ }
+ const int32_t last_offset = array.value_offset(array.length());
+ if (last_offset > 0) {
+ if (!array.values()) {
+ return Status::Invalid("last offset was non-zero and values was null");
+ }
+ if (array.values()->length() != last_offset) {
+ std::stringstream ss;
+ ss << "Final offset invariant not equal to values length: " << last_offset
+ << "!=" << array.values()->length();
+ return Status::Invalid(ss.str());
+ }
+
+ const Status child_valid = ValidateArray(*array.values());
+ if (!child_valid.ok()) {
+ std::stringstream ss;
+ ss << "Child array invalid: " << child_valid.ToString();
+ return Status::Invalid(ss.str());
+ }
+ }
+
+ int32_t prev_offset = array.value_offset(0);
+ if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); }
+ for (int64_t i = 1; i <= array.length(); ++i) {
+ int32_t current_offset = array.value_offset(i);
+ if (array.IsNull(i - 1) && current_offset != prev_offset) {
+ std::stringstream ss;
+ ss << "Offset invariant failure at: " << i
+ << " inconsistent value_offsets for null slot" << current_offset
+ << "!=" << prev_offset;
+ return Status::Invalid(ss.str());
+ }
+ if (current_offset < prev_offset) {
+ std::stringstream ss;
+ ss << "Offset invariant failure: " << i
+ << " inconsistent offset for non-null slot: " << current_offset << "<"
+ << prev_offset;
+ return Status::Invalid(ss.str());
+ }
+ prev_offset = current_offset;
+ }
+ return Status::OK();
+ }
- ArrayVisitor* visitor;
+ Status Visit(const StructArray& array) {
+ if (array.length() < 0) { return Status::Invalid("Length was negative"); }
- template <typename T>
- Status Visit(const T& array) {
- return visitor->Visit(array);
+ if (array.null_count() > array.length()) {
+ return Status::Invalid("Null count exceeds the length of this struct");
+ }
+
+ if (array.fields().size() > 0) {
+ // Validate fields
+ int64_t array_length = array.fields()[0]->length();
+ size_t idx = 0;
+ for (auto it : array.fields()) {
+ if (it->length() != array_length) {
+ std::stringstream ss;
+ ss << "Length is not equal from field " << it->type()->ToString()
+ << " at position {" << idx << "}";
+ return Status::Invalid(ss.str());
+ }
+
+ const Status child_valid = ValidateArray(*it);
+ if (!child_valid.ok()) {
+ std::stringstream ss;
+ ss << "Child array invalid: " << child_valid.ToString() << " at position {"
+ << idx << "}";
+ return Status::Invalid(ss.str());
+ }
+ ++idx;
+ }
+
+ if (array_length > 0 && array_length != array.length()) {
+ return Status::Invalid("Struct's length is not equal to its child arrays");
+ }
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const UnionArray& array) {
+ if (array.length() < 0) { return Status::Invalid("Length was negative"); }
+
+ if (array.null_count() > array.length()) {
+ return Status::Invalid("Null count exceeds the length of this struct");
+ }
+ return Status::OK();
+ }
+
+ Status Visit(const DictionaryArray& array) {
+ Type::type index_type_id = array.indices()->type()->id();
+ if (!is_integer(index_type_id)) {
+ return Status::Invalid("Dictionary indices must be integer type");
+ }
+ return Status::OK();
}
};
-Status Array::Accept(ArrayVisitor* visitor) const {
- AcceptVirtualVisitor inline_visitor(visitor);
- return VisitArrayInline(*this, visitor);
+Status ValidateArray(const Array& array) {
+ ValidateVisitor validate_visitor;
+ return VisitArrayInline(array, &validate_visitor);
}
// ----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 071d4e3..331c6c3 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -108,11 +108,6 @@ class ARROW_EXPORT Array {
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
int64_t other_start_idx) const;
- /// Determines if the array is internally consistent.
- ///
- /// Defaults to always returning Status::OK. This can be an expensive check.
- virtual Status Validate() const;
-
Status Accept(ArrayVisitor* visitor) const;
/// Construct a zero-copy slice of the array with the indicated offset and
@@ -238,8 +233,6 @@ class ARROW_EXPORT ListArray : public Array {
values_ = values;
}
- Status Validate() const override;
-
// Return a shared pointer in case the requestor desires to share ownership
// with this array.
std::shared_ptr<Array> values() const { return values_; }
@@ -306,8 +299,6 @@ class ARROW_EXPORT BinaryArray : public Array {
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
}
- Status Validate() const override;
-
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;
protected:
@@ -342,8 +333,6 @@ class ARROW_EXPORT StringArray : public BinaryArray {
return std::string(reinterpret_cast<const char*>(str), nchars);
}
- Status Validate() const override;
-
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;
};
@@ -406,8 +395,6 @@ class ARROW_EXPORT StructArray : public Array {
std::shared_ptr<Buffer> null_bitmap = nullptr, int64_t null_count = 0,
int64_t offset = 0);
- Status Validate() const override;
-
// Return a shared pointer in case the requestor desires to share ownership
// with this array.
std::shared_ptr<Array> field(int pos) const;
@@ -436,8 +423,6 @@ class ARROW_EXPORT UnionArray : public Array {
const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0,
int64_t offset = 0);
- Status Validate() const override;
-
/// Note that this buffer does not account for any slice offset
std::shared_ptr<Buffer> type_ids() const { return type_ids_; }
@@ -490,8 +475,6 @@ class ARROW_EXPORT DictionaryArray : public Array {
DictionaryArray(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices);
- Status Validate() const override;
-
std::shared_ptr<Array> indices() const { return indices_; }
std::shared_ptr<Array> dictionary() const;
@@ -525,6 +508,16 @@ ARROW_EXTERN_TEMPLATE NumericArray<Time32Type>;
ARROW_EXTERN_TEMPLATE NumericArray<Time64Type>;
ARROW_EXTERN_TEMPLATE NumericArray<TimestampType>;
+
+/// \brief Perform any validation checks to determine obvious inconsistencies
+/// with the array's internal data
+///
+/// This can be an expensive check.
+///
+/// \param array an Array instance
+/// \return Status
+Status ValidateArray(const Array& array);
+
} // namespace arrow
#endif
http://git-wip-us.apache.org/repos/asf/arrow/blob/86a90556/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index c8ca21c..5caa3a9 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -133,7 +133,7 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
ListBuilder builder(pool, child_array);
RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data()));
RETURN_NOT_OK(builder.Finish(out));
- return (*out)->Validate();
+ return ValidateArray(**out);
}
typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out);