You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/03/31 14:20:27 UTC

arrow git commit: ARROW-603: [C++] Add RecordBatch::Validate method, call in RecordBatch ctor in debug builds

Repository: arrow
Updated Branches:
  refs/heads/master 4915ecf1e -> f5967ed68


ARROW-603: [C++] Add RecordBatch::Validate method, call in RecordBatch ctor in debug builds

This function will help catch malformed RecordBatch objects during development

Author: Wes McKinney <we...@twosigma.com>

Closes #466 from wesm/ARROW-603 and squashes the following commits:

dfdb048 [Wes McKinney] Fix incorrect clang-tidy name
5a51f69 [Wes McKinney] Add RecordBatch::Validate method, call in RecordBatch ctor in debug builds


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/f5967ed6
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/f5967ed6
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/f5967ed6

Branch: refs/heads/master
Commit: f5967ed682e63dd752d0120573bb33f42dd56e27
Parents: 4915ecf
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Mar 31 10:20:20 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Mar 31 10:20:20 2017 -0400

----------------------------------------------------------------------
 cpp/cmake_modules/FindClangTools.cmake | 29 +++++++----
 cpp/src/arrow/io/io-hdfs-test.cc       |  7 ++-
 cpp/src/arrow/table-test.cc            | 76 ++++++++++++++++++++---------
 cpp/src/arrow/table.cc                 | 25 ++++++++++
 cpp/src/arrow/table.h                  |  4 ++
 5 files changed, 104 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/f5967ed6/cpp/cmake_modules/FindClangTools.cmake
----------------------------------------------------------------------
diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake
index c07c7d2..0e9430b 100644
--- a/cpp/cmake_modules/FindClangTools.cmake
+++ b/cpp/cmake_modules/FindClangTools.cmake
@@ -27,16 +27,21 @@
 # This module defines
 #  CLANG_TIDY_BIN, The  path to the clang tidy binary
 #  CLANG_TIDY_FOUND, Whether clang tidy was found
-#  CLANG_FORMAT_BIN, The path to the clang format binary 
+#  CLANG_FORMAT_BIN, The path to the clang format binary
 #  CLANG_TIDY_FOUND, Whether clang format was found
 
-find_program(CLANG_TIDY_BIN 
-  NAMES clang-tidy-3.8 clang-tidy-3.7 clang-tidy-3.6  clang-tidy
-  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin 
+find_program(CLANG_TIDY_BIN
+  NAMES clang-tidy-4.0
+  clang-tidy-3.9
+  clang-tidy-3.8
+  clang-tidy-3.7
+  clang-tidy-3.6
+  clang-tidy
+  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
         NO_DEFAULT_PATH
 )
 
-if ( "${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND" ) 
+if ( "${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND" )
   set(CLANG_TIDY_FOUND 0)
   message("clang-tidy not found")
 else()
@@ -44,17 +49,21 @@ else()
   message("clang-tidy found at ${CLANG_TIDY_BIN}")
 endif()
 
-find_program(CLANG_FORMAT_BIN 
-  NAMES clang-format-3.8 clang-format-3.7 clang-format-3.6  clang-format
-  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin 
+find_program(CLANG_FORMAT_BIN
+  NAMES clang-format-4.0
+  clang-format-3.9
+  clang-format-3.8
+  clang-format-3.7
+  clang-format-3.6
+  clang-format
+  PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin
         NO_DEFAULT_PATH
 )
 
-if ( "${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND" ) 
+if ( "${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND" )
   set(CLANG_FORMAT_FOUND 0)
   message("clang-format not found")
 else()
   set(CLANG_FORMAT_FOUND 1)
   message("clang-format found at ${CLANG_FORMAT_BIN}")
 endif()
-

http://git-wip-us.apache.org/repos/asf/arrow/blob/f5967ed6/cpp/src/arrow/io/io-hdfs-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc
index af59e96..f3140be 100644
--- a/cpp/src/arrow/io/io-hdfs-test.cc
+++ b/cpp/src/arrow/io/io-hdfs-test.cc
@@ -78,10 +78,9 @@ class TestHdfsClient : public ::testing::Test {
     LibHdfsShim* driver_shim;
 
     client_ = nullptr;
-    scratch_dir_ =
-        boost::filesystem::unique_path(
-            boost::filesystem::temp_directory_path() / "arrow-hdfs/scratch-%%%%")
-            .string();
+    scratch_dir_ = boost::filesystem::unique_path(
+        boost::filesystem::temp_directory_path() / "arrow-hdfs/scratch-%%%%")
+                       .string();
 
     loaded_driver_ = false;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/f5967ed6/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index 3853306..cd32f4a 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -127,8 +127,8 @@ TEST_F(TestColumn, BasicAPI) {
   arrays.push_back(MakePrimitive<Int32Array>(100, 10));
   arrays.push_back(MakePrimitive<Int32Array>(100, 20));
 
-  auto field = std::make_shared<Field>("c0", int32());
-  column_.reset(new Column(field, arrays));
+  auto f0 = field("c0", int32());
+  column_.reset(new Column(f0, arrays));
 
   ASSERT_EQ("c0", column_->name());
   ASSERT_TRUE(column_->type()->Equals(int32()));
@@ -137,7 +137,7 @@ TEST_F(TestColumn, BasicAPI) {
   ASSERT_EQ(3, column_->data()->num_chunks());
 
   // nullptr array should not break
-  column_.reset(new Column(field, std::shared_ptr<Array>(nullptr)));
+  column_.reset(new Column(f0, std::shared_ptr<Array>(nullptr)));
   ASSERT_NE(column_.get(), nullptr);
 }
 
@@ -146,13 +146,13 @@ TEST_F(TestColumn, ChunksInhomogeneous) {
   arrays.push_back(MakePrimitive<Int32Array>(100));
   arrays.push_back(MakePrimitive<Int32Array>(100, 10));
 
-  auto field = std::make_shared<Field>("c0", int32());
-  column_.reset(new Column(field, arrays));
+  auto f0 = field("c0", int32());
+  column_.reset(new Column(f0, arrays));
 
   ASSERT_OK(column_->ValidateData());
 
   arrays.push_back(MakePrimitive<Int16Array>(100, 10));
-  column_.reset(new Column(field, arrays));
+  column_.reset(new Column(f0, arrays));
   ASSERT_RAISES(Invalid, column_->ValidateData());
 }
 
@@ -164,8 +164,8 @@ TEST_F(TestColumn, Equals) {
   arrays_one_.push_back(array);
   arrays_another_.push_back(array);
 
-  one_field_ = std::make_shared<Field>("column", int32());
-  another_field_ = std::make_shared<Field>("column", int32());
+  one_field_ = field("column", int32());
+  another_field_ = field("column", int32());
 
   Construct();
   ASSERT_TRUE(one_col_->Equals(one_col_));
@@ -174,13 +174,13 @@ TEST_F(TestColumn, Equals) {
   ASSERT_TRUE(one_col_->Equals(*another_col_.get()));
 
   // Field is different
-  another_field_ = std::make_shared<Field>("two", int32());
+  another_field_ = field("two", int32());
   Construct();
   ASSERT_FALSE(one_col_->Equals(another_col_));
   ASSERT_FALSE(one_col_->Equals(*another_col_.get()));
 
   // ChunkedArray is different
-  another_field_ = std::make_shared<Field>("column", int32());
+  another_field_ = field("column", int32());
   arrays_another_.push_back(array);
   Construct();
   ASSERT_FALSE(one_col_->Equals(another_col_));
@@ -190,9 +190,9 @@ TEST_F(TestColumn, Equals) {
 class TestTable : public TestBase {
  public:
   void MakeExample1(int length) {
-    auto f0 = std::make_shared<Field>("f0", int32());
-    auto f1 = std::make_shared<Field>("f1", uint8());
-    auto f2 = std::make_shared<Field>("f2", int16());
+    auto f0 = field("f0", int32());
+    auto f1 = field("f1", uint8());
+    auto f2 = field("f2", int16());
 
     vector<shared_ptr<Field>> fields = {f0, f1, f2};
     schema_ = std::make_shared<Schema>(fields);
@@ -279,9 +279,9 @@ TEST_F(TestTable, Equals) {
 
   ASSERT_TRUE(table_->Equals(*table_));
   // Differing schema
-  auto f0 = std::make_shared<Field>("f3", int32());
-  auto f1 = std::make_shared<Field>("f4", uint8());
-  auto f2 = std::make_shared<Field>("f5", int16());
+  auto f0 = field("f3", int32());
+  auto f1 = field("f4", uint8());
+  auto f2 = field("f5", int16());
   vector<shared_ptr<Field>> fields = {f0, f1, f2};
   auto other_schema = std::make_shared<Schema>(fields);
   ASSERT_FALSE(table_->Equals(Table(other_schema, columns_)));
@@ -389,9 +389,9 @@ class TestRecordBatch : public TestBase {};
 TEST_F(TestRecordBatch, Equals) {
   const int length = 10;
 
-  auto f0 = std::make_shared<Field>("f0", int32());
-  auto f1 = std::make_shared<Field>("f1", uint8());
-  auto f2 = std::make_shared<Field>("f2", int16());
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", uint8());
+  auto f2 = field("f2", int16());
 
   vector<shared_ptr<Field>> fields = {f0, f1, f2};
   auto schema = std::make_shared<Schema>(fields);
@@ -401,21 +401,51 @@ TEST_F(TestRecordBatch, Equals) {
   auto a2 = MakePrimitive<Int16Array>(length);
 
   RecordBatch b1(schema, length, {a0, a1, a2});
-  RecordBatch b2(schema, 5, {a0, a1, a2});
   RecordBatch b3(schema, length, {a0, a1});
   RecordBatch b4(schema, length, {a0, a1, a1});
 
   ASSERT_TRUE(b1.Equals(b1));
-  ASSERT_FALSE(b1.Equals(b2));
   ASSERT_FALSE(b1.Equals(b3));
   ASSERT_FALSE(b1.Equals(b4));
 }
 
+#ifdef NDEBUG
+// In debug builds, RecordBatch ctor aborts if you construct an invalid one
+
+TEST_F(TestRecordBatch, Validate) {
+  const int length = 10;
+
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", uint8());
+  auto f2 = field("f2", int16());
+
+  auto schema = std::shared_ptr<Schema>(new Schema({f0, f1, f2}));
+
+  auto a0 = MakePrimitive<Int32Array>(length);
+  auto a1 = MakePrimitive<UInt8Array>(length);
+  auto a2 = MakePrimitive<Int16Array>(length);
+  auto a3 = MakePrimitive<Int16Array>(5);
+
+  RecordBatch b1(schema, length, {a0, a1, a2});
+
+  ASSERT_OK(b1.Validate());
+
+  // Length mismatch
+  RecordBatch b2(schema, length, {a0, a1, a3});
+  ASSERT_RAISES(Invalid, b2.Validate());
+
+  // Type mismatch
+  RecordBatch b3(schema, length, {a0, a1, a0});
+  ASSERT_RAISES(Invalid, b3.Validate());
+}
+
+#endif
+
 TEST_F(TestRecordBatch, Slice) {
   const int length = 10;
 
-  auto f0 = std::make_shared<Field>("f0", int32());
-  auto f1 = std::make_shared<Field>("f1", uint8());
+  auto f0 = field("f0", int32());
+  auto f1 = field("f1", uint8());
 
   vector<shared_ptr<Field>> fields = {f0, f1};
   auto schema = std::make_shared<Schema>(fields);

http://git-wip-us.apache.org/repos/asf/arrow/blob/f5967ed6/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 8e283f4..da61fbb 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -139,6 +139,11 @@ Status Column::ValidateData() {
 // ----------------------------------------------------------------------
 // RecordBatch methods
 
+void AssertBatchValid(const RecordBatch& batch) {
+  Status s = batch.Validate();
+  if (!s.ok()) { DCHECK(false) << s.ToString(); }
+}
+
 RecordBatch::RecordBatch(const std::shared_ptr<Schema>& schema, int64_t num_rows,
     const std::vector<std::shared_ptr<Array>>& columns)
     : schema_(schema), num_rows_(num_rows), columns_(columns) {}
@@ -190,6 +195,26 @@ std::shared_ptr<RecordBatch> RecordBatch::Slice(int64_t offset, int64_t length)
   return std::make_shared<RecordBatch>(schema_, num_rows, arrays);
 }
 
+Status RecordBatch::Validate() const {
+  for (int i = 0; i < num_columns(); ++i) {
+    const Array& arr = *columns_[i];
+    if (arr.length() != num_rows_) {
+      std::stringstream ss;
+      ss << "Number of rows in column " << i << " did not match batch: " << arr.length()
+         << " vs " << num_rows_;
+      return Status::Invalid(ss.str());
+    }
+    const auto& schema_type = *schema_->field(i)->type;
+    if (!arr.type()->Equals(schema_type)) {
+      std::stringstream ss;
+      ss << "Column " << i << " type not match schema: " << arr.type()->ToString()
+         << " vs " << schema_type.ToString();
+      return Status::Invalid(ss.str());
+    }
+  }
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // Table methods
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/f5967ed6/cpp/src/arrow/table.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index 7b739c9..0f35dd8 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -140,6 +140,10 @@ class ARROW_EXPORT RecordBatch {
   std::shared_ptr<RecordBatch> Slice(int64_t offset);
   std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length);
 
+  /// Returns error status is there is something wrong with the record batch
+  /// contents, like a schema/array mismatch or inconsistent lengths
+  Status Validate() const;
+
  private:
   std::shared_ptr<Schema> schema_;
   int64_t num_rows_;