You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/03 12:23:25 UTC

arrow git commit: ARROW-417: Add Equals implementation to compare ChunkedArrays

Repository: arrow
Updated Branches:
  refs/heads/master 26140dca8 -> fdbc57941


ARROW-417: Add Equals implementation to compare ChunkedArrays

Author: Uwe L. Korn <uw...@xhochy.com>

Closes #259 from xhochy/ARROW-417 and squashes the following commits:

ffc076a [Uwe L. Korn] Add interface for non-shared_ptr-based Equals
3686d6c [Uwe L. Korn] ARROW-415: C++: Add Equals implementation to compare Tables
54cbf54 [Uwe L. Korn] ARROW-416: C++: Add Equals implementation to compare Columns
21e73a0 [Uwe L. Korn] Make signed comparison explicit
8563cb2 [Uwe L. Korn] ARROW-417: Add Equals implementation to compare ChunkedArrays


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/fdbc5794
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/fdbc5794
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/fdbc5794

Branch: refs/heads/master
Commit: fdbc57941fd3615c71b3a61b409b63eb6a48a817
Parents: 26140dc
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Tue Jan 3 07:23:17 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jan 3 07:23:17 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/column-test.cc | 121 ++++++++++++++++++++++++++++++++++++--
 cpp/src/arrow/column.cc      |  51 ++++++++++++++++
 cpp/src/arrow/column.h       |   7 +++
 cpp/src/arrow/table-test.cc  |  44 ++++++++++----
 cpp/src/arrow/table.cc       |  17 ++++++
 cpp/src/arrow/table.h        |   3 +
 cpp/src/arrow/test-util.h    |   2 +-
 7 files changed, 228 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/column-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc
index 9005245..1e722ed 100644
--- a/cpp/src/arrow/column-test.cc
+++ b/cpp/src/arrow/column-test.cc
@@ -33,12 +33,92 @@ using std::vector;
 
 namespace arrow {
 
-const auto INT32 = std::make_shared<Int32Type>();
+class TestChunkedArray : public TestBase {
+ protected:
+  virtual void Construct() {
+    one_ = std::make_shared<ChunkedArray>(arrays_one_);
+    another_ = std::make_shared<ChunkedArray>(arrays_another_);
+  }
+
+  ArrayVector arrays_one_;
+  ArrayVector arrays_another_;
+
+  std::shared_ptr<ChunkedArray> one_;
+  std::shared_ptr<ChunkedArray> another_;
+};
+
+TEST_F(TestChunkedArray, BasicEquals) {
+  std::vector<bool> null_bitmap(100, true);
+  std::vector<int32_t> data(100, 1);
+  std::shared_ptr<Array> array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array);
+  arrays_one_.push_back(array);
+  arrays_another_.push_back(array);
+
+  Construct();
+  ASSERT_TRUE(one_->Equals(one_));
+  ASSERT_FALSE(one_->Equals(nullptr));
+  ASSERT_TRUE(one_->Equals(another_));
+  ASSERT_TRUE(one_->Equals(*another_.get()));
+}
+
+TEST_F(TestChunkedArray, EqualsDifferingTypes) {
+  std::vector<bool> null_bitmap(100, true);
+  std::vector<int32_t> data32(100, 1);
+  std::vector<int64_t> data64(100, 1);
+  std::shared_ptr<Array> array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data32, &array);
+  arrays_one_.push_back(array);
+  ArrayFromVector<Int64Type, int64_t>(int64(), null_bitmap, data64, &array);
+  arrays_another_.push_back(array);
+
+  Construct();
+  ASSERT_FALSE(one_->Equals(another_));
+  ASSERT_FALSE(one_->Equals(*another_.get()));
+}
+
+TEST_F(TestChunkedArray, EqualsDifferingLengths) {
+  std::vector<bool> null_bitmap100(100, true);
+  std::vector<bool> null_bitmap101(101, true);
+  std::vector<int32_t> data100(100, 1);
+  std::vector<int32_t> data101(101, 1);
+  std::shared_ptr<Array> array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap100, data100, &array);
+  arrays_one_.push_back(array);
+  ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap101, data101, &array);
+  arrays_another_.push_back(array);
+
+  Construct();
+  ASSERT_FALSE(one_->Equals(another_));
+  ASSERT_FALSE(one_->Equals(*another_.get()));
+
+  std::vector<bool> null_bitmap1(1, true);
+  std::vector<int32_t> data1(1, 1);
+  ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap1, data1, &array);
+  arrays_one_.push_back(array);
 
-class TestColumn : public TestBase {
+  Construct();
+  ASSERT_TRUE(one_->Equals(another_));
+  ASSERT_TRUE(one_->Equals(*another_.get()));
+}
+
+class TestColumn : public TestChunkedArray {
  protected:
+  void Construct() override {
+    TestChunkedArray::Construct();
+
+    one_col_ = std::make_shared<Column>(one_field_, one_);
+    another_col_ = std::make_shared<Column>(another_field_, another_);
+  }
+
   std::shared_ptr<ChunkedArray> data_;
   std::unique_ptr<Column> column_;
+
+  std::shared_ptr<Field> one_field_;
+  std::shared_ptr<Field> another_field_;
+
+  std::shared_ptr<Column> one_col_;
+  std::shared_ptr<Column> another_col_;
 };
 
 TEST_F(TestColumn, BasicAPI) {
@@ -47,11 +127,11 @@ TEST_F(TestColumn, BasicAPI) {
   arrays.push_back(MakePrimitive<Int32Array>(100, 10));
   arrays.push_back(MakePrimitive<Int32Array>(100, 20));
 
-  auto field = std::make_shared<Field>("c0", INT32);
+  auto field = std::make_shared<Field>("c0", int32());
   column_.reset(new Column(field, arrays));
 
   ASSERT_EQ("c0", column_->name());
-  ASSERT_TRUE(column_->type()->Equals(INT32));
+  ASSERT_TRUE(column_->type()->Equals(int32()));
   ASSERT_EQ(300, column_->length());
   ASSERT_EQ(30, column_->null_count());
   ASSERT_EQ(3, column_->data()->num_chunks());
@@ -62,7 +142,7 @@ TEST_F(TestColumn, ChunksInhomogeneous) {
   arrays.push_back(MakePrimitive<Int32Array>(100));
   arrays.push_back(MakePrimitive<Int32Array>(100, 10));
 
-  auto field = std::make_shared<Field>("c0", INT32);
+  auto field = std::make_shared<Field>("c0", int32());
   column_.reset(new Column(field, arrays));
 
   ASSERT_OK(column_->ValidateData());
@@ -72,4 +152,35 @@ TEST_F(TestColumn, ChunksInhomogeneous) {
   ASSERT_RAISES(Invalid, column_->ValidateData());
 }
 
+TEST_F(TestColumn, Equals) {
+  std::vector<bool> null_bitmap(100, true);
+  std::vector<int32_t> data(100, 1);
+  std::shared_ptr<Array> array;
+  ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array);
+  arrays_one_.push_back(array);
+  arrays_another_.push_back(array);
+
+  one_field_ = std::make_shared<Field>("column", int32());
+  another_field_ = std::make_shared<Field>("column", int32());
+
+  Construct();
+  ASSERT_TRUE(one_col_->Equals(one_col_));
+  ASSERT_FALSE(one_col_->Equals(nullptr));
+  ASSERT_TRUE(one_col_->Equals(another_col_));
+  ASSERT_TRUE(one_col_->Equals(*another_col_.get()));
+
+  // Field is different
+  another_field_ = std::make_shared<Field>("two", int32());
+  Construct();
+  ASSERT_FALSE(one_col_->Equals(another_col_));
+  ASSERT_FALSE(one_col_->Equals(*another_col_.get()));
+
+  // ChunkedArray is different
+  another_field_ = std::make_shared<Field>("column", int32());
+  arrays_another_.push_back(array);
+  Construct();
+  ASSERT_FALSE(one_col_->Equals(another_col_));
+  ASSERT_FALSE(one_col_->Equals(*another_col_.get()));
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/column.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column.cc b/cpp/src/arrow/column.cc
index 1d136e7..3e89956 100644
--- a/cpp/src/arrow/column.cc
+++ b/cpp/src/arrow/column.cc
@@ -35,6 +35,45 @@ ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) {
   }
 }
 
+bool ChunkedArray::Equals(const ChunkedArray& other) const {
+  if (length_ != other.length()) { return false; }
+  if (null_count_ != other.null_count()) { return false; }
+
+  // Check contents of the underlying arrays. This checks for equality of
+  // the underlying data independently of the chunk size.
+  int this_chunk_idx = 0;
+  int32_t this_start_idx = 0;
+  int other_chunk_idx = 0;
+  int32_t other_start_idx = 0;
+  while (this_chunk_idx < static_cast<int32_t>(chunks_.size())) {
+    const std::shared_ptr<Array> this_array = chunks_[this_chunk_idx];
+    const std::shared_ptr<Array> other_array = other.chunk(other_chunk_idx);
+    int32_t common_length = std::min(
+        this_array->length() - this_start_idx, other_array->length() - other_start_idx);
+    if (!this_array->RangeEquals(this_start_idx, this_start_idx + common_length,
+            other_start_idx, other_array)) {
+      return false;
+    }
+
+    // If we have exhausted the current chunk, proceed to the next one individually.
+    if (this_start_idx + common_length == this_array->length()) {
+      this_chunk_idx++;
+      this_start_idx = 0;
+    }
+    if (other_start_idx + common_length == other_array->length()) {
+      other_chunk_idx++;
+      other_start_idx = 0;
+    }
+  }
+  return true;
+}
+
+bool ChunkedArray::Equals(const std::shared_ptr<ChunkedArray>& other) const {
+  if (this == other.get()) { return true; }
+  if (!other) { return false; }
+  return Equals(*other.get());
+}
+
 Column::Column(const std::shared_ptr<Field>& field, const ArrayVector& chunks)
     : field_(field) {
   data_ = std::make_shared<ChunkedArray>(chunks);
@@ -49,6 +88,18 @@ Column::Column(
     const std::shared_ptr<Field>& field, const std::shared_ptr<ChunkedArray>& data)
     : field_(field), data_(data) {}
 
+bool Column::Equals(const Column& other) const {
+  if (!field_->Equals(other.field())) { return false; }
+  return data_->Equals(other.data());
+}
+
+bool Column::Equals(const std::shared_ptr<Column>& other) const {
+  if (this == other.get()) { return true; }
+  if (!other) { return false; }
+
+  return Equals(*other.get());
+}
+
 Status Column::ValidateData() {
   for (int i = 0; i < data_->num_chunks(); ++i) {
     std::shared_ptr<DataType> type = data_->chunk(i)->type();

http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/column.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h
index 1caafec..f716473 100644
--- a/cpp/src/arrow/column.h
+++ b/cpp/src/arrow/column.h
@@ -48,6 +48,9 @@ class ARROW_EXPORT ChunkedArray {
 
   std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
 
+  bool Equals(const ChunkedArray& other) const;
+  bool Equals(const std::shared_ptr<ChunkedArray>& other) const;
+
  protected:
   ArrayVector chunks_;
   int64_t length_;
@@ -78,6 +81,10 @@ class ARROW_EXPORT Column {
 
   // @returns: the column's data as a chunked logical array
   std::shared_ptr<ChunkedArray> data() const { return data_; }
+
+  bool Equals(const Column& other) const;
+  bool Equals(const std::shared_ptr<Column>& other) const;
+
   // Verify that the column's array data is consistent with the passed field's
   // metadata
   Status ValidateData();

http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index f62336d..734b941 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -34,16 +34,12 @@ using std::vector;
 
 namespace arrow {
 
-const auto INT16 = std::make_shared<Int16Type>();
-const auto UINT8 = std::make_shared<UInt8Type>();
-const auto INT32 = std::make_shared<Int32Type>();
-
 class TestTable : public TestBase {
  public:
   void MakeExample1(int length) {
-    auto f0 = std::make_shared<Field>("f0", INT32);
-    auto f1 = std::make_shared<Field>("f1", UINT8);
-    auto f2 = std::make_shared<Field>("f2", INT16);
+    auto f0 = std::make_shared<Field>("f0", int32());
+    auto f1 = std::make_shared<Field>("f1", uint8());
+    auto f2 = std::make_shared<Field>("f2", int16());
 
     vector<shared_ptr<Field>> fields = {f0, f1, f2};
     schema_ = std::make_shared<Schema>(fields);
@@ -55,7 +51,7 @@ class TestTable : public TestBase {
   }
 
  protected:
-  std::unique_ptr<Table> table_;
+  std::shared_ptr<Table> table_;
   shared_ptr<Schema> schema_;
   vector<std::shared_ptr<Column>> columns_;
 };
@@ -123,14 +119,40 @@ TEST_F(TestTable, InvalidColumns) {
   ASSERT_RAISES(Invalid, table_->ValidateColumns());
 }
 
+TEST_F(TestTable, Equals) {
+  int length = 100;
+  MakeExample1(length);
+
+  std::string name = "data";
+  table_.reset(new Table(name, schema_, columns_));
+
+  ASSERT_TRUE(table_->Equals(table_));
+  ASSERT_FALSE(table_->Equals(nullptr));
+  // Differing name
+  ASSERT_FALSE(table_->Equals(std::make_shared<Table>("other_name", schema_, columns_)));
+  // Differing schema
+  auto f0 = std::make_shared<Field>("f3", int32());
+  auto f1 = std::make_shared<Field>("f4", uint8());
+  auto f2 = std::make_shared<Field>("f5", int16());
+  vector<shared_ptr<Field>> fields = {f0, f1, f2};
+  auto other_schema = std::make_shared<Schema>(fields);
+  ASSERT_FALSE(table_->Equals(std::make_shared<Table>(name, other_schema, columns_)));
+  // Differing columns
+  std::vector<std::shared_ptr<Column>> other_columns = {
+      std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length, 10)),
+      std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length, 10)),
+      std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length, 10))};
+  ASSERT_FALSE(table_->Equals(std::make_shared<Table>(name, schema_, other_columns)));
+}
+
 class TestRecordBatch : public TestBase {};
 
 TEST_F(TestRecordBatch, Equals) {
   const int length = 10;
 
-  auto f0 = std::make_shared<Field>("f0", INT32);
-  auto f1 = std::make_shared<Field>("f1", UINT8);
-  auto f2 = std::make_shared<Field>("f2", INT16);
+  auto f0 = std::make_shared<Field>("f0", int32());
+  auto f1 = std::make_shared<Field>("f1", uint8());
+  auto f2 = std::make_shared<Field>("f2", int16());
 
   vector<shared_ptr<Field>> fields = {f0, f1, f2};
   auto schema = std::make_shared<Schema>(fields);

http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/table.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
index 855d4ec..45f672e 100644
--- a/cpp/src/arrow/table.cc
+++ b/cpp/src/arrow/table.cc
@@ -77,6 +77,23 @@ Table::Table(const std::string& name, const std::shared_ptr<Schema>& schema,
     const std::vector<std::shared_ptr<Column>>& columns, int64_t num_rows)
     : name_(name), schema_(schema), columns_(columns), num_rows_(num_rows) {}
 
+bool Table::Equals(const Table& other) const {
+  if (name_ != other.name()) { return false; }
+  if (!schema_->Equals(other.schema())) { return false; }
+  if (static_cast<int64_t>(columns_.size()) != other.num_columns()) { return false; }
+
+  for (size_t i = 0; i < columns_.size(); i++) {
+    if (!columns_[i]->Equals(other.column(i))) { return false; }
+  }
+  return true;
+}
+
+bool Table::Equals(const std::shared_ptr<Table>& other) const {
+  if (this == other.get()) { return true; }
+  if (!other) { return false; }
+  return Equals(*other.get());
+}
+
 Status Table::ValidateColumns() const {
   if (num_columns() != schema_->num_fields()) {
     return Status::Invalid("Number of columns did not match schema");

http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/table.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h
index bf5c39f..0f2418d 100644
--- a/cpp/src/arrow/table.h
+++ b/cpp/src/arrow/table.h
@@ -100,6 +100,9 @@ class ARROW_EXPORT Table {
   // @returns: the number of rows (the corresponding length of each column)
   int64_t num_rows() const { return num_rows_; }
 
+  bool Equals(const Table& other) const;
+  bool Equals(const std::shared_ptr<Table>& other) const;
+
   // After construction, perform any checks to validate the input arguments
   Status ValidateColumns() const;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/fdbc5794/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index ce9327d..70e9333 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -81,7 +81,7 @@ class TestBase : public ::testing::Test {
     auto null_bitmap = std::make_shared<PoolBuffer>(pool_);
     EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type)));
     EXPECT_OK(null_bitmap->Resize(BitUtil::BytesForBits(length)));
-    return std::make_shared<ArrayType>(length, data, 10, null_bitmap);
+    return std::make_shared<ArrayType>(length, data, null_count, null_bitmap);
   }
 
  protected: