You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/02/08 08:17:14 UTC

arrow git commit: ARROW-537: [C++] Do not compare String/Binary data in null slots when comparing arrays

Repository: arrow
Updated Branches:
  refs/heads/master c322cbf22 -> 1407abfc9


ARROW-537: [C++] Do not compare String/Binary data in null slots when comparing arrays

Author: Wes McKinney <we...@twosigma.com>

Closes #327 from wesm/ARROW-537 and squashes the following commits:

66b1961 [Wes McKinney] Do not compare String/Binary data in null slots when comparing arrays


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1407abfc
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1407abfc
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1407abfc

Branch: refs/heads/master
Commit: 1407abfc90c03e133f198b59fed48469d171c0a9
Parents: c322cbf
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Feb 8 09:16:57 2017 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Feb 8 09:16:57 2017 +0100

----------------------------------------------------------------------
 cpp/src/arrow/array-string-test.cc    | 41 ++++++++++++++++++++++
 cpp/src/arrow/array.cc                | 11 ++++--
 cpp/src/arrow/array.h                 |  9 +++--
 cpp/src/arrow/compare.cc              | 55 +++++++++++++++++++-----------
 python/src/pyarrow/adapters/pandas.cc | 12 +++----
 5 files changed, 95 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array-string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc
index 8b7eb41..c4d9bf4 100644
--- a/cpp/src/arrow/array-string-test.cc
+++ b/cpp/src/arrow/array-string-test.cc
@@ -140,6 +140,47 @@ TEST_F(TestStringArray, TestEmptyStringComparison) {
   ASSERT_TRUE(strings_a->Equals(strings_b));
 }
 
+TEST_F(TestStringArray, CompareNullByteSlots) {
+  StringBuilder builder(default_memory_pool());
+  StringBuilder builder2(default_memory_pool());
+  StringBuilder builder3(default_memory_pool());
+
+  builder.Append("foo");
+  builder2.Append("foo");
+  builder3.Append("foo");
+
+  builder.Append("bar");
+  builder2.AppendNull();
+
+  // same length, but different
+  builder3.Append("xyz");
+
+  builder.Append("baz");
+  builder2.Append("baz");
+  builder3.Append("baz");
+
+  std::shared_ptr<Array> array, array2, array3;
+  ASSERT_OK(builder.Finish(&array));
+  ASSERT_OK(builder2.Finish(&array2));
+  ASSERT_OK(builder3.Finish(&array3));
+
+  const auto& a1 = static_cast<const StringArray&>(*array);
+  const auto& a2 = static_cast<const StringArray&>(*array2);
+  const auto& a3 = static_cast<const StringArray&>(*array3);
+
+  // The validity bitmaps are the same, the data is different, but the unequal
+  // portion is masked out
+  StringArray equal_array(3, a1.value_offsets(), a1.data(), a2.null_bitmap(), 1);
+  StringArray equal_array2(3, a3.value_offsets(), a3.data(), a2.null_bitmap(), 1);
+
+  ASSERT_TRUE(equal_array.Equals(equal_array2));
+  ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0));
+
+  ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1)));
+  ASSERT_TRUE(
+      equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1)));
+}
+
 // ----------------------------------------------------------------------
 // String builder tests
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index f84023e..39459a0 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -87,11 +87,16 @@ bool Array::ApproxEquals(const std::shared_ptr<Array>& arr) const {
 }
 
 bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
-    const std::shared_ptr<Array>& arr) const {
-  if (!arr) { return false; }
+    const std::shared_ptr<Array>& other) const {
+  if (!other) { return false; }
+  return RangeEquals(*other, start_idx, end_idx, other_start_idx);
+}
+
+bool Array::RangeEquals(const Array& other, int32_t start_idx, int32_t end_idx,
+    int32_t other_start_idx) const {
   bool are_equal = false;
   Status error =
-      ArrayRangeEquals(*this, *arr, start_idx, end_idx, other_start_idx, &are_equal);
+      ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, &are_equal);
   if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); }
   return are_equal;
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index f3e8f9a..32d156b 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -127,7 +127,10 @@ class ARROW_EXPORT Array {
   /// Compare if the range of slots specified are equal for the given array and
   /// this array.  end_idx exclusive.  This methods does not bounds check.
   bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
-      const std::shared_ptr<Array>& arr) const;
+      const std::shared_ptr<Array>& other) const;
+
+  bool RangeEquals(const Array& other, int32_t start_idx, int32_t end_idx,
+      int32_t other_start_idx) const;
 
   /// Determines if the array is internally consistent.
   ///
@@ -315,8 +318,8 @@ class ARROW_EXPORT BinaryArray : public Array {
     // Account for base offset
     i += offset_;
 
-    const int32_t pos = raw_value_offsets_[i];
-    *out_length = raw_value_offsets_[i + 1] - pos;
+    const int32_t pos = raw_value_offsets_[i + offset_];
+    *out_length = raw_value_offsets_[i + offset_ + 1] - pos;
     return raw_data_ + pos;
   }
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/compare.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 27fad71..21fdb66 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -335,15 +335,8 @@ class EqualsVisitor : public RangeEqualsVisitor {
     const int value_byte_size = size_meta.bit_width() / 8;
     DCHECK_GT(value_byte_size, 0);
 
-    const uint8_t* left_data = nullptr;
-    if (left.length() > 0) {
-      left_data = left.data()->data() + left.offset() * value_byte_size;
-    }
-
-    const uint8_t* right_data = nullptr;
-    if (right.length() > 0) {
-      right_data = right.data()->data() + right.offset() * value_byte_size;
-    }
+    const uint8_t* left_data = left.data()->data() + left.offset() * value_byte_size;
+    const uint8_t* right_data = right.data()->data() + right.offset() * value_byte_size;
 
     if (left.null_count() > 0) {
       for (int i = 0; i < left.length(); ++i) {
@@ -355,7 +348,6 @@ class EqualsVisitor : public RangeEqualsVisitor {
       }
       return true;
     } else {
-      if (left.length() == 0) { return true; }
       return memcmp(left_data, right_data, value_byte_size * left.length()) == 0;
     }
   }
@@ -424,14 +416,35 @@ class EqualsVisitor : public RangeEqualsVisitor {
     bool equal_offsets = ValueOffsetsEqual<BinaryArray>(left);
     if (!equal_offsets) { return false; }
 
-    if (left.offset() == 0 && right.offset() == 0) {
-      if (!left.data() && !(right.data())) { return true; }
-      return left.data()->Equals(*right.data(), left.raw_value_offsets()[left.length()]);
+    if (!left.data() && !(right.data())) { return true; }
+    if (left.value_offset(left.length()) == 0) { return true; }
+
+    const uint8_t* left_data = left.data()->data();
+    const uint8_t* right_data = right.data()->data();
+
+    if (left.null_count() == 0) {
+      // Fast path for null count 0, single memcmp
+      if (left.offset() == 0 && right.offset() == 0) {
+        return std::memcmp(
+                   left_data, right_data, left.raw_value_offsets()[left.length()]) == 0;
+      } else {
+        const int64_t total_bytes =
+            left.value_offset(left.length()) - left.value_offset(0);
+        return std::memcmp(left_data + left.value_offset(0),
+                   right_data + right.value_offset(0), total_bytes) == 0;
+      }
     } else {
-      // Compare the corresponding data range
-      const int64_t total_bytes = left.value_offset(left.length()) - left.value_offset(0);
-      return std::memcmp(left.data()->data() + left.value_offset(0),
-                 right.data()->data() + right.value_offset(0), total_bytes) == 0;
+      // ARROW-537: Only compare data in non-null slots
+      const int32_t* left_offsets = left.raw_value_offsets();
+      const int32_t* right_offsets = right.raw_value_offsets();
+      for (int32_t i = 0; i < left.length(); ++i) {
+        if (left.IsNull(i)) { continue; }
+        if (std::memcmp(left_data + left_offsets[i], right_data + right_offsets[i],
+                left.value_length(i))) {
+          return false;
+        }
+      }
+      return true;
     }
   }
 
@@ -485,8 +498,6 @@ inline bool FloatingApproxEquals(
 
   static constexpr T EPSILON = 1E-5;
 
-  if (left.length() == 0 && right.length() == 0) { return true; }
-
   if (left.null_count() > 0) {
     for (int32_t i = 0; i < left.length(); ++i) {
       if (left.IsNull(i)) continue;
@@ -535,6 +546,8 @@ Status ArrayEquals(const Array& left, const Array& right, bool* are_equal) {
     *are_equal = true;
   } else if (!BaseDataEquals(left, right)) {
     *are_equal = false;
+  } else if (left.length() == 0) {
+    *are_equal = true;
   } else {
     EqualsVisitor visitor(right);
     RETURN_NOT_OK(left.Accept(&visitor));
@@ -549,6 +562,8 @@ Status ArrayRangeEquals(const Array& left, const Array& right, int32_t left_star
     *are_equal = true;
   } else if (left.type_enum() != right.type_enum()) {
     *are_equal = false;
+  } else if (left.length() == 0) {
+    *are_equal = true;
   } else {
     RangeEqualsVisitor visitor(right, left_start_idx, left_end_idx, right_start_idx);
     RETURN_NOT_OK(left.Accept(&visitor));
@@ -563,6 +578,8 @@ Status ArrayApproxEquals(const Array& left, const Array& right, bool* are_equal)
     *are_equal = true;
   } else if (!BaseDataEquals(left, right)) {
     *are_equal = false;
+  } else if (left.length() == 0) {
+    *are_equal = true;
   } else {
     ApproxEqualsVisitor visitor(right);
     RETURN_NOT_OK(left.Accept(&visitor));

http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index b4e0d2f..bdc2cb7 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -1717,12 +1717,8 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
 #if (NPY_INT64 == NPY_LONGLONG) && (NPY_SIZEOF_LONGLONG == 8)
   // Both LONGLONG and INT64 can be observed in the wild, which is buggy. We set
   // U/LONGLONG to U/INT64 so things work properly.
-  if (type_num == NPY_LONGLONG) {
-    type_num = NPY_INT64;
-  }
-  if (type_num == NPY_ULONGLONG) {
-    type_num = NPY_UINT64;
-  }
+  if (type_num == NPY_LONGLONG) { type_num = NPY_INT64; }
+  if (type_num == NPY_ULONGLONG) { type_num = NPY_UINT64; }
 #endif
 
   switch (type_num) {
@@ -1732,14 +1728,14 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
     TO_ARROW_CASE(INT32);
     TO_ARROW_CASE(INT64);
 #if (NPY_INT64 != NPY_LONGLONG)
-	TO_ARROW_CASE(LONGLONG);
+    TO_ARROW_CASE(LONGLONG);
 #endif
     TO_ARROW_CASE(UINT8);
     TO_ARROW_CASE(UINT16);
     TO_ARROW_CASE(UINT32);
     TO_ARROW_CASE(UINT64);
 #if (NPY_UINT64 != NPY_ULONGLONG)
-	TO_ARROW_CASE(ULONGLONG);
+    TO_ARROW_CASE(ULONGLONG);
 #endif
     TO_ARROW_CASE(FLOAT32);
     TO_ARROW_CASE(FLOAT64);