You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/02/08 08:17:14 UTC
arrow git commit: ARROW-537: [C++] Do not compare String/Binary data
in null slots when comparing arrays
Repository: arrow
Updated Branches:
refs/heads/master c322cbf22 -> 1407abfc9
ARROW-537: [C++] Do not compare String/Binary data in null slots when comparing arrays
Author: Wes McKinney <we...@twosigma.com>
Closes #327 from wesm/ARROW-537 and squashes the following commits:
66b1961 [Wes McKinney] Do not compare String/Binary data in null slots when comparing arrays
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1407abfc
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1407abfc
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1407abfc
Branch: refs/heads/master
Commit: 1407abfc90c03e133f198b59fed48469d171c0a9
Parents: c322cbf
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Feb 8 09:16:57 2017 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Feb 8 09:16:57 2017 +0100
----------------------------------------------------------------------
cpp/src/arrow/array-string-test.cc | 41 ++++++++++++++++++++++
cpp/src/arrow/array.cc | 11 ++++--
cpp/src/arrow/array.h | 9 +++--
cpp/src/arrow/compare.cc | 55 +++++++++++++++++++-----------
python/src/pyarrow/adapters/pandas.cc | 12 +++----
5 files changed, 95 insertions(+), 33 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array-string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc
index 8b7eb41..c4d9bf4 100644
--- a/cpp/src/arrow/array-string-test.cc
+++ b/cpp/src/arrow/array-string-test.cc
@@ -140,6 +140,47 @@ TEST_F(TestStringArray, TestEmptyStringComparison) {
ASSERT_TRUE(strings_a->Equals(strings_b));
}
+TEST_F(TestStringArray, CompareNullByteSlots) {
+ StringBuilder builder(default_memory_pool());
+ StringBuilder builder2(default_memory_pool());
+ StringBuilder builder3(default_memory_pool());
+
+ builder.Append("foo");
+ builder2.Append("foo");
+ builder3.Append("foo");
+
+ builder.Append("bar");
+ builder2.AppendNull();
+
+ // same length, but different
+ builder3.Append("xyz");
+
+ builder.Append("baz");
+ builder2.Append("baz");
+ builder3.Append("baz");
+
+ std::shared_ptr<Array> array, array2, array3;
+ ASSERT_OK(builder.Finish(&array));
+ ASSERT_OK(builder2.Finish(&array2));
+ ASSERT_OK(builder3.Finish(&array3));
+
+ const auto& a1 = static_cast<const StringArray&>(*array);
+ const auto& a2 = static_cast<const StringArray&>(*array2);
+ const auto& a3 = static_cast<const StringArray&>(*array3);
+
+ // The validity bitmaps are the same, the data is different, but the unequal
+ // portion is masked out
+ StringArray equal_array(3, a1.value_offsets(), a1.data(), a2.null_bitmap(), 1);
+ StringArray equal_array2(3, a3.value_offsets(), a3.data(), a2.null_bitmap(), 1);
+
+ ASSERT_TRUE(equal_array.Equals(equal_array2));
+ ASSERT_TRUE(a2.RangeEquals(equal_array2, 0, 3, 0));
+
+ ASSERT_TRUE(equal_array.Array::Slice(1)->Equals(equal_array2.Array::Slice(1)));
+ ASSERT_TRUE(
+ equal_array.Array::Slice(1)->RangeEquals(0, 2, 0, equal_array2.Array::Slice(1)));
+}
+
// ----------------------------------------------------------------------
// String builder tests
http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index f84023e..39459a0 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -87,11 +87,16 @@ bool Array::ApproxEquals(const std::shared_ptr<Array>& arr) const {
}
bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
- const std::shared_ptr<Array>& arr) const {
- if (!arr) { return false; }
+ const std::shared_ptr<Array>& other) const {
+ if (!other) { return false; }
+ return RangeEquals(*other, start_idx, end_idx, other_start_idx);
+}
+
+bool Array::RangeEquals(const Array& other, int32_t start_idx, int32_t end_idx,
+ int32_t other_start_idx) const {
bool are_equal = false;
Status error =
- ArrayRangeEquals(*this, *arr, start_idx, end_idx, other_start_idx, &are_equal);
+ ArrayRangeEquals(*this, other, start_idx, end_idx, other_start_idx, &are_equal);
if (!error.ok()) { DCHECK(false) << "Arrays not comparable: " << error.ToString(); }
return are_equal;
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index f3e8f9a..32d156b 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -127,7 +127,10 @@ class ARROW_EXPORT Array {
/// Compare if the range of slots specified are equal for the given array and
/// this array. end_idx exclusive. This methods does not bounds check.
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
- const std::shared_ptr<Array>& arr) const;
+ const std::shared_ptr<Array>& other) const;
+
+ bool RangeEquals(const Array& other, int32_t start_idx, int32_t end_idx,
+ int32_t other_start_idx) const;
/// Determines if the array is internally consistent.
///
@@ -315,8 +318,8 @@ class ARROW_EXPORT BinaryArray : public Array {
// Account for base offset
i += offset_;
- const int32_t pos = raw_value_offsets_[i];
- *out_length = raw_value_offsets_[i + 1] - pos;
+ const int32_t pos = raw_value_offsets_[i + offset_];
+ *out_length = raw_value_offsets_[i + offset_ + 1] - pos;
return raw_data_ + pos;
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/cpp/src/arrow/compare.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 27fad71..21fdb66 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -335,15 +335,8 @@ class EqualsVisitor : public RangeEqualsVisitor {
const int value_byte_size = size_meta.bit_width() / 8;
DCHECK_GT(value_byte_size, 0);
- const uint8_t* left_data = nullptr;
- if (left.length() > 0) {
- left_data = left.data()->data() + left.offset() * value_byte_size;
- }
-
- const uint8_t* right_data = nullptr;
- if (right.length() > 0) {
- right_data = right.data()->data() + right.offset() * value_byte_size;
- }
+ const uint8_t* left_data = left.data()->data() + left.offset() * value_byte_size;
+ const uint8_t* right_data = right.data()->data() + right.offset() * value_byte_size;
if (left.null_count() > 0) {
for (int i = 0; i < left.length(); ++i) {
@@ -355,7 +348,6 @@ class EqualsVisitor : public RangeEqualsVisitor {
}
return true;
} else {
- if (left.length() == 0) { return true; }
return memcmp(left_data, right_data, value_byte_size * left.length()) == 0;
}
}
@@ -424,14 +416,35 @@ class EqualsVisitor : public RangeEqualsVisitor {
bool equal_offsets = ValueOffsetsEqual<BinaryArray>(left);
if (!equal_offsets) { return false; }
- if (left.offset() == 0 && right.offset() == 0) {
- if (!left.data() && !(right.data())) { return true; }
- return left.data()->Equals(*right.data(), left.raw_value_offsets()[left.length()]);
+ if (!left.data() && !(right.data())) { return true; }
+ if (left.value_offset(left.length()) == 0) { return true; }
+
+ const uint8_t* left_data = left.data()->data();
+ const uint8_t* right_data = right.data()->data();
+
+ if (left.null_count() == 0) {
+ // Fast path for null count 0, single memcmp
+ if (left.offset() == 0 && right.offset() == 0) {
+ return std::memcmp(
+ left_data, right_data, left.raw_value_offsets()[left.length()]) == 0;
+ } else {
+ const int64_t total_bytes =
+ left.value_offset(left.length()) - left.value_offset(0);
+ return std::memcmp(left_data + left.value_offset(0),
+ right_data + right.value_offset(0), total_bytes) == 0;
+ }
} else {
- // Compare the corresponding data range
- const int64_t total_bytes = left.value_offset(left.length()) - left.value_offset(0);
- return std::memcmp(left.data()->data() + left.value_offset(0),
- right.data()->data() + right.value_offset(0), total_bytes) == 0;
+ // ARROW-537: Only compare data in non-null slots
+ const int32_t* left_offsets = left.raw_value_offsets();
+ const int32_t* right_offsets = right.raw_value_offsets();
+ for (int32_t i = 0; i < left.length(); ++i) {
+ if (left.IsNull(i)) { continue; }
+ if (std::memcmp(left_data + left_offsets[i], right_data + right_offsets[i],
+ left.value_length(i))) {
+ return false;
+ }
+ }
+ return true;
}
}
@@ -485,8 +498,6 @@ inline bool FloatingApproxEquals(
static constexpr T EPSILON = 1E-5;
- if (left.length() == 0 && right.length() == 0) { return true; }
-
if (left.null_count() > 0) {
for (int32_t i = 0; i < left.length(); ++i) {
if (left.IsNull(i)) continue;
@@ -535,6 +546,8 @@ Status ArrayEquals(const Array& left, const Array& right, bool* are_equal) {
*are_equal = true;
} else if (!BaseDataEquals(left, right)) {
*are_equal = false;
+ } else if (left.length() == 0) {
+ *are_equal = true;
} else {
EqualsVisitor visitor(right);
RETURN_NOT_OK(left.Accept(&visitor));
@@ -549,6 +562,8 @@ Status ArrayRangeEquals(const Array& left, const Array& right, int32_t left_star
*are_equal = true;
} else if (left.type_enum() != right.type_enum()) {
*are_equal = false;
+ } else if (left.length() == 0) {
+ *are_equal = true;
} else {
RangeEqualsVisitor visitor(right, left_start_idx, left_end_idx, right_start_idx);
RETURN_NOT_OK(left.Accept(&visitor));
@@ -563,6 +578,8 @@ Status ArrayApproxEquals(const Array& left, const Array& right, bool* are_equal)
*are_equal = true;
} else if (!BaseDataEquals(left, right)) {
*are_equal = false;
+ } else if (left.length() == 0) {
+ *are_equal = true;
} else {
ApproxEqualsVisitor visitor(right);
RETURN_NOT_OK(left.Accept(&visitor));
http://git-wip-us.apache.org/repos/asf/arrow/blob/1407abfc/python/src/pyarrow/adapters/pandas.cc
----------------------------------------------------------------------
diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc
index b4e0d2f..bdc2cb7 100644
--- a/python/src/pyarrow/adapters/pandas.cc
+++ b/python/src/pyarrow/adapters/pandas.cc
@@ -1717,12 +1717,8 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
#if (NPY_INT64 == NPY_LONGLONG) && (NPY_SIZEOF_LONGLONG == 8)
// Both LONGLONG and INT64 can be observed in the wild, which is buggy. We set
// U/LONGLONG to U/INT64 so things work properly.
- if (type_num == NPY_LONGLONG) {
- type_num = NPY_INT64;
- }
- if (type_num == NPY_ULONGLONG) {
- type_num = NPY_UINT64;
- }
+ if (type_num == NPY_LONGLONG) { type_num = NPY_INT64; }
+ if (type_num == NPY_ULONGLONG) { type_num = NPY_UINT64; }
#endif
switch (type_num) {
@@ -1732,14 +1728,14 @@ Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo,
TO_ARROW_CASE(INT32);
TO_ARROW_CASE(INT64);
#if (NPY_INT64 != NPY_LONGLONG)
- TO_ARROW_CASE(LONGLONG);
+ TO_ARROW_CASE(LONGLONG);
#endif
TO_ARROW_CASE(UINT8);
TO_ARROW_CASE(UINT16);
TO_ARROW_CASE(UINT32);
TO_ARROW_CASE(UINT64);
#if (NPY_UINT64 != NPY_ULONGLONG)
- TO_ARROW_CASE(ULONGLONG);
+ TO_ARROW_CASE(ULONGLONG);
#endif
TO_ARROW_CASE(FLOAT32);
TO_ARROW_CASE(FLOAT64);