You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/07/19 12:01:48 UTC
[arrow] branch master updated: ARROW-16911: [C++] Add Equals method to Partitioning (#13567)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d07dc75e27 ARROW-16911: [C++] Add Equals method to Partitioning (#13567)
d07dc75e27 is described below
commit d07dc75e27f016ca05c4ca22bc2d19c79fa2cd4a
Author: Vibhatha Lakmal Abeykoon <vi...@users.noreply.github.com>
AuthorDate: Tue Jul 19 17:31:43 2022 +0530
ARROW-16911: [C++] Add Equals method to Partitioning (#13567)
Adding `Equals` method to `Partitioning` class and extended classes. Also include a few test cases.
Lead-authored-by: Vibhatha Abeykoon <vi...@gmail.com>
Co-authored-by: Antoine Pitrou <pi...@free.fr>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/dataset/partition.cc | 50 ++++++++++++++++++++++
cpp/src/arrow/dataset/partition.h | 18 +++++++-
cpp/src/arrow/dataset/partition_test.cc | 73 +++++++++++++++++++++++++++++++++
3 files changed, 140 insertions(+), 1 deletion(-)
diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index ca65288721..a210c947a3 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -82,6 +82,10 @@ std::shared_ptr<Partitioning> Partitioning::Default() {
std::string type_name() const override { return "default"; }
+ bool Equals(const Partitioning& other) const override {
+ return type_name() == other.type_name();
+ }
+
Result<compute::Expression> Parse(const std::string& path) const override {
return compute::literal(true);
}
@@ -115,6 +119,28 @@ static Result<RecordBatchVector> ApplyGroupings(
return out;
}
+bool KeyValuePartitioning::Equals(const Partitioning& other) const {
+ if (this == &other) {
+ return true;
+ }
+ const auto& kv_partitioning = checked_cast<const KeyValuePartitioning&>(other);
+ const auto& other_dictionaries = kv_partitioning.dictionaries();
+ if (dictionaries_.size() != other_dictionaries.size()) {
+ return false;
+ }
+ int64_t idx = 0;
+ for (const auto& array : dictionaries_) {
+ const auto& other_array = other_dictionaries[idx++];
+ bool match = (array == nullptr && other_array == nullptr) ||
+ (array && other_array && array->Equals(other_array));
+ if (!match) {
+ return false;
+ }
+ }
+ return options_.segment_encoding == kv_partitioning.options_.segment_encoding &&
+ Partitioning::Equals(other);
+}
+
Result<Partitioning::PartitionedBatches> KeyValuePartitioning::Partition(
const std::shared_ptr<RecordBatch>& batch) const {
std::vector<int> key_indices;
@@ -381,6 +407,10 @@ Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
return ParsePartitionSegments(segments);
}
+bool DirectoryPartitioning::Equals(const Partitioning& other) const {
+ return type_name() == other.type_name() && KeyValuePartitioning::Equals(other);
+}
+
FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
ArrayVector dictionaries,
KeyValuePartitioningOptions options)
@@ -678,6 +708,13 @@ std::shared_ptr<PartitioningFactory> FilenamePartitioning::MakeFactory(
new FilenamePartitioningFactory(std::move(field_names), options));
}
+bool FilenamePartitioning::Equals(const Partitioning& other) const {
+ if (type_name() != other.type_name()) {
+ return false;
+ }
+ return KeyValuePartitioning::Equals(other);
+}
+
Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
const std::string& segment, const HivePartitioningOptions& options) {
auto name_end = string_view(segment).find_first_of('=');
@@ -754,6 +791,19 @@ Result<PartitionPathFormat> HivePartitioning::FormatValues(
return PartitionPathFormat{fs::internal::JoinAbstractPath(std::move(segments)), ""};
}
+bool HivePartitioning::Equals(const Partitioning& other) const {
+ if (this == &other) {
+ return true;
+ }
+ if (type_name() != other.type_name()) {
+ return false;
+ }
+ const auto& hive_part = ::arrow::internal::checked_cast<const HivePartitioning&>(other);
+ return null_fallback() == hive_part.null_fallback() &&
+ options().null_fallback == hive_part.options().null_fallback &&
+ KeyValuePartitioning::Equals(other);
+}
+
class HivePartitioningFactory : public KeyValuePartitioningFactory {
public:
explicit HivePartitioningFactory(HivePartitioningFactoryOptions options)
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index bffa2f979f..2d8c8bb274 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -30,6 +30,7 @@
#include "arrow/compute/exec/expression.h"
#include "arrow/dataset/type_fwd.h"
#include "arrow/dataset/visibility.h"
+#include "arrow/util/compare.h"
#include "arrow/util/optional.h"
namespace arrow {
@@ -63,13 +64,18 @@ struct ARROW_DS_EXPORT PartitionPathFormat {
/// Paths are consumed from left to right. Paths must be relative to
/// the root of a partition; path prefixes must be removed before passing
/// the path to a partitioning for parsing.
-class ARROW_DS_EXPORT Partitioning {
+class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable<Partitioning> {
public:
virtual ~Partitioning() = default;
/// \brief The name identifying the kind of partitioning
virtual std::string type_name() const = 0;
+ //// \brief Return whether the partitionings are equal
+ virtual bool Equals(const Partitioning& other) const {
+ return schema_->Equals(other.schema_, /*check_metadata=*/false);
+ }
+
/// \brief If the input batch shares any fields with this partitioning,
/// produce sub-batches which satisfy mutually exclusive Expressions.
struct PartitionedBatches {
@@ -180,6 +186,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
const ArrayVector& dictionaries() const { return dictionaries_; }
+ bool Equals(const Partitioning& other) const override;
+
protected:
KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
KeyValuePartitioningOptions options)
@@ -223,6 +231,8 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
std::string type_name() const override { return "directory"; }
+ bool Equals(const Partitioning& other) const override;
+
/// \brief Create a factory for a directory partitioning.
///
/// \param[in] field_names The names for the partition fields. Types will be
@@ -282,6 +292,8 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
static Result<util::optional<Key>> ParseKey(const std::string& segment,
const HivePartitioningOptions& options);
+ bool Equals(const Partitioning& other) const override;
+
/// \brief Create a factory for a hive partitioning.
static std::shared_ptr<PartitioningFactory> MakeFactory(
HivePartitioningFactoryOptions = {});
@@ -310,6 +322,8 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
std::string type_name() const override { return name_; }
+ bool Equals(const Partitioning& other) const override { return false; }
+
Result<compute::Expression> Parse(const std::string& path) const override {
return parse_impl_(path);
}
@@ -352,6 +366,8 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
static std::shared_ptr<PartitioningFactory> MakeFactory(
std::vector<std::string> field_names, PartitioningFactoryOptions = {});
+ bool Equals(const Partitioning& other) const override;
+
private:
Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 86b8c4f0b9..66a22a2db3 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -206,6 +206,21 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
equal(field_ref("beta"), literal("foo"))));
}
+TEST_F(TestPartitioning, DirectoryPartitioningEquals) {
+ auto part = std::make_shared<DirectoryPartitioning>(
+ schema({field("alpha", int32()), field("beta", utf8())}));
+ auto other = std::make_shared<DirectoryPartitioning>(
+ schema({field("alpha", int32()), field("gamma", utf8())}));
+ auto another = std::make_shared<DirectoryPartitioning>(
+ schema({field("alpha", int32()), field("beta", utf8())}));
+ auto some_other = std::make_shared<DirectoryPartitioning>(
+ schema({field("alpha", int32()), field("beta", utf8())}));
+ EXPECT_TRUE(part->Equals(*part));
+ EXPECT_FALSE(part->Equals(*other));
+ EXPECT_TRUE(part->Equals(*another));
+ EXPECT_TRUE(another->Equals(*some_other));
+}
+
TEST_F(TestPartitioning, FilenamePartitioning) {
partitioning_ = std::make_shared<FilenamePartitioning>(
schema({field("alpha", int32()), field("beta", utf8())}));
@@ -222,6 +237,21 @@ TEST_F(TestPartitioning, FilenamePartitioning) {
equal(field_ref("beta"), literal("foo"))));
}
+TEST_F(TestPartitioning, FilenamePartitioningEquals) {
+ auto part = std::make_shared<FilenamePartitioning>(
+ schema({field("alpha", int32()), field("beta", utf8())}));
+ auto other_part = std::make_shared<FilenamePartitioning>(
+ schema({field("sigma", int32()), field("beta", utf8())}));
+ auto another_part = std::make_shared<FilenamePartitioning>(
+ schema({field("sigma", int64()), field("beta", utf8())}));
+ auto some_other_part = std::make_shared<FilenamePartitioning>(
+ schema({field("sigma", int64()), field("beta", utf8())}));
+ EXPECT_TRUE(part->Equals(*part));
+ EXPECT_FALSE(part->Equals(*other_part));
+ EXPECT_FALSE(other_part->Equals(*another_part));
+ EXPECT_TRUE(another_part->Equals(*some_other_part));
+}
+
TEST_F(TestPartitioning, DirectoryPartitioningFormat) {
partitioning_ = std::make_shared<DirectoryPartitioning>(
schema({field("alpha", int32()), field("beta", utf8())}));
@@ -426,6 +456,41 @@ TEST_F(TestPartitioning, HivePartitioning) {
AssertParseError("/alpha=0.0/beta=3.25/"); // conversion of "0.0" to int32 fails
}
+TEST_F(TestPartitioning, HivePartitioningEquals) {
+ const auto& array_vector = ArrayVector();
+ ArrayVector other_vector(2);
+ other_vector[0] = ArrayFromJSON(utf8(), R"(["foo", "bar", "baz"])");
+ other_vector[1] = ArrayFromJSON(utf8(), R"(["bar", "foo", "baz"])");
+ auto part = std::make_shared<HivePartitioning>(
+ schema({field("alpha", int32()), field("beta", float32())}), array_vector, "xyz");
+ auto other_part = std::make_shared<HivePartitioning>(
+ schema({field("sigma", int32()), field("beta", float32())}), array_vector, "xyz");
+ auto another_part = std::make_shared<HivePartitioning>(
+ schema({field("alpha", int32()), field("beta", float32())}), other_vector, "xyz");
+ auto some_part = std::make_shared<HivePartitioning>(
+ schema({field("alpha", int32()), field("beta", float32())}), array_vector, "abc");
+ auto match_part = std::make_shared<HivePartitioning>(
+ schema({field("alpha", int32()), field("beta", float32())}), array_vector, "xyz");
+ EXPECT_TRUE(part->Equals(*part));
+ EXPECT_FALSE(part->Equals(*other_part));
+ EXPECT_FALSE(part->Equals(*another_part));
+ EXPECT_FALSE(part->Equals(*some_part));
+ EXPECT_TRUE(part->Equals(*match_part));
+}
+
+TEST_F(TestPartitioning, CrossCheckPartitioningEquals) {
+ auto file_part = std::make_shared<FilenamePartitioning>(
+ schema({field("alpha", int32()), field("beta", utf8())}));
+ auto dir_part = std::make_shared<DirectoryPartitioning>(
+ schema({field("alpha", int32()), field("beta", utf8())}));
+ auto hive_part = std::make_shared<HivePartitioning>(
+ schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
+ EXPECT_FALSE(file_part->Equals(*dir_part));
+ EXPECT_FALSE(dir_part->Equals(*file_part));
+ EXPECT_FALSE(dir_part->Equals(*hive_part));
+ EXPECT_FALSE(hive_part->Equals(*dir_part));
+}
+
TEST_F(TestPartitioning, HivePartitioningFormat) {
partitioning_ = std::make_shared<HivePartitioning>(
schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
@@ -891,6 +956,14 @@ class RangePartitioning : public Partitioning {
std::string type_name() const override { return "range"; }
+ bool Equals(const Partitioning& other) const override {
+ if (this == &other) {
+ return true;
+ }
+ return checked_cast<const RangePartitioning&>(other).type_name() == type_name() &&
+ Partitioning::Equals(other);
+ }
+
Result<compute::Expression> Parse(const std::string& path) const override {
std::vector<compute::Expression> ranges;