You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/07/19 12:01:48 UTC

[arrow] branch master updated: ARROW-16911: [C++] Add Equals method to Partitioning (#13567)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d07dc75e27 ARROW-16911: [C++] Add Equals method to Partitioning (#13567)
d07dc75e27 is described below

commit d07dc75e27f016ca05c4ca22bc2d19c79fa2cd4a
Author: Vibhatha Lakmal Abeykoon <vi...@users.noreply.github.com>
AuthorDate: Tue Jul 19 17:31:43 2022 +0530

    ARROW-16911: [C++] Add Equals method to Partitioning (#13567)
    
    Adding `Equals` method to `Partitioning` class and extended classes. Also include a few test cases.
    
    Lead-authored-by: Vibhatha Abeykoon <vi...@gmail.com>
    Co-authored-by: Antoine Pitrou <pi...@free.fr>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/dataset/partition.cc      | 50 ++++++++++++++++++++++
 cpp/src/arrow/dataset/partition.h       | 18 +++++++-
 cpp/src/arrow/dataset/partition_test.cc | 73 +++++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/dataset/partition.cc b/cpp/src/arrow/dataset/partition.cc
index ca65288721..a210c947a3 100644
--- a/cpp/src/arrow/dataset/partition.cc
+++ b/cpp/src/arrow/dataset/partition.cc
@@ -82,6 +82,10 @@ std::shared_ptr<Partitioning> Partitioning::Default() {
 
     std::string type_name() const override { return "default"; }
 
+    bool Equals(const Partitioning& other) const override {
+      return type_name() == other.type_name();
+    }
+
     Result<compute::Expression> Parse(const std::string& path) const override {
       return compute::literal(true);
     }
@@ -115,6 +119,28 @@ static Result<RecordBatchVector> ApplyGroupings(
   return out;
 }
 
+bool KeyValuePartitioning::Equals(const Partitioning& other) const {
+  if (this == &other) {
+    return true;
+  }
+  const auto& kv_partitioning = checked_cast<const KeyValuePartitioning&>(other);
+  const auto& other_dictionaries = kv_partitioning.dictionaries();
+  if (dictionaries_.size() != other_dictionaries.size()) {
+    return false;
+  }
+  int64_t idx = 0;
+  for (const auto& array : dictionaries_) {
+    const auto& other_array = other_dictionaries[idx++];
+    bool match = (array == nullptr && other_array == nullptr) ||
+                 (array && other_array && array->Equals(other_array));
+    if (!match) {
+      return false;
+    }
+  }
+  return options_.segment_encoding == kv_partitioning.options_.segment_encoding &&
+         Partitioning::Equals(other);
+}
+
 Result<Partitioning::PartitionedBatches> KeyValuePartitioning::Partition(
     const std::shared_ptr<RecordBatch>& batch) const {
   std::vector<int> key_indices;
@@ -381,6 +407,10 @@ Result<std::vector<KeyValuePartitioning::Key>> DirectoryPartitioning::ParseKeys(
   return ParsePartitionSegments(segments);
 }
 
+bool DirectoryPartitioning::Equals(const Partitioning& other) const {
+  return type_name() == other.type_name() && KeyValuePartitioning::Equals(other);
+}
+
 FilenamePartitioning::FilenamePartitioning(std::shared_ptr<Schema> schema,
                                            ArrayVector dictionaries,
                                            KeyValuePartitioningOptions options)
@@ -678,6 +708,13 @@ std::shared_ptr<PartitioningFactory> FilenamePartitioning::MakeFactory(
       new FilenamePartitioningFactory(std::move(field_names), options));
 }
 
+bool FilenamePartitioning::Equals(const Partitioning& other) const {
+  if (type_name() != other.type_name()) {
+    return false;
+  }
+  return KeyValuePartitioning::Equals(other);
+}
+
 Result<util::optional<KeyValuePartitioning::Key>> HivePartitioning::ParseKey(
     const std::string& segment, const HivePartitioningOptions& options) {
   auto name_end = string_view(segment).find_first_of('=');
@@ -754,6 +791,19 @@ Result<PartitionPathFormat> HivePartitioning::FormatValues(
   return PartitionPathFormat{fs::internal::JoinAbstractPath(std::move(segments)), ""};
 }
 
+bool HivePartitioning::Equals(const Partitioning& other) const {
+  if (this == &other) {
+    return true;
+  }
+  if (type_name() != other.type_name()) {
+    return false;
+  }
+  const auto& hive_part = ::arrow::internal::checked_cast<const HivePartitioning&>(other);
+  return null_fallback() == hive_part.null_fallback() &&
+         options().null_fallback == hive_part.options().null_fallback &&
+         KeyValuePartitioning::Equals(other);
+}
+
 class HivePartitioningFactory : public KeyValuePartitioningFactory {
  public:
   explicit HivePartitioningFactory(HivePartitioningFactoryOptions options)
diff --git a/cpp/src/arrow/dataset/partition.h b/cpp/src/arrow/dataset/partition.h
index bffa2f979f..2d8c8bb274 100644
--- a/cpp/src/arrow/dataset/partition.h
+++ b/cpp/src/arrow/dataset/partition.h
@@ -30,6 +30,7 @@
 #include "arrow/compute/exec/expression.h"
 #include "arrow/dataset/type_fwd.h"
 #include "arrow/dataset/visibility.h"
+#include "arrow/util/compare.h"
 #include "arrow/util/optional.h"
 
 namespace arrow {
@@ -63,13 +64,18 @@ struct ARROW_DS_EXPORT PartitionPathFormat {
 /// Paths are consumed from left to right. Paths must be relative to
 /// the root of a partition; path prefixes must be removed before passing
 /// the path to a partitioning for parsing.
-class ARROW_DS_EXPORT Partitioning {
+class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable<Partitioning> {
  public:
   virtual ~Partitioning() = default;
 
   /// \brief The name identifying the kind of partitioning
   virtual std::string type_name() const = 0;
 
+  //// \brief Return whether the partitionings are equal
+  virtual bool Equals(const Partitioning& other) const {
+    return schema_->Equals(other.schema_, /*check_metadata=*/false);
+  }
+
   /// \brief If the input batch shares any fields with this partitioning,
   /// produce sub-batches which satisfy mutually exclusive Expressions.
   struct PartitionedBatches {
@@ -180,6 +186,8 @@ class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
 
   const ArrayVector& dictionaries() const { return dictionaries_; }
 
+  bool Equals(const Partitioning& other) const override;
+
  protected:
   KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
                        KeyValuePartitioningOptions options)
@@ -223,6 +231,8 @@ class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
 
   std::string type_name() const override { return "directory"; }
 
+  bool Equals(const Partitioning& other) const override;
+
   /// \brief Create a factory for a directory partitioning.
   ///
   /// \param[in] field_names The names for the partition fields. Types will be
@@ -282,6 +292,8 @@ class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
   static Result<util::optional<Key>> ParseKey(const std::string& segment,
                                               const HivePartitioningOptions& options);
 
+  bool Equals(const Partitioning& other) const override;
+
   /// \brief Create a factory for a hive partitioning.
   static std::shared_ptr<PartitioningFactory> MakeFactory(
       HivePartitioningFactoryOptions = {});
@@ -310,6 +322,8 @@ class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
 
   std::string type_name() const override { return name_; }
 
+  bool Equals(const Partitioning& other) const override { return false; }
+
   Result<compute::Expression> Parse(const std::string& path) const override {
     return parse_impl_(path);
   }
@@ -352,6 +366,8 @@ class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
   static std::shared_ptr<PartitioningFactory> MakeFactory(
       std::vector<std::string> field_names, PartitioningFactoryOptions = {});
 
+  bool Equals(const Partitioning& other) const override;
+
  private:
   Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
 
diff --git a/cpp/src/arrow/dataset/partition_test.cc b/cpp/src/arrow/dataset/partition_test.cc
index 86b8c4f0b9..66a22a2db3 100644
--- a/cpp/src/arrow/dataset/partition_test.cc
+++ b/cpp/src/arrow/dataset/partition_test.cc
@@ -206,6 +206,21 @@ TEST_F(TestPartitioning, DirectoryPartitioning) {
                                           equal(field_ref("beta"), literal("foo"))));
 }
 
+TEST_F(TestPartitioning, DirectoryPartitioningEquals) {
+  auto part = std::make_shared<DirectoryPartitioning>(
+      schema({field("alpha", int32()), field("beta", utf8())}));
+  auto other = std::make_shared<DirectoryPartitioning>(
+      schema({field("alpha", int32()), field("gamma", utf8())}));
+  auto another = std::make_shared<DirectoryPartitioning>(
+      schema({field("alpha", int32()), field("beta", utf8())}));
+  auto some_other = std::make_shared<DirectoryPartitioning>(
+      schema({field("alpha", int32()), field("beta", utf8())}));
+  EXPECT_TRUE(part->Equals(*part));
+  EXPECT_FALSE(part->Equals(*other));
+  EXPECT_TRUE(part->Equals(*another));
+  EXPECT_TRUE(another->Equals(*some_other));
+}
+
 TEST_F(TestPartitioning, FilenamePartitioning) {
   partitioning_ = std::make_shared<FilenamePartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
@@ -222,6 +237,21 @@ TEST_F(TestPartitioning, FilenamePartitioning) {
                                          equal(field_ref("beta"), literal("foo"))));
 }
 
+TEST_F(TestPartitioning, FilenamePartitioningEquals) {
+  auto part = std::make_shared<FilenamePartitioning>(
+      schema({field("alpha", int32()), field("beta", utf8())}));
+  auto other_part = std::make_shared<FilenamePartitioning>(
+      schema({field("sigma", int32()), field("beta", utf8())}));
+  auto another_part = std::make_shared<FilenamePartitioning>(
+      schema({field("sigma", int64()), field("beta", utf8())}));
+  auto some_other_part = std::make_shared<FilenamePartitioning>(
+      schema({field("sigma", int64()), field("beta", utf8())}));
+  EXPECT_TRUE(part->Equals(*part));
+  EXPECT_FALSE(part->Equals(*other_part));
+  EXPECT_FALSE(other_part->Equals(*another_part));
+  EXPECT_TRUE(another_part->Equals(*some_other_part));
+}
+
 TEST_F(TestPartitioning, DirectoryPartitioningFormat) {
   partitioning_ = std::make_shared<DirectoryPartitioning>(
       schema({field("alpha", int32()), field("beta", utf8())}));
@@ -426,6 +456,41 @@ TEST_F(TestPartitioning, HivePartitioning) {
   AssertParseError("/alpha=0.0/beta=3.25/");  // conversion of "0.0" to int32 fails
 }
 
+TEST_F(TestPartitioning, HivePartitioningEquals) {
+  const auto& array_vector = ArrayVector();
+  ArrayVector other_vector(2);
+  other_vector[0] = ArrayFromJSON(utf8(), R"(["foo", "bar", "baz"])");
+  other_vector[1] = ArrayFromJSON(utf8(), R"(["bar", "foo", "baz"])");
+  auto part = std::make_shared<HivePartitioning>(
+      schema({field("alpha", int32()), field("beta", float32())}), array_vector, "xyz");
+  auto other_part = std::make_shared<HivePartitioning>(
+      schema({field("sigma", int32()), field("beta", float32())}), array_vector, "xyz");
+  auto another_part = std::make_shared<HivePartitioning>(
+      schema({field("alpha", int32()), field("beta", float32())}), other_vector, "xyz");
+  auto some_part = std::make_shared<HivePartitioning>(
+      schema({field("alpha", int32()), field("beta", float32())}), array_vector, "abc");
+  auto match_part = std::make_shared<HivePartitioning>(
+      schema({field("alpha", int32()), field("beta", float32())}), array_vector, "xyz");
+  EXPECT_TRUE(part->Equals(*part));
+  EXPECT_FALSE(part->Equals(*other_part));
+  EXPECT_FALSE(part->Equals(*another_part));
+  EXPECT_FALSE(part->Equals(*some_part));
+  EXPECT_TRUE(part->Equals(*match_part));
+}
+
+TEST_F(TestPartitioning, CrossCheckPartitioningEquals) {
+  auto file_part = std::make_shared<FilenamePartitioning>(
+      schema({field("alpha", int32()), field("beta", utf8())}));
+  auto dir_part = std::make_shared<DirectoryPartitioning>(
+      schema({field("alpha", int32()), field("beta", utf8())}));
+  auto hive_part = std::make_shared<HivePartitioning>(
+      schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
+  EXPECT_FALSE(file_part->Equals(*dir_part));
+  EXPECT_FALSE(dir_part->Equals(*file_part));
+  EXPECT_FALSE(dir_part->Equals(*hive_part));
+  EXPECT_FALSE(hive_part->Equals(*dir_part));
+}
+
 TEST_F(TestPartitioning, HivePartitioningFormat) {
   partitioning_ = std::make_shared<HivePartitioning>(
       schema({field("alpha", int32()), field("beta", float32())}), ArrayVector(), "xyz");
@@ -891,6 +956,14 @@ class RangePartitioning : public Partitioning {
 
   std::string type_name() const override { return "range"; }
 
+  bool Equals(const Partitioning& other) const override {
+    if (this == &other) {
+      return true;
+    }
+    return checked_cast<const RangePartitioning&>(other).type_name() == type_name() &&
+           Partitioning::Equals(other);
+  }
+
   Result<compute::Expression> Parse(const std::string& path) const override {
     std::vector<compute::Expression> ranges;