You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/12 07:34:08 UTC

[GitHub] [arrow] westonpace commented on a diff in pull request #13078: ARROW-15590: [C++] Add support for joins to the Substrait consumer

westonpace commented on code in PR #13078:
URL: https://github.com/apache/arrow/pull/13078#discussion_r871038360


##########
cpp/src/arrow/engine/substrait/serde_test.cc:
##########
@@ -750,5 +750,309 @@ TEST(Substrait, ExtensionSetFromPlanMissingFunc) {
           &ext_set));
 }
 
+TEST(Substrait, JoinPlanBasic) {

Review Comment:
   These plans are very verbose.  I can't think of any good trick to shorten it but I worry a little about how much raw substrait json we are going to have in all our unit tests and that becoming a maintenance burden.  Any ideas?



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -188,6 +188,79 @@ Result<compute::Declaration> FromProto(const substrait::Rel& rel,
       });
     }
 
+    case substrait::Rel::RelTypeCase::kJoin: {
+      const auto& join = rel.join();
+      RETURN_NOT_OK(CheckRelCommon(join));
+
+      if (!join.has_left()) {
+        return Status::Invalid("substrait::JoinRel with no left relation");
+      }
+
+      if (!join.has_right()) {
+        return Status::Invalid("substrait::JoinRel with no right relation");
+      }
+
+      compute::JoinType join_type;
+      switch (join.type()) {
+        case 0:
+          return Status::NotImplemented("Unspecified join type is not supported");
+        case 1:
+          join_type = compute::JoinType::INNER;
+          break;
+        case 2:
+          return Status::NotImplemented("Outer join type is not supported");
+        case 3:
+          return Status::NotImplemented("Left join type is not supported");
+        case 4:
+          return Status::NotImplemented("Right join type is not supported");
+        case 5:
+          return Status::NotImplemented("Semi join type is not supported");
+        case 6:
+          return Status::NotImplemented("Anti join type is not supported");
+        default:
+          return Status::Invalid("Unsupported join type");
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto left, FromProto(join.left(), ext_set));
+      ARROW_ASSIGN_OR_RAISE(auto right, FromProto(join.right(), ext_set));
+
+      if (!join.has_expression()) {
+        return Status::Invalid("substrait::JoinRel with no expression");
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto expression, FromProto(join.expression(), ext_set));
+
+      const auto& callptr = expression.call();
+      if (!callptr) {
+        return Status::Invalid(
+            "Only support call expressions as the join key comparison.");
+      }
+
+      compute::JoinKeyCmp join_key_cmp;
+      if (callptr->function_name == "equal") {
+        join_key_cmp = compute::JoinKeyCmp::EQ;
+      } else if (callptr->function_name == "is_not_distinct_from") {
+        join_key_cmp = compute::JoinKeyCmp::IS;
+      } else {
+        return Status::Invalid(
+            "Only Support `equal` or `is_not_distinct_from` for join key comparison");
+      }
+
+      // TODO: Add Suffix support for Substrait
+      compute::HashJoinNodeOptions join_options{
+          join_type,
+          {std::move(*callptr->arguments[0].field_ref())},
+          {std::move(*callptr->arguments[1].field_ref())},
+          {join_key_cmp},
+          arrow::compute::literal(true),
+          "_l",
+          "_r"};

Review Comment:
   These defaults should ideally be in the constructor and not specified here.



##########
cpp/src/arrow/compute/exec/options.h:
##########
@@ -294,6 +294,20 @@ class ARROW_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
       this->key_cmp[i] = JoinKeyCmp::EQ;
     }
   }
+  HashJoinNodeOptions(

Review Comment:
   Rather than keep adding constructors for `HashJoinNodeOptions` can we make a constructor that takes no arguments (or the minimum number of required arguments?  Maybe just `left_keys` and `right_keys` and assigns reasonable defaults).  Then we can just override the settings after construction.  For example:
   
   ```
   HashJoinNodeOptions opts{left_keys, right_keys};
   opts.join_type = join_type_from_substrait;
   ...
   ```



##########
cpp/src/arrow/engine/substrait/relation_internal.cc:
##########
@@ -188,6 +188,79 @@ Result<compute::Declaration> FromProto(const substrait::Rel& rel,
       });
     }
 
+    case substrait::Rel::RelTypeCase::kJoin: {
+      const auto& join = rel.join();
+      RETURN_NOT_OK(CheckRelCommon(join));
+
+      if (!join.has_left()) {
+        return Status::Invalid("substrait::JoinRel with no left relation");
+      }
+
+      if (!join.has_right()) {
+        return Status::Invalid("substrait::JoinRel with no right relation");
+      }
+
+      compute::JoinType join_type;
+      switch (join.type()) {
+        case 0:
+          return Status::NotImplemented("Unspecified join type is not supported");
+        case 1:
+          join_type = compute::JoinType::INNER;
+          break;
+        case 2:
+          return Status::NotImplemented("Outer join type is not supported");
+        case 3:
+          return Status::NotImplemented("Left join type is not supported");
+        case 4:
+          return Status::NotImplemented("Right join type is not supported");
+        case 5:
+          return Status::NotImplemented("Semi join type is not supported");
+        case 6:
+          return Status::NotImplemented("Anti join type is not supported");
+        default:
+          return Status::Invalid("Unsupported join type");
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto left, FromProto(join.left(), ext_set));
+      ARROW_ASSIGN_OR_RAISE(auto right, FromProto(join.right(), ext_set));
+
+      if (!join.has_expression()) {
+        return Status::Invalid("substrait::JoinRel with no expression");
+      }
+
+      ARROW_ASSIGN_OR_RAISE(auto expression, FromProto(join.expression(), ext_set));
+
+      const auto& callptr = expression.call();

Review Comment:
   Is this a pointer?  If so I think we can use `const auto*` (but I may be wrong on this)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org