You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/12/23 14:00:23 UTC

[GitHub] [arrow] lidavidm commented on a change in pull request #12031: ARROW-15138: [C++] Make ExecPlan::ToString give some additional information

lidavidm commented on a change in pull request #12031:
URL: https://github.com/apache/arrow/pull/12031#discussion_r774585813



##########
File path: cpp/src/arrow/compute/exec/exec_plan.cc
##########
@@ -157,11 +157,46 @@ struct ExecPlanImpl : public ExecPlan {
     return std::move(Impl{nodes_}.sorted);
   }
 
+  std::pair<NodeVector, std::vector<int>> OrderedNodes() const {
+    struct Impl {
+      const std::vector<std::unique_ptr<ExecNode>>& nodes;
+      std::unordered_set<ExecNode*> visited;
+      NodeVector sorted;
+      std::vector<int> indents;
+
+      explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
+        visited.reserve(nodes.size());
+
+        for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
+          if (visited.count(it->get()) != 0) return;

Review comment:
       Hmm, should this be `continue;` not `return;`?

##########
File path: cpp/src/arrow/compute/exec/exec_plan.cc
##########
@@ -157,11 +157,46 @@ struct ExecPlanImpl : public ExecPlan {
     return std::move(Impl{nodes_}.sorted);
   }
 
+  std::pair<NodeVector, std::vector<int>> OrderedNodes() const {
+    struct Impl {
+      const std::vector<std::unique_ptr<ExecNode>>& nodes;
+      std::unordered_set<ExecNode*> visited;
+      NodeVector sorted;
+      std::vector<int> indents;
+
+      explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
+        visited.reserve(nodes.size());
+
+        for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
+          if (visited.count(it->get()) != 0) return;
+          Visit(it->get());
+        }
+
+        DCHECK_EQ(visited.size(), nodes.size());
+      }
+
+      void Visit(ExecNode* node, int indent = 0) {
+        for (auto input : node->inputs()) {
+          Visit(input, indent + 1);
+        }
+
+        indents.push_back(indent);
+        sorted.push_back(node);
+        visited.insert(node);
+      }
+    };
+
+    auto result = Impl{nodes_};

Review comment:
       Should this be `TopoSort()`? It seems `nodes_` reflects the order that nodes were added to the plan, which probably, but not necessarily, corresponds with direction of data flow.

##########
File path: cpp/src/arrow/compute/exec/exec_plan.cc
##########
@@ -157,11 +157,46 @@ struct ExecPlanImpl : public ExecPlan {
     return std::move(Impl{nodes_}.sorted);
   }
 
+  std::pair<NodeVector, std::vector<int>> OrderedNodes() const {
+    struct Impl {
+      const std::vector<std::unique_ptr<ExecNode>>& nodes;
+      std::unordered_set<ExecNode*> visited;
+      NodeVector sorted;
+      std::vector<int> indents;
+
+      explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
+        visited.reserve(nodes.size());
+
+        for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
+          if (visited.count(it->get()) != 0) return;

Review comment:
       I suppose it'd only matter if we have something like two sink nodes (which may be an odd case).

##########
File path: cpp/src/arrow/compute/exec/plan_test.cc
##########
@@ -374,13 +374,13 @@ custom_sink_label:OrderBySinkNode{inputs=[collected=:FilterNode], by={sort_keys=
           })
           .AddToPlan(plan.get()));
   EXPECT_EQ(plan->ToString(), R"a(ExecPlan with 5 nodes:
-lhs:SourceNode{outputs=[:UnionNode]}
-rhs:SourceNode{outputs=[:UnionNode]}
-:UnionNode{inputs=[input_0_label=lhs:SourceNode, input_1_label=rhs:SourceNode], outputs=[:ScalarAggregateNode]}
-:ScalarAggregateNode{inputs=[target=:UnionNode], outputs=[:SinkNode], aggregates=[
+:SinkNode{inputs=[collected=:ScalarAggregateNode]}
+  :ScalarAggregateNode{inputs=[target=:UnionNode], outputs=[:SinkNode], aggregates=[
 	count(i32, {mode=NON_NULL}),
 ]}
-:SinkNode{inputs=[collected=:ScalarAggregateNode]}
+    :UnionNode{inputs=[input_0_label=lhs:SourceNode, input_1_label=rhs:SourceNode], outputs=[:ScalarAggregateNode]}

Review comment:
       Do we still want to print `inputs` and `outputs` for every node?

##########
File path: cpp/src/arrow/compute/exec/exec_plan.cc
##########
@@ -157,11 +157,46 @@ struct ExecPlanImpl : public ExecPlan {
     return std::move(Impl{nodes_}.sorted);
   }
 
+  std::pair<NodeVector, std::vector<int>> OrderedNodes() const {
+    struct Impl {
+      const std::vector<std::unique_ptr<ExecNode>>& nodes;
+      std::unordered_set<ExecNode*> visited;
+      NodeVector sorted;
+      std::vector<int> indents;
+
+      explicit Impl(const std::vector<std::unique_ptr<ExecNode>>& nodes) : nodes(nodes) {
+        visited.reserve(nodes.size());
+
+        for (auto it = nodes.rbegin(); it != nodes.rend(); ++it) {
+          if (visited.count(it->get()) != 0) return;
+          Visit(it->get());
+        }
+
+        DCHECK_EQ(visited.size(), nodes.size());
+      }
+
+      void Visit(ExecNode* node, int indent = 0) {
+        for (auto input : node->inputs()) {
+          Visit(input, indent + 1);

Review comment:
       Should we also check that `input` is not in `visited` just to ensure we don't get trapped if there is somehow a cycle?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org