You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/04/11 09:39:42 UTC

[GitHub] [tvm] NicolaLancellotti opened a new pull request, #10959: [microNPU] Add a pass to reorder copy and compute nodes

NicolaLancellotti opened a new pull request, #10959:
URL: https://github.com/apache/tvm/pull/10959

   This pr adds a pass to reorder Arm(R) Ethos(TM)-U copy and compute nodes in such a way that independent DMA copies, and computes happen in parallel.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] manupa-arm commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

manupa-arm commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r861971113


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
+           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
+  }
+
+  int _max_copy_movements;
+};
+
+/*!
+ * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ *
+ * \param max_copy_movements: The maximum number of movements allowed for a copy.
+ * \return tvm::transform::Pass
+ */
+tvm::transform::Pass CopyComputeReordering(int max_copy_movements) {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))

Review Comment:
   Lets add a PassContext option so it could be passed from above. 
   
   I think int max_copy_movement (if provided) should take priority --> PassContext value --> then the default.
   E.g. : 
   https://github.com/apache/tvm/blob/6b45f8dc4ad0cfecf07dbd031b1e55fe4c9b02c5/src/tir/usmp/unified_static_memory_planner.cc#L96



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};

Review Comment:
   nit : lets use the type here.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};

Review Comment:
   nit : lets use typed better variable name.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {

Review Comment:
   lets use const Stmt& to pass const references here.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
+           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
+  }
+
+  int _max_copy_movements;

Review Comment:
   Please add docs for these variables.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {

Review Comment:
   lets use const Stmt& to pass const references here.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {

Review Comment:
   I think its clearer lets stick to explicit initialization.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(Stmt stmt) {

Review Comment:
   lets use const Stmt& to pass const references here.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {

Review Comment:
   docs : please add some comment to explaint what is being done in the loop.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r863943982


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
+           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
+  }
+
+  int _max_copy_movements;

Review Comment:
   Done.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy" &&
+           args[3].as<BufferLoadNode>()->buffer.scope() == "global";
+  }
+
+  int _max_copy_movements;
+};
+
+/*!
+ * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ *
+ * \param max_copy_movements: The maximum number of movements allowed for a copy.
+ * \return tvm::transform::Pass
+ */
+tvm::transform::Pass CopyComputeReordering(int max_copy_movements) {
+  auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
+    ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r848550508


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};
+      auto args{stmt.as<EvaluateNode>()->value.as<CallNode>()->args};
+      bool stmt_is_copy{args[0].as<StringImmNode>()->value == "ethosu_copy"};
+      bool stmt_is_global_copy{stmt_is_copy &&
+                               args[3].as<BufferLoadNode>()->buffer.scope() == "global"};

Review Comment:
   I have added it in the pass documentation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r863946022


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {
+    auto args{get_stmt_args(stmt)};
+    return args[0].as<StringImmNode>()->value == "ethosu_copy";
+  }
+
+  bool stmt_is_global_copy(Stmt stmt) {

Review Comment:
   Done.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r863948957


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};

Review Comment:
   I refactored the pass and I added the type for this variable.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] lhutton1 commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

lhutton1 commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r848191393


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,

Review Comment:
   Might be worth mentioning the reordering only happens for global copy operations



##########
tests/python/contrib/test_ethosu/test_copy_compute_reordering.py:
##########
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm.script import tir as T
+from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering
+
+
+def test_four_convolutions():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+    
+    test_mod = CopyComputeReordering()(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_copy_to_buffer_with_local_scope():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering()(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+

Review Comment:
   Might also be worth adding a test when there are no copy ops. Could we also test the `Expected a single primitive function called 'main'. Please run the...` error when multiple PrimFunc's are provided and when there is no main function?



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};
+      auto args{stmt.as<EvaluateNode>()->value.as<CallNode>()->args};

Review Comment:
   If the IR input to this pass is not what's expected (e.g. during development) we could get a segfault when performing these casts, which doesn't give much information. I think it would be better to separate them out and check a pointer was returned using an `ICHECK` e.g.
   ```
   auto ev_node  = stmt.as<EvaluateNode>();
   ICHECK(ev_node) << "Expected statement to be an evaluate node, but was " << stmt->GetTypeKey();
   ...
   ```



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};

Review Comment:
   Nit: it might make the code a bit more readable to use types rather than `auto` here



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] ekalda commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

ekalda commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r848275763


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};
+      auto args{stmt.as<EvaluateNode>()->value.as<CallNode>()->args};
+      bool stmt_is_copy{args[0].as<StringImmNode>()->value == "ethosu_copy"};
+      bool stmt_is_global_copy{stmt_is_copy &&
+                               args[3].as<BufferLoadNode>()->buffer.scope() == "global"};

Review Comment:
   Maybe add a brief comment there why we don't touch copies with scope "local", something along the lines "Copies tagged as "local" copy LUT into the SHRAM which already happens in parallel with copying weights into the weights encoder"



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] lhutton1 commented on pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

lhutton1 commented on PR #10959:
URL: https://github.com/apache/tvm/pull/10959#issuecomment-1106343504

   Looks like it now conflicts with #10344 :/


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] lhutton1 commented on pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

lhutton1 commented on PR #10959:
URL: https://github.com/apache/tvm/pull/10959#issuecomment-1129939305

   Thanks @NicolaLancellotti, @manupa-arm, @ekalda!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] lhutton1 commented on pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

lhutton1 commented on PR #10959:
URL: https://github.com/apache/tvm/pull/10959#issuecomment-1106343503

   Looks like it now conflicts with #10344 :/


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r863945721


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r863945160


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {

Review Comment:
   Done.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {
+        auto i{index - offset};
+        if (i > 0 && !stmt_is_copy(new_seq[i - 1]) && stmt_is_global_copy(new_seq[i])) {
+          std::swap(new_seq[i], new_seq[i - 1]);
+        } else {
+          break;
+        }
+      }
+    }
+
+    auto n{CopyOnWrite(op)};
+    n->seq = std::move(new_seq);
+    return Stmt{n};
+  }
+
+  tvm::runtime::Array<tvm::PrimExpr> get_stmt_args(Stmt stmt) {
+    auto eval_node{stmt.as<EvaluateNode>()};
+    ICHECK(eval_node) << "Expected statement to be an evaluate node, but was "
+                      << stmt->GetTypeKey();
+    auto call_node{eval_node->value.as<CallNode>()};
+    ICHECK(call_node) << "Expected expression to be a call node, but was "
+                      << eval_node->value->GetTypeKey();
+    return call_node->args;
+  }
+
+  bool stmt_is_copy(Stmt stmt) {

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] manupa-arm commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

manupa-arm commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r866582888


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -27,7 +27,17 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <algorithm>
+
 namespace tvm {
+
+/*!
+ * \brief The maximum number of movements allowed for a copy in the CopyComputeReordering pass.
+ */
+constexpr const char* kCopyComputeReorderingMaxCopyMovements =
+    "tir.copy_compute_reordering_max_copy_movements";

Review Comment:
   Since this is not a generic tir pass, lets use tir.ethosu.* naming



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] lhutton1 commented on pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

lhutton1 commented on PR #10959:
URL: https://github.com/apache/tvm/pull/10959#issuecomment-1102402406

   Nice thanks @NicolaLancellotti! It looks like we need a re-trigger due to the linting issues CI had last week :)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] lhutton1 merged pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

lhutton1 merged PR #10959:
URL: https://github.com/apache/tvm/pull/10959


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r848549306


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};

Review Comment:
   Done.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};
+      auto args{stmt.as<EvaluateNode>()->value.as<CallNode>()->args};

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r848549652


##########
tests/python/contrib/test_ethosu/test_copy_compute_reordering.py:
##########
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm.script import tir as T
+from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering
+
+
+def test_four_convolutions():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+    
+    test_mod = CopyComputeReordering()(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_copy_to_buffer_with_local_scope():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering()(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+

Review Comment:
   Done.



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r863947886


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,98 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ * Copies to buffers with local scope are not reordered, indeed they copy LUT
+ * into the SHRAM which already happens in parallel with copying weights into
+ * the weights encoder.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator(int max_copy_movements) : _max_copy_movements{max_copy_movements} {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    if (_max_copy_movements > 0) {
+      auto n{main_func.CopyOnWrite()};
+      n->body = this->VisitStmt(main_func->body);
+      return GetRef<PrimFunc>(n);
+    }
+    return main_func;
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+
+    for (size_t index{}; index < new_seq.size(); ++index) {
+      for (int offset{}; offset < _max_copy_movements; ++offset) {

Review Comment:
   Done.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on PR #10959:
URL: https://github.com/apache/tvm/pull/10959#issuecomment-1113508217

   I have added a parameter to specify the maximum number of movements allowed for a copy.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on PR #10959:
URL: https://github.com/apache/tvm/pull/10959#issuecomment-1096855691

   > I'd be interested in seeing how that pass interacts with graphs that have mixture of operators with and without weights, e.g. it seems to me that when we have a graph that looks like
   > `pooling -> copy -> copy -> conv2d -> copy -> copy -> depthwise2d`
   > it will end up after this pass as
   > `copy -> copy -> pooling -> copy -> copy -> conv2d ... `
   > I suppose that is intentional, that we start copying the conv2d weights in while the MAC engine is crunching the pooling? Maybe it's worth adding a test that exercises that kind of mixture of ops?
   
   Yes, it is intentional, and the reordering is just what you said. I have added a test too.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] NicolaLancellotti commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

Posted by GitBox <gi...@apache.org>.

NicolaLancellotti commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r866646140


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -27,7 +27,17 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <algorithm>
+
 namespace tvm {
+
+/*!
+ * \brief The maximum number of movements allowed for a copy in the CopyComputeReordering pass.
+ */
+constexpr const char* kCopyComputeReorderingMaxCopyMovements =
+    "tir.copy_compute_reordering_max_copy_movements";

Review Comment:
   I've just done it in this pr.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org