You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/04/12 09:18:35 UTC

[GitHub] [tvm] lhutton1 commented on a diff in pull request #10959: [microNPU] Add a pass to reorder copy and compute nodes

lhutton1 commented on code in PR #10959:
URL: https://github.com/apache/tvm/pull/10959#discussion_r848191393


##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,

Review Comment:
   Might be worth mentioning the reordering only happens for global copy operations



##########
tests/python/contrib/test_ethosu/test_copy_compute_reordering.py:
##########
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm.script import tir as T
+from tvm.relay.backend.contrib.ethosu.tir.passes import CopyComputeReordering
+
+
+def test_four_convolutions():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            # function attr dict
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([8192], "int8")
+            buffer2 = T.buffer_decl([128], "uint8")
+            buffer3 = T.buffer_decl([32], "uint8")
+            buffer4 = T.buffer_decl([112], "uint8")
+            buffer5 = T.buffer_decl([32], "uint8")
+            buffer6 = T.buffer_decl([112], "uint8")
+            buffer7 = T.buffer_decl([32], "uint8")
+            buffer8 = T.buffer_decl([112], "uint8")
+            buffer9 = T.buffer_decl([32], "uint8")
+            buffer10 = T.buffer_decl([2048], "int8")
+            # body
+            p1 = T.allocate([128], "uint8", "global")
+            p2 = T.allocate([112], "uint8", "global")
+            p3 = T.allocate([112], "uint8", "global")
+            p4 = T.allocate([32], "uint8", "global")
+            p5 = T.allocate([32], "uint8", "global")
+            p6 = T.allocate([32], "uint8", "global")
+            p7 = T.allocate([112], "uint8", "global")
+            p8 = T.allocate([32], "uint8", "global")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 128, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 32, p4[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 112, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 32, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 128, 12, p4[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 112, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 32, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p2[0], 112, 12, p5[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer8[0], 112, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer9[0], 32, p8[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p3[0], 112, 12, p6[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, buffer1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, buffer10[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p7[0], 112, 12, p8[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+    
+    test_mod = CopyComputeReordering()(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_copy_to_buffer_with_local_scope():
+    # fmt: off
+    @tvm.script.ir_module
+    class InputModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})  
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    
+    @tvm.script.ir_module
+    class ReferenceModule:
+        @T.prim_func
+        def main() -> None:
+            T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
+            buffer1 = T.buffer_decl([64], "uint8")
+            buffer2 = T.buffer_decl([48], "uint8")
+            buffer3 = T.buffer_decl([48], "uint8")
+            buffer4 = T.buffer_decl([256], "uint8")
+            buffer5 = T.buffer_decl([16], "uint8")
+            buffer6 = T.buffer_decl([48], "uint8")
+            buffer7 = T.buffer_decl([256], "uint8")
+            buffer8 = T.buffer_decl([64], "uint8")
+            # body
+            p1 = T.allocate([48], "uint8", "global")
+            p2 = T.allocate([48], "uint8", "global")
+            p3 = T.allocate([256], "int8", "local")
+            p4 = T.allocate([256], "int8", "global")
+            p5 = T.allocate([16], "uint8", "global")
+            p6 = T.allocate([48], "uint8", "global")
+            p7 = T.allocate([256], "int8", "local")
+            T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 48, p1[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer3[0], 48, p2[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer4[0], 256, p3[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer5[0], 16, p5[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer6[0], 48, p6[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 4, 4, 4, 0, 4, buffer1[0], 0, 0, 0, T.float32(0.00392081), -128, "NHWC", 16, 4, 1, "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.00839574), -128, "NHCWB16", 64, 16, 1, 1, 1, 1, 1, 1, 1, p1[0], 48, 0, p2[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_copy", buffer7[0], 256, p7[0], dtype="handle"))
+            T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 4, 4, 4, 4, 0, 4, p4[0], 0, 0, 0, T.float32(0.0078125), 0, "NHCWB16", 64, 16, 1, "int8", 4, 4, 4, 4, 0, 4, buffer8[0], 0, 0, 0, T.float32(0.00372155), -128, "NHWC", 16, 4, 1, 1, 1, 1, 1, 1, 1, p5[0], 16, 0, p6[0], 48, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
+    # fmt: on
+
+    test_mod = CopyComputeReordering()(InputModule)
+    reference_mod = ReferenceModule
+    tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+

Review Comment:
   Might also be worth adding a test when there are no copy ops. Could we also test the `Expected a single primitive function called 'main'. Please run the...` error when multiple PrimFunc's are provided and when there is no main function?



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};
+      auto args{stmt.as<EvaluateNode>()->value.as<CallNode>()->args};

Review Comment:
   If the IR input to this pass is not what's expected (e.g. during development) we could get a segfault when performing these casts, which doesn't give much information. I think it would be better to separate them out and check a pointer was returned using an `ICHECK` e.g.
   ```
   auto ev_node  = stmt.as<EvaluateNode>();
   ICHECK(ev_node) << "Expected statement to be an evaluate node, but was " << stmt->GetTypeKey();
   ...
   ```



##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -110,6 +110,71 @@ tvm::transform::Pass HoistAllocates() {
 
 TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
 
+/*!
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * and computes happen in parallel.
+ */
+class CopyComputeReorderingMutator : public StmtExprMutator {
+ public:
+  CopyComputeReorderingMutator() {}
+
+  PrimFunc operator()(PrimFunc main_func) {
+    auto n{main_func.CopyOnWrite()};
+    n->body = this->VisitStmt(main_func->body);
+    return GetRef<PrimFunc>(n);
+  }
+
+ private:
+  Stmt VisitStmt_(const SeqStmtNode* op) override {
+    if (op->size() <= 1) {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+
+    auto seq_stmt{GetRef<SeqStmt>(op)};
+    std::vector<Stmt> new_seq(seq_stmt->size());
+    std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
+    bool previous_stmt_is_copy{true};  // Do not move the first stmt if it is a copy
+
+    for (size_t i{}; i < seq_stmt->size(); ++i) {
+      auto stmt{seq_stmt[i]};

Review Comment:
   Nit: it might make the code a bit more readable to use types rather than `auto` here



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org