You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by mo...@apache.org on 2022/08/12 15:20:14 UTC
[tvm] branch main updated: [microNPU] Reorder copies and computes based on the cycle count (#11591)
This is an automated email from the ASF dual-hosted git repository.
mousius pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new d874a8ed52 [microNPU] Reorder copies and computes based on the cycle count (#11591)
d874a8ed52 is described below
commit d874a8ed5241d5c90207ee54c9bb3003b8244d63
Author: Elen Kalda <el...@arm.com>
AuthorDate: Fri Aug 12 16:20:08 2022 +0100
[microNPU] Reorder copies and computes based on the cycle count (#11591)
If the cascader is enabled and the ops in TIR have the cycle
count annotation, enabling the reorder_by_cycles option will
reorder to copies and computes based on a cycle count.
If reorder_by_cycles is enabled, max_copy_movements is ignored.
This pass is currently not part of the TIR pipeline since it
assumes that weights and bias of a compute op are merged into
one constant (which is WIP).
---
.../relay/backend/contrib/ethosu/tir/compiler.py | 1 +
.../tvm/relay/backend/contrib/ethosu/tir/passes.py | 35 +++-
src/tir/contrib/ethosu/passes.cc | 158 ++++++++++++++--
.../test_ethosu/test_copy_compute_reordering.py | 210 +++++++++++++++++++++
4 files changed, 381 insertions(+), 23 deletions(-)
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index 85c6df4c7d..aaac59ad4a 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -90,6 +90,7 @@ def lower_ethosu(sch, args, const_dict, name="main"):
mod = tvm.tir.transform.RemoveNoOp()(mod)
mod, const_dict = ethosu_passes.EncodeConstants(const_dict)(mod)
mod = ethosu_passes.HoistAllocates()(mod)
+ mod = tvm.tir.transform.RemoveNoOp()(mod)
# MergeConstant pass currently does not support striped schedules.
# It requires further investigation.
if not util.is_striping_enabled():
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
index c0b017e703..cc94c6e816 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/passes.py
@@ -916,14 +916,33 @@ def HoistAllocates() -> tvm.IRModule:
return _ffi_api.HoistAllocates()
-def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRModule:
+def CopyComputeReordering(
+ max_copy_movements: Optional[int] = None, reorder_by_cycles: Optional[bool] = None
+) -> tvm.IRModule:
"""
- Reorders copy and compute nodes in such a way that independent DMA copies,
+ Reorders copy and compute nodes in such a way that independent DMA copies
and computes happen in parallel.
- Copies to buffers with local scope are not reordered, indeed they copy LUT
- into the SHRAM which already happens in parallel with copying weights into
+ Copies to buffers with local scope are not reordered since they copy LUT
+ into the SHRAM and that already happens in parallel with copying weights into
the weights encoder.
+ If reorder_by_cycles is set, we use the compute_cycles_hint to decide the reordering. If it is
+ not set, we move the copies up by a fixed number of movements, either by max_copy_movements if
+ it is specified, or by default value of 1.
+
+ If reordering based on the cycle count is enabled, we try to achieve further copy latency
+ hiding with a two step algorithm:
+ (1) Move all the global copies (i.e. copies that copy a constant into SRAM for conv2d or
+ depthwise_conv2d) above a preceding compute op. If in general the computes take longer than
+ copies, this should be enough to hide the copy latencies.
+ (2) If there are some global copies that take longer than the computes, we might be able to
+ hide them further by moving them further up in a graph since in general there are more compute
+ ops than copy ops in a graph (as only conv2d and depthwise_conv2d have constants associated
+ with them). The algortithm checks whether a copy is hidden and if it is not, it checks if a
+ preceding compute op has a preceding copy and if it doesn't it moves the copy that we try to
+ hide further up. It keeps moving the copy until it can't move it any further or until the
+ latency is hidden.
+
Parameters
----------
max_copy_movements: Optional[int]
@@ -932,12 +951,18 @@ def CopyComputeReordering(max_copy_movements: Optional[int] = None) -> tvm.IRMod
tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements
is used if provided, otherwise the default value will be 1.
+ reorder_by_cycles: Optional[bool]
+ Whether to reorder the computes and copies based on the cycle hint.
+ If None, the pass context option
+ tir.contrib.ethos-u.copy_compute_reordering_reorder_by_cycles
+ is used if provided, otherwise the default value will be False.
+
Returns
-------
tvm.IRModule
The new module with copy and compute nodes reordered.
"""
- return _ffi_api.CopyComputeReordering(max_copy_movements)
+ return _ffi_api.CopyComputeReordering(max_copy_movements, reorder_by_cycles)
def MergeConstants(const_dict):
diff --git a/src/tir/contrib/ethosu/passes.cc b/src/tir/contrib/ethosu/passes.cc
index b662e9dfd0..2f6fa8f3ea 100644
--- a/src/tir/contrib/ethosu/passes.cc
+++ b/src/tir/contrib/ethosu/passes.cc
@@ -41,6 +41,13 @@ constexpr const char* kCopyComputeReorderingMaxCopyMovements =
"tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements";
TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingMaxCopyMovements, Integer);
+/*!
+ * \brief Whether to reorder copies and computes based on cycle count.
+ */
+constexpr const char* kCopyComputeReorderingReorderByCycles =
+ "tir.contrib.ethos-u.copy_compute_reordering_reorder_by_cycles";
+TVM_REGISTER_PASS_CONFIG_OPTION(kCopyComputeReorderingReorderByCycles, Bool);
+
namespace tir {
namespace contrib {
namespace ethosu {
@@ -180,16 +187,16 @@ tvm::transform::Pass HoistAllocates() {
TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
/*!
- * \brief Reorders copy and compute nodes in such a way that independent DMA copies,
+ * \brief Reorders copy and compute nodes in such a way that independent DMA copies
* and computes happen in parallel.
- * Copies to buffers with local scope are not reordered, indeed they copy LUT
- * into the SHRAM which already happens in parallel with copying weights into
+ * Copies to buffers with local scope are not reordered since they copy LUT
+ * into the SHRAM and that already happens in parallel with copying weights into
* the weights encoder.
*/
class CopyComputeReorderingMutator : public StmtExprMutator {
public:
- explicit CopyComputeReorderingMutator(int max_copy_movements)
- : _max_copy_movements{max_copy_movements} {}
+ explicit CopyComputeReorderingMutator(int max_copy_movements, bool reorder_by_cycles)
+ : _max_copy_movements{max_copy_movements}, _reorder_by_cycles{reorder_by_cycles} {}
PrimFunc operator()(PrimFunc main_func) {
if (_max_copy_movements > 0) {
@@ -201,6 +208,13 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
}
private:
+ // A structure to hold a compute op with the corresponding weights/bias copy and LUT copy
+ struct OpWithCopies {
+ Stmt compute_op{};
+ Stmt global_copy{};
+ Stmt local_copy{};
+ };
+
Stmt VisitStmt_(const SeqStmtNode* op) override {
if (op->size() <= 1) {
return StmtExprMutator::VisitStmt_(op);
@@ -210,13 +224,103 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
std::vector<Stmt> new_seq(seq_stmt->size());
std::copy(seq_stmt->seq.begin(), seq_stmt->seq.end(), new_seq.begin());
- // Each copy statement to a buffer with global scope is moved up
- // at most `_max_copy_movements` times.
- for (size_t index = 0; index < new_seq.size(); ++index) {
- if (GetStmtType(new_seq[index]) == StmtType::global_copy) {
- int lower = std::max(0, static_cast<int>(index) - _max_copy_movements);
- for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute); --i) {
- std::swap(new_seq[i - 1], new_seq[i]);
+ // Reorder the copies and computes based on the cycle count
+ if (_reorder_by_cycles) {
+ // We can't hide the first copy, so ignore it for the purpose of hiding copies
+ Stmt first_copy{};
+ if (stmt_is_global_copy(new_seq[0]) ||
+ (stmt_is_local_copy(new_seq[0]) && stmt_is_global_copy(new_seq[1]))) {
+ auto copy_position = stmt_is_global_copy(new_seq[0]) ? 0 : 1;
+ first_copy = new_seq[copy_position];
+ new_seq.erase(new_seq.begin() + copy_position);
+ }
+
+ // Build up a list of cells with the compute op and the copy ops that directly preceed it
+ std::vector<OpWithCopies> ops{};
+ for (size_t idx = 0; idx < new_seq.size(); ++idx) {
+ if (stmt_is_compute_op(new_seq[idx])) {
+ OpWithCopies new_op;
+ new_op.compute_op = new_seq[idx];
+ if (idx > 0) {
+ auto prev_op = new_seq[idx - 1];
+ if (!stmt_is_compute_op(prev_op)) {
+ if (stmt_is_local_copy(prev_op)) {
+ new_op.local_copy = prev_op;
+ } else {
+ new_op.global_copy = prev_op;
+ }
+ if (idx > 1) {
+ auto prev_prev_op = new_seq[idx - 2];
+ if (!stmt_is_compute_op(prev_prev_op)) {
+ if (stmt_is_local_copy(prev_prev_op)) {
+ new_op.local_copy = prev_prev_op;
+ } else {
+ new_op.global_copy = prev_prev_op;
+ }
+ }
+ }
+ }
+ }
+ ops.push_back(new_op);
+ }
+ }
+
+ // Move the global copies up by one. If in general the computes take longer than the copies,
+ // that should be good enough
+ for (size_t idx = 1; idx < ops.size(); ++idx) {
+ if (ops[idx].global_copy.as<AttrStmtNode>()) {
+ ops[idx - 1].global_copy = ops[idx].global_copy;
+ ops[idx].global_copy = {};
+ }
+ }
+
+ // If there are long copies, try to hide them further
+ for (size_t idx = ops.size() - 1; idx > 0; --idx) {
+ if (ops[idx].global_copy.as<AttrStmtNode>()) {
+ // Check whether the copy is hidden
+ int64_t copy_cycles{GetStmtCycles(ops[idx].global_copy)};
+ int64_t compute_cycles{GetStmtCycles(ops[idx].compute_op)};
+ bool is_hidden = compute_cycles >= copy_cycles;
+
+ // If the previous compute op is not already hiding another copy, move the copy back, so
+ // that it would be hidden by multiple computes
+ while (!is_hidden && !ops[idx - 1].global_copy.as<AttrStmtNode>() && (idx > 0)) {
+ int64_t new_compute_cycles{GetStmtCycles(ops[idx - 1].compute_op)};
+ ops[idx - 1].global_copy = ops[idx].global_copy;
+ ops[idx].global_copy = {};
+ compute_cycles += new_compute_cycles;
+ is_hidden = compute_cycles >= copy_cycles;
+ --idx;
+ }
+ }
+ }
+
+ // Reconstruct the op sequence from the vector of OpWithCopies
+ new_seq.clear();
+ if (first_copy.as<AttrStmtNode>()) {
+ new_seq.push_back(first_copy);
+ }
+ for (auto& op : ops) {
+ if (op.global_copy.as<AttrStmtNode>()) {
+ new_seq.push_back(op.global_copy);
+ }
+ if (op.local_copy.as<EvaluateNode>()) {
+ new_seq.push_back(op.local_copy);
+ }
+ if (op.compute_op.as<AttrStmtNode>()) {
+ new_seq.push_back(op.compute_op);
+ }
+ }
+ } else {
+ // Each copy statement to a buffer with global scope is moved up
+ // at most `_max_copy_movements` times.
+ for (size_t index = 0; index < new_seq.size(); ++index) {
+ if (GetStmtType(new_seq[index]) == StmtType::global_copy) {
+ int lower = std::max(0, static_cast<int>(index) - _max_copy_movements);
+ for (int i = index; i > lower && (GetStmtType(new_seq[i - 1]) == StmtType::compute);
+ --i) {
+ std::swap(new_seq[i - 1], new_seq[i]);
+ }
}
}
}
@@ -226,28 +330,46 @@ class CopyComputeReorderingMutator : public StmtExprMutator {
return Stmt{seq_stmt_node};
}
+ bool stmt_is_global_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::global_copy; }
+
+ bool stmt_is_local_copy(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::local_copy; }
+
+ bool stmt_is_compute_op(const Stmt& stmt) { return GetStmtType(stmt) == StmtType::compute; }
+
/*! The maximum number of movements allowed for a copy. */
int _max_copy_movements;
+ /*! Whether we use the cycle hint to determine the reordering. */
+ bool _reorder_by_cycles;
};
/*!
- * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies,
- * and computes happen in parallel.
+ * \brief A pass to reorder copy and compute nodes in such a way that independent DMA copies
+ * and computes happen in parallel. If reorder_by_cycles is set, we will ignore the
+ * max_copy_movements value.
*
- * \param max_copy_movements: The maximum number of movements allowed for a copy.
+ * \param max_copy_movements: The maximum number of movements allowed for a copy.
* If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_max_copy_movements
* is used if provided, otherwise the default value will be 1.
+ *
+ * \param reorder_by_cycles: Whether to reorder copies and computes by cycles.
+ * If None, the pass context option tir.contrib.ethos-u.copy_compute_reordering_reorder_by_cycles
+ * is used if provided, otherwise the default value will be False. If the value is True,
+ * max_copy_movements will be ignored.
* \return tvm::transform::Pass
*/
-tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements) {
+tvm::transform::Pass CopyComputeReordering(Optional<Integer> max_copy_movements,
+ Optional<Bool> reorder_by_cycles) {
auto pass_func = [=](PrimFunc f, IRModule mod, tvm::transform::PassContext ctx) {
ICHECK(mod->GetGlobalVars().size() == 1 && mod->ContainGlobalVar("main"))
<< "Expected a single primitive function called 'main'. Please run the "
"CopyComputeReordering "
"pass in conjunction with the LowerToTIR() pass.";
- auto value = max_copy_movements.value_or(
+
+ auto copy_movements = max_copy_movements.value_or(
ctx->GetConfig(kCopyComputeReorderingMaxCopyMovements, Integer(1)).value());
- return CopyComputeReorderingMutator(value.IntValue())(f);
+ auto reorder = reorder_by_cycles.value_or(
+ ctx->GetConfig(kCopyComputeReorderingReorderByCycles, Bool(false)).value());
+ return CopyComputeReorderingMutator(copy_movements.IntValue(), reorder)(f);
};
return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
"tir.contrib.ethos-u.CopyComputeReordering", {});
diff --git a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
index eebaa3b816..f348fd7f5a 100644
--- a/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
+++ b/tests/python/contrib/test_ethosu/test_copy_compute_reordering.py
@@ -468,5 +468,215 @@ def test_pass_context_option_max_copy_movements():
tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+def test_reordering_based_on_cycles():
+ # fmt: off
+ @tvm.script.ir_module
+ class ModuleBefore:
+ @T.prim_func
+ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_encoded_3: T.Buffer[112, "uint8"], ethosu_write: T.Buffer[43672, "int8"]) -> None:
+ # function attr dict
+ T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True})
+ ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32")
+ nn = T.var("int32")
+ nn_1 = T.var("int32")
+ nn_2 = T.var("int32")
+ nn_3 = T.var("int32")
+ nn_4 = T.var("int32")
+ nn_5 = T.var("int32")
+ # body
+ placeholder_d_global = T.allocate([208], "uint8", "global")
+ placeholder_d_global_1 = T.allocate([112], "uint8", "global")
+ placeholder_d_global_2 = T.allocate([96], "uint8", "global")
+ placeholder_d_global_3 = T.allocate([112], "uint8", "global")
+ ethosu_write_1 = T.allocate([195168], "int8", "global")
+ ethosu_write_2 = T.allocate([184800], "int8", "global")
+ ethosu_write_3 = T.allocate([174688], "int8", "global")
+ ethosu_write_4 = T.allocate([174688], "int8", "global")
+ ethosu_write_5 = T.allocate([174688], "int8", "global")
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 250):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtype="h [...]
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_global_1[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 467):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_1[0], 64, 0, placeholder_d_global_1[64], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global_2[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 441):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_2[0], 48, 0, placeholder_d_global_2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 439):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 439):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 112, placeholder_d_global_3[0], dtype="handle"))
+ T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 22340)
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write[0], 0, 0, 0, T.float32(0.0057619437575340271), -128, "NHWC", 424, 4, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_3[0], 64, 0, placeholder_d_global_3[64], 48, 1, 2, 1, 2, "NONE", 0, 0, "TFL", "NONE", 14, 18, 8, dtype="handle"))
+
+
+ @tvm.script.ir_module
+ class ModuleAfter:
+ @T.prim_func
+ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_encoded_3: T.Buffer[112, "uint8"], ethosu_write: T.Buffer[43672, "int8"]) -> None:
+ # function attr dict
+ T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True})
+ ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32")
+ nn = T.var("int32")
+ nn_1 = T.var("int32")
+ nn_2 = T.var("int32")
+ nn_3 = T.var("int32")
+ nn_4 = T.var("int32")
+ nn_5 = T.var("int32")
+ # body
+ placeholder_d_global = T.allocate([208], "uint8", "global")
+ placeholder_d_global_1 = T.allocate([112], "uint8", "global")
+ placeholder_d_global_2 = T.allocate([96], "uint8", "global")
+ placeholder_d_global_3 = T.allocate([112], "uint8", "global")
+ ethosu_write_1 = T.allocate([195168], "int8", "global")
+ ethosu_write_2 = T.allocate([184800], "int8", "global")
+ ethosu_write_3 = T.allocate([174688], "int8", "global")
+ ethosu_write_4 = T.allocate([174688], "int8", "global")
+ ethosu_write_5 = T.allocate([174688], "int8", "global")
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_global_1[0], dtype="handle"))
+ with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 250):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtype="h [...]
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global_2[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 467):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_1[0], 64, 0, placeholder_d_global_1[64], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 1024):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_3[0], 112, placeholder_d_global_3[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 441):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_2[0], 48, 0, placeholder_d_global_2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 439):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 439):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1696, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 22340)
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 103, 106, 4, 103, 0, 106, ethosu_write_5[0], 0, 0, 0, T.float32(0.0057637207210063934), -128, "NHCWB16", 1696, 16, 1, "int8", 103, 106, 4, 103, 0, 106, ethosu_write[0], 0, 0, 0, T.float32(0.0057619437575340271), -128, "NHWC", 424, 4, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global_3[0], 64, 0, placeholder_d_global_3[64], 48, 1, 2, 1, 2, "NONE", 0, 0, "TFL", "NONE", 14, 18, 8, dtype="handle"))
+ # fmt: on
+
+ test_mod = CopyComputeReordering(reorder_by_cycles=True)(ModuleBefore)
+ reference_mod = ModuleAfter
+ tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
+def test_reordering_based_on_cycles_luts_present():
+ # fmt: off
+ @tvm.script.ir_module
+ class ModuleBefore:
+ @T.prim_func
+ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[46200, "int8"]) -> None:
+ # function attr dict
+ T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True})
+ ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32")
+ nn = T.var("int32")
+ nn_1 = T.var("int32")
+ nn_2 = T.var("int32")
+ nn_3 = T.var("int32")
+ nn_4 = T.var("int32")
+ nn_5 = T.var("int32")
+ # body
+ placeholder_d_d_global = T.allocate([208], "uint8", "global")
+ placeholder_d_d_global_1 = T.allocate([112], "uint8", "global")
+ placeholder_d_global = T.allocate([96], "uint8", "global")
+ ethosu_write_1 = T.allocate([195168], "int8", "global")
+ placeholder_local = T.allocate([256], "int8", "local")
+ ethosu_write_2 = T.allocate([184800], "int8", "global")
+ ethosu_write_3 = T.allocate([184800], "int8", "global")
+ ethosu_write_4 = T.allocate([184800], "int8", "global")
+ placeholder_d_local = T.allocate([256], "int8", "local")
+ ethosu_write_5 = T.allocate([184800], "int8", "global")
+ placeholder_d_d_local = T.allocate([256], "int8", "local")
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 73668):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtyp [...]
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_1[0], 256, placeholder_local[0], dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_d_global_1[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 330):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_d_global_1[0], 64, 0, placeholder_d_d_global_1[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 411):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 458):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_2[0], 256, placeholder_d_local[0], dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1500):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 10464):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(0.00381289585493505), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 48, 0, placeholder_d_global[48], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_d_local[0], dtype="handle"))
+ T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5253)
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 440, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 4, 64, 8, dtype="handle"))
+
+
+ @tvm.script.ir_module
+ class ModuleAfter:
+ @T.prim_func
+ def main(placeholder: T.Buffer[97156, "int8"], placeholder_encoded: T.Buffer[208, "uint8"], placeholder_encoded_1: T.Buffer[112, "uint8"], placeholder_1: T.Buffer[256, "int8"], placeholder_encoded_2: T.Buffer[96, "uint8"], placeholder_2: T.Buffer[256, "int8"], placeholder_3: T.Buffer[256, "int8"], ethosu_write: T.Buffer[46200, "int8"]) -> None:
+ # function attr dict
+ T.func_attr({"tir.noalias": True, "global_symbol": "main", "from_legacy_te_schedule": True})
+ ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32")
+ nn = T.var("int32")
+ nn_1 = T.var("int32")
+ nn_2 = T.var("int32")
+ nn_3 = T.var("int32")
+ nn_4 = T.var("int32")
+ nn_5 = T.var("int32")
+ # body
+ placeholder_d_d_global = T.allocate([208], "uint8", "global")
+ placeholder_d_d_global_1 = T.allocate([112], "uint8", "global")
+ placeholder_d_global = T.allocate([96], "uint8", "global")
+ ethosu_write_1 = T.allocate([195168], "int8", "global")
+ placeholder_local = T.allocate([256], "int8", "local")
+ ethosu_write_2 = T.allocate([184800], "int8", "global")
+ ethosu_write_3 = T.allocate([184800], "int8", "global")
+ ethosu_write_4 = T.allocate([184800], "int8", "global")
+ placeholder_d_local = T.allocate([256], "int8", "local")
+ ethosu_write_5 = T.allocate([184800], "int8", "global")
+ placeholder_d_d_local = T.allocate([256], "int8", "local")
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None, "DataPar", ""), "pragma_compute_cycles_hint", 1792):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded[0], 208, placeholder_d_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 384):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_1[0], 112, placeholder_d_d_global_1[0], dtype="handle"))
+ with T.attr(T.iter_var(nn, None, "DataPar", ""), "pragma_compute_cycles_hint", 73668):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 214, 227, 2, 214, 0, 227, placeholder[0], 0, 0, 0, T.float32(0.0039215679280459881), -128, "NHWC", 454, 2, 1, "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, 3, 3, 2, 2, 1, 1, placeholder_d_d_global[0], 160, T.int8(-1), T.int8(-1), 0, placeholder_d_d_global[160], 48, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 10, 16, dtyp [...]
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 1500):
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_encoded_2[0], 96, placeholder_d_global[0], dtype="handle"))
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_1[0], 256, placeholder_local[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_1, None, "DataPar", ""), "pragma_compute_cycles_hint", 330):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 107, 114, 4, 107, 0, 114, ethosu_write_1[0], 0, 0, 0, T.float32(0.009109782986342907), -128, "NHCWB16", 1824, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(0.0066184266470372677), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_d_global_1[0], 64, 0, placeholder_d_d_global_1[64], 48, 0, 0, 0, 0, "SIGMOID", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_2, None, "DataPar", ""), "pragma_compute_cycles_hint", 411):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_2[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ with T.attr(T.iter_var(nn_3, None, "DataPar", ""), "pragma_compute_cycles_hint", 458):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_3[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "AVG", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 2, 64, 16, dtype="handle"))
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_2[0], 256, placeholder_d_local[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_4, None, "DataPar", ""), "pragma_compute_cycles_hint", 10464):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_4[0], 0, 0, 0, T.float32(0.00390625), -128, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(0.00381289585493505), -128, "NHCWB16", 1760, 16, 1, 3, 2, 1, 1, 2, 2, placeholder_d_global[0], 48, 0, placeholder_d_global[48], 48, 1, 2, 1, 2, "TANH", 0, 0, "TFL", "NONE", 8, 16, 16, dtype="handle"))
+ T.evaluate(T.call_extern("ethosu_copy", placeholder_3[0], 256, placeholder_d_d_local[0], dtype="handle"))
+ T.attr(T.iter_var(nn_5, None, "DataPar", ""), "pragma_compute_cycles_hint", 5253)
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 105, 110, 4, 105, 0, 110, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHCWB16", 1760, 16, 1, "int8", 105, 110, 4, 105, 0, 110, ethosu_write[0], 0, 0, 0, T.float32(1), 0, "NHWC", 440, 4, 1, "MAX", 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, "TANH", 0, 0, "TFL", "NONE", 4, 64, 8, dtype="handle"))
+ # fmt: on
+
+ test_mod = CopyComputeReordering(reorder_by_cycles=True)(ModuleBefore)
+ reference_mod = ModuleAfter
+ tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+
+
if __name__ == "__main__":
pytest.main([__file__])