You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2022/07/06 15:46:35 UTC

[tvm] branch main updated: [microNPU] Calculate memory pressure for microNPU external functions (#11209)

This is an automated email from the ASF dual-hosted git repository.

manupa pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new cfe8318990 [microNPU] Calculate memory pressure for microNPU external functions (#11209)
cfe8318990 is described below

commit cfe8318990e799215c9baac4ef2c4ecb20a91d9f
Author: Luke Hutton <lu...@arm.com>
AuthorDate: Wed Jul 6 16:46:29 2022 +0100

    [microNPU] Calculate memory pressure for microNPU external functions (#11209)
    
    * [microNPU] Calculate memory pressure for microNPU external functions
    
    During the microNPU compilation stage, the "used_memory" annotations on
    external microNPU functions are read to determine a memory pressure
    value. This value is passed to the cascader to better approximate the
    memory available for the optimization.
    
    Change-Id: I11a311b0005e785637014cb451f4aed96edcda26
    
    * fix get size from memory region
    
    Change-Id: I41acfc83f05b2204075edb99f86a0eecaba00f71
    
    * add test case for full offload
    
    Change-Id: If3e672d402ab237fa82e34761bb972d2e9483ba9
---
 python/tvm/contrib/ethosu/cascader/scheduler.py    |   8 +-
 python/tvm/relay/backend/contrib/ethosu/codegen.py |  59 +++++--
 python/tvm/tir/usmp/utils.py                       |   9 +
 .../cascader/test_calculate_memory_pressure.py     | 186 +++++++++++++++++++++
 .../test_ethosu/cascader/test_memory_reduction.py  | 163 +++++++++++++++++-
 5 files changed, 407 insertions(+), 18 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index fd247e660a..2c804a3b3b 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -225,21 +225,21 @@ def choose_proposal(
     return proposal_choice
 
 
-def extract_memory_info(memory_pool: PoolInfo) -> MemoryRegion:
+def extract_memory_info(memory_pool: PoolInfo, memory_pressure: int) -> MemoryRegion:
     "Create a MemoryRegion based on the info in the memory pool"
-    size = int(memory_pool.size_hint_bytes)
+    size = int(memory_pool.size_hint_bytes - memory_pressure)
     read_bandwidth = int(memory_pool.read_bandwidth_bytes_per_cycle)
     write_bandwidth = int(memory_pool.write_bandwidth_bytes_per_cycle)
 
     for param in (size, read_bandwidth, write_bandwidth):
         assert param != -1, f"{param} needs to be specified for the cascader."
 
-    name_to_burst_lenght = {
+    name_to_burst_length = {
         target.kind.name: burst for target, burst in memory_pool.target_burst_bytes.items()
     }
 
     try:
-        burst_length = int(name_to_burst_lenght["ethos-u"])
+        burst_length = int(name_to_burst_length["ethos-u"])
     except KeyError:
         burst_length = 1
 
diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index 423834daa8..5119c04edb 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -381,6 +381,46 @@ def _ethos_u55_cascader(sram, enable_striping) -> Callable:
     )
 
 
+def _calculate_memory_pressure(mod: tvm.ir.IRModule) -> int:
+    """
+    Calculates a worst-case estimate of the memory consumed at the callsite of
+    each microNPU function. This value can be used as a hint to guide the cascader,
+    indicating how aggressively it will need to optimize the input module to fit
+    into the memory that remains in the memory workspace.
+
+    Parameters
+    ----------
+    mod : tvm.ir.IRModule
+        The input module
+
+    Returns
+    -------
+    int
+        Memory pressure value for the module.
+    """
+    memory_pressure = 0
+
+    @util.create_npu_function_pass(opt_level=1)
+    class CalculateMemoryPressure:
+        """
+        Traverse the module and get total memory used by external NPU functions.
+        """
+
+        def transform_npu_function(self, _, func: relay.Function) -> relay.Function:
+            nonlocal memory_pressure
+            max_val = max(func.attrs["used_memory"])
+            memory_pressure += max_val
+            return func
+
+    CalculateMemoryPressure()(mod)  # pylint: disable=not-callable
+
+    io_used_memory = 0
+    if not tvm.tir.usmp.utils.use_workspace_io_is_enabled():
+        io_used_memory = int(mod["main"].attrs["io_used_memory"])
+
+    return memory_pressure - io_used_memory
+
+
 @tvm._ffi.register_func("relay.ext.ethos-u.relay_to_tir")
 def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     """
@@ -413,21 +453,18 @@ def relay_to_tir(mod: tvm.ir.IRModule) -> tvm.ir.IRModule:
     # Use the cascader if it is enabled for the U55 accelerator, otherwise use copy_constants
     # scheduler
     if util.is_cascader_enabled():
-        assert (
-            util.get_accelerator_config() != "ethos-u65-256"
-        ), "Cascading is not supported for the U65 accelerator"
+        if util.get_accelerator_config() == "ethos-u65-256":
+            raise ValueError("Cascading is not supported for the U65 accelerator")
 
         workspace_memory_pools = mod.attrs["workspace_memory_pools"]
 
-        assert (
-            workspace_memory_pools
-        ), "Workspace memory pool needs to be provided for the U55 cascader"
-
-        assert (
-            len(workspace_memory_pools.pools) == 1
-        ), "Exactly one workspace pool needs to be provided for the U55 cascader"
+        if not workspace_memory_pools:
+            raise ValueError("Workspace memory pool needs to be provided for the U55 cascader")
+        if len(workspace_memory_pools.pools) != 1:
+            raise ValueError("Exactly one workspace pool needs to be provided for the U55 cascader")
 
-        sram = extract_memory_info(workspace_memory_pools.pools[0])
+        memory_pressure = _calculate_memory_pressure(mod)
+        sram = extract_memory_info(workspace_memory_pools.pools[0], memory_pressure)
         tir_mod = LowerToTIR(_ethos_u55_cascader(sram, util.is_striping_enabled()))(mod)
     else:
         tir_mod = LowerToTIR(copy_constants())(mod)
diff --git a/python/tvm/tir/usmp/utils.py b/python/tvm/tir/usmp/utils.py
index a7221cfe6f..024922e85b 100644
--- a/python/tvm/tir/usmp/utils.py
+++ b/python/tvm/tir/usmp/utils.py
@@ -19,6 +19,7 @@
 
 from typing import Optional, List
 
+import tvm
 from tvm._ffi import register_object
 from tvm.runtime import Object
 from . import _ffi_api
@@ -31,6 +32,14 @@ from ...ir.memory_pools import PoolInfo
 CANDIDATE_MEMORY_POOL_ATTR = "candidate_memory_pools"
 
 
+def use_workspace_io_is_enabled() -> bool:
+    """
+    Check whether placing I/O tensors in the workspace is enabled.
+    """
+    ctx = tvm.transform.PassContext.current()
+    return bool(ctx.config.get("tir.usmp.use_workspace_io", False))
+
+
 @register_object("tir.usmp.BufferInfo")
 class BufferInfo(Object):
     """BufferInfo object holds information related to buffers
diff --git a/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py b/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py
new file mode 100644
index 0000000000..255ec4bba8
--- /dev/null
+++ b/tests/python/contrib/test_ethosu/cascader/test_calculate_memory_pressure.py
@@ -0,0 +1,186 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=wrong-import-position
+
+"""
+Test memory pressure is calculated correctly from used memory annotations.
+"""
+
+import pytest
+
+pytest.importorskip("ethosu.vela")
+
+import tvm
+from tvm import relay
+from tvm.relay.backend.contrib.ethosu.codegen import _calculate_memory_pressure
+from tvm.contrib.ethosu.cascader.scheduler import extract_memory_info
+from tvm import WorkspacePoolInfo, PoolInfoProperties
+
+
+def _npu_and_non_npu_functions():
+    mod = tvm.IRModule({})
+
+    # NPU function 1
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.nn.max_pool2d(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.pooling")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    npu_compiler_func1 = compiler_func.with_attr("Compiler", "ethos-u")
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = npu_compiler_func1
+
+    # Non-NPU function
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.abs(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "foo.unary_elementwise")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    non_npu_compiler_func = compiler_func.with_attr("Compiler", "foo")
+    g2 = relay.GlobalVar("g2")
+    mod[g2] = non_npu_compiler_func
+
+    # NPU function 2
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.abs(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.unary_elementwise")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    npu_compiler_func2 = compiler_func.with_attr("Compiler", "ethos-u")
+    g3 = relay.GlobalVar("g3")
+    mod[g3] = npu_compiler_func2
+
+    # Main
+    inp = relay.var("main_input", shape=(1, 2, 2, 4), dtype="int8")
+    call1 = relay.Call(g1, [inp])
+    call2 = relay.Call(g2, [call1])
+    call3 = relay.Call(g3, [call2])
+    main_func = relay.Function([inp], call3)
+    main_func = main_func.with_attr("io_used_memory", 32)
+    mod["main"] = main_func
+    return mod
+
+
+def _parallel_npu_functions():
+    mod = tvm.IRModule({})
+
+    # NPU function 1
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    max_pool = relay.nn.max_pool2d(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.pooling")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32])
+    npu_compiler_func1 = compiler_func.with_attr("Compiler", "ethos-u")
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = npu_compiler_func1
+
+    # NPU function 2
+    x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+    abs_op = relay.abs(x)
+    composite_func = relay.Function([x], abs_op)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.unary_elementwise")
+    inp = relay.var("input", shape=(1, 2, 2, 4), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [32 + 16])
+    npu_compiler_func2 = compiler_func.with_attr("Compiler", "ethos-u")
+    g2 = relay.GlobalVar("g2")
+    mod[g2] = npu_compiler_func2
+
+    # Main
+    inp = relay.var("main_input", shape=(1, 2, 2, 4), dtype="int8")
+    call1 = relay.Call(g1, [inp])
+    call2 = relay.Call(g2, [inp])
+    concat = relay.concatenate([call1, call2], axis=3)
+    main_func = relay.Function([inp], concat)
+    main_func = main_func.with_attr("io_used_memory", 32)
+    mod["main"] = main_func
+    return mod
+
+
+def _full_offload():
+    mod = tvm.IRModule({})
+
+    # NPU function
+    x = relay.var("x", shape=(1, 4, 4, 16), dtype="int8")
+    max_pool = relay.nn.max_pool2d(x)
+    composite_func = relay.Function([x], max_pool)
+    composite_func = composite_func.with_attr("Composite", "ethos-u.pooling")
+    inp = relay.var("input", shape=(1, 4, 4, 16), dtype="int8")
+    compiler_func = relay.Function([inp], composite_func)
+    compiler_func = compiler_func.with_attr("used_memory", [256 + 256])
+    npu_compiler_func = compiler_func.with_attr("Compiler", "ethos-u")
+    g1 = relay.GlobalVar("g1")
+    mod[g1] = npu_compiler_func
+
+    # Main
+    inp = relay.var("main_input", shape=(1, 4, 4, 16), dtype="int8")
+    call = relay.Call(g1, [inp])
+    main_func = relay.Function([inp], call)
+    main_func = main_func.with_attr("io_used_memory", 256 + 256)
+    mod["main"] = main_func
+    return mod
+
+
+@pytest.mark.parametrize(
+    "model_func,use_workspace_io,expected_memory_pressure",
+    [
+        (_npu_and_non_npu_functions, True, (16 + 16) + (16 + 16)),
+        (_npu_and_non_npu_functions, False, (16 + 16) + (16 + 16) - (16 + 16)),
+        (_parallel_npu_functions, True, (16 + 16) + (16 + 16 + 16)),
+        (_parallel_npu_functions, False, (16 + 16) + (16 + 16 + 16) - (16 + 16)),
+        (_full_offload, True, (256 + 256)),
+        (_full_offload, False, (256 + 256) - (256 + 256)),
+    ],
+)
+def test_calculate_memory_pressure_pass(model_func, use_workspace_io, expected_memory_pressure):
+    """
+    Test that memory pressure is correctly calculated for NPU external functions.
+    """
+
+    mod = model_func()
+    with tvm.transform.PassContext(config={"tir.usmp.use_workspace_io": use_workspace_io}):
+        memory_pressure = _calculate_memory_pressure(mod)
+    assert memory_pressure == expected_memory_pressure
+
+
+def test_extract_memory_info():
+    """
+    Test memory pressure value correctly reduces the workspace size.
+    """
+    initial_pool_size = 2000
+    memory_pressure = 500
+    memory_pool = WorkspacePoolInfo(
+        "SRAM",
+        [tvm.target.Target("c"), tvm.target.Target("ethos-u")],
+        PoolInfoProperties(
+            size_hint_bytes=initial_pool_size,
+            read_bandwidth_bytes_per_cycle=16,
+            write_bandwidth_bytes_per_cycle=16,
+            target_burst_bytes={tvm.target.Target("ethos-u"): 1},
+        ),
+    )
+
+    sram = extract_memory_info(memory_pool, memory_pressure)
+    assert sram.size == initial_pool_size - memory_pressure
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 5c3b745cb4..e882822405 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -28,13 +28,12 @@ from tvm.micro import model_library_format as mlf
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 import tvm
 from tvm import WorkspaceMemoryPools, WorkspacePoolInfo, PoolInfoProperties
+from tvm.relay.backend.contrib.ethosu.codegen import extract_memory_info
 
 from .. import infra
 
 
-def _get_ethosu_workspace_size(
-    mod, params, accel_type, pool_size, enable_cascader, enable_striping
-):
+def _get_compilation_config(accel_type, enable_cascader, enable_striping):
     enable_usmp = True
 
     target = tvm.target.Target("c")
@@ -61,6 +60,17 @@ def _get_ethosu_workspace_size(
         "tir.disable_storage_rewrite": enable_usmp,
     }
 
+    return target, ethosu_target, runtime, executor, pass_config
+
+
+def _get_ethosu_workspace_size(
+    mod, params, accel_type, pool_size, enable_cascader, enable_striping
+):
+
+    target, ethosu_target, runtime, executor, pass_config = _get_compilation_config(
+        accel_type, enable_cascader, enable_striping
+    )
+
     workspace_memory_pools = WorkspaceMemoryPools(
         [
             WorkspacePoolInfo(
@@ -234,3 +244,150 @@ def test_depthwise2d_conv2d_pooling(
 
     assert workspace_size_cascader_disabled == expected_ws_size_without_striping
     assert workspace_size_cascader_enabled_striping_enabled == expected_ws_size_with_striping
+
+
+def test_multiple_memory_pools():
+    """
+    The cascader does not support multiple workspace memory
+    pools. Check the correct error is thrown.
+    """
+    np.random.seed(2)
+    ifm_shape = (1, 80, 75, 3)
+
+    target, ethosu_target, runtime, executor, pass_config = _get_compilation_config(
+        "ethos-u55-256", True, True
+    )
+    workspace_memory_pools = WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                "SRAM",
+                [target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=1,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
+            ),
+            WorkspacePoolInfo(
+                "SRAM",
+                [target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=1,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
+            ),
+        ]
+    )
+
+    @tf.function
+    def tf_graph(x):
+        return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME")
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    with pytest.raises(ValueError) as e:
+        with tvm.transform.PassContext(opt_level=3, config=pass_config):
+            tvm.relay.build(
+                mod,
+                target,
+                executor=executor,
+                runtime=runtime,
+                workspace_memory_pools=workspace_memory_pools,
+                params=params,
+            )
+
+    expected_reason = "Exactly one workspace pool needs to be provided for the U55 cascader"
+    on_error = "A ValueError was caught but its reason is not the expected one."
+    assert expected_reason in str(e.value), on_error
+
+
+def test_missing_memory_pools():
+    """
+    The cascader requires memory pools to be present, check the correct error
+    is thrown when there aren't any.
+    """
+    np.random.seed(2)
+    ifm_shape = (1, 80, 75, 3)
+
+    target, _, runtime, executor, pass_config = _get_compilation_config("ethos-u55-256", True, True)
+
+    @tf.function
+    def tf_graph(x):
+        return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME")
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    with pytest.raises(ValueError) as e:
+        with tvm.transform.PassContext(opt_level=3, config=pass_config):
+            tvm.relay.build(
+                mod,
+                target,
+                executor=executor,
+                runtime=runtime,
+                workspace_memory_pools=None,
+                params=params,
+            )
+
+    expected_reason = "Workspace memory pool needs to be provided for the U55 cascader"
+    on_error = "A ValueError was caught but its reason is not the expected one."
+    assert expected_reason in str(e.value), on_error
+
+
+def test_invalid_accelerator():
+    """
+    Check an error is thrown when an unsupported accelerator configuration
+    is used.
+    """
+    np.random.seed(2)
+    ifm_shape = (1, 80, 75, 3)
+
+    target, ethosu_target, runtime, executor, pass_config = _get_compilation_config(
+        "ethos-u65-256", True, True
+    )
+    workspace_memory_pools = WorkspaceMemoryPools(
+        [
+            WorkspacePoolInfo(
+                "SRAM",
+                [target, ethosu_target],
+                PoolInfoProperties(
+                    size_hint_bytes=1,
+                    read_bandwidth_bytes_per_cycle=16,
+                    write_bandwidth_bytes_per_cycle=16,
+                    target_burst_bytes={ethosu_target: 1},
+                ),
+            ),
+        ]
+    )
+
+    @tf.function
+    def tf_graph(x):
+        return tf.nn.max_pool(x, (3, 3), (1, 1), "SAME")
+
+    _, tflite_graph = infra.get_tflite_graph(tf_graph, [ifm_shape])
+    tflite_model = tflite.Model.Model.GetRootAsModel(tflite_graph, 0)
+    relay_module, params = relay.frontend.from_tflite(tflite_model)
+    mod = partition_for_ethosu(relay_module, params)
+
+    with pytest.raises(ValueError) as e:
+        with tvm.transform.PassContext(opt_level=3, config=pass_config):
+            tvm.relay.build(
+                mod,
+                target,
+                executor=executor,
+                runtime=runtime,
+                workspace_memory_pools=workspace_memory_pools,
+                params=params,
+            )
+
+    expected_reason = "Cascading is not supported for the U65 accelerator"
+    on_error = "A ValueError was caught but its reason is not the expected one."
+    assert expected_reason in str(e.value), on_error