You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by lu...@apache.org on 2022/05/30 15:31:31 UTC
[tvm] branch main updated: [microNPU] add E2E tests with cascader wo striping (#11410)

This is an automated email from the ASF dual-hosted git repository.

lukhut pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 119afda634 [microNPU] add E2E tests with cascader wo striping (#11410)
119afda634 is described below

commit 119afda6344785aee5cf1729eec30624ac068f33
Author: Manupa Karunaratne <ma...@arm.com>
AuthorDate: Mon May 30 16:31:23 2022 +0100

    [microNPU] add E2E tests with cascader wo striping (#11410)
    
    This commit adds end-to-end tests using the cascader
    w/o striping. It needed few adjustments to the order
    in which the arugments are provided to the entry point
    function in AoT when both memory pools and devices
    are present.
    
    Change-Id: I37e04afd635add895e317586f628a62cae75f3fa
---
 .../tvm/contrib/ethosu/cascader/device_config.py   |  80 +++++++++-------
 .../tvm/relay/backend/contrib/ethosu/te/common.py  |   5 +-
 src/contrib/ethosu/cascader/parts/ethosu.cc        |  37 +++++---
 src/contrib/ethosu/cascader/parts/ethosu.h         |   8 ++
 src/target/source/interface_c.cc                   |  12 +--
 tests/cpp/target/source/interface_c_test.cc        |  27 ++++++
 .../cascader/test_ethosu_block_config.py           |  40 ++++----
 .../test_ethosu/cascader/test_memory_reduction.py  |   2 +-
 tests/python/contrib/test_ethosu/infra.py          | 102 ++++++++++++++++-----
 tests/python/contrib/test_ethosu/test_codegen.py   |  93 ++++++++++++++-----
 .../contrib/test_ethosu/test_identity_optimizer.py |   6 +-
 .../contrib/test_ethosu/test_layout_optimizer.py   |   5 +-
 .../contrib/test_ethosu/test_lookup_table.py       |  10 +-
 tests/python/contrib/test_ethosu/test_networks.py  |  65 +++++++++++--
 14 files changed, 354 insertions(+), 138 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index 5f5a937628..27aa8b8c78 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -84,7 +84,7 @@ class EthosuDeviceConfig:
 
             self._total_banks = 48
             self._reserved_banks = 4
-            self._input_granularity = 8
+            self._input_granularity = {1: 8, 2: 8, 4: 16}
             self._accumulator_granularity = {4: 16, 5: 20}
             self._lut_reserved = True
         elif self._device == "ethos-u55-128":
@@ -96,7 +96,7 @@ class EthosuDeviceConfig:
 
             self._total_banks = 24
             self._reserved_banks = 4
-            self._input_granularity = 4
+            self._input_granularity = {1: 4, 2: 4, 4: 8}
             self._accumulator_granularity = {4: 8, 5: 12}
             self._lut_reserved = True
         elif self._device == "ethos-u55-64":
@@ -108,7 +108,7 @@ class EthosuDeviceConfig:
 
             self._total_banks = 16
             self._reserved_banks = 2
-            self._input_granularity = 2
+            self._input_granularity = {1: 2, 2: 2, 4: 4}
             self._accumulator_granularity = {4: 4, 5: 8}
             self._lut_reserved = False
         elif self._device == "ethos-u55-32":
@@ -120,8 +120,8 @@ class EthosuDeviceConfig:
 
             self._total_banks = 16
             self._reserved_banks = 2
-            self._input_granularity = 2
-            self._accumulator_granularity = {4: 4, 5: 8}
+            self._input_granularity = {1: 2, 2: 2, 4: 4}
+            self._accumulator_granularity = {4: 4, 5: 4}
             self._lut_reserved = False
 
     def _get_output_cycles(
@@ -448,18 +448,32 @@ class EthosuDeviceConfig:
             input_block_shape.depth * input_bytewidth, 8
         )
         input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
-        input_banks = _round_up(input_banks, self._input_granularity)
+        input_banks = _round_up(input_banks, self._input_granularity[input_bytewidth])
 
         return input_banks
 
-    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
-        acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
+    def _get_accumulator_banks(self, output_block_shape, acc_bytewidth):
+        acc_depth = _round_up(output_block_shape.depth, 8)
         acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
         acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
         acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])
 
         return acc_banks
 
+    @staticmethod
+    def _create_layout_block(nhwc_block_config, layout):
+        """A helper function to convert to brick layout"""
+        if layout == "NHCWB16":
+            return [
+                nhwc_block_config[0],
+                nhwc_block_config[1],
+                1 + ((nhwc_block_config[3] - 1) // 16),
+                nhwc_block_config[2],
+                16,
+            ]
+        # else it could only be NHWC
+        return nhwc_block_config
+
     def get_elementwise_block_config(
         self,
         ifm_propagator: Propagator,
@@ -537,22 +551,22 @@ class EthosuDeviceConfig:
         # Split the block in half until it fits into SHRAM
         max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
-            split_order = (a for a in [1, 3, 2])
-            output_block = [
-                output_shape[0],
-                _round_up(min(output_shape[1], max_height), self._micro_block.height),
-                min(output_shape[2] * output_shape[4], max_depth),
-                _round_up(min(output_shape[3], max_width), self._micro_block.width),
-                16,
-            ]
+            output_height = output_shape[1]
+            output_width = output_shape[3]
+            output_channels = output_shape[2] * 16
         else:
-            split_order = (a for a in [1, 2, 3])
-            output_block = [
-                output_shape[0],
-                _round_up(min(output_shape[1], max_height), self._micro_block.height),
-                _round_up(min(output_shape[2], max_width), self._micro_block.width),
-                _round_up(min(output_shape[3], max_depth), self._micro_block.depth),
-            ]
+            output_height = output_shape[1]
+            output_width = output_shape[2]
+            output_channels = output_shape[3]
+
+        output_nhwc_block = [
+            1,
+            _round_up(min(output_height, max_height), self._micro_block.height),
+            _round_up(min(output_width, max_width), self._micro_block.width),
+            _round_up(min(output_channels, max_depth), self._micro_block.depth),
+        ]
+        output_block = self._create_layout_block(output_nhwc_block, output_layout)
+        split_order = (a for a in [1, 2, 3])
         split_axis = next(split_order)
 
         offset = [0] * len(output_block)
@@ -572,7 +586,7 @@ class EthosuDeviceConfig:
                 )
             else:
                 # Unary elementwise
-                input2_block = _Shape([0, 0, 0, 0])
+                input2_block = input_block
 
             input_block.round_up(self._input_micro_block)
             input2_block.round_up(self._input_micro_block)
@@ -589,15 +603,19 @@ class EthosuDeviceConfig:
                 )
                 output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
                 output_cycles = int(math.ceil(output_cycles))
-                block_config.append(BlockConfig(output_block, output_block, 0, output_cycles))
+                block_config.append(
+                    BlockConfig(input_block.as_list(), output_block, 0, output_cycles)
+                )
                 break
 
-            if output_block[split_axis] == self._micro_block.as_list()[split_axis]:
+            if output_nhwc_block[split_axis] == self._micro_block.as_list()[split_axis]:
                 split_axis = next(split_order)
 
-            output_block[split_axis] = _round_up(
-                _round_up_div(output_block[split_axis], 2), self._micro_block.as_list()[split_axis]
+            output_nhwc_block[split_axis] = _round_up(
+                _round_up_div(output_nhwc_block[split_axis], 2),
+                self._micro_block.as_list()[split_axis],
             )
+            output_block = self._create_layout_block(output_nhwc_block, output_layout)
 
         return block_config
 
@@ -739,7 +757,7 @@ class EthosuDeviceConfig:
                             height,
                             1 + ((depth - 1) // 16),
                             width,
-                            min(16, _round_up(ofm_channels, self._micro_block.depth)),
+                            16,
                         )
                         order = [1, 2, 4, 3, 0]
                     else:
@@ -771,9 +789,7 @@ class EthosuDeviceConfig:
                     # Banks required for input block
                     input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
                     # Banks required for accumulation
-                    acc_banks = self._get_accumulator_banks(
-                        output_block_shape, acc_bytewidth, depth
-                    )
+                    acc_banks = self._get_accumulator_banks(output_block_shape, acc_bytewidth)
 
                     if (input_banks + acc_banks) <= banks_available:
                         output_cycles = self._get_output_cycles(
diff --git a/python/tvm/relay/backend/contrib/ethosu/te/common.py b/python/tvm/relay/backend/contrib/ethosu/te/common.py
index aac060308e..edbece4e13 100644
--- a/python/tvm/relay/backend/contrib/ethosu/te/common.py
+++ b/python/tvm/relay/backend/contrib/ethosu/te/common.py
@@ -53,7 +53,10 @@ def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]],
         [1, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0],
         [0, 0, 0, 1, 0, 0],
-        [0, 0, 0, 0, 0, ofm_channels],
+        # We need to offset only if number of ofm_channels is not divisible by 16
+        # Moreover, we can't use just the "ofm_channels" as last element because
+        # the propogation matrices are used to propogate block configs as well.
+        [0, 0, 16, 0, 0, -(int(ofm_channels % 16 != 0)) * (16 - ofm_channels % 16)],
         [0, 0, 0, 0, 0, 1],
     ]
 
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc
index f9c5a8409f..33d9b3b452 100644
--- a/src/contrib/ethosu/cascader/parts/ethosu.cc
+++ b/src/contrib/ethosu/cascader/parts/ethosu.cc
@@ -70,34 +70,41 @@ const std::vector<int64_t> EthosuPartNode::GetBytesRead(const std::vector<int>&
   return bytes_per_input;
 }
 
-const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stripe_config) {
-  BlockConfig best_block_config;
-  float best_cost = std::numeric_limits<float>::infinity();
+float EthosuPartNode::CalculateCost(const BlockConfig& block_config,
+                                    const StripeConfig& output_stripe_config) {
+  std::vector<int> output_block = block_config->GetOutputBlockShape();
   std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
   auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
   std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
-  for (const auto& block_config : valid_block_configs_) {
-    std::vector<int> output_block = block_config->GetOutputBlockShape();
+  std::vector<int64_t> bytes_per_input = GetBytesRead(output_block, output_stripe_shape);
+  bytes_per_input[0] *= subkernels_;
 
-    std::vector<int64_t> bytes_per_input = GetBytesRead(output_block, output_stripe_shape);
-    bytes_per_input[0] *= subkernels_;
+  // Calculate bytes read per output element
+  float cost =
+      static_cast<float>(bytes_per_input[0] + bytes_per_input[1]) / mul_reduce(output_stripe_shape);
 
-    // Calculate bytes read per output element
-    float relative_cost = static_cast<float>(bytes_per_input[0] + bytes_per_input[1]) /
-                          mul_reduce(output_stripe_shape);
+  // Single buffering hardware optimization
+  if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
+    cost /= 2;
+  }
+  return cost;
+}
 
-    // Single buffering hardware optimization
-    if (mul_reduce(input_stripe_shape) <= 2 * mul_reduce(block_config->GetInputBlockShape())) {
-      relative_cost /= 2;
-    }
+const BlockConfig EthosuPartNode::GetBlockConfig(const StripeConfig& output_stripe_config) {
+  BlockConfig best_block_config = valid_block_configs_[0];
+  float best_cost = CalculateCost(best_block_config, output_stripe_config);
+  std::vector<int> output_stripe_shape = output_stripe_config->GetShape();
+  auto input_stripe_configs = CalculateInputStripeConfigs(output_stripe_config);
+  std::vector<int> input_stripe_shape = input_stripe_configs[0]->GetShape();
 
+  for (const auto& block_config : valid_block_configs_) {
+    float relative_cost = CalculateCost(block_config, output_stripe_config);
     if (relative_cost < best_cost) {
       best_block_config = block_config;
       best_cost = relative_cost;
     }
   }
-
   return best_block_config;
 }
 
diff --git a/src/contrib/ethosu/cascader/parts/ethosu.h b/src/contrib/ethosu/cascader/parts/ethosu.h
index cd8fa84eca..4738f673e7 100644
--- a/src/contrib/ethosu/cascader/parts/ethosu.h
+++ b/src/contrib/ethosu/cascader/parts/ethosu.h
@@ -75,6 +75,14 @@ class EthosuPartNode : public PartNode {
   const std::vector<int64_t> GetBytesRead(const std::vector<int>& block_shape,
                                           const std::vector<int>& full_shape);
 
+  /*!
+   * \brief Get cost heuristic of using a given block config with the associated stripe config
+   * \param block_config The block config that is being checked for the cost
+   * \param output_stripe_config The striping configuration associated with the operator
+   * \return A cost heuristic representative of the choice
+   */
+  float CalculateCost(const BlockConfig& block_config, const StripeConfig& output_stripe_config);
+
   /*! \brief List of block configs that are valid for this part */
   std::vector<BlockConfig> valid_block_configs_;
   /*! \brief The output volume that is atomically computed */
diff --git a/src/target/source/interface_c.cc b/src/target/source/interface_c.cc
index 1bb567d148..19b37fe21c 100644
--- a/src/target/source/interface_c.cc
+++ b/src/target/source/interface_c.cc
@@ -167,12 +167,12 @@ class InterfaceCNode : public runtime::ModuleNode {
       code_stream << " * \\param outputs Output tensors for the module \n";
     }
 
-    if (!devices_.empty()) {
-      code_stream << " * \\param devices Device context pointers for the module \n";
-    }
     if (!pools_.empty()) {
       code_stream << " * \\param workspace_pools Workspace memory pool pointers for the module \n";
     }
+    if (!devices_.empty()) {
+      code_stream << " * \\param devices Device context pointers for the module \n";
+    }
 
     code_stream << " */\n"
                 << "int32_t " << run_function << "(\n";
@@ -182,12 +182,12 @@ class InterfaceCNode : public runtime::ModuleNode {
       call_args_ss << "  struct " << inputs_struct << "* inputs,\n";
       call_args_ss << "  struct " << outputs_struct << "* outputs,\n";
     }
-    if (!devices_.empty()) {
-      call_args_ss << "  struct " << devices_struct << "* devices,\n";
-    }
     if (!pools_.empty()) {
       call_args_ss << "  struct " << pools_struct << "* workspace_pools,\n";
     }
+    if (!devices_.empty()) {
+      call_args_ss << "  struct " << devices_struct << "* devices,\n";
+    }
     std::string call_args_str = call_args_ss.str();
     call_args_str.pop_back();
     call_args_str.pop_back();
diff --git a/tests/cpp/target/source/interface_c_test.cc b/tests/cpp/target/source/interface_c_test.cc
index bc81d48b27..d578c79255 100644
--- a/tests/cpp/target/source/interface_c_test.cc
+++ b/tests/cpp/target/source/interface_c_test.cc
@@ -126,6 +126,33 @@ TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePools) {
   ASSERT_THAT(header_source, HasSubstr(run_function.str()));
 }
 
+TEST(InterfaceAPI, ContainsRunFunctionWithWorkspacePoolsAndDevices) {
+  std::stringstream run_function;
+
+  run_function << "/*!\n"
+               << " * \\brief entrypoint function for TVM module \"ultimate_cat_spotter\"\n"
+               << " * \\param inputs Input tensors for the module \n"
+               << " * \\param outputs Output tensors for the module \n"
+               << " * \\param workspace_pools Workspace memory pool pointers for the module \n"
+               << " * \\param devices Device context pointers for the module \n"
+               << " */\n"
+               << "int32_t tvmgen_ultimate_cat_spotter_run(\n"
+               << "  struct tvmgen_ultimate_cat_spotter_inputs* inputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_outputs* outputs,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_workspace_pools* workspace_pools,\n"
+               << "  struct tvmgen_ultimate_cat_spotter_devices* devices\n"
+               << ");\n";
+
+  PoolInfo pool_info = PoolInfo("my_memory_pool", {});
+  tir::usmp::AllocatedPoolInfo allocated_pool_info =
+      tir::usmp::AllocatedPoolInfo(pool_info, 100000);
+  runtime::Module test_module = InterfaceCCreate("ultimate_cat_spotter", {"input"}, {"output"},
+                                                 {allocated_pool_info}, {}, {"device"}, 0);
+  std::string header_source = test_module->GetSource();
+
+  ASSERT_THAT(header_source, HasSubstr(run_function.str()));
+}
+
 TEST(InterfaceAPI, ContainsRunFunctionWithWorkspaceIO) {
   std::stringstream run_function_with_map_functions;
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index 26a8080e1a..66d9b4647c 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -166,14 +166,14 @@ from .infra import make_matrices
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
+                ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
-                ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
+                ((1, 6, 10, 16), (1, 4, 1, 12, 16)),
+                ((1, 8, 5, 16), (1, 6, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
-                ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
+                ((1, 1, 1, 128), (1, 1, 4, 1, 16)),
+                ((1, 9, 6, 16), (1, 8, 1, 4, 16)),
             ],
         ),
         (
@@ -184,14 +184,14 @@ from .infra import make_matrices
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
-                ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
+                ((1, 10, 6, 8), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
-                ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
+                ((1, 6, 10, 16), (1, 4, 1, 12, 16)),
+                ((1, 8, 5, 16), (1, 6, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
-                ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
+                ((1, 1, 1, 128), (1, 1, 4, 1, 16)),
+                ((1, 9, 6, 16), (1, 8, 1, 4, 16)),
             ],
         ),
         (
@@ -202,15 +202,15 @@ from .infra import make_matrices
                 ((1, 5, 8, 16), (1, 5, 1, 8, 16)),
                 ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 16, 4, 16), (1, 16, 1, 4, 16)),
-                ((1, 8, 12, 8), (1, 8, 1, 12, 8)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 8, 12, 8), (1, 10, 1, 6, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16), (1, 6, 1, 6, 16)),
                 # Depthwise Conv2D
-                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 7, 2, 10, 16)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 6, 1, 10, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16), (1, 6, 1, 6, 16)),
                 # Pooling
                 # ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
-                ((1, 1, 2, 128), (1, 1, 8, 2, 16)),
-                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
+                ((1, 1, 2, 128), (1, 1, 4, 2, 16)),
+                ((1, 10, 6, 16), (1, 9, 1, 6, 16)),
             ],
         ),
         (
@@ -221,14 +221,14 @@ from .infra import make_matrices
                 ((1, 16, 8, 16), (1, 16, 1, 8, 16)),
                 ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 32, 4, 16), (1, 10, 12, 16), (1, 32, 1, 4, 16), (1, 10, 1, 12, 16)),
-                ((1, 20, 12, 8), (1, 20, 1, 12, 8)),
+                ((1, 20, 12, 8), (1, 10, 1, 12, 16)),
                 ((1, 12, 10, 16), (1, 12, 1, 10, 16)),
                 # Depthwise Conv2D
-                ((1, 8, 20, 16), (1, 8, 1, 20, 16), (1, 8, 2, 20, 16)),
-                ((1, 14, 6, 16), (1, 14, 1, 6, 16)),
+                ((1, 8, 20, 16), (1, 6, 1, 20, 16), (1, 6, 2, 20, 16)),
+                ((1, 14, 6, 16), (1, 12, 1, 6, 16)),
                 # Pooling
                 # ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
-                ((1, 2, 2, 128), (1, 2, 8, 2, 16)),
+                ((1, 2, 2, 128), (1, 2, 6, 2, 16)),
                 ((1, 10, 12, 16), (1, 10, 1, 12, 16)),
             ],
         ),
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index 01545217be..8a0d51d2ae 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -162,7 +162,7 @@ def test_double_conv2d(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
         ("ethos-u55-256", 180288, 15200),
-        ("ethos-u55-128", 180288, 14432),
+        ("ethos-u55-128", 180288, 15200),
         ("ethos-u55-64", 180288, 14432),
         ("ethos-u55-32", 180272, 14416),
     ],
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 20bd12945f..a1bdcb47e6 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -45,6 +45,7 @@ from tvm.relay.expr_functor import ExprMutator
 from tvm.relay.op.annotation import compiler_begin, compiler_end
 from tvm.relay.backend.contrib.ethosu import preprocess
 import tvm.relay.testing.tf as tf_testing
+from tvm import WorkspaceMemoryPools, PoolInfo
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.testing.aot import (
@@ -109,19 +110,51 @@ def deserialize_command_stream(blob):
     return cmms
 
 
-def create_test_runner(accel="ethos-u55-256", enable_usmp=True, enable_cascader=False):
+def _get_workspace_size_define_macro(pool_name: str, model_name="default") -> str:
+    """This function converts pool names to compiler generated
+    workspace pool size macros"""
+
+    prefix = "TVMGEN_" + model_name.upper() + "_"
+    postfix = "_WORKSPACE_POOL_SIZE"
+    return prefix + pool_name.upper() + postfix
+
+
+def create_test_runner(
+    accel="ethos-u55-256",
+    enable_usmp=True,
+    enable_cascader=False,
+    enable_striping=False,
+    workspace_pools=None,
+):
+
     file_dir = os.path.dirname(os.path.abspath(__file__))
     test_root = os.path.join(file_dir, "reference_system")
     _, ethosu_variant, ethosu_macs = accel.split("-")
     ethosu_variant = ethosu_variant.upper()
+
+    prologue = """
+    uart_init();
+    EthosuInit();
+
+    struct ethosu_driver* ethos_u = ethosu_reserve_driver();
+    """
+
+    if workspace_pools:
+        for pool in workspace_pools.pools:
+            prologue = (
+                prologue
+                + f"""
+    #ifdef {_get_workspace_size_define_macro(pool.pool_name)}
+    __attribute__((section(".bss.noinit.tvm"), aligned(16)))
+    static uint8_t {pool.pool_name}[{_get_workspace_size_define_macro(pool.pool_name)}];
+    #endif
+    
+            """
+            )
+
     return AOTTestRunner(
         makefile="corstone300",
-        prologue="""
-        uart_init();
-        EthosuInit();
-
-        struct ethosu_driver* ethos_u = ethosu_reserve_driver();
-        """,
+        prologue=prologue,
         epilogue="""
         ethosu_release_driver(ethos_u);
         """,
@@ -135,6 +168,7 @@ def create_test_runner(accel="ethos-u55-256", enable_usmp=True, enable_cascader=
             "relay.ext.ethos-u.options": {
                 "accelerator_config": accel,
                 "enable_cascader": enable_cascader,
+                "enable_striping": enable_striping,
             },
             "tir.usmp.enable": enable_usmp,
             "tir.usmp.algorithm": "hill_climb",
@@ -147,12 +181,10 @@ def build_source(
     module,
     inputs,
     outputs,
-    accel="ethos-u55-256",
+    test_runner,
     output_tolerance=0,
-    enable_usmp=True,
-    enable_cascader=False,
+    workspace_pools=None,
 ):
-    test_runner = create_test_runner(accel, enable_usmp, enable_cascader)
     return compile_models(
         models=AOTTestModel(
             module=module,
@@ -163,22 +195,17 @@ def build_source(
         ),
         interface_api="c",
         use_unpacked_api=True,
+        workspace_memory_pools=workspace_pools,
         workspace_byte_alignment=16,
         pass_config=test_runner.pass_config,
     )
 
 
-def verify_source(
-    models: List[AOTCompiledTestModel],
-    accel="ethos-u55-256",
-    enable_usmp=True,
-    enable_cascader=False,
-):
+def verify_source(models: List[AOTCompiledTestModel], test_runner):
     """
     This method verifies the generated source from an NPU module by building it and running on an FVP.
     """
     interface_api = "c"
-    test_runner = create_test_runner(accel, enable_usmp, enable_cascader)
     run_and_check(
         models,
         test_runner,
@@ -295,18 +322,45 @@ def compare_ethosu_with_reference(
     mod,
     input_data,
     output_data,
-    accel_type,
+    accel_type: str,
     output_tolerance=0,
     print_cmm=False,
-    enable_cascader=False,
+    enable_cascader=None,
 ):
+    if enable_cascader is None:
+        enable_cascader = "u65" not in accel_type
+    pool_name = "my_memory_pool"
+    host_target = tvm.target.Target("c")
+    ethosu_target = tvm.target.Target("ethos-u")
+    workspace_pools = WorkspaceMemoryPools(
+        [
+            PoolInfo(
+                pool_name,
+                {
+                    host_target: PoolInfo.READ_WRITE_ACCESS,
+                    ethosu_target: PoolInfo.READ_WRITE_ACCESS,
+                },
+                size_hint_bytes=2400000,
+                read_bandwidth_bytes_per_cycle=16,
+                write_bandwidth_bytes_per_cycle=16,
+                target_burst_bytes={ethosu_target: 1},
+            )
+        ]
+    )
+    test_runner = create_test_runner(
+        accel_type,
+        enable_usmp=True,
+        enable_cascader=enable_cascader,
+        enable_striping=False,
+        workspace_pools=workspace_pools,
+    )
     compiled_models = build_source(
         mod,
         input_data,
         output_data,
-        accel_type,
+        test_runner,
+        workspace_pools=workspace_pools,
         output_tolerance=output_tolerance,
-        enable_cascader=enable_cascader,
     )
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
@@ -319,7 +373,7 @@ def compare_ethosu_with_reference(
         cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
         print_payload(cmms)
 
-    verify_source(compiled_models, accel_type, enable_cascader=enable_cascader)
+    verify_source(compiled_models, test_runner)
 
 
 def compare_tvm_with_tflite(
@@ -329,7 +383,7 @@ def compare_tvm_with_tflite(
     ranges=None,
     output_tolerance=0,
     print_cmm=False,
-    enable_cascader=False,
+    enable_cascader=None,
 ):
     mod, tflite_graph = get_tflite_graph(tf_func, shapes, ranges)
 
diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py
index 1e8d307b33..ce617d14fa 100644
--- a/tests/python/contrib/test_ethosu/test_codegen.py
+++ b/tests/python/contrib/test_ethosu/test_codegen.py
@@ -270,6 +270,9 @@ def test_ethosu_binary_elementwise(
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
+        # non 4D ops legalize into identity op that is not currently supported in the cascader
+        enable_cascader=(len(ifm_shape) == 4 and len(ifm2_shape) == 4)
+        and ("u65" not in accel_type),
     )
 
 
@@ -298,6 +301,8 @@ def test_binary_add_with_non_4d_shapes(
         shapes=[ifm_shape, ifm2_shape],
         ranges=[(0, 1), (0, 2)],
         accel_type=accel_type,
+        # non 4D ops legalize into identity op that is not currently supported in the cascader
+        enable_cascader=False,
     )
 
 
@@ -386,7 +391,8 @@ def test_mean(accel_type, ifm_shape, axis, keep_dims, use_same_quantization):
     )
     mod = partition_for_ethosu(mod)
 
-    compiled_models = infra.build_source(mod, input_data, output_data, accel_type)
+    test_runner = infra.create_test_runner(accel_type)
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
     ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
@@ -396,7 +402,7 @@ def test_mean(accel_type, ifm_shape, axis, keep_dims, use_same_quantization):
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, test_runner)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -432,7 +438,10 @@ def test_elementwise_add_from_constant_scalar(accel_type, dtype, constant):
     }
     output_data = generate_ref_data(cpu_mod, input_data)
 
-    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    # Scalar constants are not supported by the cascader
+    infra.compare_ethosu_with_reference(
+        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -558,7 +567,13 @@ def test_ethosu_identity_codegen(ifm_shape, ifm_scale, ifm_zp, ofm_scale, ofm_zp
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
     infra.compare_ethosu_with_reference(
-        ethosu_mod, input_data, output_data, accel_type, output_tolerance=1
+        # identity op is not supported in cascader
+        ethosu_mod,
+        input_data,
+        output_data,
+        accel_type,
+        output_tolerance=1,
+        enable_cascader=False,
     )
 
 
@@ -588,7 +603,10 @@ def test_relay_reshape_codegen(ifm_shape, new_shape, accel_type):
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = infra.create_ethosu_partition(cpu_mod)
 
-    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    # reshape ops legalize into identity op that is not currently supported in the cascader
+    infra.compare_ethosu_with_reference(
+        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -608,7 +626,8 @@ def test_tflite_slice(accel_type, ifm_shape, begin, size):
     def slice_func(x):
         return tf.slice(x, begin, size)
 
-    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity is currently not supported by the cascader
+    infra.compare_tvm_with_tflite(slice_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -623,7 +642,10 @@ def test_tflite_strided_slice(accel_type, ifm_shape, begin, end):
     def strided_slice_func(x):
         return tf.strided_slice(x, begin, end)
 
-    infra.compare_tvm_with_tflite(strided_slice_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(
+        strided_slice_func, [ifm_shape], accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -645,7 +667,13 @@ def test_ethosu_unary_elementwise(
             op = tf.math.abs(x)
         return op
 
-    infra.compare_tvm_with_tflite(abs_func, [ifm_shape], accel_type)
+    # non-4D tensors are legalized to identity which are not supported by the cascader
+    infra.compare_tvm_with_tflite(
+        abs_func,
+        [ifm_shape],
+        accel_type,
+        enable_cascader=(len(ifm_shape) == 4) and ("u65" not in accel_type),
+    )
 
 
 def test_ethosu_section_name():
@@ -664,7 +692,8 @@ def test_ethosu_section_name():
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
 
-    compiled_models = infra.build_source(mod, input_data, output_data)
+    test_runner = infra.create_test_runner()
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
     ethosu_module = compiled_models[0].executor_factory.lib.imported_modules[0].imported_modules[0]
@@ -723,7 +752,8 @@ def test_tflite_tanh(accel_type):
         op = tf.nn.tanh(x)
         return op
 
-    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(tanh_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -744,7 +774,8 @@ def test_tflite_concat(shapes, axis, accel_type):
         op = tf.concat(list(inputs), axis)
         return op
 
-    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(concat_func, shapes, accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -757,7 +788,8 @@ def test_tflite_sigmoid(accel_type):
         op = tf.nn.sigmoid(x)
         return op
 
-    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(sigmoid_function, [ifm_shape], accel_type, enable_cascader=False)
 
 
 # This codegen test checks both, split and split_v
@@ -781,7 +813,8 @@ def test_tflite_split(accel_type, ifm_shape, num_or_size_splits, axis):
         op = tf.split(x, num_or_size_splits, axis=axis)
         return op
 
-    infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(split_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -812,7 +845,10 @@ def test_ethosu_requantize(accel_type, ifm_shape, ifm_scale, ifm_zp, ofm_scale,
     output_data = generate_ref_data(cpu_mod, input_data)
     ethosu_mod = partition_for_ethosu(cpu_mod)
 
-    infra.compare_ethosu_with_reference(ethosu_mod, input_data, output_data, accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_ethosu_with_reference(
+        ethosu_mod, input_data, output_data, accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -824,7 +860,8 @@ def test_tflite_expand_dims(accel_type, ifm_shape, axis):
     def expand_dims_func(x):
         return tf.expand_dims(x, axis=axis)
 
-    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(expand_dims_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -838,7 +875,8 @@ def test_tflite_squeeze(accel_type, ifm_shape, axis):
     def squeeze_func(x):
         return tf.squeeze(x, axis=axis)
 
-    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(squeeze_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -856,7 +894,8 @@ def test_tflite_resize2d_nearest_neighbor(accel_type, ifm_shape, size):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -879,7 +918,8 @@ def test_tflite_resize2d_bilinear(accel_type, ifm_shape, size, align_corners):
             x, size, align_corners=align_corners, half_pixel_centers=False
         )
 
-    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(resize_model, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -919,7 +959,10 @@ def test_tflite_transpose_convolution(
             op = tf.nn.bias_add(op, bias)
         return op
 
-    infra.compare_tvm_with_tflite(conv2d_transpose, [ifm_shape], accel_type=accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(
+        conv2d_transpose, [ifm_shape], accel_type=accel_type, enable_cascader=False
+    )
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -939,7 +982,8 @@ def test_tflite_pack(accel_type, ifm_shapes, axis):
     def pack_func(*inputs):
         return tf.stack(inputs, axis=axis)
 
-    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(pack_func, ifm_shapes, accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -954,7 +998,8 @@ def test_tflite_unpack(accel_type, ifm_shape, axis):
     def unpack_func(x):
         return tf.unstack(x, axis=axis)
 
-    infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(unpack_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -967,7 +1012,8 @@ def test_tflite_leaky_relu(accel_type, ifm_shape, alpha):
     def leaky_relu_func(x):
         return tf.nn.leaky_relu(x, alpha=alpha)
 
-    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(leaky_relu_func, [ifm_shape], accel_type, enable_cascader=False)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -999,7 +1045,8 @@ def test_tflite_fully_connected(
             x = tf.nn.relu(x)
         return x
 
-    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type)
+    # Ops that get legalized to identity are currently not supported by the cascader
+    infra.compare_tvm_with_tflite(fully_connected, [ifm_shape], accel_type, enable_cascader=False)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_identity_optimizer.py b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
index f37509e1cd..f90f0f2e62 100644
--- a/tests/python/contrib/test_ethosu/test_identity_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_identity_optimizer.py
@@ -322,7 +322,7 @@ def test_same_output():
         z = tf.reshape(z, (1, 1, 25, 8))
         return z
 
-    infra.compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, ifm_shapes, "ethos-u55-256", enable_cascader=False)
 
 
 def test_multi_output_identity_has_same_output():
@@ -340,7 +340,7 @@ def test_multi_output_identity_has_same_output():
         y = tf.concat(outputs, axis=0)
         return y
 
-    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256", enable_cascader=False)
 
 
 def test_multiple_transform_ops_same_output():
@@ -355,4 +355,4 @@ def test_multiple_transform_ops_same_output():
         x = tf.reshape(x, (12,))
         return x
 
-    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256")
+    infra.compare_tvm_with_tflite(model, [ifm_shape], "ethos-u55-256", enable_cascader=False)
diff --git a/tests/python/contrib/test_ethosu/test_layout_optimizer.py b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
index a2161c775b..eec963af7f 100644
--- a/tests/python/contrib/test_ethosu/test_layout_optimizer.py
+++ b/tests/python/contrib/test_ethosu/test_layout_optimizer.py
@@ -76,11 +76,12 @@ def _compile_and_compare_model(tflite_graph, ifm_shape, dtype):
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
 
+    test_runner = infra.create_test_runner("ethos-u55-256")
     compiled_models = infra.build_source(
         mod,
         input_data,
         output_data,
-        "ethos-u55-256",
+        test_runner,
         output_tolerance=0,
     )
 
@@ -92,7 +93,7 @@ def _compile_and_compare_model(tflite_graph, ifm_shape, dtype):
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, "ethos-u55-256")
+    infra.verify_source(compiled_models, test_runner)
 
 
 def test_single_convolution():
diff --git a/tests/python/contrib/test_ethosu/test_lookup_table.py b/tests/python/contrib/test_ethosu/test_lookup_table.py
index ae9d4ee27c..8e044b5b99 100644
--- a/tests/python/contrib/test_ethosu/test_lookup_table.py
+++ b/tests/python/contrib/test_ethosu/test_lookup_table.py
@@ -95,11 +95,12 @@ def test_tflite_lut_activations(accel_type):
     # Generate reference data
     input_data, output_data = infra.generate_ref_data_tflite(tflite_graph)
 
+    test_runner = infra.create_test_runner(accel_type)
     compiled_models = infra.build_source(
         mod,
         input_data,
         output_data,
-        accel_type,
+        test_runner,
     )
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
@@ -110,7 +111,7 @@ def test_tflite_lut_activations(accel_type):
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, test_runner)
 
 
 @pytest.mark.parametrize("accel_type", ACCEL_TYPES)
@@ -151,11 +152,12 @@ def test_random_lut(accel_type):
     mod["main"] = relay.Function([ifm], call)
     mod = relay.transform.InferType()(mod)
 
+    test_runner = infra.create_test_runner(accel_type)
     compiled_models = infra.build_source(
         mod,
         {"ifm": in_data},
         {"output": out_data},
-        accel_type,
+        test_runner,
     )
 
     # Assumes only two runtime.Modules are created -- i.e. single offload module
@@ -166,7 +168,7 @@ def test_random_lut(accel_type):
     compilation_artifacts = get_artifacts(ethosu_module)
     cmms = bytes.fromhex(compilation_artifacts[0].command_stream)
     infra.print_payload(cmms)
-    infra.verify_source(compiled_models, accel_type)
+    infra.verify_source(compiled_models, test_runner)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index b91168b7bb..ca7a213be5 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -23,7 +23,8 @@ import numpy as np
 
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.micro import model_library_format as mlf
-
+from tvm import WorkspaceMemoryPools, PoolInfo
+import tvm
 from tvm.testing.aot import convert_to_relay
 
 from . import infra
@@ -58,14 +59,13 @@ def test_networks_without_usmp(accel_type, model_url, workspace_size):
     input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
     mod, params = convert_to_relay(tflite_model_buf)
     mod = partition_for_ethosu(mod, params)
-    compiled_models = infra.build_source(
-        mod, input_data, output_data, accel_type, enable_usmp=False
-    )
+    test_runner = infra.create_test_runner(accel_type, enable_usmp=False)
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
     mlf_memory_map = mlf._build_function_memory_map(
         compiled_models[0].executor_factory.function_metadata
     )
     assert mlf_memory_map["main"][0]["workspace_size_bytes"] == workspace_size
-    infra.verify_source(compiled_models, accel_type, enable_usmp=False)
+    infra.verify_source(compiled_models, test_runner)
 
 
 @pytest.mark.parametrize(
@@ -81,12 +81,63 @@ def test_networks_with_usmp(accel_type, model_url, workspace_size):
     input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
     mod, params = convert_to_relay(tflite_model_buf)
     mod = partition_for_ethosu(mod, params)
-    compiled_models = infra.build_source(mod, input_data, output_data, accel_type, enable_usmp=True)
+    test_runner = infra.create_test_runner(accel_type, enable_usmp=True)
+    compiled_models = infra.build_source(mod, input_data, output_data, test_runner)
+    allocated_pool_info = list(
+        dict(compiled_models[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
+    )[0]
+    assert allocated_pool_info.allocated_size == workspace_size
+    infra.verify_source(compiled_models, test_runner)
+
+
+@pytest.mark.parametrize(
+    "accel_type, model_url, workspace_size",
+    [
+        ("ethos-u55-256", MOBILENET_V1_URL, 1205872),
+        ("ethos-u55-256", MOBILENET_V2_URL, 1509408),
+    ],
+)
+def test_networks_with_usmp_and_cascader_wo_striping(accel_type, model_url, workspace_size):
+    np.random.seed(23)
+
+    pool_name = "my_memory_pool"
+    host_target = tvm.target.Target("c")
+    ethosu_target = tvm.target.Target("ethos-u")
+    workspace_pools = WorkspaceMemoryPools(
+        [
+            PoolInfo(
+                pool_name,
+                {
+                    host_target: PoolInfo.READ_WRITE_ACCESS,
+                    ethosu_target: PoolInfo.READ_WRITE_ACCESS,
+                },
+                size_hint_bytes=2400000,
+                read_bandwidth_bytes_per_cycle=16,
+                write_bandwidth_bytes_per_cycle=16,
+                target_burst_bytes={ethosu_target: 1},
+            )
+        ]
+    )
+    tflite_model_buf = infra.get_tflite_model(model_url)
+    input_data, output_data = infra.generate_ref_data_tflite(tflite_model_buf)
+    mod, params = convert_to_relay(tflite_model_buf)
+    mod = partition_for_ethosu(mod, params)
+    test_runner = infra.create_test_runner(
+        accel_type,
+        enable_usmp=True,
+        enable_cascader=True,
+        enable_striping=False,
+        workspace_pools=workspace_pools,
+    )
+    compiled_models = infra.build_source(
+        mod, input_data, output_data, test_runner, workspace_pools=workspace_pools
+    )
+    infra.verify_source(compiled_models, test_runner)
+
     allocated_pool_info = list(
         dict(compiled_models[0].executor_factory.executor_codegen_metadata.pool_inputs).values()
     )[0]
     assert allocated_pool_info.allocated_size == workspace_size
-    infra.verify_source(compiled_models, accel_type, enable_usmp=True)
 
 
 if __name__ == "__main__":