You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2022/05/13 14:02:29 UTC

[tvm] branch main updated: [microNPU] Add various options to the cascader (#10509)

This is an automated email from the ASF dual-hosted git repository.

manupa pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 7c75b77df8 [microNPU] Add various options to the cascader  (#10509)
7c75b77df8 is described below

commit 7c75b77df8391b9e6b52cbf9ab0959f937c58c0a
Author: Jacob Bohlin <ja...@arm.com>
AuthorDate: Fri May 13 15:02:23 2022 +0100

    [microNPU] Add various options to the cascader  (#10509)
    
    * [microNPU] Added options to Cascader
    
    * Added option to toggle multi-dimensional striping, it is disabled by
      default because it has a very high computational cost. Single
      dimension striping shares most of the benefit with greatly reduced
      cost.
    * Added multiple developer/debugging options prefixed with 'dev_'
      Also added these options to tvmc.
    * Added cascader logging, if enabled it will dump information about the
      cascader proposals to a 'cascader_log.json' file.
    
    Co-authored-by: Matthew Barrett <ma...@arm.com>
    Change-Id: I2ec59ae0bd84b73b2cc4bc56d39e3831b0aeec27
    
    * Updated memory_reduction testcases
    
    Also added enable_striping to plan_generator.h
    
    Change-Id: I496b30ed6af6f0730087329cd81a69c5040a5e4d
    
    Co-authored-by: Matthew Barrett <ma...@arm.com>
---
 python/tvm/contrib/ethosu/cascader/__init__.py     |   1 +
 python/tvm/contrib/ethosu/cascader/block_config.py |  12 ++
 .../contrib/ethosu/cascader/cascader_options.py    |  24 ++++
 .../tvm/contrib/ethosu/cascader/device_config.py   |  83 ++++++++++--
 python/tvm/contrib/ethosu/cascader/logging.py      |  70 ++++++++++
 python/tvm/contrib/ethosu/cascader/pareto.py       |   6 +-
 .../tvm/contrib/ethosu/cascader/plan_generator.py  |   8 +-
 python/tvm/contrib/ethosu/cascader/scheduler.py    |  45 ++++++-
 python/tvm/relay/backend/contrib/ethosu/codegen.py |   2 +
 .../tvm/relay/backend/contrib/ethosu/vela_api.py   |   4 +
 src/contrib/ethosu/cascader/cascader_options.cc    |  28 +++-
 src/contrib/ethosu/cascader/cascader_options.h     |  17 ++-
 src/contrib/ethosu/cascader/pareto.cc              |  20 ++-
 src/contrib/ethosu/cascader/pareto.h               |   7 +-
 src/contrib/ethosu/cascader/plan_generator.cc      |  45 +++++--
 src/contrib/ethosu/cascader/plan_generator.h       |   5 +-
 src/contrib/ethosu/cascader/proposal_generator.cc  |   3 +-
 src/relay/backend/contrib/ethosu/compiler_attrs.cc |  40 ++++++
 tests/python/contrib/test_ethosu/cascader/infra.py |   8 ++
 .../cascader/test_ethosu_block_config.py           | 143 +++++++++++++++++++--
 .../test_ethosu/cascader/test_memory_reduction.py  |  14 +-
 .../contrib/test_ethosu/cascader/test_pareto.py    |   2 +-
 .../test_ethosu/cascader/test_plan_generator.py    | 115 +++++++++++++++--
 .../contrib/test_ethosu/cascader/test_scheduler.py |   5 +-
 tests/python/contrib/test_ethosu/test_vela_api.py  |  13 ++
 25 files changed, 640 insertions(+), 80 deletions(-)

diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py
index 51f5e58a47..1d608c04ff 100644
--- a/python/tvm/contrib/ethosu/cascader/__init__.py
+++ b/python/tvm/contrib/ethosu/cascader/__init__.py
@@ -37,4 +37,5 @@ from .device_config import EthosuDeviceConfig
 from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig
 from .plan import Plan
 from .scheduler import apply_proposal, cascade, extract_memory_info
+from .logging import Logging
 from .cascader_options import CascaderOptions
diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py
index f246918cf4..b90de753f6 100644
--- a/python/tvm/contrib/ethosu/cascader/block_config.py
+++ b/python/tvm/contrib/ethosu/cascader/block_config.py
@@ -55,5 +55,17 @@ class BlockConfig(Object):
     def output_cycles(self) -> int:
         return int(self._output_cycles)
 
+    def __ge__(self, other: "BlockConfig"):
+        if len(self.output_shape) != len(other.output_shape):
+            return False
+
+        return all(a >= b for a, b in zip(self.output_shape, other.output_shape))
+
+    def __lt__(self, other: "BlockConfig"):
+        if len(self.output_shape) != len(other.output_shape):
+            return False
+
+        return other >= self
+
     def __repr__(self) -> str:
         return f"BlockConfig(output_shape={self.output_shape})"
diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py
index ade04bdde9..aeca7fcdcb 100644
--- a/python/tvm/contrib/ethosu/cascader/cascader_options.py
+++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py
@@ -38,8 +38,20 @@ class CascaderOptions(Object):
         How many striping factors to try per axis.
     max_plan_size : int
         The maximum number of Parts in a Plan.
+    max_open_plans : int
+        The maximum number of open Plans to keep after culling.
+    max_closed_plans : int
+        The maxmum number of closed Plans to keep after culling.
     always_copy_size : int
         The maximum size of a Tensor that will always be copied into the cascade region.
+    disable_pareto_plans : bool
+        Disable pareto culling for Plans.
+    disable_pareto_proposals : bool
+        Disable pareto culling for Proposals.
+    enable_multi_dimensional_striping : bool
+        Enable striping in multiple dimensions simultaneously.
+    disable_block_culling : bool
+        Disable culling of block configs.
     enable_striping : bool
         A boolean option to enable striping
 
@@ -51,7 +63,13 @@ class CascaderOptions(Object):
         max_proposals: int,
         stripe_factors: int,
         max_plan_size: int,
+        max_open_plans: int,
+        max_closed_plans: int,
         always_copy_size: int,
+        disable_pareto_plans: bool = False,
+        disable_pareto_proposals: bool = False,
+        enable_multi_dimensional_striping: bool = False,
+        disable_block_culling: bool = True,
         enable_striping: bool = False,
     ):
         self.__init_handle_by_constructor__(
@@ -60,6 +78,12 @@ class CascaderOptions(Object):
             max_proposals,
             stripe_factors,
             max_plan_size,
+            max_open_plans,
+            max_closed_plans,
             always_copy_size,
+            disable_pareto_plans,
+            disable_pareto_proposals,
+            enable_multi_dimensional_striping,
+            disable_block_culling,
             enable_striping,
         )
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index bf6ac48cf9..5f5a937628 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -15,12 +15,15 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name
+# pylint: disable=too-many-nested-blocks
 """Device config class to hold information about the target hardware"""
 from typing import Tuple, List, Dict, Optional
 from functools import reduce
 
 import math
+import numpy as np
 
+import tvm
 from . import BlockConfig
 from . import StripeConfig
 from . import Propagator
@@ -64,13 +67,14 @@ class _Shape:
 class EthosuDeviceConfig:
     """Arm(R) Ethos(TM)-U NPU config class"""
 
-    def __init__(self, device: str):
+    def __init__(self, device: str, disable_block_bulling: bool = False):
         self._device = device
         self._subkernel_limits = (8, 8)
         self._output_cycles = (1, 2, 3, 4, 6)
         self._split_depth = 16
         self._max_block_shape = _Shape([1, 32, 64, 128])
         self._bank_size_bytes = 1024
+        self._disable_block_culling = disable_block_bulling
         if self._device == "ethos-u55-256":
             self._micro_block = _Shape([1, 2, 2, 8])
             self._input_micro_block = _Shape([1, 2, 2, 8])
@@ -508,6 +512,28 @@ class EthosuDeviceConfig:
         if activation == "LUT" and not self._lut_reserved:
             banks_available -= 2
 
+        # Handle user-forced block config
+        options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+        if options and options.dev_force_block_config:
+            block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+            assert len(block_config) == 3
+            if output_layout == "NHWC":
+                block_shape = [output_shape[0], block_config[0], block_config[1], block_config[2]]
+            else:
+                block_shape = [
+                    output_shape[0],
+                    block_config[0],
+                    1 + ((block_config[2] - 1) // 16),
+                    block_config[1],
+                    16,
+                ]
+            output_cycles = self._get_output_cycles(
+                op_type, op_str, ifm_dtype, ofm_dtype, activation
+            )
+            output_cycles *= reduce(lambda a, b: a * b, block_shape, 1)
+            output_cycles = int(math.ceil(output_cycles))
+            return [BlockConfig(block_shape, block_shape, 0, output_cycles)]
+
         # Split the block in half until it fits into SHRAM
         max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
         if output_layout == "NHCWB16":
@@ -666,6 +692,21 @@ class EthosuDeviceConfig:
         max_depth = min(ofm_channels, self._max_block_shape.depth)
         min_depth = max(self._micro_block.depth, upscaling_factor)
 
+        heights = range(min_height, max_height + min_height, min_height)
+        widths = range(min_width, max_width + min_width, min_width)
+        depths = range(min_depth, max_depth + min_depth, min_depth)
+
+        # Handle user-forced block config
+        options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+        forced = False
+        if options and options.dev_force_block_config:
+            block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+            assert len(block_config) == 3
+            heights = [block_config[0]]
+            widths = [block_config[1]]
+            depths = [block_config[2]]
+            forced = True
+
         input_bytewidth = 1 if ifm_dtype == "int8" else 2
         acc_bytewidth = self._get_accumulator_width(op_type, ifm_dtype)
         banks_available = self._total_banks - self._reserved_banks
@@ -681,8 +722,8 @@ class EthosuDeviceConfig:
             else:
                 input_block_depth = min(ifm_channels, 32)
 
-        for depth in range(min_depth, max_depth + min_depth, min_depth):
-            if (depth < output_shape.depth) and (depth % self._split_depth != 0):
+        for depth in reversed(depths):
+            if (depth < output_shape.depth) and (depth % self._split_depth != 0) and not forced:
                 # Block depth has to be less than full depth or a multiple of the split depth
                 continue
 
@@ -690,17 +731,15 @@ class EthosuDeviceConfig:
                 op_attrs, ifm_propagator, input_layout, output_layout, depth
             )
 
-            for width in range(min_width, max_width + min_width, min_width):
-                for height in range(min_height, max_height + min_height, min_height):
+            for width in reversed(widths):
+                for height in reversed(heights):
                     if output_layout == "NHCWB16":
                         output_block = (
                             1,
                             height,
                             1 + ((depth - 1) // 16),
                             width,
-                            _round_up(
-                                min(16, max(ofm_channels, min_depth)), self._micro_block.depth
-                            ),
+                            min(16, _round_up(ofm_channels, self._micro_block.depth)),
                         )
                         order = [1, 2, 4, 3, 0]
                     else:
@@ -740,7 +779,7 @@ class EthosuDeviceConfig:
                         output_cycles = self._get_output_cycles(
                             op_type, op_str, ifm_dtype, ofm_dtype, activation
                         )
-                        output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
+                        output_cycles *= np.prod(output_block).tolist()
                         output_cycles = int(math.ceil(output_cycles))
                         compute_cycles = self._estimate_compute_cycles_per_block(
                             op_type,
@@ -755,11 +794,27 @@ class EthosuDeviceConfig:
                         block_config = BlockConfig(
                             input_block_shape.as_list(), output_block, compute_cycles, output_cycles
                         )
-                        valid_block_configs.append(block_config)
-                    else:
-                        # Block config does not fit into SHRAM
-                        # Any Block config that is strictly larger than this one will also fail
-                        break
+
+                        if self._disable_block_culling:
+                            # Block culling disabled - add all block configs that fit
+                            valid_block_configs.append(block_config)
+                        else:
+                            # Add block config only if it's not dominated by an existing block.
+                            # A block config is dominated by another if its output_shape is greater
+                            # or equal in every dimension and strictly greater in at least one
+                            # dimension.
+                            dominated = False
+                            for valid_block in valid_block_configs:
+                                if block_config < valid_block:
+                                    dominated = True
+                                    break
+
+                            if not dominated:
+                                valid_block_configs.append(block_config)
+
+                            # Every consecutive block in the innermost loop will be dominated by
+                            # this one so break
+                            break
 
         return valid_block_configs
 
diff --git a/python/tvm/contrib/ethosu/cascader/logging.py b/python/tvm/contrib/ethosu/cascader/logging.py
new file mode 100644
index 0000000000..0b163eb147
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/logging.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A class to hold logging information about the cascader"""
+from typing import Tuple
+import datetime
+import json
+import os
+import math
+
+
+class Logging:
+    """Cascader logging class"""
+
+    def __init__(self):
+        self.min_memory_usage = 0
+        self.max_memory_usage = 0
+        self.min_cycles = 0
+        self.max_cycles = 0
+
+        self.selected_proposal_idx = -1
+        self.proposals = {}
+        self.cascader_runtime = 0
+
+    def add_proposal(self, idx: int, memory_usage: int, cycles: int):
+        self.proposals[idx] = {"memory_usage": memory_usage, "cycles": cycles}
+
+    def get_extreme_points(self) -> Tuple[int, int, int, int]:
+        min_cycles, min_mem_usage = math.inf, math.inf
+        max_cycles, max_mem_usage = 0, 0
+        for proposal in self.proposals.values():
+            min_mem_usage = min(proposal["memory_usage"], min_mem_usage)
+            max_mem_usage = max(proposal["memory_usage"], max_mem_usage)
+            min_cycles = min(proposal["cycles"], min_cycles)
+            max_cycles = max(proposal["cycles"], max_cycles)
+
+        return min_mem_usage, max_mem_usage, min_cycles, max_cycles
+
+    def dump_json(self):
+        min_mem_usage, max_mem_usage, min_cycles, max_cycles = self.get_extreme_points()
+        with open(os.getcwd() + "/cascader_log.json", "w") as json_file:
+            print(
+                json.dumps(
+                    {
+                        "date": f"{datetime.datetime.now()}",
+                        "cascader_runtime": self.cascader_runtime,
+                        "min_cycles": min_cycles,
+                        "max_cycles": max_cycles,
+                        "min_memory_usage": min_mem_usage,
+                        "max_memory_usage": max_mem_usage,
+                        "selected_proposal": self.selected_proposal_idx,
+                        "proposals": self.proposals,
+                    },
+                    indent=2,
+                ),
+                file=json_file,
+            )
diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py
index 3c4dcbc88a..545778934c 100644
--- a/python/tvm/contrib/ethosu/cascader/pareto.py
+++ b/python/tvm/contrib/ethosu/cascader/pareto.py
@@ -35,5 +35,7 @@ def _thin_vector(vec: List[Object], max_size: int) -> List[Object]:
     return list(_ffi_api.ThinVector(vec, max_size))
 
 
-def _pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]:
-    return list(_ffi_api.ParetoCullPlans(plans, max_plans))
+def _pareto_cull_plans(
+    plans: List[Plan], max_plans: int, disable_pareto_metric: bool
+) -> List[Plan]:
+    return list(_ffi_api.ParetoCullPlans(plans, max_plans, disable_pareto_metric))
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index 9235a285d8..155e01431c 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -27,9 +27,13 @@ from .graph import CascaderGraph, Part, Tensor
 
 
 def _generate_output_stripe_configs(
-    part: Part, stripe_factors: int, enable_striping: bool
+    part: Part, stripe_factors: int, enable_striping: bool, multi_dimensional: bool
 ) -> List[StripeConfig]:
-    return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors, enable_striping))
+    return list(
+        _ffi_api.GenerateOutputStripeConfigs(
+            part, stripe_factors, enable_striping, multi_dimensional
+        )
+    )
 
 
 def _generate_single_plans(
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index 63d48a19af..d33abaf2b7 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -18,8 +18,10 @@
 """Scheduler for cascader which converts Proposals into Schedules."""
 from typing import Tuple, List, Dict, DefaultDict
 from collections import defaultdict
+import time
 import numpy as np
 
+import tvm
 from tvm import te
 from tvm import tir
 from tvm import PoolInfo
@@ -31,6 +33,7 @@ from .proposal import Proposal
 from .proposal_generator import generate_proposals
 from .graph import create_cascader_graph
 from .device_config import EthosuDeviceConfig
+from .logging import Logging
 
 
 def tile_nd(
@@ -188,13 +191,20 @@ def create_home_map(
     return home_map
 
 
-def choose_proposal(proposals: List[Proposal], cascade_region: MemoryRegion):
+def choose_proposal(
+    proposals: List[Proposal], cascade_region: MemoryRegion, select_proposal_idx: int
+):
     """Choose the best performing Proposal that doesn't overflow the cascade region."""
-    proposal_choice = proposals[0]
-    for proposal in reversed(proposals):
-        if proposal.memory_usage < cascade_region.size:
-            proposal_choice = proposal
-            break
+    if select_proposal_idx != -1:
+        # Manually select proposal based on index, take modulus the total number of proposals to
+        # ensure that some proposal is always selected.
+        proposal_choice = proposals[select_proposal_idx % len(proposals)]
+    else:
+        proposal_choice = proposals[0]
+        for proposal in reversed(proposals):
+            if proposal.memory_usage < cascade_region.size:
+                proposal_choice = proposal
+                break
 
     return proposal_choice
 
@@ -271,6 +281,17 @@ def cascade(
         Target device configuration.
 
     """
+    tvmc_options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+    log = Logging() if tvmc_options and tvmc_options.dev_cascader_logging else None
+    select_proposal_idx = (
+        int(tvmc_options.dev_select_proposal_idx)
+        if tvmc_options and tvmc_options.dev_select_proposal_idx
+        else -1
+    )
+
+    if log:
+        start = time.time()
+
     assert options.cascade_region in working_regions
     # First convert the Tensor Expression graph into a CascaderGraph
     casc_graph = create_cascader_graph(te_graph, const_dict, device_config)
@@ -279,6 +300,16 @@ def cascade(
     # Generate Proposals for Pareto-optimal ways to cascade the CascaderGraph
     proposals = generate_proposals(casc_graph, home_map, options)
     # Select the best Proposal subject to the memory constraints
-    proposal_choice = choose_proposal(proposals, options.cascade_region)
+    proposal_choice = choose_proposal(proposals, options.cascade_region, select_proposal_idx)
+
+    if log:
+        for idx, proposal in enumerate(proposals):
+            log.add_proposal(idx, proposal.memory_usage, proposal.cycles)
+            if proposal == proposal_choice:
+                log.selected_proposal_idx = idx
+
+        log.cascader_runtime = time.time() - start
+        log.dump_json()
+
     # Apply the selected Proposal to the Tensor Expression Schedule
     apply_proposal(proposal_choice, sch)
diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index 2552d891c9..423834daa8 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -368,6 +368,8 @@ def _ethos_u55_cascader(sram, enable_striping) -> Callable:
         stripe_factors=5,
         max_plan_size=10,
         always_copy_size=1024,
+        max_open_plans=8,
+        max_closed_plans=32,
         enable_striping=enable_striping,
     )
     return _create_cascader(
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index 6d01e8de57..f241652e73 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -67,6 +67,10 @@ def get_optimal_block_config(
     ethosu.vela.api.NpuShape3D :
         The optimal block config for the operator
     """
+    options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+    if options and options.dev_force_block_config:
+        block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+        return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2])
     all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
     return _get_optimal_block_config(all_valid_block_configs)
 
diff --git a/src/contrib/ethosu/cascader/cascader_options.cc b/src/contrib/ethosu/cascader/cascader_options.cc
index be4bfee6d7..0daf3fed24 100644
--- a/src/contrib/ethosu/cascader/cascader_options.cc
+++ b/src/contrib/ethosu/cascader/cascader_options.cc
@@ -30,28 +30,48 @@ void CascaderOptionsNode::VisitAttrs(AttrVisitor* v) {
   v->Visit("max_proposals", &max_proposals);
   v->Visit("stripe_factors", &stripe_factors);
   v->Visit("max_plan_size", &max_plan_size);
+  v->Visit("max_open_plans", &max_open_plans);
+  v->Visit("max_closed_plans", &max_closed_plans);
   v->Visit("always_copy_size", &always_copy_size);
+  v->Visit("disable_pareto_plans", &disable_pareto_plans);
+  v->Visit("disable_pareto_proposals", &disable_pareto_proposals);
+  v->Visit("enable_multi_dimensional_striping", &enable_multi_dimensional_striping);
+  v->Visit("disable_block_culling", &disable_block_culling);
   v->Visit("enable_striping", &enable_striping);
 }
 
 CascaderOptions::CascaderOptions(const MemoryRegion& cascade_region, int max_proposals,
-                                 int stripe_factors, int max_plan_size, int always_copy_size,
+                                 int stripe_factors, int max_plan_size, int max_open_plans,
+                                 int max_closed_plans, int always_copy_size,
+                                 bool disable_pareto_plans, bool disable_pareto_proposals,
+                                 bool enable_multi_dimensional_striping, bool disable_block_culling,
                                  bool enable_striping) {
   auto n = make_object<CascaderOptionsNode>();
   n->cascade_region = std::move(cascade_region);
   n->max_proposals = max_proposals;
   n->stripe_factors = stripe_factors;
   n->max_plan_size = max_plan_size;
+  n->max_open_plans = max_open_plans;
+  n->max_closed_plans = max_closed_plans;
   n->always_copy_size = always_copy_size;
+  n->disable_pareto_plans = disable_pareto_plans;
+  n->disable_pareto_proposals = disable_pareto_proposals;
+  n->enable_multi_dimensional_striping = enable_multi_dimensional_striping;
+  n->disable_block_culling = disable_block_culling;
   n->enable_striping = enable_striping;
   data_ = std::move(n);
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.CascaderOptions")
     .set_body_typed([](MemoryRegion cascade_region, int max_proposals, int stripe_factors,
-                       int max_plan_size, int always_copy_size, bool enable_striping) {
-      return CascaderOptions(cascade_region, max_proposals, stripe_factors, max_plan_size,
-                             always_copy_size, enable_striping);
+                       int max_plan_size, int max_open_plans, int max_closed_plans,
+                       int always_copy_size, bool disable_pareto_plans,
+                       bool disable_pareto_proposals, bool enable_multi_dimensional_striping,
+                       bool disable_block_culling, bool enable_striping) {
+      return CascaderOptions(
+          cascade_region, max_proposals, stripe_factors, max_plan_size, max_open_plans,
+          max_closed_plans, always_copy_size, disable_pareto_plans, disable_pareto_proposals,
+          enable_multi_dimensional_striping, disable_block_culling, enable_striping);
     });
 
 TVM_REGISTER_NODE_TYPE(CascaderOptionsNode);
diff --git a/src/contrib/ethosu/cascader/cascader_options.h b/src/contrib/ethosu/cascader/cascader_options.h
index ba00451766..3545e5cc3a 100644
--- a/src/contrib/ethosu/cascader/cascader_options.h
+++ b/src/contrib/ethosu/cascader/cascader_options.h
@@ -47,8 +47,20 @@ class CascaderOptionsNode : public Object {
   int stripe_factors;
   /*! \brief The maximum number of Parts in a Plan. */
   int max_plan_size;
+  /*! \brief The maximum number of open Plans saved for a Part Group */
+  int max_open_plans;
+  /*! \brief The maximum number of closed Plans saved for a Part Group */
+  int max_closed_plans;
   /*! \brief The maximum size of Tensor that will always be copied into the cascade region. */
   int always_copy_size;
+  /*! \brief Flag to disable pareto culling for plans to allow non pareto-optimal plans */
+  bool disable_pareto_plans;
+  /*! \brief Flag to disable pareto culling for proposals to allow non pareto-optimal proposals */
+  bool disable_pareto_proposals;
+  /*! \brief Whether to consider multi-dimensional striping */
+  bool enable_multi_dimensional_striping;
+  /*! \brief Flag to disable culling for block configs to allow non-dominant blocks */
+  bool disable_block_culling;
   /*! \brief A boolean option to enable striping. */
   bool enable_striping;
 
@@ -60,7 +72,10 @@ class CascaderOptionsNode : public Object {
 class CascaderOptions : public ObjectRef {
  public:
   CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, int stripe_factors,
-                  int max_plan_size, int always_copy_size, bool enable_striping = true);
+                  int max_plan_size, int max_open_plans, int max_closed_plans, int always_copy_size,
+                  bool disable_pareto_plans, bool disable_pareto_proposals,
+                  bool enable_multi_dimensional_striping, bool disable_block_culling,
+                  bool multi_dimensional_striping);
 
   TVM_DEFINE_OBJECT_REF_METHODS(CascaderOptions, ObjectRef, CascaderOptionsNode);
 };
diff --git a/src/contrib/ethosu/cascader/pareto.cc b/src/contrib/ethosu/cascader/pareto.cc
index 52ea729bff..e40a6602fa 100644
--- a/src/contrib/ethosu/cascader/pareto.cc
+++ b/src/contrib/ethosu/cascader/pareto.cc
@@ -80,10 +80,16 @@ std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size) {
   return thin_vec;
 }
 
-std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans) {
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans,
+                                  bool disable_pareto_metric) {
   if (plans.size() <= max_plans) {
     return plans;
   }
+  if (disable_pareto_metric) {
+    // Sample from all plans
+    return ThinVector(plans, max_plans);
+  }
+
   std::sort(plans.begin(), plans.end(), [](const Plan& a, const Plan& b) -> bool {
     return a->GetMemoryUsage() < b->GetMemoryUsage();
   });
@@ -108,7 +114,13 @@ std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans) {
   return ThinVector(optimal_plans, max_plans);
 }
 
-std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals) {
+std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals,
+                                          bool disable_pareto_metric) {
+  if (disable_pareto_metric) {
+    // Sample from all Proposals
+    return ThinVector(proposals, max_proposals);
+  }
+
   std::sort(proposals.begin(), proposals.end(), [](const Proposal& a, const Proposal& b) -> bool {
     return a->GetMemoryUsage() < b->GetMemoryUsage();
   });
@@ -156,9 +168,9 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ThinVector")
     });
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ParetoCullPlans")
-    .set_body_typed([](Array<Plan> plans, int max_size) {
+    .set_body_typed([](Array<Plan> plans, int max_size, bool disable_pareto_metric) {
       std::vector<Plan> vplans(plans.begin(), plans.end());
-      return Array<Plan>(ParetoCullPlans(vplans, max_size));
+      return Array<Plan>(ParetoCullPlans(vplans, max_size, disable_pareto_metric));
     });
 
 }  // namespace cascader
diff --git a/src/contrib/ethosu/cascader/pareto.h b/src/contrib/ethosu/cascader/pareto.h
index 511da6c271..abb6ca516c 100644
--- a/src/contrib/ethosu/cascader/pareto.h
+++ b/src/contrib/ethosu/cascader/pareto.h
@@ -61,13 +61,16 @@ std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size);
  * \brief Cull plans which are not Pareto optimal then thin them down.
  * \param plans The plans to apply the Pareto culling to.
  * \param max_plans The maximum number of plans after the culling.
+ * \param disable_pareto_metric Whether to only select from Pareto frontier or not.
  * \return The culled plans.
  * \note Plan Pareto-optimality is determined based upon a Plan's memory_usage
  * and cycles.
  */
-std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans);
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans,
+                                  bool disable_pareto_metric);
 
-std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals);
+std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals,
+                                          bool disable_pareto_metric);
 
 }  // namespace cascader
 }  // namespace ethosu
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
index 75e711ea0f..780f9adc2c 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -106,7 +106,8 @@ std::vector<bool> GetCascadableAxes(const Part& part) {
 }
 
 std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
-                                                      bool enable_striping) {
+                                                      bool enable_striping,
+                                                      bool multi_dimensional) {
   // If stripe_factors is <= 0, then we won't produce any StripeConfigs
   if (stripe_factors <= 0) {
     return std::vector<StripeConfig>();
@@ -147,11 +148,29 @@ std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stri
     }
     splits.push_back(std::vector<int>(axis_splits.begin(), axis_splits.end()));
   }
-  // Now calculate all the possible combinations of splits for each dimension
-  // to give us all the possible stripe shapes. For example, if we had two axes
-  // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
-  // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
-  auto stripe_shapes = EnumerateCombinations<int>(splits);
+
+  std::vector<std::vector<int>> stripe_shapes;
+  if (multi_dimensional) {
+    // Now calculate all the possible combinations of splits for each dimension
+    // to give us all the possible stripe shapes. For example, if we had two axes
+    // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
+    // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
+    stripe_shapes = EnumerateCombinations<int>(splits);
+  } else {
+    // Only consider splitting a single axis
+    int axis = 0;
+    for (const auto& split : splits) {
+      for (const auto& axis_split : split) {
+        std::vector<int> stripe_shape = output_shape;
+        if (stripe_shape[axis] != axis_split) {
+          stripe_shape[axis] = axis_split;
+          stripe_shapes.push_back(stripe_shape);
+        }
+      }
+      axis++;
+    }
+    stripe_shapes.push_back(output_shape);
+  }
   auto offset = std::vector<int>(output_dims);
   std::vector<StripeConfig> stripe_configs;
   // Calculate the possible axis orderings such that each axis has the opportunity
@@ -437,7 +456,8 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
     // output of a Plan. The number generated is a function of stripe_factors and the number of
     // cascadable dimensions in the Part.
     std::vector<StripeConfig> stripe_configs =
-        GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping);
+        GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping,
+                                    options->enable_multi_dimensional_striping);
     // Check to see if the output Tensor is part of any existing open Plans
     if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) {
       // If there are other open Plans which have this Part's output Tensor as an input, then
@@ -491,10 +511,12 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
     // and plans_by_config maps.
     for (const auto& part_group : new_part_groups) {
       if (closed_plans.find(part_group) != closed_plans.end()) {
-        closed_plans[part_group] = ParetoCullPlans(closed_plans.at(part_group), 32);
+        closed_plans[part_group] = ParetoCullPlans(
+            closed_plans.at(part_group), options->max_closed_plans, options->disable_pareto_plans);
       }
       for (const auto& it : open_plans[part_group]) {
-        auto pareto_plans = ParetoCullPlans(it.second, 8);
+        auto pareto_plans =
+            ParetoCullPlans(it.second, options->max_open_plans, options->disable_pareto_plans);
         for (const auto& plan : pareto_plans) {
           for (const auto& open_config : plan->GetOpenConfigs()) {
             if (open_config != plan->GetOutputConfig()) {
@@ -515,12 +537,13 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
 }
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs")
-    .set_body_typed([](Part part, int stripe_factors, bool enable_striping) {
+    .set_body_typed([](Part part, int stripe_factors, bool enable_striping,
+                       bool multi_dimensional) {
       if (stripe_factors < 0) {
         return Array<StripeConfig>();
       }
       return Array<StripeConfig>(
-          GenerateOutputStripeConfigs(part, stripe_factors, enable_striping));
+          GenerateOutputStripeConfigs(part, stripe_factors, enable_striping, multi_dimensional));
     });
 
 TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans")
diff --git a/src/contrib/ethosu/cascader/plan_generator.h b/src/contrib/ethosu/cascader/plan_generator.h
index 947728addf..71bdef82d2 100644
--- a/src/contrib/ethosu/cascader/plan_generator.h
+++ b/src/contrib/ethosu/cascader/plan_generator.h
@@ -51,9 +51,12 @@ using HomeMap =
  * \brief Generate possible output StripeConfigs that could be applied to a Part's output.
  * \param part The Part to generate StripeConfigs for.
  * \param stripe_factors How many striping factors to try per axis.
+ * \param enable_striping Whether striping is enabled
+ * \param multi_dimensional Whether to stripe in more than one dimension.
  * \return The generated StripeConfigs for the Part's output.
  */
-std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors);
+std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
+                                                      bool enable_striping, bool multi_dimensional);
 
 /*!
  * \brief Generate single-Part Plans for a Part for a given list of output StripeConfigs.
diff --git a/src/contrib/ethosu/cascader/proposal_generator.cc b/src/contrib/ethosu/cascader/proposal_generator.cc
index ce709cbaa6..f886aad424 100644
--- a/src/contrib/ethosu/cascader/proposal_generator.cc
+++ b/src/contrib/ethosu/cascader/proposal_generator.cc
@@ -177,7 +177,8 @@ std::vector<Proposal> GeneratePartialProposals(
       }
     }
     (*proposals_by_group)[partial_proposal_group] =
-        ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals);
+        ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals,
+                            options->disable_pareto_proposals);
   }
   return proposals_by_group->at(partial_proposal_group);
 }
diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
index 5124e273d9..42add45b01 100644
--- a/src/relay/backend/contrib/ethosu/compiler_attrs.cc
+++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
@@ -41,6 +41,14 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
   String accelerator_config;
   bool enable_cascader;
   bool enable_striping;
+  String dev_force_block_config;
+  String dev_max_open_plans;
+  String dev_max_closed_plans;
+  String dev_select_proposal_idx;
+  bool dev_disable_pareto_plans;
+  bool dev_disable_pareto_proposals;
+  bool dev_disable_block_culling;
+  bool dev_cascader_logging;
 
   TVM_DECLARE_ATTRS(EthosUCompilerConfigNode, "ext.attrs.EthosUCompilerConfigNode") {
     TVM_ATTR_FIELD(accelerator_config)
@@ -54,6 +62,38 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
     TVM_ATTR_FIELD(enable_striping)
         .describe("Whether the cascader should be striping")
         .set_default(false);
+    String dev_warning = "Option is intended for development and debugging purposes only. ";
+    TVM_ATTR_FIELD(dev_force_block_config)
+        .describe((dev_warning + String("Force the block config to a given value; format = "
+                                        "\"[BLK_HEIGHT]x[BLK_WIDTH]x[BLK_DEPTH]\""))
+                      .data())
+        .set_default("");
+    TVM_ATTR_FIELD(dev_max_open_plans)
+        .describe(
+            (dev_warning + String("Specify the number of open plans kept for each part group"))
+                .data())
+        .set_default("8");
+    TVM_ATTR_FIELD(dev_max_closed_plans)
+        .describe(
+            (dev_warning + String("Specify the number of closed plans kept for each part group"))
+                .data())
+        .set_default("32");
+    TVM_ATTR_FIELD(dev_select_proposal_idx)
+        .describe((dev_warning + String("Select proposal by index")).data())
+        .set_default("-1");
+    TVM_ATTR_FIELD(dev_disable_pareto_plans)
+        .describe((dev_warning + String("Disable pareto culling for plans")).data())
+        .set_default(false);
+    TVM_ATTR_FIELD(dev_disable_pareto_proposals)
+        .describe((dev_warning + String("Disable pareto culling for proposals")).data())
+        .set_default(false);
+    TVM_ATTR_FIELD(dev_disable_block_culling)
+        .describe((dev_warning + String("Disable culling for block configs")).data())
+        .set_default(false);
+    TVM_ATTR_FIELD(dev_cascader_logging)
+        .describe(
+            (dev_warning + String("Enable cascader logging, log is dumped to .json file")).data())
+        .set_default(false);
   }
 };
 
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index e629e19a69..cfda1df721 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -31,7 +31,11 @@ def make_options(
     max_proposals: int = 1,
     stripe_factors: int = 1,
     max_plan_size: int = 1,
+    max_open_plans: int = 8,
+    max_closed_plans: int = 32,
     always_copy_size: int = 1024,
+    disable_pareto_plans: bool = False,
+    disable_pareto_proposals: bool = False,
     enable_striping: bool = True,
 ):
     return cs.CascaderOptions(
@@ -39,7 +43,11 @@ def make_options(
         max_proposals=max_proposals,
         stripe_factors=stripe_factors,
         max_plan_size=max_plan_size,
+        max_open_plans=max_open_plans,
+        max_closed_plans=max_closed_plans,
         always_copy_size=always_copy_size,
+        disable_pareto_plans=disable_pareto_plans,
+        disable_pareto_proposals=disable_pareto_proposals,
         enable_striping=enable_striping,
     )
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index ee416a12e1..26a8080e1a 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -21,6 +21,7 @@ pytest.importorskip("ethosu.vela")
 import numpy as np
 import math
 
+import tvm
 import tvm.contrib.ethosu.cascader as cs
 from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
 
@@ -163,15 +164,15 @@ from .infra import make_matrices
                 # Conv2D
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 7, 5, 16), (1, 7, 1, 5, 16)),
+                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 16), (1, 1, 1, 1, 16)),
+                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
                 ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
             ],
         ),
@@ -181,15 +182,15 @@ from .infra import make_matrices
                 # Conv2D
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
                 ((1, 8, 4, 16), (1, 8, 1, 4, 16)),
                 ((1, 10, 6, 8), (1, 10, 1, 6, 8)),
                 ((1, 6, 5, 16), (1, 6, 1, 5, 16)),
                 # Depthwise Conv2D
                 ((1, 6, 10, 16), (1, 6, 1, 10, 16)),
-                ((1, 7, 5, 16), (1, 7, 1, 5, 16)),
+                ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
                 # Pooling
-                ((1, 1, 1, 16), (1, 1, 1, 1, 16)),
+                ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
                 ((1, 9, 6, 16), (1, 9, 1, 6, 16)),
             ],
         ),
@@ -199,15 +200,16 @@ from .infra import make_matrices
                 # Conv2D
                 ((1, 7, 6, 16), (1, 7, 1, 6, 16)),
                 ((1, 5, 8, 16), (1, 5, 1, 8, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 16, 4, 16), (1, 16, 1, 4, 16)),
                 ((1, 8, 12, 8), (1, 8, 1, 12, 8)),
                 ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
                 # Depthwise Conv2D
-                ((1, 7, 10, 16), (1, 7, 1, 10, 16)),
-                ((1, 7, 6, 16), (1, 7, 1, 6, 16)),
+                ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 7, 2, 10, 16)),
+                ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
                 # Pooling
-                ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
+                # ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
+                ((1, 1, 2, 128), (1, 1, 8, 2, 16)),
                 ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
             ],
         ),
@@ -217,15 +219,16 @@ from .infra import make_matrices
                 # Conv2D
                 ((1, 14, 8, 16), (1, 14, 1, 8, 16)),
                 ((1, 16, 8, 16), (1, 16, 1, 8, 16)),
-                ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+                ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
                 ((1, 32, 4, 16), (1, 10, 12, 16), (1, 32, 1, 4, 16), (1, 10, 1, 12, 16)),
                 ((1, 20, 12, 8), (1, 20, 1, 12, 8)),
                 ((1, 12, 10, 16), (1, 12, 1, 10, 16)),
                 # Depthwise Conv2D
-                ((1, 8, 20, 16), (1, 8, 1, 20, 16)),
+                ((1, 8, 20, 16), (1, 8, 1, 20, 16), (1, 8, 2, 20, 16)),
                 ((1, 14, 6, 16), (1, 14, 1, 6, 16)),
                 # Pooling
-                ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
+                # ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
+                ((1, 2, 2, 128), (1, 2, 8, 2, 16)),
                 ((1, 10, 12, 16), (1, 10, 1, 12, 16)),
             ],
         ),
@@ -339,5 +342,119 @@ def test_best_block_config(
     assert block_shape in expected_block_configs[test_id]
 
 
+@pytest.mark.parametrize(
+    "ofm_layout, block_config_str, expected_block_shape",
+    [
+        ("NHWC", "4x4x8", [1, 4, 4, 8]),
+        ("NHCWB16", "4x4x8", [1, 4, 1, 4, 16]),
+        ("NHCWB16", "4x4x24", [1, 4, 2, 4, 16]),
+    ],
+)
+def test_force_block_config_kernelwise(ofm_layout, block_config_str, expected_block_shape):
+    op_type = "ethosu_pooling"
+    activation = "NONE"
+    kernel = (2, 2)
+    stride = (2, 2)
+    padding = (0, 0)
+    dilation = (1, 1)
+    ifm_channels = 32
+    out_shape = (1, 8, 10, 16)
+
+    ifm_matrix, ifm_offset, _, _, _, _ = make_matrices(
+        op_type, kernel, stride, padding, "NHWC", ofm_layout, dilation, ifm_channels
+    )
+
+    ofm_channels = out_shape[3]
+
+    propagator = cs.Propagator(ifm_matrix, ifm_offset)
+
+    op_attrs = {
+        "op": op_type,
+        "activation": activation,
+        "stride_h": stride[0],
+        "stride_w": stride[1],
+        "dilation_h": dilation[0],
+        "dilation_w": dilation[1],
+    }
+
+    config = {
+        "enable_cascader": True,
+        "dev_force_block_config": block_config_str,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        device_config = cs.EthosuDeviceConfig("ethos-u55-128")
+        block_configs = device_config.get_valid_block_configs(
+            propagator,
+            op_attrs,
+            out_shape,
+            ofm_channels,
+            ifm_channels,
+            ofm_layout,
+            "NHWC",
+            "int8",
+            "int8",
+            kernel[0],
+            kernel[1],
+        )
+
+    assert len(block_configs) == 1
+    assert block_configs[0].output_shape == expected_block_shape
+
+
+@pytest.mark.parametrize(
+    "ofm_layout, block_config_str, expected_block_shape",
+    [
+        ("NHWC", "4x4x8", [1, 4, 4, 8]),
+        ("NHCWB16", "4x4x8", [1, 4, 1, 4, 16]),
+        ("NHCWB16", "4x4x24", [1, 4, 2, 4, 16]),
+    ],
+)
+def test_force_block_config_elementwise(ofm_layout, block_config_str, expected_block_shape):
+    op_type = "ethosu_elementwise_unary"
+    op_str = "ABS"
+    activation = "NONE"
+    ofm_shape = (1, 8, 10, 16)
+    ifm_matrix = [
+        [1, 0, 0, 0, 0],
+        [0, 1, 0, 0, 0],
+        [0, 0, 1, 0, 0],
+        [0, 0, 0, 1, 0],
+        [0, 0, 0, 0, 1],
+    ]
+    ifm_offset = [0, 0, 0, 0]
+
+    propagator = cs.Propagator(ifm_matrix, ifm_offset)
+
+    op_attrs = {
+        "op": op_type,
+        "operator_type": op_str,
+        "activation": activation,
+        "clip_min": 0,
+        "clip_max": 0,
+        "rounding_mode": "TFL",
+    }
+
+    config = {
+        "enable_cascader": True,
+        "dev_force_block_config": block_config_str,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        device_config = cs.EthosuDeviceConfig("ethos-u55-128")
+        block_configs = device_config.get_elementwise_block_config(
+            propagator,
+            None,
+            op_attrs,
+            ofm_shape,
+            ofm_layout,
+            "NWHC",
+            None,
+            "int8",
+            "int8",
+        )
+
+    assert len(block_configs) == 1
+    assert block_configs[0].output_shape == expected_block_shape
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index fb19af4abc..5e4117e50f 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -93,8 +93,8 @@ def _get_ethosu_workspace_size(
     [
         ("ethos-u55-256", 1067408, 14096),
         ("ethos-u55-128", 1067408, 3968),
-        ("ethos-u55-64", 1067408, 2272),
-        ("ethos-u55-32", 1067392, 2256),
+        ("ethos-u55-64", 1067408, 3968),
+        ("ethos-u55-32", 1067392, 3952),
     ],
 )
 def test_double_conv2d(
@@ -161,10 +161,10 @@ def test_double_conv2d(
 @pytest.mark.parametrize(
     "accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
     [
-        ("ethos-u55-256", 180096, 5024),
-        ("ethos-u55-128", 180096, 4832),
-        ("ethos-u55-64", 180096, 6464),
-        ("ethos-u55-32", 180096, 6464),
+        ("ethos-u55-256", 180096, 15008),
+        ("ethos-u55-128", 180096, 14240),
+        ("ethos-u55-64", 180096, 14240),
+        ("ethos-u55-32", 180096, 14240),
     ],
 )
 def test_depthwise2d_conv2d_pooling(
@@ -227,7 +227,7 @@ def test_depthwise2d_conv2d_pooling(
     assert workspace_size_cascader_disabled == workspace_size_cascader_enabled_striping_disabled
 
     # Run the same graph with the cascader, giving it less memory to persuade cascder to cascade
-    pool_size = 40000
+    pool_size = 50000
     workspace_size_cascader_enabled_striping_enabled = _get_ethosu_workspace_size(
         mod, params, accel_type, pool_size, enable_cascader=True, enable_striping=True
     )
diff --git a/tests/python/contrib/test_ethosu/cascader/test_pareto.py b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
index 2d897a7931..baf8739c08 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_pareto.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
@@ -141,7 +141,7 @@ def test_pareto_cull_plans(num_plans, max_plans, SRAM):
 
     plans = _make_plans(num_plans)
     reference = list(_ref_pareto_cull_plans(plans, max_plans))
-    result = _pareto_cull_plans(plans, max_plans)
+    result = _pareto_cull_plans(plans, max_plans, False)
     assert result == reference
 
 
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
index ac767fa00e..c35ad15e23 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -26,9 +26,8 @@ from tvm.contrib.ethosu.cascader.plan_generator import (
 )
 
 
-def test_generate_output_stripe_configs():
-    stripe_factors = 3
-    expected_configs = 13
+@pytest.mark.parametrize("stripe_factors", [3, 4, 8, 16, 10])
+def test_generate_output_stripe_configs_disable_striping(stripe_factors):
     subgraph = cs.TESubgraph([], None)
     part_1 = cs.InlinePart(
         subgraph,
@@ -48,13 +47,95 @@ def test_generate_output_stripe_configs():
     tensor_2.add_producer(part_1)
 
     assert (
-        len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=True))
-        == expected_configs
+        len(
+            _generate_output_stripe_configs(
+                part_1, stripe_factors, enable_striping=False, multi_dimensional=False
+            )
+        )
+        == 1
     )
 
 
-@pytest.mark.parametrize("stripe_factors", [3, 4, 8, 16, 10])
-def test_generate_output_stripe_configs_disable_striping(stripe_factors):
+def test_generate_output_stripe_configs_multi_dimensional():
+    stripe_factors = 3
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 2, 0], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([800, 800], "uint8")
+    tensor_2 = cs.Tensor([400, 400], "uint8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    expected_stripe_configs = {
+        cs.StripeConfig([1, 1], [400, 400], [1, 1], [1, 2], [400, 400], [0, 0]),
+        cs.StripeConfig([1, 1], [400, 400], [1, 1], [2, 1], [400, 400], [0, 0]),
+        cs.StripeConfig([200, 1], [400, 400], [200, 1], [1, 2], [2, 400], [0, 0]),
+        cs.StripeConfig([200, 1], [400, 400], [200, 1], [2, 1], [2, 400], [0, 0]),
+        cs.StripeConfig([400, 1], [400, 400], [400, 1], [2, 1], [1, 400], [0, 0]),
+        cs.StripeConfig([1, 200], [400, 400], [1, 200], [1, 2], [400, 2], [0, 0]),
+        cs.StripeConfig([1, 200], [400, 400], [1, 200], [2, 1], [400, 2], [0, 0]),
+        cs.StripeConfig([200, 200], [400, 400], [200, 200], [2, 1], [2, 2], [0, 0]),
+        cs.StripeConfig([200, 200], [400, 400], [200, 200], [1, 2], [2, 2], [0, 0]),
+        cs.StripeConfig([400, 200], [400, 400], [400, 200], [2, 1], [1, 2], [0, 0]),
+        cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+        cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+        cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+    }
+
+    output_stripe_configs = _generate_output_stripe_configs(
+        part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=True
+    )
+
+    assert len(output_stripe_configs) == len(expected_stripe_configs)
+    assert set(output_stripe_configs) == expected_stripe_configs
+
+
+def test_generate_output_stripe_configs_uncascadable_axis():
+    stripe_factors = 3
+    subgraph = cs.TESubgraph([], None)
+    part_1 = cs.InlinePart(
+        subgraph,
+        [
+            cs.Propagator(
+                [[2, 0, 0], [0, 0, 200], [0, 0, 1]],
+                [0, 0],
+            ),
+        ],
+    )
+    tensor_1 = cs.Tensor([800, 200], "uint8")
+    tensor_2 = cs.Tensor([400, 400], "uint8")
+
+    part_1.set_input(0, tensor_1)
+    part_1.set_output(tensor_2)
+    tensor_1.add_consumer(part_1)
+    tensor_2.add_producer(part_1)
+
+    expected_stripe_configs = {
+        cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+        cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+        cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+    }
+
+    output_stripe_configs = _generate_output_stripe_configs(
+        part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=True
+    )
+
+    assert len(output_stripe_configs) == len(expected_stripe_configs)
+    assert set(output_stripe_configs) == expected_stripe_configs
+
+
+def test_generate_output_stripe_configs_single_dimension():
+    stripe_factors = 3
     subgraph = cs.TESubgraph([], None)
     part_1 = cs.InlinePart(
         subgraph,
@@ -73,7 +154,20 @@ def test_generate_output_stripe_configs_disable_striping(stripe_factors):
     tensor_1.add_consumer(part_1)
     tensor_2.add_producer(part_1)
 
-    assert len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=False)) == 1
+    expected_stripe_configs = {
+        cs.StripeConfig([400, 1], [400, 400], [400, 1], [2, 1], [1, 400], [0, 0]),
+        cs.StripeConfig([400, 200], [400, 400], [400, 200], [2, 1], [1, 2], [0, 0]),
+        cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+        cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+        cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+    }
+
+    output_stripe_configs = _generate_output_stripe_configs(
+        part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=False
+    )
+
+    assert len(output_stripe_configs) == len(expected_stripe_configs)
+    assert set(output_stripe_configs) == expected_stripe_configs
 
 
 def test_generate_single_plans(SRAM, DRAM):
@@ -101,7 +195,10 @@ def test_generate_single_plans(SRAM, DRAM):
     }
     options = make_options(cascade_region=SRAM, stripe_factors=1)
     output_stripe_configs = _generate_output_stripe_configs(
-        part_1, options.stripe_factors, enable_striping=True
+        part_1,
+        options.stripe_factors,
+        enable_striping=True,
+        multi_dimensional=True,
     )
     plans = _generate_single_plans(part_1, output_stripe_configs, home_map, options)
     for plan in plans:
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index 89b4b41b33..6ac188187e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -20,7 +20,6 @@ import tvm.contrib.ethosu.cascader as cs
 
 from .infra import ethosu_enabled
 
-
 if ethosu_enabled:
 
     def test_cascade(
@@ -39,7 +38,11 @@ if ethosu_enabled:
                 max_proposals=64,
                 stripe_factors=4,
                 max_plan_size=10,
+                max_open_plans=8,
+                max_closed_plans=32,
                 always_copy_size=1024,
+                disable_pareto_plans=False,
+                disable_pareto_proposals=False,
             )
             cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
 
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
index 662b35822c..e2e4b2cb3a 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -254,6 +254,19 @@ def test_get_optimal_block_config():
         assert vela_api._get_optimal_block_config(test_case["test"]) == test_case["ref"]
 
 
+@pytest.mark.parametrize(
+    "block_config_str, expected_block_config",
+    [("4x4x8", vapi.NpuShape3D(4, 4, 8)), ("3x7x16", vapi.NpuShape3D(3, 7, 16))],
+)
+def test_force_block_config(block_config_str, expected_block_config):
+    config = {
+        "dev_force_block_config": block_config_str,
+    }
+    with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+        block_config = vela_api.get_optimal_block_config(None, vapi.NpuAccelerator.Ethos_U55_128)
+        assert block_config == expected_block_config
+
+
 def test_compress_weights():
     test_vecs = [
         {