You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2022/05/13 14:02:29 UTC
[tvm] branch main updated: [microNPU] Add various options to the cascader (#10509)
This is an automated email from the ASF dual-hosted git repository.
manupa pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 7c75b77df8 [microNPU] Add various options to the cascader (#10509)
7c75b77df8 is described below
commit 7c75b77df8391b9e6b52cbf9ab0959f937c58c0a
Author: Jacob Bohlin <ja...@arm.com>
AuthorDate: Fri May 13 15:02:23 2022 +0100
[microNPU] Add various options to the cascader (#10509)
* [microNPU] Added options to Cascader
* Added option to toggle multi-dimensional striping, it is disabled by
default because it has a very high computational cost. Single
dimension striping shares most of the benefit with greatly reduced
cost.
* Added multiple developer/debugging options prefixed with 'dev_'
Also added these options to tvmc.
* Added cascader logging, if enabled it will dump information about the
cascader proposals to a 'cascader_log.json' file.
Co-authored-by: Matthew Barrett <ma...@arm.com>
Change-Id: I2ec59ae0bd84b73b2cc4bc56d39e3831b0aeec27
* Updated memory_reduction testcases
Also added enable_striping to plan_generator.h
Change-Id: I496b30ed6af6f0730087329cd81a69c5040a5e4d
Co-authored-by: Matthew Barrett <ma...@arm.com>
---
python/tvm/contrib/ethosu/cascader/__init__.py | 1 +
python/tvm/contrib/ethosu/cascader/block_config.py | 12 ++
.../contrib/ethosu/cascader/cascader_options.py | 24 ++++
.../tvm/contrib/ethosu/cascader/device_config.py | 83 ++++++++++--
python/tvm/contrib/ethosu/cascader/logging.py | 70 ++++++++++
python/tvm/contrib/ethosu/cascader/pareto.py | 6 +-
.../tvm/contrib/ethosu/cascader/plan_generator.py | 8 +-
python/tvm/contrib/ethosu/cascader/scheduler.py | 45 ++++++-
python/tvm/relay/backend/contrib/ethosu/codegen.py | 2 +
.../tvm/relay/backend/contrib/ethosu/vela_api.py | 4 +
src/contrib/ethosu/cascader/cascader_options.cc | 28 +++-
src/contrib/ethosu/cascader/cascader_options.h | 17 ++-
src/contrib/ethosu/cascader/pareto.cc | 20 ++-
src/contrib/ethosu/cascader/pareto.h | 7 +-
src/contrib/ethosu/cascader/plan_generator.cc | 45 +++++--
src/contrib/ethosu/cascader/plan_generator.h | 5 +-
src/contrib/ethosu/cascader/proposal_generator.cc | 3 +-
src/relay/backend/contrib/ethosu/compiler_attrs.cc | 40 ++++++
tests/python/contrib/test_ethosu/cascader/infra.py | 8 ++
.../cascader/test_ethosu_block_config.py | 143 +++++++++++++++++++--
.../test_ethosu/cascader/test_memory_reduction.py | 14 +-
.../contrib/test_ethosu/cascader/test_pareto.py | 2 +-
.../test_ethosu/cascader/test_plan_generator.py | 115 +++++++++++++++--
.../contrib/test_ethosu/cascader/test_scheduler.py | 5 +-
tests/python/contrib/test_ethosu/test_vela_api.py | 13 ++
25 files changed, 640 insertions(+), 80 deletions(-)
diff --git a/python/tvm/contrib/ethosu/cascader/__init__.py b/python/tvm/contrib/ethosu/cascader/__init__.py
index 51f5e58a47..1d608c04ff 100644
--- a/python/tvm/contrib/ethosu/cascader/__init__.py
+++ b/python/tvm/contrib/ethosu/cascader/__init__.py
@@ -37,4 +37,5 @@ from .device_config import EthosuDeviceConfig
from .tensor_config import TensorConfigState, MemoryRegion, TensorConfig
from .plan import Plan
from .scheduler import apply_proposal, cascade, extract_memory_info
+from .logging import Logging
from .cascader_options import CascaderOptions
diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py
index f246918cf4..b90de753f6 100644
--- a/python/tvm/contrib/ethosu/cascader/block_config.py
+++ b/python/tvm/contrib/ethosu/cascader/block_config.py
@@ -55,5 +55,17 @@ class BlockConfig(Object):
def output_cycles(self) -> int:
return int(self._output_cycles)
+ def __ge__(self, other: "BlockConfig"):
+ if len(self.output_shape) != len(other.output_shape):
+ return False
+
+ return all(a >= b for a, b in zip(self.output_shape, other.output_shape))
+
+ def __lt__(self, other: "BlockConfig"):
+ if len(self.output_shape) != len(other.output_shape):
+ return False
+
+ return other >= self
+
def __repr__(self) -> str:
return f"BlockConfig(output_shape={self.output_shape})"
diff --git a/python/tvm/contrib/ethosu/cascader/cascader_options.py b/python/tvm/contrib/ethosu/cascader/cascader_options.py
index ade04bdde9..aeca7fcdcb 100644
--- a/python/tvm/contrib/ethosu/cascader/cascader_options.py
+++ b/python/tvm/contrib/ethosu/cascader/cascader_options.py
@@ -38,8 +38,20 @@ class CascaderOptions(Object):
How many striping factors to try per axis.
max_plan_size : int
The maximum number of Parts in a Plan.
+ max_open_plans : int
+ The maximum number of open Plans to keep after culling.
+ max_closed_plans : int
+ The maxmum number of closed Plans to keep after culling.
always_copy_size : int
The maximum size of a Tensor that will always be copied into the cascade region.
+ disable_pareto_plans : bool
+ Disable pareto culling for Plans.
+ disable_pareto_proposals : bool
+ Disable pareto culling for Proposals.
+ enable_multi_dimensional_striping : bool
+ Enable striping in multiple dimensions simultaneously.
+ disable_block_culling : bool
+ Disable culling of block configs.
enable_striping : bool
A boolean option to enable striping
@@ -51,7 +63,13 @@ class CascaderOptions(Object):
max_proposals: int,
stripe_factors: int,
max_plan_size: int,
+ max_open_plans: int,
+ max_closed_plans: int,
always_copy_size: int,
+ disable_pareto_plans: bool = False,
+ disable_pareto_proposals: bool = False,
+ enable_multi_dimensional_striping: bool = False,
+ disable_block_culling: bool = True,
enable_striping: bool = False,
):
self.__init_handle_by_constructor__(
@@ -60,6 +78,12 @@ class CascaderOptions(Object):
max_proposals,
stripe_factors,
max_plan_size,
+ max_open_plans,
+ max_closed_plans,
always_copy_size,
+ disable_pareto_plans,
+ disable_pareto_proposals,
+ enable_multi_dimensional_striping,
+ disable_block_culling,
enable_striping,
)
diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py
index bf6ac48cf9..5f5a937628 100644
--- a/python/tvm/contrib/ethosu/cascader/device_config.py
+++ b/python/tvm/contrib/ethosu/cascader/device_config.py
@@ -15,12 +15,15 @@
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name
+# pylint: disable=too-many-nested-blocks
"""Device config class to hold information about the target hardware"""
from typing import Tuple, List, Dict, Optional
from functools import reduce
import math
+import numpy as np
+import tvm
from . import BlockConfig
from . import StripeConfig
from . import Propagator
@@ -64,13 +67,14 @@ class _Shape:
class EthosuDeviceConfig:
"""Arm(R) Ethos(TM)-U NPU config class"""
- def __init__(self, device: str):
+ def __init__(self, device: str, disable_block_bulling: bool = False):
self._device = device
self._subkernel_limits = (8, 8)
self._output_cycles = (1, 2, 3, 4, 6)
self._split_depth = 16
self._max_block_shape = _Shape([1, 32, 64, 128])
self._bank_size_bytes = 1024
+ self._disable_block_culling = disable_block_bulling
if self._device == "ethos-u55-256":
self._micro_block = _Shape([1, 2, 2, 8])
self._input_micro_block = _Shape([1, 2, 2, 8])
@@ -508,6 +512,28 @@ class EthosuDeviceConfig:
if activation == "LUT" and not self._lut_reserved:
banks_available -= 2
+ # Handle user-forced block config
+ options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+ if options and options.dev_force_block_config:
+ block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+ assert len(block_config) == 3
+ if output_layout == "NHWC":
+ block_shape = [output_shape[0], block_config[0], block_config[1], block_config[2]]
+ else:
+ block_shape = [
+ output_shape[0],
+ block_config[0],
+ 1 + ((block_config[2] - 1) // 16),
+ block_config[1],
+ 16,
+ ]
+ output_cycles = self._get_output_cycles(
+ op_type, op_str, ifm_dtype, ofm_dtype, activation
+ )
+ output_cycles *= reduce(lambda a, b: a * b, block_shape, 1)
+ output_cycles = int(math.ceil(output_cycles))
+ return [BlockConfig(block_shape, block_shape, 0, output_cycles)]
+
# Split the block in half until it fits into SHRAM
max_height, max_width, max_depth = self._max_block_shape.as_list()[1:]
if output_layout == "NHCWB16":
@@ -666,6 +692,21 @@ class EthosuDeviceConfig:
max_depth = min(ofm_channels, self._max_block_shape.depth)
min_depth = max(self._micro_block.depth, upscaling_factor)
+ heights = range(min_height, max_height + min_height, min_height)
+ widths = range(min_width, max_width + min_width, min_width)
+ depths = range(min_depth, max_depth + min_depth, min_depth)
+
+ # Handle user-forced block config
+ options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+ forced = False
+ if options and options.dev_force_block_config:
+ block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+ assert len(block_config) == 3
+ heights = [block_config[0]]
+ widths = [block_config[1]]
+ depths = [block_config[2]]
+ forced = True
+
input_bytewidth = 1 if ifm_dtype == "int8" else 2
acc_bytewidth = self._get_accumulator_width(op_type, ifm_dtype)
banks_available = self._total_banks - self._reserved_banks
@@ -681,8 +722,8 @@ class EthosuDeviceConfig:
else:
input_block_depth = min(ifm_channels, 32)
- for depth in range(min_depth, max_depth + min_depth, min_depth):
- if (depth < output_shape.depth) and (depth % self._split_depth != 0):
+ for depth in reversed(depths):
+ if (depth < output_shape.depth) and (depth % self._split_depth != 0) and not forced:
# Block depth has to be less than full depth or a multiple of the split depth
continue
@@ -690,17 +731,15 @@ class EthosuDeviceConfig:
op_attrs, ifm_propagator, input_layout, output_layout, depth
)
- for width in range(min_width, max_width + min_width, min_width):
- for height in range(min_height, max_height + min_height, min_height):
+ for width in reversed(widths):
+ for height in reversed(heights):
if output_layout == "NHCWB16":
output_block = (
1,
height,
1 + ((depth - 1) // 16),
width,
- _round_up(
- min(16, max(ofm_channels, min_depth)), self._micro_block.depth
- ),
+ min(16, _round_up(ofm_channels, self._micro_block.depth)),
)
order = [1, 2, 4, 3, 0]
else:
@@ -740,7 +779,7 @@ class EthosuDeviceConfig:
output_cycles = self._get_output_cycles(
op_type, op_str, ifm_dtype, ofm_dtype, activation
)
- output_cycles *= reduce(lambda a, b: a * b, output_block, 1)
+ output_cycles *= np.prod(output_block).tolist()
output_cycles = int(math.ceil(output_cycles))
compute_cycles = self._estimate_compute_cycles_per_block(
op_type,
@@ -755,11 +794,27 @@ class EthosuDeviceConfig:
block_config = BlockConfig(
input_block_shape.as_list(), output_block, compute_cycles, output_cycles
)
- valid_block_configs.append(block_config)
- else:
- # Block config does not fit into SHRAM
- # Any Block config that is strictly larger than this one will also fail
- break
+
+ if self._disable_block_culling:
+ # Block culling disabled - add all block configs that fit
+ valid_block_configs.append(block_config)
+ else:
+ # Add block config only if it's not dominated by an existing block.
+ # A block config is dominated by another if its output_shape is greater
+ # or equal in every dimension and strictly greater in at least one
+ # dimension.
+ dominated = False
+ for valid_block in valid_block_configs:
+ if block_config < valid_block:
+ dominated = True
+ break
+
+ if not dominated:
+ valid_block_configs.append(block_config)
+
+ # Every consecutive block in the innermost loop will be dominated by
+ # this one so break
+ break
return valid_block_configs
diff --git a/python/tvm/contrib/ethosu/cascader/logging.py b/python/tvm/contrib/ethosu/cascader/logging.py
new file mode 100644
index 0000000000..0b163eb147
--- /dev/null
+++ b/python/tvm/contrib/ethosu/cascader/logging.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""A class to hold logging information about the cascader"""
+from typing import Tuple
+import datetime
+import json
+import os
+import math
+
+
+class Logging:
+ """Cascader logging class"""
+
+ def __init__(self):
+ self.min_memory_usage = 0
+ self.max_memory_usage = 0
+ self.min_cycles = 0
+ self.max_cycles = 0
+
+ self.selected_proposal_idx = -1
+ self.proposals = {}
+ self.cascader_runtime = 0
+
+ def add_proposal(self, idx: int, memory_usage: int, cycles: int):
+ self.proposals[idx] = {"memory_usage": memory_usage, "cycles": cycles}
+
+ def get_extreme_points(self) -> Tuple[int, int, int, int]:
+ min_cycles, min_mem_usage = math.inf, math.inf
+ max_cycles, max_mem_usage = 0, 0
+ for proposal in self.proposals.values():
+ min_mem_usage = min(proposal["memory_usage"], min_mem_usage)
+ max_mem_usage = max(proposal["memory_usage"], max_mem_usage)
+ min_cycles = min(proposal["cycles"], min_cycles)
+ max_cycles = max(proposal["cycles"], max_cycles)
+
+ return min_mem_usage, max_mem_usage, min_cycles, max_cycles
+
+ def dump_json(self):
+ min_mem_usage, max_mem_usage, min_cycles, max_cycles = self.get_extreme_points()
+ with open(os.getcwd() + "/cascader_log.json", "w") as json_file:
+ print(
+ json.dumps(
+ {
+ "date": f"{datetime.datetime.now()}",
+ "cascader_runtime": self.cascader_runtime,
+ "min_cycles": min_cycles,
+ "max_cycles": max_cycles,
+ "min_memory_usage": min_mem_usage,
+ "max_memory_usage": max_mem_usage,
+ "selected_proposal": self.selected_proposal_idx,
+ "proposals": self.proposals,
+ },
+ indent=2,
+ ),
+ file=json_file,
+ )
diff --git a/python/tvm/contrib/ethosu/cascader/pareto.py b/python/tvm/contrib/ethosu/cascader/pareto.py
index 3c4dcbc88a..545778934c 100644
--- a/python/tvm/contrib/ethosu/cascader/pareto.py
+++ b/python/tvm/contrib/ethosu/cascader/pareto.py
@@ -35,5 +35,7 @@ def _thin_vector(vec: List[Object], max_size: int) -> List[Object]:
return list(_ffi_api.ThinVector(vec, max_size))
-def _pareto_cull_plans(plans: List[Plan], max_plans: int) -> List[Plan]:
- return list(_ffi_api.ParetoCullPlans(plans, max_plans))
+def _pareto_cull_plans(
+ plans: List[Plan], max_plans: int, disable_pareto_metric: bool
+) -> List[Plan]:
+ return list(_ffi_api.ParetoCullPlans(plans, max_plans, disable_pareto_metric))
diff --git a/python/tvm/contrib/ethosu/cascader/plan_generator.py b/python/tvm/contrib/ethosu/cascader/plan_generator.py
index 9235a285d8..155e01431c 100644
--- a/python/tvm/contrib/ethosu/cascader/plan_generator.py
+++ b/python/tvm/contrib/ethosu/cascader/plan_generator.py
@@ -27,9 +27,13 @@ from .graph import CascaderGraph, Part, Tensor
def _generate_output_stripe_configs(
- part: Part, stripe_factors: int, enable_striping: bool
+ part: Part, stripe_factors: int, enable_striping: bool, multi_dimensional: bool
) -> List[StripeConfig]:
- return list(_ffi_api.GenerateOutputStripeConfigs(part, stripe_factors, enable_striping))
+ return list(
+ _ffi_api.GenerateOutputStripeConfigs(
+ part, stripe_factors, enable_striping, multi_dimensional
+ )
+ )
def _generate_single_plans(
diff --git a/python/tvm/contrib/ethosu/cascader/scheduler.py b/python/tvm/contrib/ethosu/cascader/scheduler.py
index 63d48a19af..d33abaf2b7 100644
--- a/python/tvm/contrib/ethosu/cascader/scheduler.py
+++ b/python/tvm/contrib/ethosu/cascader/scheduler.py
@@ -18,8 +18,10 @@
"""Scheduler for cascader which converts Proposals into Schedules."""
from typing import Tuple, List, Dict, DefaultDict
from collections import defaultdict
+import time
import numpy as np
+import tvm
from tvm import te
from tvm import tir
from tvm import PoolInfo
@@ -31,6 +33,7 @@ from .proposal import Proposal
from .proposal_generator import generate_proposals
from .graph import create_cascader_graph
from .device_config import EthosuDeviceConfig
+from .logging import Logging
def tile_nd(
@@ -188,13 +191,20 @@ def create_home_map(
return home_map
-def choose_proposal(proposals: List[Proposal], cascade_region: MemoryRegion):
+def choose_proposal(
+ proposals: List[Proposal], cascade_region: MemoryRegion, select_proposal_idx: int
+):
"""Choose the best performing Proposal that doesn't overflow the cascade region."""
- proposal_choice = proposals[0]
- for proposal in reversed(proposals):
- if proposal.memory_usage < cascade_region.size:
- proposal_choice = proposal
- break
+ if select_proposal_idx != -1:
+ # Manually select proposal based on index, take modulus the total number of proposals to
+ # ensure that some proposal is always selected.
+ proposal_choice = proposals[select_proposal_idx % len(proposals)]
+ else:
+ proposal_choice = proposals[0]
+ for proposal in reversed(proposals):
+ if proposal.memory_usage < cascade_region.size:
+ proposal_choice = proposal
+ break
return proposal_choice
@@ -271,6 +281,17 @@ def cascade(
Target device configuration.
"""
+ tvmc_options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+ log = Logging() if tvmc_options and tvmc_options.dev_cascader_logging else None
+ select_proposal_idx = (
+ int(tvmc_options.dev_select_proposal_idx)
+ if tvmc_options and tvmc_options.dev_select_proposal_idx
+ else -1
+ )
+
+ if log:
+ start = time.time()
+
assert options.cascade_region in working_regions
# First convert the Tensor Expression graph into a CascaderGraph
casc_graph = create_cascader_graph(te_graph, const_dict, device_config)
@@ -279,6 +300,16 @@ def cascade(
# Generate Proposals for Pareto-optimal ways to cascade the CascaderGraph
proposals = generate_proposals(casc_graph, home_map, options)
# Select the best Proposal subject to the memory constraints
- proposal_choice = choose_proposal(proposals, options.cascade_region)
+ proposal_choice = choose_proposal(proposals, options.cascade_region, select_proposal_idx)
+
+ if log:
+ for idx, proposal in enumerate(proposals):
+ log.add_proposal(idx, proposal.memory_usage, proposal.cycles)
+ if proposal == proposal_choice:
+ log.selected_proposal_idx = idx
+
+ log.cascader_runtime = time.time() - start
+ log.dump_json()
+
# Apply the selected Proposal to the Tensor Expression Schedule
apply_proposal(proposal_choice, sch)
diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py
index 2552d891c9..423834daa8 100644
--- a/python/tvm/relay/backend/contrib/ethosu/codegen.py
+++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py
@@ -368,6 +368,8 @@ def _ethos_u55_cascader(sram, enable_striping) -> Callable:
stripe_factors=5,
max_plan_size=10,
always_copy_size=1024,
+ max_open_plans=8,
+ max_closed_plans=32,
enable_striping=enable_striping,
)
return _create_cascader(
diff --git a/python/tvm/relay/backend/contrib/ethosu/vela_api.py b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
index 6d01e8de57..f241652e73 100644
--- a/python/tvm/relay/backend/contrib/ethosu/vela_api.py
+++ b/python/tvm/relay/backend/contrib/ethosu/vela_api.py
@@ -67,6 +67,10 @@ def get_optimal_block_config(
ethosu.vela.api.NpuShape3D :
The optimal block config for the operator
"""
+ options = tvm.transform.PassContext.current().config.get("relay.ext.ethos-u.options", None)
+ if options and options.dev_force_block_config:
+ block_config = [int(v) for v in options.dev_force_block_config.split("x")]
+ return vapi.NpuShape3D(height=block_config[0], width=block_config[1], depth=block_config[2])
all_valid_block_configs = vapi.npu_find_block_configs(npu_op, accel_config)
return _get_optimal_block_config(all_valid_block_configs)
diff --git a/src/contrib/ethosu/cascader/cascader_options.cc b/src/contrib/ethosu/cascader/cascader_options.cc
index be4bfee6d7..0daf3fed24 100644
--- a/src/contrib/ethosu/cascader/cascader_options.cc
+++ b/src/contrib/ethosu/cascader/cascader_options.cc
@@ -30,28 +30,48 @@ void CascaderOptionsNode::VisitAttrs(AttrVisitor* v) {
v->Visit("max_proposals", &max_proposals);
v->Visit("stripe_factors", &stripe_factors);
v->Visit("max_plan_size", &max_plan_size);
+ v->Visit("max_open_plans", &max_open_plans);
+ v->Visit("max_closed_plans", &max_closed_plans);
v->Visit("always_copy_size", &always_copy_size);
+ v->Visit("disable_pareto_plans", &disable_pareto_plans);
+ v->Visit("disable_pareto_proposals", &disable_pareto_proposals);
+ v->Visit("enable_multi_dimensional_striping", &enable_multi_dimensional_striping);
+ v->Visit("disable_block_culling", &disable_block_culling);
v->Visit("enable_striping", &enable_striping);
}
CascaderOptions::CascaderOptions(const MemoryRegion& cascade_region, int max_proposals,
- int stripe_factors, int max_plan_size, int always_copy_size,
+ int stripe_factors, int max_plan_size, int max_open_plans,
+ int max_closed_plans, int always_copy_size,
+ bool disable_pareto_plans, bool disable_pareto_proposals,
+ bool enable_multi_dimensional_striping, bool disable_block_culling,
bool enable_striping) {
auto n = make_object<CascaderOptionsNode>();
n->cascade_region = std::move(cascade_region);
n->max_proposals = max_proposals;
n->stripe_factors = stripe_factors;
n->max_plan_size = max_plan_size;
+ n->max_open_plans = max_open_plans;
+ n->max_closed_plans = max_closed_plans;
n->always_copy_size = always_copy_size;
+ n->disable_pareto_plans = disable_pareto_plans;
+ n->disable_pareto_proposals = disable_pareto_proposals;
+ n->enable_multi_dimensional_striping = enable_multi_dimensional_striping;
+ n->disable_block_culling = disable_block_culling;
n->enable_striping = enable_striping;
data_ = std::move(n);
}
TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.CascaderOptions")
.set_body_typed([](MemoryRegion cascade_region, int max_proposals, int stripe_factors,
- int max_plan_size, int always_copy_size, bool enable_striping) {
- return CascaderOptions(cascade_region, max_proposals, stripe_factors, max_plan_size,
- always_copy_size, enable_striping);
+ int max_plan_size, int max_open_plans, int max_closed_plans,
+ int always_copy_size, bool disable_pareto_plans,
+ bool disable_pareto_proposals, bool enable_multi_dimensional_striping,
+ bool disable_block_culling, bool enable_striping) {
+ return CascaderOptions(
+ cascade_region, max_proposals, stripe_factors, max_plan_size, max_open_plans,
+ max_closed_plans, always_copy_size, disable_pareto_plans, disable_pareto_proposals,
+ enable_multi_dimensional_striping, disable_block_culling, enable_striping);
});
TVM_REGISTER_NODE_TYPE(CascaderOptionsNode);
diff --git a/src/contrib/ethosu/cascader/cascader_options.h b/src/contrib/ethosu/cascader/cascader_options.h
index ba00451766..3545e5cc3a 100644
--- a/src/contrib/ethosu/cascader/cascader_options.h
+++ b/src/contrib/ethosu/cascader/cascader_options.h
@@ -47,8 +47,20 @@ class CascaderOptionsNode : public Object {
int stripe_factors;
/*! \brief The maximum number of Parts in a Plan. */
int max_plan_size;
+ /*! \brief The maximum number of open Plans saved for a Part Group */
+ int max_open_plans;
+ /*! \brief The maximum number of closed Plans saved for a Part Group */
+ int max_closed_plans;
/*! \brief The maximum size of Tensor that will always be copied into the cascade region. */
int always_copy_size;
+ /*! \brief Flag to disable pareto culling for plans to allow non pareto-optimal plans */
+ bool disable_pareto_plans;
+ /*! \brief Flag to disable pareto culling for proposals to allow non pareto-optimal proposals */
+ bool disable_pareto_proposals;
+ /*! \brief Whether to consider multi-dimensional striping */
+ bool enable_multi_dimensional_striping;
+ /*! \brief Flag to disable culling for block configs to allow non-dominant blocks */
+ bool disable_block_culling;
/*! \brief A boolean option to enable striping. */
bool enable_striping;
@@ -60,7 +72,10 @@ class CascaderOptionsNode : public Object {
class CascaderOptions : public ObjectRef {
public:
CascaderOptions(const MemoryRegion& cascade_region, int max_proposals, int stripe_factors,
- int max_plan_size, int always_copy_size, bool enable_striping = true);
+ int max_plan_size, int max_open_plans, int max_closed_plans, int always_copy_size,
+ bool disable_pareto_plans, bool disable_pareto_proposals,
+ bool enable_multi_dimensional_striping, bool disable_block_culling,
+ bool multi_dimensional_striping);
TVM_DEFINE_OBJECT_REF_METHODS(CascaderOptions, ObjectRef, CascaderOptionsNode);
};
diff --git a/src/contrib/ethosu/cascader/pareto.cc b/src/contrib/ethosu/cascader/pareto.cc
index 52ea729bff..e40a6602fa 100644
--- a/src/contrib/ethosu/cascader/pareto.cc
+++ b/src/contrib/ethosu/cascader/pareto.cc
@@ -80,10 +80,16 @@ std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size) {
return thin_vec;
}
-std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans) {
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans,
+ bool disable_pareto_metric) {
if (plans.size() <= max_plans) {
return plans;
}
+ if (disable_pareto_metric) {
+ // Sample from all plans
+ return ThinVector(plans, max_plans);
+ }
+
std::sort(plans.begin(), plans.end(), [](const Plan& a, const Plan& b) -> bool {
return a->GetMemoryUsage() < b->GetMemoryUsage();
});
@@ -108,7 +114,13 @@ std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans) {
return ThinVector(optimal_plans, max_plans);
}
-std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals) {
+std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals,
+ bool disable_pareto_metric) {
+ if (disable_pareto_metric) {
+ // Sample from all Proposals
+ return ThinVector(proposals, max_proposals);
+ }
+
std::sort(proposals.begin(), proposals.end(), [](const Proposal& a, const Proposal& b) -> bool {
return a->GetMemoryUsage() < b->GetMemoryUsage();
});
@@ -156,9 +168,9 @@ TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ThinVector")
});
TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.ParetoCullPlans")
- .set_body_typed([](Array<Plan> plans, int max_size) {
+ .set_body_typed([](Array<Plan> plans, int max_size, bool disable_pareto_metric) {
std::vector<Plan> vplans(plans.begin(), plans.end());
- return Array<Plan>(ParetoCullPlans(vplans, max_size));
+ return Array<Plan>(ParetoCullPlans(vplans, max_size, disable_pareto_metric));
});
} // namespace cascader
diff --git a/src/contrib/ethosu/cascader/pareto.h b/src/contrib/ethosu/cascader/pareto.h
index 511da6c271..abb6ca516c 100644
--- a/src/contrib/ethosu/cascader/pareto.h
+++ b/src/contrib/ethosu/cascader/pareto.h
@@ -61,13 +61,16 @@ std::vector<T> ThinVector(const std::vector<T>& vec, size_t max_size);
* \brief Cull plans which are not Pareto optimal then thin them down.
* \param plans The plans to apply the Pareto culling to.
* \param max_plans The maximum number of plans after the culling.
+ * \param disable_pareto_metric Whether to only select from Pareto frontier or not.
* \return The culled plans.
* \note Plan Pareto-optimality is determined based upon a Plan's memory_usage
* and cycles.
*/
-std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans);
+std::vector<Plan> ParetoCullPlans(std::vector<Plan> plans, size_t max_plans,
+ bool disable_pareto_metric);
-std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals);
+std::vector<Proposal> ParetoCullProposals(std::vector<Proposal> proposals, size_t max_proposals,
+ bool disable_pareto_metric);
} // namespace cascader
} // namespace ethosu
diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc
index 75e711ea0f..780f9adc2c 100644
--- a/src/contrib/ethosu/cascader/plan_generator.cc
+++ b/src/contrib/ethosu/cascader/plan_generator.cc
@@ -106,7 +106,8 @@ std::vector<bool> GetCascadableAxes(const Part& part) {
}
std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
- bool enable_striping) {
+ bool enable_striping,
+ bool multi_dimensional) {
// If stripe_factors is <= 0, then we won't produce any StripeConfigs
if (stripe_factors <= 0) {
return std::vector<StripeConfig>();
@@ -147,11 +148,29 @@ std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stri
}
splits.push_back(std::vector<int>(axis_splits.begin(), axis_splits.end()));
}
- // Now calculate all the possible combinations of splits for each dimension
- // to give us all the possible stripe shapes. For example, if we had two axes
- // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
- // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
- auto stripe_shapes = EnumerateCombinations<int>(splits);
+
+ std::vector<std::vector<int>> stripe_shapes;
+ if (multi_dimensional) {
+ // Now calculate all the possible combinations of splits for each dimension
+ // to give us all the possible stripe shapes. For example, if we had two axes
+ // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
+ // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
+ stripe_shapes = EnumerateCombinations<int>(splits);
+ } else {
+ // Only consider splitting a single axis
+ int axis = 0;
+ for (const auto& split : splits) {
+ for (const auto& axis_split : split) {
+ std::vector<int> stripe_shape = output_shape;
+ if (stripe_shape[axis] != axis_split) {
+ stripe_shape[axis] = axis_split;
+ stripe_shapes.push_back(stripe_shape);
+ }
+ }
+ axis++;
+ }
+ stripe_shapes.push_back(output_shape);
+ }
auto offset = std::vector<int>(output_dims);
std::vector<StripeConfig> stripe_configs;
// Calculate the possible axis orderings such that each axis has the opportunity
@@ -437,7 +456,8 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
// output of a Plan. The number generated is a function of stripe_factors and the number of
// cascadable dimensions in the Part.
std::vector<StripeConfig> stripe_configs =
- GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping);
+ GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping,
+ options->enable_multi_dimensional_striping);
// Check to see if the output Tensor is part of any existing open Plans
if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) {
// If there are other open Plans which have this Part's output Tensor as an input, then
@@ -491,10 +511,12 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
// and plans_by_config maps.
for (const auto& part_group : new_part_groups) {
if (closed_plans.find(part_group) != closed_plans.end()) {
- closed_plans[part_group] = ParetoCullPlans(closed_plans.at(part_group), 32);
+ closed_plans[part_group] = ParetoCullPlans(
+ closed_plans.at(part_group), options->max_closed_plans, options->disable_pareto_plans);
}
for (const auto& it : open_plans[part_group]) {
- auto pareto_plans = ParetoCullPlans(it.second, 8);
+ auto pareto_plans =
+ ParetoCullPlans(it.second, options->max_open_plans, options->disable_pareto_plans);
for (const auto& plan : pareto_plans) {
for (const auto& open_config : plan->GetOpenConfigs()) {
if (open_config != plan->GetOutputConfig()) {
@@ -515,12 +537,13 @@ std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
}
TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs")
- .set_body_typed([](Part part, int stripe_factors, bool enable_striping) {
+ .set_body_typed([](Part part, int stripe_factors, bool enable_striping,
+ bool multi_dimensional) {
if (stripe_factors < 0) {
return Array<StripeConfig>();
}
return Array<StripeConfig>(
- GenerateOutputStripeConfigs(part, stripe_factors, enable_striping));
+ GenerateOutputStripeConfigs(part, stripe_factors, enable_striping, multi_dimensional));
});
TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans")
diff --git a/src/contrib/ethosu/cascader/plan_generator.h b/src/contrib/ethosu/cascader/plan_generator.h
index 947728addf..71bdef82d2 100644
--- a/src/contrib/ethosu/cascader/plan_generator.h
+++ b/src/contrib/ethosu/cascader/plan_generator.h
@@ -51,9 +51,12 @@ using HomeMap =
* \brief Generate possible output StripeConfigs that could be applied to a Part's output.
* \param part The Part to generate StripeConfigs for.
* \param stripe_factors How many striping factors to try per axis.
+ * \param enable_striping Whether striping is enabled
+ * \param multi_dimensional Whether to stripe in more than one dimension.
* \return The generated StripeConfigs for the Part's output.
*/
-std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors);
+std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
+ bool enable_striping, bool multi_dimensional);
/*!
* \brief Generate single-Part Plans for a Part for a given list of output StripeConfigs.
diff --git a/src/contrib/ethosu/cascader/proposal_generator.cc b/src/contrib/ethosu/cascader/proposal_generator.cc
index ce709cbaa6..f886aad424 100644
--- a/src/contrib/ethosu/cascader/proposal_generator.cc
+++ b/src/contrib/ethosu/cascader/proposal_generator.cc
@@ -177,7 +177,8 @@ std::vector<Proposal> GeneratePartialProposals(
}
}
(*proposals_by_group)[partial_proposal_group] =
- ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals);
+ ParetoCullProposals(proposals_by_group->at(partial_proposal_group), options->max_proposals,
+ options->disable_pareto_proposals);
}
return proposals_by_group->at(partial_proposal_group);
}
diff --git a/src/relay/backend/contrib/ethosu/compiler_attrs.cc b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
index 5124e273d9..42add45b01 100644
--- a/src/relay/backend/contrib/ethosu/compiler_attrs.cc
+++ b/src/relay/backend/contrib/ethosu/compiler_attrs.cc
@@ -41,6 +41,14 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
String accelerator_config;
bool enable_cascader;
bool enable_striping;
+ String dev_force_block_config;
+ String dev_max_open_plans;
+ String dev_max_closed_plans;
+ String dev_select_proposal_idx;
+ bool dev_disable_pareto_plans;
+ bool dev_disable_pareto_proposals;
+ bool dev_disable_block_culling;
+ bool dev_cascader_logging;
TVM_DECLARE_ATTRS(EthosUCompilerConfigNode, "ext.attrs.EthosUCompilerConfigNode") {
TVM_ATTR_FIELD(accelerator_config)
@@ -54,6 +62,38 @@ struct EthosUCompilerConfigNode : public tvm::AttrsNode<EthosUCompilerConfigNode
TVM_ATTR_FIELD(enable_striping)
.describe("Whether the cascader should be striping")
.set_default(false);
+ String dev_warning = "Option is intended for development and debugging purposes only. ";
+ TVM_ATTR_FIELD(dev_force_block_config)
+ .describe((dev_warning + String("Force the block config to a given value; format = "
+ "\"[BLK_HEIGHT]x[BLK_WIDTH]x[BLK_DEPTH]\""))
+ .data())
+ .set_default("");
+ TVM_ATTR_FIELD(dev_max_open_plans)
+ .describe(
+ (dev_warning + String("Specify the number of open plans kept for each part group"))
+ .data())
+ .set_default("8");
+ TVM_ATTR_FIELD(dev_max_closed_plans)
+ .describe(
+ (dev_warning + String("Specify the number of closed plans kept for each part group"))
+ .data())
+ .set_default("32");
+ TVM_ATTR_FIELD(dev_select_proposal_idx)
+ .describe((dev_warning + String("Select proposal by index")).data())
+ .set_default("-1");
+ TVM_ATTR_FIELD(dev_disable_pareto_plans)
+ .describe((dev_warning + String("Disable pareto culling for plans")).data())
+ .set_default(false);
+ TVM_ATTR_FIELD(dev_disable_pareto_proposals)
+ .describe((dev_warning + String("Disable pareto culling for proposals")).data())
+ .set_default(false);
+ TVM_ATTR_FIELD(dev_disable_block_culling)
+ .describe((dev_warning + String("Disable culling for block configs")).data())
+ .set_default(false);
+ TVM_ATTR_FIELD(dev_cascader_logging)
+ .describe(
+ (dev_warning + String("Enable cascader logging, log is dumped to .json file")).data())
+ .set_default(false);
}
};
diff --git a/tests/python/contrib/test_ethosu/cascader/infra.py b/tests/python/contrib/test_ethosu/cascader/infra.py
index e629e19a69..cfda1df721 100644
--- a/tests/python/contrib/test_ethosu/cascader/infra.py
+++ b/tests/python/contrib/test_ethosu/cascader/infra.py
@@ -31,7 +31,11 @@ def make_options(
max_proposals: int = 1,
stripe_factors: int = 1,
max_plan_size: int = 1,
+ max_open_plans: int = 8,
+ max_closed_plans: int = 32,
always_copy_size: int = 1024,
+ disable_pareto_plans: bool = False,
+ disable_pareto_proposals: bool = False,
enable_striping: bool = True,
):
return cs.CascaderOptions(
@@ -39,7 +43,11 @@ def make_options(
max_proposals=max_proposals,
stripe_factors=stripe_factors,
max_plan_size=max_plan_size,
+ max_open_plans=max_open_plans,
+ max_closed_plans=max_closed_plans,
always_copy_size=always_copy_size,
+ disable_pareto_plans=disable_pareto_plans,
+ disable_pareto_proposals=disable_pareto_proposals,
enable_striping=enable_striping,
)
diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
index ee416a12e1..26a8080e1a 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py
@@ -21,6 +21,7 @@ pytest.importorskip("ethosu.vela")
import numpy as np
import math
+import tvm
import tvm.contrib.ethosu.cascader as cs
from tvm.relay.backend.contrib.ethosu.te.common import get_layout_transform_matrices
@@ -163,15 +164,15 @@ from .infra import make_matrices
# Conv2D
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
- ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+ ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
((1, 10, 6, 4), (1, 5, 1, 12, 4), (1, 10, 1, 6, 4)),
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
# Depthwise Conv2D
((1, 6, 10, 16), (1, 6, 1, 10, 16)),
- ((1, 7, 5, 16), (1, 7, 1, 5, 16)),
+ ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
# Pooling
- ((1, 1, 1, 16), (1, 1, 1, 1, 16)),
+ ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
((1, 9, 6, 16), (1, 9, 1, 6, 16)),
],
),
@@ -181,15 +182,15 @@ from .infra import make_matrices
# Conv2D
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
- ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+ ((1, 4, 4, 96), (1, 4, 6, 4, 16)),
((1, 8, 4, 16), (1, 8, 1, 4, 16)),
((1, 10, 6, 8), (1, 10, 1, 6, 8)),
((1, 6, 5, 16), (1, 6, 1, 5, 16)),
# Depthwise Conv2D
((1, 6, 10, 16), (1, 6, 1, 10, 16)),
- ((1, 7, 5, 16), (1, 7, 1, 5, 16)),
+ ((1, 8, 5, 16), (1, 8, 1, 5, 16)),
# Pooling
- ((1, 1, 1, 16), (1, 1, 1, 1, 16)),
+ ((1, 1, 1, 128), (1, 1, 8, 1, 16)),
((1, 9, 6, 16), (1, 9, 1, 6, 16)),
],
),
@@ -199,15 +200,16 @@ from .infra import make_matrices
# Conv2D
((1, 7, 6, 16), (1, 7, 1, 6, 16)),
((1, 5, 8, 16), (1, 5, 1, 8, 16)),
- ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+ ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
((1, 16, 4, 16), (1, 16, 1, 4, 16)),
((1, 8, 12, 8), (1, 8, 1, 12, 8)),
((1, 10, 6, 16), (1, 10, 1, 6, 16)),
# Depthwise Conv2D
- ((1, 7, 10, 16), (1, 7, 1, 10, 16)),
- ((1, 7, 6, 16), (1, 7, 1, 6, 16)),
+ ((1, 7, 10, 16), (1, 7, 1, 10, 16), (1, 7, 2, 10, 16)),
+ ((1, 10, 6, 16), (1, 10, 1, 6, 16)),
# Pooling
- ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
+ # ((1, 1, 2, 16), (1, 1, 1, 2, 16)),
+ ((1, 1, 2, 128), (1, 1, 8, 2, 16)),
((1, 10, 6, 16), (1, 10, 1, 6, 16)),
],
),
@@ -217,15 +219,16 @@ from .infra import make_matrices
# Conv2D
((1, 14, 8, 16), (1, 14, 1, 8, 16)),
((1, 16, 8, 16), (1, 16, 1, 8, 16)),
- ((1, 4, 4, 16), (1, 4, 1, 4, 16)),
+ ((1, 4, 4, 128), (1, 4, 8, 4, 16)),
((1, 32, 4, 16), (1, 10, 12, 16), (1, 32, 1, 4, 16), (1, 10, 1, 12, 16)),
((1, 20, 12, 8), (1, 20, 1, 12, 8)),
((1, 12, 10, 16), (1, 12, 1, 10, 16)),
# Depthwise Conv2D
- ((1, 8, 20, 16), (1, 8, 1, 20, 16)),
+ ((1, 8, 20, 16), (1, 8, 1, 20, 16), (1, 8, 2, 20, 16)),
((1, 14, 6, 16), (1, 14, 1, 6, 16)),
# Pooling
- ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
+ # ((1, 2, 2, 16), (1, 2, 1, 2, 16)),
+ ((1, 2, 2, 128), (1, 2, 8, 2, 16)),
((1, 10, 12, 16), (1, 10, 1, 12, 16)),
],
),
@@ -339,5 +342,119 @@ def test_best_block_config(
assert block_shape in expected_block_configs[test_id]
+@pytest.mark.parametrize(
+ "ofm_layout, block_config_str, expected_block_shape",
+ [
+ ("NHWC", "4x4x8", [1, 4, 4, 8]),
+ ("NHCWB16", "4x4x8", [1, 4, 1, 4, 16]),
+ ("NHCWB16", "4x4x24", [1, 4, 2, 4, 16]),
+ ],
+)
+def test_force_block_config_kernelwise(ofm_layout, block_config_str, expected_block_shape):
+ op_type = "ethosu_pooling"
+ activation = "NONE"
+ kernel = (2, 2)
+ stride = (2, 2)
+ padding = (0, 0)
+ dilation = (1, 1)
+ ifm_channels = 32
+ out_shape = (1, 8, 10, 16)
+
+ ifm_matrix, ifm_offset, _, _, _, _ = make_matrices(
+ op_type, kernel, stride, padding, "NHWC", ofm_layout, dilation, ifm_channels
+ )
+
+ ofm_channels = out_shape[3]
+
+ propagator = cs.Propagator(ifm_matrix, ifm_offset)
+
+ op_attrs = {
+ "op": op_type,
+ "activation": activation,
+ "stride_h": stride[0],
+ "stride_w": stride[1],
+ "dilation_h": dilation[0],
+ "dilation_w": dilation[1],
+ }
+
+ config = {
+ "enable_cascader": True,
+ "dev_force_block_config": block_config_str,
+ }
+ with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+ device_config = cs.EthosuDeviceConfig("ethos-u55-128")
+ block_configs = device_config.get_valid_block_configs(
+ propagator,
+ op_attrs,
+ out_shape,
+ ofm_channels,
+ ifm_channels,
+ ofm_layout,
+ "NHWC",
+ "int8",
+ "int8",
+ kernel[0],
+ kernel[1],
+ )
+
+ assert len(block_configs) == 1
+ assert block_configs[0].output_shape == expected_block_shape
+
+
+@pytest.mark.parametrize(
+ "ofm_layout, block_config_str, expected_block_shape",
+ [
+ ("NHWC", "4x4x8", [1, 4, 4, 8]),
+ ("NHCWB16", "4x4x8", [1, 4, 1, 4, 16]),
+ ("NHCWB16", "4x4x24", [1, 4, 2, 4, 16]),
+ ],
+)
+def test_force_block_config_elementwise(ofm_layout, block_config_str, expected_block_shape):
+ op_type = "ethosu_elementwise_unary"
+ op_str = "ABS"
+ activation = "NONE"
+ ofm_shape = (1, 8, 10, 16)
+ ifm_matrix = [
+ [1, 0, 0, 0, 0],
+ [0, 1, 0, 0, 0],
+ [0, 0, 1, 0, 0],
+ [0, 0, 0, 1, 0],
+ [0, 0, 0, 0, 1],
+ ]
+ ifm_offset = [0, 0, 0, 0]
+
+ propagator = cs.Propagator(ifm_matrix, ifm_offset)
+
+ op_attrs = {
+ "op": op_type,
+ "operator_type": op_str,
+ "activation": activation,
+ "clip_min": 0,
+ "clip_max": 0,
+ "rounding_mode": "TFL",
+ }
+
+ config = {
+ "enable_cascader": True,
+ "dev_force_block_config": block_config_str,
+ }
+ with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+ device_config = cs.EthosuDeviceConfig("ethos-u55-128")
+ block_configs = device_config.get_elementwise_block_config(
+ propagator,
+ None,
+ op_attrs,
+ ofm_shape,
+ ofm_layout,
+ "NWHC",
+ None,
+ "int8",
+ "int8",
+ )
+
+ assert len(block_configs) == 1
+ assert block_configs[0].output_shape == expected_block_shape
+
+
if __name__ == "__main__":
pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
index fb19af4abc..5e4117e50f 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_memory_reduction.py
@@ -93,8 +93,8 @@ def _get_ethosu_workspace_size(
[
("ethos-u55-256", 1067408, 14096),
("ethos-u55-128", 1067408, 3968),
- ("ethos-u55-64", 1067408, 2272),
- ("ethos-u55-32", 1067392, 2256),
+ ("ethos-u55-64", 1067408, 3968),
+ ("ethos-u55-32", 1067392, 3952),
],
)
def test_double_conv2d(
@@ -161,10 +161,10 @@ def test_double_conv2d(
@pytest.mark.parametrize(
"accel_type, expected_ws_size_without_striping, expected_ws_size_with_striping",
[
- ("ethos-u55-256", 180096, 5024),
- ("ethos-u55-128", 180096, 4832),
- ("ethos-u55-64", 180096, 6464),
- ("ethos-u55-32", 180096, 6464),
+ ("ethos-u55-256", 180096, 15008),
+ ("ethos-u55-128", 180096, 14240),
+ ("ethos-u55-64", 180096, 14240),
+ ("ethos-u55-32", 180096, 14240),
],
)
def test_depthwise2d_conv2d_pooling(
@@ -227,7 +227,7 @@ def test_depthwise2d_conv2d_pooling(
assert workspace_size_cascader_disabled == workspace_size_cascader_enabled_striping_disabled
# Run the same graph with the cascader, giving it less memory to persuade cascder to cascade
- pool_size = 40000
+ pool_size = 50000
workspace_size_cascader_enabled_striping_enabled = _get_ethosu_workspace_size(
mod, params, accel_type, pool_size, enable_cascader=True, enable_striping=True
)
diff --git a/tests/python/contrib/test_ethosu/cascader/test_pareto.py b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
index 2d897a7931..baf8739c08 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_pareto.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_pareto.py
@@ -141,7 +141,7 @@ def test_pareto_cull_plans(num_plans, max_plans, SRAM):
plans = _make_plans(num_plans)
reference = list(_ref_pareto_cull_plans(plans, max_plans))
- result = _pareto_cull_plans(plans, max_plans)
+ result = _pareto_cull_plans(plans, max_plans, False)
assert result == reference
diff --git a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
index ac767fa00e..c35ad15e23 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_plan_generator.py
@@ -26,9 +26,8 @@ from tvm.contrib.ethosu.cascader.plan_generator import (
)
-def test_generate_output_stripe_configs():
- stripe_factors = 3
- expected_configs = 13
+@pytest.mark.parametrize("stripe_factors", [3, 4, 8, 16, 10])
+def test_generate_output_stripe_configs_disable_striping(stripe_factors):
subgraph = cs.TESubgraph([], None)
part_1 = cs.InlinePart(
subgraph,
@@ -48,13 +47,95 @@ def test_generate_output_stripe_configs():
tensor_2.add_producer(part_1)
assert (
- len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=True))
- == expected_configs
+ len(
+ _generate_output_stripe_configs(
+ part_1, stripe_factors, enable_striping=False, multi_dimensional=False
+ )
+ )
+ == 1
)
-@pytest.mark.parametrize("stripe_factors", [3, 4, 8, 16, 10])
-def test_generate_output_stripe_configs_disable_striping(stripe_factors):
+def test_generate_output_stripe_configs_multi_dimensional():
+ stripe_factors = 3
+ subgraph = cs.TESubgraph([], None)
+ part_1 = cs.InlinePart(
+ subgraph,
+ [
+ cs.Propagator(
+ [[2, 0, 0], [0, 2, 0], [0, 0, 1]],
+ [0, 0],
+ ),
+ ],
+ )
+ tensor_1 = cs.Tensor([800, 800], "uint8")
+ tensor_2 = cs.Tensor([400, 400], "uint8")
+
+ part_1.set_input(0, tensor_1)
+ part_1.set_output(tensor_2)
+ tensor_1.add_consumer(part_1)
+ tensor_2.add_producer(part_1)
+
+ expected_stripe_configs = {
+ cs.StripeConfig([1, 1], [400, 400], [1, 1], [1, 2], [400, 400], [0, 0]),
+ cs.StripeConfig([1, 1], [400, 400], [1, 1], [2, 1], [400, 400], [0, 0]),
+ cs.StripeConfig([200, 1], [400, 400], [200, 1], [1, 2], [2, 400], [0, 0]),
+ cs.StripeConfig([200, 1], [400, 400], [200, 1], [2, 1], [2, 400], [0, 0]),
+ cs.StripeConfig([400, 1], [400, 400], [400, 1], [2, 1], [1, 400], [0, 0]),
+ cs.StripeConfig([1, 200], [400, 400], [1, 200], [1, 2], [400, 2], [0, 0]),
+ cs.StripeConfig([1, 200], [400, 400], [1, 200], [2, 1], [400, 2], [0, 0]),
+ cs.StripeConfig([200, 200], [400, 400], [200, 200], [2, 1], [2, 2], [0, 0]),
+ cs.StripeConfig([200, 200], [400, 400], [200, 200], [1, 2], [2, 2], [0, 0]),
+ cs.StripeConfig([400, 200], [400, 400], [400, 200], [2, 1], [1, 2], [0, 0]),
+ cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+ cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+ cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+ }
+
+ output_stripe_configs = _generate_output_stripe_configs(
+ part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=True
+ )
+
+ assert len(output_stripe_configs) == len(expected_stripe_configs)
+ assert set(output_stripe_configs) == expected_stripe_configs
+
+
+def test_generate_output_stripe_configs_uncascadable_axis():
+ stripe_factors = 3
+ subgraph = cs.TESubgraph([], None)
+ part_1 = cs.InlinePart(
+ subgraph,
+ [
+ cs.Propagator(
+ [[2, 0, 0], [0, 0, 200], [0, 0, 1]],
+ [0, 0],
+ ),
+ ],
+ )
+ tensor_1 = cs.Tensor([800, 200], "uint8")
+ tensor_2 = cs.Tensor([400, 400], "uint8")
+
+ part_1.set_input(0, tensor_1)
+ part_1.set_output(tensor_2)
+ tensor_1.add_consumer(part_1)
+ tensor_2.add_producer(part_1)
+
+ expected_stripe_configs = {
+ cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+ cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+ cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+ }
+
+ output_stripe_configs = _generate_output_stripe_configs(
+ part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=True
+ )
+
+ assert len(output_stripe_configs) == len(expected_stripe_configs)
+ assert set(output_stripe_configs) == expected_stripe_configs
+
+
+def test_generate_output_stripe_configs_single_dimension():
+ stripe_factors = 3
subgraph = cs.TESubgraph([], None)
part_1 = cs.InlinePart(
subgraph,
@@ -73,7 +154,20 @@ def test_generate_output_stripe_configs_disable_striping(stripe_factors):
tensor_1.add_consumer(part_1)
tensor_2.add_producer(part_1)
- assert len(_generate_output_stripe_configs(part_1, stripe_factors, enable_striping=False)) == 1
+ expected_stripe_configs = {
+ cs.StripeConfig([400, 1], [400, 400], [400, 1], [2, 1], [1, 400], [0, 0]),
+ cs.StripeConfig([400, 200], [400, 400], [400, 200], [2, 1], [1, 2], [0, 0]),
+ cs.StripeConfig([1, 400], [400, 400], [1, 400], [1, 2], [400, 1], [0, 0]),
+ cs.StripeConfig([200, 400], [400, 400], [200, 400], [1, 2], [2, 1], [0, 0]),
+ cs.StripeConfig([400, 400], [400, 400], [400, 400], [1, 2], [1, 1], [0, 0]),
+ }
+
+ output_stripe_configs = _generate_output_stripe_configs(
+ part=part_1, stripe_factors=stripe_factors, enable_striping=True, multi_dimensional=False
+ )
+
+ assert len(output_stripe_configs) == len(expected_stripe_configs)
+ assert set(output_stripe_configs) == expected_stripe_configs
def test_generate_single_plans(SRAM, DRAM):
@@ -101,7 +195,10 @@ def test_generate_single_plans(SRAM, DRAM):
}
options = make_options(cascade_region=SRAM, stripe_factors=1)
output_stripe_configs = _generate_output_stripe_configs(
- part_1, options.stripe_factors, enable_striping=True
+ part_1,
+ options.stripe_factors,
+ enable_striping=True,
+ multi_dimensional=True,
)
plans = _generate_single_plans(part_1, output_stripe_configs, home_map, options)
for plan in plans:
diff --git a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
index 89b4b41b33..6ac188187e 100644
--- a/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
+++ b/tests/python/contrib/test_ethosu/cascader/test_scheduler.py
@@ -20,7 +20,6 @@ import tvm.contrib.ethosu.cascader as cs
from .infra import ethosu_enabled
-
if ethosu_enabled:
def test_cascade(
@@ -39,7 +38,11 @@ if ethosu_enabled:
max_proposals=64,
stripe_factors=4,
max_plan_size=10,
+ max_open_plans=8,
+ max_closed_plans=32,
always_copy_size=1024,
+ disable_pareto_plans=False,
+ disable_pareto_proposals=False,
)
cs.cascade(sch, te_graph, const_dict, options, SRAM, FLASH, [SRAM], device_config)
diff --git a/tests/python/contrib/test_ethosu/test_vela_api.py b/tests/python/contrib/test_ethosu/test_vela_api.py
index 662b35822c..e2e4b2cb3a 100644
--- a/tests/python/contrib/test_ethosu/test_vela_api.py
+++ b/tests/python/contrib/test_ethosu/test_vela_api.py
@@ -254,6 +254,19 @@ def test_get_optimal_block_config():
assert vela_api._get_optimal_block_config(test_case["test"]) == test_case["ref"]
+@pytest.mark.parametrize(
+ "block_config_str, expected_block_config",
+ [("4x4x8", vapi.NpuShape3D(4, 4, 8)), ("3x7x16", vapi.NpuShape3D(3, 7, 16))],
+)
+def test_force_block_config(block_config_str, expected_block_config):
+ config = {
+ "dev_force_block_config": block_config_str,
+ }
+ with tvm.transform.PassContext(config={"relay.ext.ethos-u.options": config}):
+ block_config = vela_api.get_optimal_block_config(None, vapi.NpuAccelerator.Ethos_U55_128)
+ assert block_config == expected_block_config
+
+
def test_compress_weights():
test_vecs = [
{