You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by cs...@apache.org on 2022/06/17 19:09:26 UTC
[tvm] branch main updated: [hexagon][testing] add max_pool2d benchmark (#11720)
This is an automated email from the ASF dual-hosted git repository.
csullivan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new c5465d8037 [hexagon][testing] add max_pool2d benchmark (#11720)
c5465d8037 is described below
commit c5465d8037196c7abbf3f599a120dd28e1d029bd
Author: Christian Convey <cc...@octoml.ai>
AuthorDate: Fri Jun 17 15:09:18 2022 -0400
[hexagon][testing] add max_pool2d benchmark (#11720)
- Add benchmarking framework for Hexagon maxpool-2d kernels,
and one (simple) kernel.
---
.../python/contrib/test_hexagon/benchmark_util.py | 76 +++++
.../test_hexagon/test_benchmark_maxpool2d.py | 351 +++++++++++++++++++++
2 files changed, 427 insertions(+)
diff --git a/tests/python/contrib/test_hexagon/benchmark_util.py b/tests/python/contrib/test_hexagon/benchmark_util.py
index 35fe6bad64..e581c3d55d 100644
--- a/tests/python/contrib/test_hexagon/benchmark_util.py
+++ b/tests/python/contrib/test_hexagon/benchmark_util.py
@@ -17,6 +17,9 @@
import csv
import os
+import pytest
+import tempfile
+import collections
def skip_bencharks_flag_and_reason():
@@ -52,6 +55,20 @@ class NumericalAccuracyException(Exception):
"""
+class UnsupportedException(Exception):
+ """
+ Indicates that the specified benchmarking configuration is known to
+ currently be unsupported. The Exception message may provide more detail.
+ """
+
+
+class NumericalAccuracyException(Exception):
+ """
+ Indicates that the benchmarking configuration appeared to run successfully,
+ but the output data didn't have the expected accuracy.
+ """
+
+
class BenchmarksTable:
"""
Stores/reports the result of benchmark runs.
@@ -207,3 +224,62 @@ def get_benchmark_decription(keys_dict):
other characters that make it unsuitable for use as a filename.
"""
return " ".join([f"{k}={v}" for k, v in keys_dict.items()])
+
+
+# This fixture provides some initialization / finalization logic for groups of related
+# benchmark runs.
+# See the fixture implementation below for details.
+#
+# The fixture's mechanics are described here: https://stackoverflow.com/a/63047695
+#
+# TODO: There may be cleaner ways to let each class that uses this fixture provide its
+# own value for `csv_column_order`.
+#
+# TODO: In the future we may wish to break this fixture up in to several smaller ones.
+#
+# The overall contract for a class (e.g. `MyTest`) using this fixture is as follows:
+#
+# https://stackoverflow.com/a/63047695
+#
+# @pytest.mark.usefixtures("benchmark_group")
+# class MyTest:
+#
+# # The fixture requires that this class variable is defined before
+# # the fixture's finalizer-logic executes.
+# #
+# # This is used as an argument to BenchmarkTable.print_csv(...) after
+# # all of MyTest's unit tests have executed.
+# csv_column_order = [
+# ...
+# ]
+#
+# # Before the MyTest's first unit test executes, the fixture will populate the
+# # following class variables:
+# MyTest.working_dir : str
+# MyTest.benchmark_table : BenchmarkTable
+@pytest.fixture(scope="class")
+def benchmark_group(request):
+ working_dir = tempfile.mkdtemp()
+ bt = BenchmarksTable()
+
+ request.cls.working_dir = working_dir
+ request.cls.benchmark_table = bt
+
+ yield
+
+ tabular_output_filename = os.path.join(working_dir, "benchmark-results.csv")
+
+ if not hasattr(request.cls, "csv_column_order"):
+ raise Exception('Classes using this fixture must have a member named "csv_column_order"')
+
+ with open(tabular_output_filename, "w") as csv_file:
+ bt.print_csv(csv_file, request.cls.csv_column_order)
+
+ print()
+ print("*" * 80)
+ print(f"BENCHMARK RESULTS FILE: {tabular_output_filename}")
+ print("*" * 80)
+ print()
+
+ if bt.has_fail() > 0:
+ pytest.fail("At least one benchmark configuration failed", pytrace=False)
diff --git a/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
new file mode 100644
index 0000000000..4116949441
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_benchmark_maxpool2d.py
@@ -0,0 +1,351 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+This module serves two purposes:
+ (1) Demonstrates how to write Python code that exercises various
+ Hexagon-related algorithms / features.
+
+ (2) Benchmark the resulting primfuncs.
+
+Current limitations:
+ - Input shapes are limited to NHWC --> NHWC_8h8w32c.
+
+ - Testing parameters (input shapes, dtypes, etc.) currently
+ support only one value for each parameter.
+
+ - H, W, C must be integer multiples of 8, 8, and 32,
+ respectively. I.e., partial blocks aren't currently
+ supported by this script.
+
+ - Requires that I/O tensors reside in "global.VTCM" memory,
+ rather than "global" memory.
+ This prevents benchmarking with I/O tensors that are too
+ large to fit into availble VTCM.
+
+ - The script only develops one primfunc.
+ Future revisions to this script are expected to add more
+ primfuncs and demonstrate more coding strategies.
+"""
+
+import sys
+import pytest
+import numpy as np
+import copy
+import os
+
+import tvm.testing
+from tvm import te, topi, tir
+from tvm.topi import testing
+from tvm.script import tir as T
+from tvm.tir import IndexMap
+from tvm.relay.backend import Executor, Runtime
+from tvm.contrib.hexagon.session import Session
+from typing import List
+
+from .infrastructure import allocate_hexagon_array
+from . import benchmark_util as bu
+from .benchmark_util import benchmark_group
+
+_SHOULD_SKIP_BENCHMARKS, _SKIP_BENCHMARKS_REASON = bu.skip_bencharks_flag_and_reason()
+
+
+def _ceil_div(numerator, denominator):
+ return (numerator + (denominator - 1)) // denominator
+
+
+def _int8_nhwc_8h8w32c_map(n, h, w, c):
+ return [
+ n,
+ h // 8,
+ w // 8,
+ c // 32,
+ te.AXIS_SEPARATOR,
+ h % 8,
+ w % 8,
+ c % 32,
+ ]
+
+
+def _int8_nhwc_8h8w32c_shape(n, h, w, c) -> List[int]:
+ return [
+ n,
+ _ceil_div(h, 8),
+ _ceil_div(w, 8),
+ _ceil_div(c, 32),
+ 8,
+ 8,
+ 32,
+ ]
+
+
+def _int8_nhwc_8h8w32c_xform_immediate(arr_in: np.ndarray) -> np.ndarray:
+ """
+ Return a deep copy of 'arr_in', transformed from a NWHC to
+ NHWC-8h8wc32 shape. Any newly created array elements have value 0.
+ """
+ stage1 = copy.copy(arr_in)
+
+ (
+ n,
+ h,
+ w,
+ c,
+ ) = stage1.shape
+
+ (
+ h_minor,
+ w_minor,
+ c_minor,
+ ) = [8, 8, 32]
+
+ h_major = _ceil_div(h, h_minor)
+ w_major = _ceil_div(w, w_minor)
+ c_major = _ceil_div(c, c_minor)
+
+ # This handles cases where the dimensions of arr_in are not cleanly divided
+ # by the minor block size, i.e. [8, 8, 32].
+ #
+ # Any additional array elements that this creates will ahve value 0.
+ # We shouldn't actually care what value is used for those elements, because they
+ # shouldn't be treated as meaningful by any of our algorithms.
+ if (h % h_minor) or (w % w_minor) or (c % c_minor):
+ stage1.resize((n, h_major * h_minor, w_major * w_minor, c_major * c_minor), refcheck=False)
+
+ stage2 = stage1.reshape(n, h_major, h_minor, w_major, w_minor, c_major, c_minor)
+ stage3 = stage2.transpose(0, 1, 3, 5, 2, 4, 6)
+ return stage3
+
+
+def _create_test_input(shape, dtype: str) -> np.ndarray:
+ np_dtype = np.dtype(dtype)
+ min_value = np.iinfo(np_dtype).min
+ max_value = np.iinfo(np_dtype).max
+ return np.random.randint(low=min_value, high=max_value, size=tuple(shape), dtype=np.int8)
+
+
+@pytest.mark.usefixtures("benchmark_group")
+class TestMaxPool2D:
+ csv_column_order = [
+ # Identifies which TE-compute / TIRScript is used as the basis for the
+ # benchmarked primfunc. Only needs to be meaningful to humans.
+ "basic_kernel",
+ # When applicable, indicates the particular variation of schedules
+ # apply by the Python code. Decoding this may require looking at this
+ # script's source code.
+ "sched_type",
+ # Values directly based on test parameters...
+ "input_shape_4d",
+ "block_shape",
+ "DTYPE",
+ "KERNEL",
+ "STRIDE",
+ "DILATION",
+ "PADDING",
+ "IO_TENSOR_MEM_SCOPE",
+ # Reserved columns defined by the BenchmarksTable class.
+ "row_status",
+ "timings_min_usecs",
+ "timings_max_usecs",
+ "timings_median_usecs",
+ "timings_mean_usecs",
+ "timings_stddev_usecs",
+ # For benchmarks that produce files on the host file system, this indicates
+ # their location. Useful for post-mortem investigation of benchmark results.
+ "host_files_dir_path",
+ # Miscellaneous comments about the benchmark.
+ "comments",
+ ]
+
+ DTYPE = tvm.testing.parameter("int8")
+
+ # FIXME(cconvey): The script currently fails when H, W, or C is not an
+ # integer multiple of 8, 8, or 32, respectively.
+ N = tvm.testing.parameter(1)
+ H = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+ W = tvm.testing.parameter(*[x * 8 for x in [1, 4, 16]])
+ C = tvm.testing.parameter(*[x * 32 for x in [1, 2]])
+
+ KERNEL = tvm.testing.parameter((1, 1), (3, 3))
+ STRIDE = tvm.testing.parameter((1, 1))
+ DILATION = tvm.testing.parameter((1, 1))
+ PADDING = tvm.testing.parameter((0, 0, 0, 0))
+ IO_TENSOR_MEM_SCOPE = tvm.testing.parameter("global.vtcm")
+
+ @pytest.mark.skipif(_SHOULD_SKIP_BENCHMARKS, reason=_SKIP_BENCHMARKS_REASON)
+ @tvm.testing.requires_hexagon
+ def test_maxpool2d_nhwc(
+ self,
+ N,
+ H,
+ W,
+ C,
+ DTYPE,
+ KERNEL,
+ STRIDE,
+ DILATION,
+ PADDING,
+ IO_TENSOR_MEM_SCOPE,
+ hexagon_session: Session,
+ ):
+ keys_dict = {
+ "basic_kernel": "max_pool2d",
+ "sched_type": 1,
+ "input_shape_4d": [N, H, W, C],
+ "block_shape": [8, 8, 32],
+ "DTYPE": DTYPE,
+ "KERNEL": KERNEL,
+ "STRIDE": STRIDE,
+ "DILATION": DILATION,
+ "PADDING": PADDING,
+ "IO_TENSOR_MEM_SCOPE": IO_TENSOR_MEM_SCOPE,
+ }
+
+ desc = bu.get_benchmark_decription(keys_dict)
+
+ # Create the host-side directory for this benchmark run's files / logs...
+ host_files_dir_name = bu.get_benchmark_id(keys_dict)
+ host_files_dir_path = os.path.join(self.working_dir, host_files_dir_name)
+ os.mkdir(host_files_dir_path)
+
+ keys_dict["host_files_dir_path"] = host_files_dir_path
+
+ log_file_path = os.path.join(host_files_dir_path, "out.txt")
+ with open(log_file_path, "w") as log_file:
+ print(f"CONFIGURATION: {desc}")
+ log_file.write(f"CONFIGURATION: {desc}\n")
+
+ try:
+ input_tensor_shape_4d = [N, H, W, C]
+ input_tensor_shape_7d = _int8_nhwc_8h8w32c_shape(N, H, W, C)
+
+ data = te.placeholder(tuple(input_tensor_shape_4d), dtype=DTYPE)
+
+ output = topi.nn.pool2d(
+ data, KERNEL, STRIDE, DILATION, PADDING, "max", layout="NHWC"
+ )
+ primfunc = te.create_prim_func([data, output])
+
+ sch = tir.Schedule(primfunc, debug_mask="all")
+
+ sch.transform_layout(
+ block="tensor", buffer="placeholder", index_map=_int8_nhwc_8h8w32c_map
+ )
+
+ target_hexagon = tvm.target.hexagon("v69", link_params=True)
+ # func = tvm.build(sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon))
+ built_module = tvm.build(
+ sch.mod, target=tvm.target.Target(target_hexagon, host=target_hexagon)
+ )
+
+ # Save a local copy of the Hexagon object code (in the form of a .so file)
+ # to allow post-mortem inspection.
+ host_dso_binary_path = os.path.join(host_files_dir_path, "test_binary.so")
+ built_module.save(host_dso_binary_path)
+ print(f"SAVED BINARY TO HOST PATH: {host_dso_binary_path}")
+
+ hexagon_mod = hexagon_session.load_module(built_module)
+
+ # Generate the input tensor's data.
+ # Note that we'll eventually need it in two different layouts:
+ # (1) NHWC as an argument to testing.poolnd_python.
+ # (2) NHWC_8h8w32c for as an argument to our Hexagon primfunc.
+ # a_numpy_4d = np.random.randint(low=-128, high=127, size=input_tensor_shape_4d, dtype=np.int8)
+ a_numpy_4d = _create_test_input(input_tensor_shape_4d, DTYPE)
+
+ ref_output_4d = testing.poolnd_python(
+ a_numpy_4d.astype("int32"),
+ KERNEL,
+ STRIDE,
+ DILATION,
+ PADDING[0:2],
+ PADDING[2:],
+ pool_type="max",
+ dtype="int32",
+ layout="NHWC",
+ ).astype(DTYPE)
+
+ output_tensor_shape_4d = ref_output_4d.shape
+
+ a_numpy_7d = _int8_nhwc_8h8w32c_xform_immediate(a_numpy_4d)
+
+ a_hexagon_7d = allocate_hexagon_array(
+ hexagon_session.device,
+ tensor_shape=input_tensor_shape_7d,
+ axis_separators=[4],
+ dtype=DTYPE,
+ mem_scope=IO_TENSOR_MEM_SCOPE,
+ )
+
+ c_hexagon_4d = allocate_hexagon_array(
+ hexagon_session.device,
+ tensor_shape=output_tensor_shape_4d,
+ axis_separators=[],
+ dtype=DTYPE,
+ mem_scope=IO_TENSOR_MEM_SCOPE,
+ )
+
+ a_hexagon_7d.copyfrom(a_numpy_7d)
+
+ if DTYPE == "int8":
+ rel_tolerance = 0
+ abs_tolerance = 0
+ else:
+ assert False, f"TODO: decide acceptable tolerances for DTYPE {DTYPE}"
+
+ # hexagon_mod(a_hexagon_7d, c_hexagon_4d)
+ # tvm.testing.assert_allclose(ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance)
+
+ timer = hexagon_mod.time_evaluator(
+ "main", hexagon_session.device, number=10, repeat=1
+ )
+ timing_result = timer(a_hexagon_7d, c_hexagon_4d)
+
+ try:
+ tvm.testing.assert_allclose(
+ ref_output_4d, c_hexagon_4d.numpy(), rtol=rel_tolerance, atol=abs_tolerance
+ )
+ except AssertionError as e:
+ raise bu.NumericalAccuracyException(str(e))
+
+ except bu.NumericalAccuracyException as e:
+ print()
+ print(f"FAIL: Numerical accuracy error. See log file.")
+
+ log_file.write("\n")
+ log_file.write(f"FAIL: {e}\n")
+
+ self.benchmark_table.record_fail(
+ **keys_dict, comments=f"Numerical accuracy error. See log file."
+ )
+
+ except bu.UnsupportedException as e:
+ print()
+ print(f"SKIP: {e}")
+
+ log_file.write("\n")
+ log_file.write(f"SKIP: {e}\n")
+
+ self.benchmark_table.record_skip(
+ **keys_dict, comments=f"Unsupported configuration: {e}"
+ )
+
+ self.benchmark_table.record_success(timing_result, **keys_dict)
+
+
+if __name__ == "__main__":
+ tvm.testing.main()