You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2022/04/28 01:03:36 UTC
[tvm] branch main updated: [Graph Debugger] Expose way to benchmark individual nodes. (#11000)
This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 9fd279b40a [Graph Debugger] Expose way to benchmark individual nodes. (#11000)
9fd279b40a is described below
commit 9fd279b40aa77677e9ba698ca34c8062b55cb3e8
Author: AndrewZhaoLuo <an...@gmail.com>
AuthorDate: Wed Apr 27 18:03:29 2022 -0700
[Graph Debugger] Expose way to benchmark individual nodes. (#11000)
* initial
* secondary commit
* docs
* match tests
* fix test
* use std::fixed, max precision, typed pack func, fix isnan
* comments on docs
* address tristan comments
* add test
* tristan comments
* use skipif
* empty commit
* empty commit
* jostle again
* remove assert statement
---
python/tvm/contrib/debugger/debug_executor.py | 55 +++++++++-
.../graph_executor/debug/graph_executor_debug.cc | 113 ++++++++++++--------
tests/python/unittest/test_runtime_graph_debug.py | 114 +++++++++++++++++----
3 files changed, 214 insertions(+), 68 deletions(-)
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index 12152e9de1..f144b3cb4a 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -16,16 +16,18 @@
# under the License.
"""Graph debug runtime executes TVM debug packed functions."""
+import logging
import os
-import tempfile
import shutil
-import logging
-import tvm._ffi
+import tempfile
+import tvm._ffi
from tvm._ffi.base import string_types
from tvm.contrib import graph_executor
-from . import debug_result
+from tvm.runtime.module import BenchmarkResult
+
from ...runtime.profiling import Report
+from . import debug_result
_DUMP_ROOT_PREFIX = "tvmdbg_"
_DUMP_PATH_PREFIX = "_tvmdbg_"
@@ -111,6 +113,7 @@ class GraphModuleDebug(graph_executor.GraphModule):
self._dump_root = dump_root
self._dump_path = None
self._run_individual = module["run_individual"]
+ self._run_individual_node = module["run_individual_node"]
self._debug_get_output = module["debug_get_output"]
self._execute_node = module["execute_node"]
self._get_node_output = module["get_node_output"]
@@ -223,7 +226,6 @@ class GraphModuleDebug(graph_executor.GraphModule):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.
-
"""
# Get timing.
self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
@@ -281,6 +283,49 @@ class GraphModuleDebug(graph_executor.GraphModule):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []
+ def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+ """Benchmark a single node in the serialized graph.
+
+ This does not do any data transfers and uses arrays already on the device.
+
+ Parameters
+ ----------
+ index : int
+ The index of the node, see `self.debug_datum.get_graph_nodes`
+
+ number: int
+ The number of times to run this function for taking average.
+ We call these runs as one `repeat` of measurement.
+
+ repeat: int, optional
+ The number of times to repeat the measurement.
+ In total, the function will be invoked (1 + number x repeat) times,
+ where the first one is warm up and will be discarded.
+ The returned result contains `repeat` costs,
+ each of which is an average of `number` costs.
+
+ min_repeat_ms: int, optional
+ The minimum duration of one `repeat` in milliseconds.
+ By default, one `repeat` contains `number` runs. If this parameter is set,
+ the parameters `number` will be dynamically adjusted to meet the
+ minimum duration requirement of one `repeat`.
+ i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+ will be automatically increased.
+
+ Returns
+ -------
+ A module BenchmarkResult
+ """
+ # Results are returned as serialized strings which we deserialize
+ ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
+ answer = []
+ for value in ret.split(","):
+ if value.strip() == "":
+ continue
+ answer.append(float(value))
+
+ return BenchmarkResult(answer)
+
def profile(self, collectors=None, **input_dict):
"""Run forward execution of the graph and collect overall and per-op
performance metrics.
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 12a739722a..cf7a4cd049 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -27,8 +27,10 @@
#include <tvm/runtime/registry.h>
#include <chrono>
+#include <cmath>
#include <sstream>
+#include "../../rpc/rpc_session.h"
#include "../graph_executor.h"
namespace tvm {
@@ -67,44 +69,14 @@ class GraphExecutorDebug : public GraphExecutor {
time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
}
} else {
- for (int i = 0; i < repeat; ++i) {
- std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
- tbegin, tend;
- double duration_ms = 0.0;
- do {
- std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0);
- if (duration_ms > 0.0) {
- number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
- number * 1.618)); // 1.618 is chosen by random
- }
- tbegin = std::chrono::high_resolution_clock::now();
- std::vector<std::vector<Timer>> op_timers;
- for (size_t index = 0; index < op_execs_.size(); index++) {
- op_timers.push_back({});
- }
- for (int k = 0; k < number; k++) {
- for (size_t index = 0; index < op_execs_.size(); ++index) {
- if (op_execs_[index]) {
- op_timers[index].push_back(RunOpHost(index));
- }
- }
- }
- for (size_t index = 0; index < op_execs_.size(); ++index) {
- for (auto t : op_timers[index]) {
- time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9;
- }
- }
- tend = std::chrono::high_resolution_clock::now();
- duration_ms =
- std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() *
- 1000;
- } while (duration_ms < min_repeat_ms);
-
- LOG(INFO) << "Iteration: " << i;
- int op = 0;
- for (size_t index = 0; index < time_sec_per_op.size(); index++) {
+ for (size_t index = 0; index < op_execs_.size(); ++index) {
+ std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
+ for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
+ time_sec_per_op[index] = results[cur_repeat];
+
+ LOG(INFO) << "Iteration: " << cur_repeat;
+ int op = 0;
if (op_execs_[index]) {
- time_sec_per_op[index] /= number;
LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
<< time_sec_per_op[index] * 1e6 << " us/iter";
}
@@ -114,15 +86,50 @@ class GraphExecutorDebug : public GraphExecutor {
std::ostringstream os;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
- os << time_sec_per_op[index] << ",";
+ double time = time_sec_per_op[index];
+ // To have good behavior when calculating total time, etc.
+ if (std::isnan(time)) {
+ time = 0;
+ }
+ os << time << ",";
}
return os.str();
}
+ std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
+ std::string tkey = module_->type_key();
+
+ // results_in_seconds[a][b] is the bth index run of the ath index repeat
+ std::vector<double> results_in_seconds(repeat, 0);
+
+ if (tkey == "rpc") {
+ LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+ }
+
+ if (!op_execs_[node_index]) {
+ // don't return anything...
+ return results_in_seconds;
+ }
+
+ // assume host runs things which is first device
+ Device& d = devices_[0];
+ PackedFunc time_evaluator = WrapTimeEvaluator(
+ TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
+ repeat, min_repeat_ms);
+ std::string result = time_evaluator();
+ const double* results_arr = reinterpret_cast<const double*>(result.data());
+ size_t double_bytes = sizeof(double);
+ for (size_t i = 0; i < result.size() / double_bytes; i++) {
+ results_in_seconds[i] = results_arr[i];
+ }
+ return results_in_seconds;
+ }
+
double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
- // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which
- // represent inputs/parameters to the graph. Other types may be supported in the future, but
- // consideration would be needed as to how to do that over RPC before we support it here.
+ // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
+ // which represent inputs/parameters to the graph. Other types may be supported in the
+ // future, but consideration would be needed as to how to do that over RPC before we support
+ // it here.
if (nodes_[index].op_type != "tvm_op") {
CHECK_EQ(nodes_[index].op_type, "null")
<< "Don't know how to run op type " << nodes_[index].op_type
@@ -362,6 +369,30 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
ICHECK_GE(min_repeat_ms, 0);
*rv = this->RunIndividual(number, repeat, min_repeat_ms);
});
+ } else if (name == "run_individual_node") {
+ return TypedPackedFunc<std::string(int, int, int, int)>(
+ [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) {
+ ICHECK_GE(node_index, 0);
+ ICHECK_LT(node_index, nodes_.size());
+ ICHECK_GT(number, 0);
+ ICHECK_GT(repeat, 0);
+ ICHECK_GE(min_repeat_ms, 0);
+ std::vector<double> results =
+ this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+ // Have problems returning FloatImm so serialize to string results as hack.
+ std::stringstream s;
+
+ // use maximum precision available and use fixed representation
+ s << std::fixed;
+ s.precision(std::numeric_limits<double>::max_digits10);
+
+ for (double cur : results) {
+ s << cur << ", ";
+ }
+
+ return s.str();
+ });
} else if (name == "profile") {
return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
[sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index cadc8ae6a4..9d7bedecab 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -19,26 +19,56 @@ import os
import re
import sys
import time
+from distutils.log import debug
+import numpy as np
import pytest
-
import tvm
import tvm.testing
-from tvm import te
-import numpy as np
-from tvm import rpc
+from tvm import rpc, te
+from tvm._ffi.base import TVMError
from tvm.contrib import utils
from tvm.contrib.debugger import debug_executor
-@tvm.testing.requires_llvm
-@tvm.testing.requires_rpc
-def test_graph_simple():
- n = 4
- A = te.placeholder((n,), name="A")
- B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
- s = te.create_schedule(B.op)
+# Constants for creating simple graphs, fixtures to avoid free globals
+@pytest.fixture
+def n():
+ return 4
+
+
+@pytest.fixture
+def A(n):
+ return te.placeholder((n,), name="A")
+
+
+@pytest.fixture
+def B(A):
+ return te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+
+@pytest.fixture
+def s(B):
+ return te.create_schedule(B.op)
+
+
+@pytest.fixture
+def mlib(s, A, B):
+ return tvm.build(s, [A, B], "llvm", name="myadd")
+
+
+@pytest.fixture
+def myadd(mlib):
+ def _myadd(*args):
+ to_return = mlib["myadd"](*args)
+ time.sleep(0.25)
+ return to_return
+
+ return _myadd
+
+
+@pytest.fixture
+def graph():
node0 = {"op": "null", "name": "x", "inputs": []}
node1 = {
"op": "tvm_op",
@@ -64,21 +94,19 @@ def test_graph_simple():
"attrs": attrs,
}
graph = json.dumps(graph)
+ return graph
- def check_verify():
- mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-
- def myadd(*args):
- to_return = mlib["myadd"](*args)
- time.sleep(0.25)
- return to_return
+@tvm.testing.requires_llvm
+@tvm.testing.requires_rpc
+@pytest.mark.skipif(
+ tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
+)
+def test_end_to_end_graph_simple(graph, n, A, B, s, myadd):
+ def check_verify():
mlib_proxy = tvm.support.FrontendTestModule()
mlib_proxy["myadd"] = myadd
- try:
- mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
- except ValueError:
- return
+ mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
a = np.random.uniform(size=(n,)).astype(A.dtype)
mod.set_input(x=a)
@@ -185,5 +213,47 @@ def test_graph_simple():
check_remote(rpc.Server("127.0.0.1"))
+@tvm.testing.requires_llvm
+@pytest.mark.skipif(
+ tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
+)
+def test_run_single_node(graph, n, A, myadd):
+ mlib_proxy = tvm.support.FrontendTestModule()
+ mlib_proxy["myadd"] = myadd
+ mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
+
+ a = np.random.uniform(size=(n,)).astype(A.dtype)
+ mod.set_input(x=a)
+
+ assert len(mod.debug_datum.get_graph_nodes()) == 2
+ assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param"
+ assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd"
+
+ # Running a node with no associated function should return instantly and have 0 runtime
+ assert mod.run_individual_node(0, number=1).mean == 0
+
+ # Meanwhile the actual function should take some time, more time if you run it more times
+ repeat_1_result = mod.run_individual_node(1, repeat=1)
+ assert repeat_1_result.mean > 0
+
+ # Running multiple times (10) should take longer than 1 time
+ repeat_3_results = mod.run_individual_node(1, repeat=3)
+ assert sum(repeat_3_results.results) > sum(repeat_1_result.results)
+
+ # Increasing the number of repeats should give you the number of results asked for
+ assert len(mod.run_individual_node(1, repeat=10).results) == 10
+
+ # Doing repeat_ms should have the run time greater than the asked amount
+ start = time.time()
+ mod.run_individual_node(1, min_repeat_ms=500)
+ end = time.time()
+ elapsed_time_in_seconds = end - start
+ assert elapsed_time_in_seconds >= 0.5
+
+ # Going out of bounds of node index throws a tvm error
+ with pytest.raises(TVMError):
+ mod.run_individual_node(2)
+
+
if __name__ == "__main__":
sys.exit(pytest.main([__file__] + sys.argv[1:]))