You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by an...@apache.org on 2022/06/06 15:49:26 UTC

[tvm] branch main updated: [PROFILER] Add configuration information to profiler (#11530)

This is an automated email from the ASF dual-hosted git repository.

andrewzhaoluo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 9d6599c928 [PROFILER] Add configuration information to profiler (#11530)
9d6599c928 is described below

commit 9d6599c928ec4de1aede59927fcc5f651096e358
Author: Tristan Konolige <tk...@octoml.ai>
AuthorDate: Mon Jun 6 08:49:22 2022 -0700

    [PROFILER] Add configuration information to profiler (#11530)
    
    Configuration is a place to store extra information related to the
    specific profiler run. Right now it is just the executor used and the
    number of threads. The roofline analysis also adds peak flops and peak
    bandwidth.
---
 include/tvm/runtime/profiling.h                    |  17 +++-
 python/tvm/runtime/profiling/__init__.py           |  11 +-
 python/tvm/utils/roofline.py                       |   5 +-
 src/node/structural_hash.cc                        |   1 +
 .../graph_executor/debug/graph_executor_debug.cc   |   2 +-
 src/runtime/profiling.cc                           | 111 ++++++++++++++-------
 src/runtime/vm/profiler/vm.cc                      |   6 +-
 tests/python/unittest/test_runtime_profiling.py    |   3 +
 8 files changed, 109 insertions(+), 47 deletions(-)

diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h
index 0163f0c2e4..83c26933be 100644
--- a/include/tvm/runtime/profiling.h
+++ b/include/tvm/runtime/profiling.h
@@ -25,6 +25,7 @@
 #define TVM_RUNTIME_PROFILING_H_
 
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/container/map.h>
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/object.h>
 #include <tvm/runtime/packed_func.h>
@@ -192,6 +193,11 @@ class ReportNode : public Object {
    * because these metrics include the overhead of the executor.
    */
   Map<String, Map<String, ObjectRef>> device_metrics;
+  /*! Configuration used for this profiling run. Includes number of threads, executor.
+   *
+   * Values must be an object type that can be used with device_metrics.
+   */
+  Map<String, ObjectRef> configuration;
   /*! \brief Output `calls` in CSV format.
    *
    * Note that this does not include `device_metrics`, it only includes per-call metrics.
@@ -255,9 +261,11 @@ class Report : public ObjectRef {
   /*! Construct a Report from a set of calls (with associated metrics) and per-device metrics.
    * \param calls Function calls and associated metrics.
    * \param device_metrics Per-device metrics for overall execution.
+   * \param configuration Configuration data specific to this profiling run.
    */
   explicit Report(Array<Map<String, ObjectRef>> calls,
-                  Map<String, Map<String, ObjectRef>> device_metrics);
+                  Map<String, Map<String, ObjectRef>> device_metrics,
+                  Map<String, ObjectRef> configuration);
 
   /*! Deserialize a Report from a JSON object. Needed for sending the report over RPC.
    * \param json Serialized json report from `ReportNode::AsJSON`.
@@ -366,8 +374,10 @@ class Profiler {
    * \param devs The list of devices the profiler will be running on. Should
    *             include all devices used by profiled operators.
    * \param metric_collectors Additional `MetricCollector`s to use with this profiler.
+   * \param configuration Additional configuration data to add to the outputted profiling report.
    */
-  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors);
+  explicit Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors,
+                    std::unordered_map<String, ObjectRef> configuration = {});
   /*! \brief Start the profiler.
    *
    * This function should only be called once per object.
@@ -400,7 +410,7 @@ class Profiler {
    *  \returns A `Report` that can either be formatted as CSV (with `.AsCSV`)
    *  or as a human readable table (with `.AsTable`).
    */
-  profiling::Report Report(bool aggregate = true, bool sort = true);
+  profiling::Report Report();
   /*! \brief Check if the profiler is currently running.
    * \returns Whether or not the profiler is running.
    */
@@ -412,6 +422,7 @@ class Profiler {
   std::vector<CallFrame> calls_;
   std::stack<CallFrame> in_flight_;
   std::vector<MetricCollector> collectors_;
+  std::unordered_map<String, ObjectRef> configuration_;
 };
 
 /* \brief A duration in time. */
diff --git a/python/tvm/runtime/profiling/__init__.py b/python/tvm/runtime/profiling/__init__.py
index 5737790378..347d8b9f94 100644
--- a/python/tvm/runtime/profiling/__init__.py
+++ b/python/tvm/runtime/profiling/__init__.py
@@ -36,7 +36,10 @@ class Report(Object):
     """
 
     def __init__(
-        self, calls: Sequence[Dict[str, Object]], device_metrics: Dict[str, Dict[str, Object]]
+        self,
+        calls: Sequence[Dict[str, Object]],
+        device_metrics: Dict[str, Dict[str, Object]],
+        configuration: Dict[str, Object],
     ):
         """Construct a profiling report from a list of metrics and per-device metrics.
 
@@ -47,8 +50,12 @@ class Report(Object):
 
         device_metrics : Dict[str, Dict[str, Object]]
             Per device metrics.
+
+        configuration : Dict[str, Object]
+            Configuration of TVM for this profiling run. Includes number of
+            threads, executor.
         """
-        self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics)
+        self.__init_handle_by_constructor__(_ffi_api.Report, calls, device_metrics, configuration)
 
     def csv(self):
         """Convert this profiling report into CSV format.
diff --git a/python/tvm/utils/roofline.py b/python/tvm/utils/roofline.py
index 8a17b9f003..6cfca81c5c 100644
--- a/python/tvm/utils/roofline.py
+++ b/python/tvm/utils/roofline.py
@@ -400,7 +400,10 @@ def roofline_from_existing(
             new_calls.append(call)
         else:
             new_calls.append(call)
-    return profiling.Report(new_calls, report.device_metrics)
+    new_configuration = dict(report.configuration.items())
+    new_configuration["Estimated Peak FMA FLOP/s"] = profiling.Ratio(peak_flops)
+    new_configuration["Estimated Peak Bandwidth (byte/second)"] = profiling.Ratio(peak_bandwidth)
+    return profiling.Report(new_calls, report.device_metrics, new_configuration)
 
 
 def roofline_analysis(
diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc
index e97e5f41bf..23811e2190 100644
--- a/src/node/structural_hash.cc
+++ b/src/node/structural_hash.cc
@@ -521,6 +521,7 @@ struct ReportNodeTrait {
   static void VisitAttrs(runtime::profiling::ReportNode* report, AttrVisitor* attrs) {
     attrs->Visit("calls", &report->calls);
     attrs->Visit("device_metrics", &report->device_metrics);
+    attrs->Visit("configuration", &report->configuration);
   }
   static constexpr std::nullptr_t SEqualReduce = nullptr;
   static constexpr std::nullptr_t SHashReduce = nullptr;
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index bd3b0db040..4a95015395 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -294,7 +294,7 @@ class GraphExecutorDebug : public GraphExecutor {
    */
   profiling::Report Profile(Array<profiling::MetricCollector> collectors) {
     std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-    profiling::Profiler prof(devices_, cs);
+    profiling::Profiler prof(devices_, cs, {{String("Executor"), String("Graph")}});
 
     // warm up. 1 iteration does not seem enough.
     for (int i = 0; i < 3; i++) {
diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc
index 9499a6e7a5..9f95bf18f7 100644
--- a/src/runtime/profiling.cc
+++ b/src/runtime/profiling.cc
@@ -105,8 +105,9 @@ TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start);
 
 namespace profiling {
 
-Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors)
-    : devs_(devs), collectors_(metric_collectors) {
+Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric_collectors,
+                   std::unordered_map<String, ObjectRef> configuration)
+    : devs_(devs), collectors_(metric_collectors), configuration_(configuration) {
   is_running_ = false;
   std::vector<DeviceWrapper> wrapped_devs;
   for (auto dev : devs) {
@@ -117,6 +118,9 @@ Profiler::Profiler(std::vector<Device> devs, std::vector<MetricCollector> metric
   }
   // reset the thread pool so that PAPI eventset hooks are set in all threads.
   threading::ResetThreadPool();
+
+  configuration_[String("Number of threads")] =
+      ObjectRef(make_object<CountNode>(threading::NumThreads()));
 }
 
 void Profiler::Start() {
@@ -279,7 +283,7 @@ String ReportNode::AsCSV() const {
 }
 
 namespace {
-void print_metric(std::ostream& os, ObjectRef o) {
+void metric_as_json(std::ostream& os, ObjectRef o) {
   if (o.as<StringObj>()) {
     os << "{\"string\":"
        << "\"" << Downcast<String>(o) << "\""
@@ -309,13 +313,14 @@ String ReportNode::AsJSON() const {
   // value we want to print. Instead we construct the json by hand because it
   // is easier.
   s << "{";
+
   s << "\"calls\":[";
   for (size_t i = 0; i < calls.size(); i++) {
     size_t j = 0;
     s << "{";
     for (const auto& kv : calls[i]) {
       s << "\"" << kv.first << "\":";
-      print_metric(s, kv.second);
+      metric_as_json(s, kv.second);
       if (j < calls[i].size() - 1) {
         s << ",";
       }
@@ -326,7 +331,8 @@ String ReportNode::AsJSON() const {
       s << ",";
     }
   }
-  s << "],";
+  s << "],";  // end calls
+
   s << "\"device_metrics\":{";
   size_t i = 0;
   for (const auto& dev_kv : device_metrics) {
@@ -334,7 +340,7 @@ String ReportNode::AsJSON() const {
     s << "\"" << dev_kv.first << "\":{";
     for (const auto& metric_kv : dev_kv.second) {
       s << "\"" << metric_kv.first << "\":";
-      print_metric(s, metric_kv.second);
+      metric_as_json(s, metric_kv.second);
       if (j < dev_kv.second.size() - 1) {
         s << ",";
       }
@@ -346,7 +352,20 @@ String ReportNode::AsJSON() const {
     }
     i++;
   }
-  s << "}}";
+  s << "},";  // end device metrics
+
+  s << "\"configuration\":{";
+  size_t k = 0;
+  for (const auto& kv : configuration) {
+    s << "\"" << kv.first << "\":";
+    metric_as_json(s, kv.second);
+    if (k < configuration.size() - 1) {
+      s << ",";
+    }
+    k++;
+  }
+  s << "}";  // end configuration
+  s << "}";
   return s.str();
 }
 
@@ -395,6 +414,35 @@ ObjectRef AggregateMetric(const std::vector<ObjectRef>& metrics) {
   }
 }
 
+static String print_metric(ObjectRef metric) {
+  std::string val;
+  if (metric.as<CountNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::fixed << metric.as<CountNode>()->value;
+    val = s.str();
+  } else if (metric.as<DurationNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::fixed << std::setprecision(2) << metric.as<DurationNode>()->microseconds;
+    val = s.str();
+  } else if (metric.as<PercentNode>()) {
+    std::stringstream s;
+    s << std::fixed << std::setprecision(2) << metric.as<PercentNode>()->percent;
+    val = s.str();
+  } else if (metric.as<RatioNode>()) {
+    std::stringstream s;
+    s.imbue(std::locale(""));  // for 1000s seperators
+    s << std::setprecision(2) << metric.as<RatioNode>()->ratio;
+    val = s.str();
+  } else if (metric.as<StringObj>()) {
+    val = Downcast<String>(metric);
+  } else {
+    LOG(FATAL) << "Cannot print metric of type " << metric->GetTypeKey();
+  }
+  return val;
+}
+
 String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) const {
   // aggregate calls by op hash (or op name if hash is not set) + argument shapes
   std::vector<Map<String, ObjectRef>> aggregated_calls;
@@ -533,30 +581,7 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
         // fill empty data with empty strings
         cols[i].push_back("");
       } else {
-        std::string val;
-        if ((*it).second.as<CountNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::fixed << (*it).second.as<CountNode>()->value;
-          val = s.str();
-        } else if ((*it).second.as<DurationNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::fixed << std::setprecision(2) << (*it).second.as<DurationNode>()->microseconds;
-          val = s.str();
-        } else if ((*it).second.as<PercentNode>()) {
-          std::stringstream s;
-          s << std::fixed << std::setprecision(2) << (*it).second.as<PercentNode>()->percent;
-          val = s.str();
-        } else if ((*it).second.as<RatioNode>()) {
-          std::stringstream s;
-          s.imbue(std::locale(""));  // for 1000s seperators
-          s << std::setprecision(2) << (*it).second.as<RatioNode>()->ratio;
-          val = s.str();
-        } else if ((*it).second.as<StringObj>()) {
-          val = Downcast<String>((*it).second);
-        }
-        cols[i].push_back(val);
+        cols[i].push_back(print_metric((*it).second));
       }
     }
   }
@@ -592,6 +617,12 @@ String ReportNode::AsTable(bool sort, bool aggregate, bool compute_col_sums) con
     }
     s << std::endl;
   }
+
+  // Add configuration information. It will not be aligned with the columns.
+  s << std::endl << "Configuration" << std::endl << "-------------" << std::endl;
+  for (auto kv : configuration) {
+    s << kv.first << ": " << print_metric(kv.second) << std::endl;
+  }
   return s.str();
 }
 
@@ -599,7 +630,7 @@ std::string DeviceString(Device dev) {
   return DeviceName(dev.device_type) + std::to_string(dev.device_id);
 }
 
-Report Profiler::Report(bool aggregate, bool sort) {
+Report Profiler::Report() {
   // sync all timers and normalize rows
   std::vector<std::unordered_map<String, ObjectRef>> rows;
   for (auto& cf : calls_) {
@@ -638,14 +669,16 @@ Report Profiler::Report(bool aggregate, bool sort) {
     converted_rows.push_back(row);
   }
 
-  return profiling::Report(converted_rows, device_metrics);
+  return profiling::Report(converted_rows, device_metrics, configuration_);
 }
 
 Report::Report(Array<Map<String, ObjectRef>> calls,
-               Map<String, Map<String, ObjectRef>> device_metrics) {
+               Map<String, Map<String, ObjectRef>> device_metrics,
+               Map<String, ObjectRef> configuration) {
   auto node = make_object<ReportNode>();
   node->calls = std::move(calls);
   node->device_metrics = std::move(device_metrics);
+  node->configuration = std::move(configuration);
   data_ = std::move(node);
 }
 
@@ -697,6 +730,7 @@ Report Report::FromJSON(String json) {
   std::string key;
   Array<Map<String, ObjectRef>> calls;
   Map<String, Map<String, ObjectRef>> device_metrics;
+  Map<String, ObjectRef> configuration;
 
   reader.BeginObject();
   while (reader.NextObjectItem(&key)) {
@@ -713,10 +747,12 @@ Report Report::FromJSON(String json) {
         device_metrics.Set(device_name, parse_metrics(&reader));
       }
       // reader.EndObject();
+    } else if (key == "configuration") {
+      configuration = parse_metrics(&reader);
     }
   }
 
-  return Report(calls, device_metrics);
+  return Report(calls, device_metrics, configuration);
 }
 
 TVM_REGISTER_OBJECT_TYPE(DurationNode);
@@ -855,8 +891,9 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
 
 TVM_REGISTER_GLOBAL("runtime.profiling.Report")
     .set_body_typed([](Array<Map<String, ObjectRef>> calls,
-                       Map<String, Map<String, ObjectRef>> device_metrics) {
-      return Report(calls, device_metrics);
+                       Map<String, Map<String, ObjectRef>> device_metrics,
+                       Map<String, ObjectRef> configuration) {
+      return Report(calls, device_metrics, configuration);
     });
 
 TVM_REGISTER_GLOBAL("runtime.profiling.Count").set_body_typed([](int64_t count) {
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index 393d1b3998..0ace910b5c 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -58,9 +58,9 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
           // on remotes, we accept a nullptr for collectors.
           if (collectors.defined()) {
             std::vector<profiling::MetricCollector> cs(collectors.begin(), collectors.end());
-            prof_ = profiling::Profiler(devices, cs);
+            prof_ = profiling::Profiler(devices, cs, {{String("Executor"), String("VM")}});
           } else {
-            prof_ = profiling::Profiler(devices, {});
+            prof_ = profiling::Profiler(devices, {}, {{String("Executor"), String("VM")}});
           }
 
           auto invoke = VirtualMachine::GetFunction("invoke", sptr_to_self);
@@ -77,7 +77,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name,
           return report;
         });
   } else if (name == "profile_rpc") {
-    // We cannot return a Report over RPC because TMV RPC mechanism only
+    // We cannot return a Report over RPC because TVM RPC mechanism only
     // supports a subset of Object classes. Instead we serialize it on the
     // remote (here) and deserialize it on the other end.
     return TypedPackedFunc<std::string(std::string)>([sptr_to_self, this](std::string arg_name) {
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index 29a8414337..ab22bd2b9c 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -69,6 +69,7 @@ def test_vm(target, dev):
     assert "Total" in str(report)
     assert "AllocTensorReg" in str(report)
     assert "AllocStorage" in str(report)
+    assert report.configuration["Executor"] == "VM"
 
     csv = read_csv(report)
     assert "Hash" in csv.keys()
@@ -102,6 +103,7 @@ def test_graph_executor(target, dev):
     assert "fused_nn_softmax" in str(report)
     assert "Total" in str(report)
     assert "Hash" in str(report)
+    assert "Graph" in str(report)
 
 
 @tvm.testing.parametrize_targets("cuda", "llvm")
@@ -147,6 +149,7 @@ def test_json():
     parsed = json.loads(report.json())
     assert "device_metrics" in parsed
     assert "calls" in parsed
+    assert "configuration" in parsed
     assert "Duration (us)" in parsed["calls"][0]
     assert "microseconds" in parsed["calls"][0]["Duration (us)"]
     assert len(parsed["calls"]) > 0