You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/06/10 17:09:13 UTC
[GitHub] [tvm] tkonolige commented on a diff in pull request #11465: Add cooldown_interval_ms and internal measurements for profiling

tkonolige commented on code in PR #11465:
URL: https://github.com/apache/tvm/pull/11465#discussion_r894716908


##########
python/tvm/contrib/debugger/debug_result.py:
##########
@@ -157,7 +155,12 @@ def s_to_us(t):
             return t * 10**6
 
         starting_times = np.zeros(len(self._time_list) + 1)
-        starting_times[1:] = np.cumsum([times[0] for times in self._time_list])
+        starting_times[1:] = np.cumsum(
+            [
+                np.mean([np.mean(repeat_data) for repeat_data in node_data])
+                for node_data in self._time_list
+            ]
+        )

Review Comment:
   This doesn't seem right. The variable is called `starting_times` but its values are the mean of mean time.



##########
python/tvm/contrib/debugger/debug_result.py:
##########
@@ -205,12 +210,18 @@ def _dump_graph_json(self, graph):
 
     def get_debug_result(self, sort_by_time=True):
         """Return the debugger result"""
-        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs"]
-        lines = ["---------", "---", "--------", "-------", "-----", "------", "-------"]
+        header = ["Node Name", "Ops", "Time(us)", "Time(%)", "Shape", "Inputs", "Outputs", "Times"]

Review Comment:
   We have `Time(us)` and `Times` here. What is the difference?



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -55,77 +56,96 @@ class GraphExecutorDebug : public GraphExecutor {
    *        By default, one `repeat` contains `number` runs. If this parameter is set,
    *        the parameters `number` will be dynamically adjusted to meet the
    *        minimum duration requirement of one `repeat`.
+   * \param cooldown_interval_ms The cool down interval between two measurements in milliseconds.
    * \return Comma seperated string containing the elapsed time per op for the last
    *         iteration only, because returning a long string over rpc can be expensive.
    */
-  std::string RunIndividual(int number, int repeat, int min_repeat_ms) {
+  std::string RunIndividual(int number, int repeat, int min_repeat_ms, int cooldown_interval_ms) {
     // warmup run
     GraphExecutor::Run();
     std::string tkey = module_->type_key();
-    std::vector<double> time_sec_per_op(op_execs_.size(), 0);
+    std::vector<std::vector<std::vector<double>>> time_sec_per_op(op_execs_.size());
     if (tkey == "rpc") {
       // RPC modules rely on remote timing which implements the logic from the else branch.
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
+        time_sec_per_op[index] =
+            RunOpRPC(index, number, repeat, min_repeat_ms, cooldown_interval_ms);
       }
     } else {
+      int op = 0;
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
-        for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
-          time_sec_per_op[index] = results[cur_repeat];
-
-          LOG(INFO) << "Iteration: " << cur_repeat;
-          int op = 0;
-          if (op_execs_[index]) {
-            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
-                      << time_sec_per_op[index] * 1e6 << " us/iter";
+        time_sec_per_op[index] =
+            RunIndividualNode(index, number, repeat, min_repeat_ms, cooldown_interval_ms);
+        if (op_execs_[index]) {
+          LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
+          for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) {
+            const auto& data = time_sec_per_op[index][cur_repeat];
+            std::string delimiter = ", ";
+            std::string log;

Review Comment:
   Use `std::stringstream` instead of appending to a string.



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -55,77 +56,96 @@ class GraphExecutorDebug : public GraphExecutor {
    *        By default, one `repeat` contains `number` runs. If this parameter is set,
    *        the parameters `number` will be dynamically adjusted to meet the
    *        minimum duration requirement of one `repeat`.
+   * \param cooldown_interval_ms The cool down interval between two measurements in milliseconds.
    * \return Comma seperated string containing the elapsed time per op for the last
    *         iteration only, because returning a long string over rpc can be expensive.
    */
-  std::string RunIndividual(int number, int repeat, int min_repeat_ms) {
+  std::string RunIndividual(int number, int repeat, int min_repeat_ms, int cooldown_interval_ms) {
     // warmup run
     GraphExecutor::Run();
     std::string tkey = module_->type_key();
-    std::vector<double> time_sec_per_op(op_execs_.size(), 0);
+    std::vector<std::vector<std::vector<double>>> time_sec_per_op(op_execs_.size());
     if (tkey == "rpc") {
       // RPC modules rely on remote timing which implements the logic from the else branch.
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
+        time_sec_per_op[index] =
+            RunOpRPC(index, number, repeat, min_repeat_ms, cooldown_interval_ms);
       }
     } else {
+      int op = 0;
       for (size_t index = 0; index < op_execs_.size(); ++index) {
-        std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
-        for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
-          time_sec_per_op[index] = results[cur_repeat];
-
-          LOG(INFO) << "Iteration: " << cur_repeat;
-          int op = 0;
-          if (op_execs_[index]) {
-            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
-                      << time_sec_per_op[index] * 1e6 << " us/iter";
+        time_sec_per_op[index] =
+            RunIndividualNode(index, number, repeat, min_repeat_ms, cooldown_interval_ms);
+        if (op_execs_[index]) {
+          LOG(INFO) << "Op #" << op << " " << GetNodeName(index) << ":";
+          for (size_t cur_repeat = 0; cur_repeat < time_sec_per_op[index].size(); cur_repeat++) {
+            const auto& data = time_sec_per_op[index][cur_repeat];
+            std::string delimiter = ", ";
+            std::string log;
+            for (double r : data) {
+              log += std::to_string(r * 1e6) + delimiter;
+            }
+            if (data.size()) log.resize(log.size() - delimiter.size());
+            const double mean = std::accumulate(data.begin(), data.end(), 0.0) / data.size();
+            LOG(INFO) << "Iteration: " << cur_repeat << ": " << (mean * 1e6) << " us/iter [" << log
+                      << "]";
           }
+          ++op;
         }
       }
     }
 
     std::ostringstream os;
     for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-      double time = time_sec_per_op[index];
-      // To have good behavior when calculating total time, etc.
-      if (std::isnan(time)) {
-        time = 0;
+      for (const auto& repeat_data : time_sec_per_op[index]) {
+        for (const auto& number_data : repeat_data) {
+          // To have good behavior when calculating total time, etc.
+          os << (std::isnan(number_data) ? std::to_string(0) : std::to_string(number_data)) << ",";
+        }
+        os << ";";
       }
-      os << time << ",";
+      os << ":";
     }
     return os.str();
   }
 
-  std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
+  std::vector<std::vector<double>> RunIndividualNode(int node_index, int number, int repeat,
+                                                     int min_repeat_ms, int cooldown_interval_ms) {
     std::string tkey = module_->type_key();
 
-    // results_in_seconds[a][b] is the bth index run of the ath index repeat

Review Comment:
   Can you add a comment like this to `results_in_seconds`?



##########
src/runtime/crt/common/crt_runtime_api.c:
##########
@@ -1,593 +1,622 @@
-/*

Review Comment:
   What happened to this file. Seems like it got formatted differently or something. Could you please undo this change.



##########
src/runtime/profiling.cc:
##########
@@ -822,25 +823,37 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
         f_preproc.CallPacked(args, &temp);
       }
       double duration_ms = 0.0;
-
+      std::vector<double> results;
+      int64_t final_number = number;
       do {
+        results.clear();
         if (duration_ms > 0.0) {
-          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                             number * 1.618));  // 1.618 is chosen by random
+          final_number =
+              static_cast<int>(std::max((min_repeat_ms / (duration_ms / final_number) + 1),
+                                        final_number * 1.618));  // 1.618 is chosen by random
         }
+        results.resize(final_number);
 
-        Timer t = Timer::Start(dev);
-        // start timing
-        for (int i = 0; i < number; ++i) {
+        for (int i = 0; i < final_number; ++i) {
+          // start timing
+          Timer t = Timer::Start(dev);
           pf.CallPacked(args, &temp);
+          t->Stop();
+          int64_t t_nanos = t->SyncAndGetElapsedNanos();
+          results[i] = t_nanos / 1e6;
         }
-        t->Stop();
-        int64_t t_nanos = t->SyncAndGetElapsedNanos();
-        duration_ms = t_nanos / 1e6;
+        duration_ms = std::accumulate(results.begin(), results.end(), 0.0);
       } while (duration_ms < min_repeat_ms);
 
-      double speed = duration_ms / 1e3 / number;
-      os.write(reinterpret_cast<char*>(&speed), sizeof(speed));
+      // write the size value.
+      os.write(reinterpret_cast<char*>(&final_number), sizeof(int64_t));
+      // write the array
+      for (int i = 0; i < final_number; ++i) {
+        results[i] /= 1e3;
+        os.write(reinterpret_cast<char*>(&results[i]), sizeof(double));
+      }
+
+      std::this_thread::sleep_for(std::chrono::milliseconds(cooldown_interval_ms));

Review Comment:
   Can you guard this which a check that `cooldown_iterval_ms` is greater than 0.



##########
src/runtime/graph_executor/debug/graph_executor_debug.cc:
##########
@@ -176,10 +197,29 @@ class GraphExecutorDebug : public GraphExecutor {
     }
     TVMRetValue rv;
     time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv);
-    std::string results = rv.operator std::string();
-    const double* results_arr = reinterpret_cast<const double*>(results.data());
-    LOG(INFO) << "Got op timing: " << results_arr[0];
-    return results_arr[0];
+    std::string results_str = rv.operator std::string();
+
+    size_t offset = 0;
+    auto blob_ptr = results_str.data();
+    for (int i = 0; i < repeat; ++i) {
+      const int64_t* size_ptr = reinterpret_cast<const int64_t*>(blob_ptr + offset);
+      offset += sizeof(int64_t);
+      auto result =
+          std::vector<double>(reinterpret_cast<const double*>(blob_ptr + offset),
+                              reinterpret_cast<const double*>(blob_ptr) + offset + size_ptr[0]);
+      offset += sizeof(double) * size_ptr[0];
+      results[i] = result;
+    }
+
+    std::string log = "";

Review Comment:
   Use `std::stringstream` instead of appending to a string.



##########
src/runtime/profiling.cc:
##########
@@ -822,25 +823,37 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, Device dev, int number, int repeat,
         f_preproc.CallPacked(args, &temp);
       }
       double duration_ms = 0.0;
-
+      std::vector<double> results;
+      int64_t final_number = number;
       do {
+        results.clear();
         if (duration_ms > 0.0) {
-          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                             number * 1.618));  // 1.618 is chosen by random
+          final_number =
+              static_cast<int>(std::max((min_repeat_ms / (duration_ms / final_number) + 1),
+                                        final_number * 1.618));  // 1.618 is chosen by random
         }
+        results.resize(final_number);
 
-        Timer t = Timer::Start(dev);
-        // start timing
-        for (int i = 0; i < number; ++i) {
+        for (int i = 0; i < final_number; ++i) {
+          // start timing
+          Timer t = Timer::Start(dev);

Review Comment:
   The reason we have the timer outside the loop is so that we can accurately time very small duration functions. We can amortize the inaccuracy in the timer by running the small duration programs many times under a single timer and then dividing the total time by the number of runs.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org