You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by an...@apache.org on 2022/04/18 21:52:36 UTC

[tvm] branch aluo/run-individual-node updated (ab98b150be -> 85751bce55)

This is an automated email from the ASF dual-hosted git repository.

andrewzhaoluo pushed a change to branch aluo/run-individual-node
in repository https://gitbox.apache.org/repos/asf/tvm.git


    omit ab98b150be secondary commit
    omit a1f26fc5b6 initial
     add a2d973d350 change Hexagon docker version (#10981)
     add c7cca3913a Support `qnn.conv2d` in FoldExplicitPading (#10982)
     add 6d9a1118b1 [COMMUNITY] @guberti -> Reviewer (#10976)
     add 597000c74e [ONNX] Add MatMulInteger importer (#10450)
     add 5cfb4bc3d0 [TVMC] Allow output module name to be passed as a command line argument (#10962)
     add 61a9269d85 [FIX] resolve int64/32 for AttrStmtNode (#10983)
     add b542724873 [Runtime][Vulkan] Add RGP support to TVM for vulkan device (#10953)
     add 01e0e2d0e1 [CI] Update GPU image (#10992)
     add ef163a5791 [Hexagon] Remove HexagonBuffer external constructor and support (#10978)
     add 814e856851 sort axes (#10985)
     add 11b8cd3ca1 [ONNX] Add imports for BERT contrib operators (#10949)
     add dbfab5c10d [Metaschedule] Make custom schedule_rule registration optional (#10975)
     add 1bfb9cac93 [COMMUNITY] @yzh119 -> Reviewer (#10993)
     add ce8f83e3c5 [hexagon] 'add_hvx' test to explore HVX usage. (#10604)
     add e370ed4597 [Hexagon] Less aggressive adb state clean up (#10909)
     add 985fc933f4 [Hexagon] Handle TCP server binding to unknown port (#10945)
     add 52f52c83de [BYOC][ACL] Fix list is not supported as an input node (#10801)
     add 96616b7af8 [Hexagon] Add top-level CMakeLists.txt for apps/hexagon_launcher (#11006)
     add 1cf0c0a5bf [CUDNN] Add partitioning support for fused conv2d+bias+act (#10997)
     add 324bf4cac5 Add driazati to triagers. (#11004)
     add e84b3590dc [Relay] Refactor inline composites transformation (#10995)
     add 529da9bd56 [TensorRT][BYOC] Minor refactoring to handle constants in pattern-based ops for TRT (#10994)
     add 3d63b2d944 [CI] Update CI Images to include `pytest-lazy-fixture` (#10999)
     add a34731b7fc [ROCM] DP4A intrinsic support for TE/TIR (#11009)
     add b94119692e [TIR] Ignore Allocate/AllocateConst in BufferAllocationLocator (#10998)
     add 0e1a2a2ff8 Fix broken CI when git-merge needs to create a commit. (#11007)
     add 29774bddd8 [RELAY][FRONTEND] Initial OneFlow frontend support.  (#8790)
     add a9d86e61b6 [Metaschedule] Support tuning on rocm and vulkan target (#11017)
     add 715f24d238 [Metaschedule] Enable continuing tuning after schedule application failure  (#10937)
     add b99a096825 [Metaschedule, Refactor] Move MultiLevelTilingNode decl to a header (#11020)
     add f238900e6b Add Havisha to triagers and alphabetize. (#11005)
     add 8bfe3bbb3c [Arith] Updated arith::DetectIterMap to keep extent=1 components (#10980)
     add 37db213a84 [QNNParam] Refactor the implmentation of QNNParam (#11011)
     add 365fcc832d Use TVM log instead of hexagon_print (#11024)
     add 8aafe5b109 [OpenCL] Fix type casting error (#11021)
     add 351f31b51c [Runtime][PipelineExecutor]Add forwarding queue logic for set input. (#10990)
     add fafabc96c1 [VirtualMachine] Zero copy in set_input when input is DLTensor (#11003)
     add f92d351e11 [COMMUNITY] @wrongtest -> Committer (#11028)
     add 7d9b7bbd50 [ci] Always assume num executors == 1 (#11014)
     add 69ba7ed640 [BugFix][TIR] Fix narrower dtype of loop vars in CreatePrimFunc (#11030)
     add 8d868f6bf3 [BugFix][TIR] Fix rfactor when RF block becomes spatial (#11031)
     add 9c2df39376 [BugFix][TIR] Error check: Inline Block with Init Stmt (#11033)
     add 9f3da1cbae [Frontend][Paddle] Fix pool2d op (#11029)
     add 0b9bcf0e7a [MetaSchedule][Refactor] Introduce TuneConfig (#10986)
     add 240fac437c Fix typo in tutorial doc (#10974)
     add 28aac0e48f [Hexagon] Deprecate USE_HEXAGON_DEVICE, introduce USE_HEXAGON (#11025)
     add 1e524d0670 [ci] Migrate all test steps to macros (#10968)
     add 1542286826 [ci] Add branch protections to .asf.yaml (#10964)
     new f871866919 initial
     new 85751bce55 secondary commit

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (ab98b150be)
            \
             N -- N -- N   refs/heads/aluo/run-individual-node (85751bce55)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .asf.yaml                                          |   13 +
 CMakeLists.txt                                     |    4 +-
 CONTRIBUTORS.md                                    |    3 +
 Jenkinsfile                                        |  195 ++-
 apps/hexagon_api/CMakeLists.txt                    |    4 +
 apps/hexagon_launcher/CMakeLists.txt               |   78 +
 apps/hexagon_launcher/cmake/android/CMakeLists.txt |    9 +-
 apps/hexagon_launcher/cmake/hexagon/CMakeLists.txt |    7 +-
 cmake/config.cmake                                 |   13 +-
 cmake/modules/Hexagon.cmake                        |   90 +-
 cmake/modules/LibInfo.cmake                        |    2 +-
 gallery/tutorial/tvmc_python.py                    |   12 +-
 include/tvm/arith/iter_affine_map.h                |    4 +-
 include/tvm/runtime/ndarray.h                      |   18 +
 jenkins/Jenkinsfile.j2                             |  207 +--
 python/tvm/arith/iter_affine_map.py                |   17 +-
 python/tvm/contrib/hexagon/build.py                |  114 +-
 python/tvm/contrib/pipeline_executor.py            |    5 +-
 python/tvm/driver/tvmc/compiler.py                 |   21 +-
 python/tvm/driver/tvmc/model.py                    |    2 +-
 python/tvm/meta_schedule/__init__.py               |    9 +-
 .../search_strategy/evolutionary_search.py         |   14 +-
 .../meta_schedule/task_scheduler/round_robin.py    |    5 +
 .../testing/tune_relay_meta_schedule.py            |   56 +-
 .../meta_schedule/testing/tune_te_meta_schedule.py |    4 +-
 python/tvm/meta_schedule/tune.py                   |  575 +++---
 python/tvm/relay/frontend/__init__.py              |    1 +
 python/tvm/relay/frontend/oneflow.py               | 1821 ++++++++++++++++++++
 python/tvm/relay/frontend/onnx.py                  |  296 +++-
 python/tvm/relay/frontend/paddlepaddle.py          |   12 +-
 python/tvm/relay/frontend/qnn_torch.py             |    9 +-
 python/tvm/relay/op/contrib/arm_compute_lib.py     |   37 +-
 python/tvm/relay/op/contrib/cmsisnn.py             |    6 +-
 python/tvm/relay/op/contrib/cudnn.py               |   79 +-
 python/tvm/relay/op/contrib/ethosu.py              |    9 +-
 python/tvm/relay/op/contrib/tensorrt.py            |   38 +-
 python/tvm/relay/op/strategy/cuda.py               |    8 +-
 python/tvm/relay/op/strategy/rocm.py               |  172 +-
 python/tvm/relay/qnn/op/legalizations.py           |   22 +-
 python/tvm/runtime/vm.py                           |    4 +
 python/tvm/tir/tensor_intrin/__init__.py           |    2 +
 python/tvm/tir/tensor_intrin/dot_product_common.py |   55 +
 python/tvm/tir/tensor_intrin/rocm.py               |   47 +
 python/tvm/topi/cuda/batch_matmul.py               |    7 +-
 python/tvm/topi/cuda/conv2d_alter_op.py            |   12 +-
 python/tvm/topi/cuda/conv2d_int8.py                |    4 +-
 python/tvm/topi/cuda/dense.py                      |    5 +-
 python/tvm/topi/cuda/tensor_intrin.py              |   23 +-
 python/tvm/topi/cuda/tensorcore_alter_op.py        |   22 +-
 python/tvm/topi/rocm/dense.py                      |   79 +-
 python/tvm/topi/utils.py                           |    7 +
 src/arith/iter_affine_map.cc                       |   15 +-
 .../schedule_rule/multi_level_tiling.cc            |  205 +--
 .../schedule_rule/multi_level_tiling.h             |  187 ++
 .../space_generator/post_order_apply.cc            |   22 +-
 src/meta_schedule/task_scheduler/task_scheduler.cc |    1 +
 src/meta_schedule/utils.h                          |   12 +-
 .../backend/contrib/arm_compute_lib/codegen.cc     |   26 +
 src/relay/transforms/fold_explicit_padding.cc      |   33 +-
 src/relay/transforms/inline_composites.cc          |   79 +-
 src/runtime/contrib/arm_compute_lib/acl_runtime.cc |   82 +-
 src/runtime/contrib/arm_compute_lib/acl_utils.cc   |   16 +-
 src/runtime/contrib/arm_compute_lib/acl_utils.h    |    9 +-
 src/runtime/contrib/cudnn/conv_forward.cc          |   62 +
 src/runtime/contrib/cudnn/cudnn_utils.cc           |    4 +
 src/runtime/contrib/cudnn/cudnn_utils.h            |    2 +
 src/runtime/contrib/json/json_runtime.h            |    1 +
 src/runtime/hexagon/README.md                      |   70 +-
 src/runtime/hexagon/android/hexagon_device_api.cc  |    2 +-
 src/runtime/hexagon/hexagon/hexagon_buffer.cc      |   33 +-
 src/runtime/hexagon/hexagon/hexagon_buffer.h       |   12 -
 src/runtime/hexagon/hexagon/hexagon_common.cc      |    9 +-
 .../hexagon/hexagon/hexagon_device_api_v2.cc       |    9 +-
 src/runtime/hexagon/rpc/hexagon/rpc_server.cc      |   48 +-
 src/runtime/ndarray.cc                             |   31 +
 src/runtime/pipeline/pipeline_executor.cc          |   20 +-
 src/runtime/pipeline/pipeline_executor.h           |    9 +-
 src/runtime/pipeline/pipeline_scheduler.cc         |   15 +-
 src/runtime/pipeline/pipeline_scheduler.h          |    9 +-
 src/runtime/pipeline/pipeline_struct.h             |  296 ++--
 src/runtime/vm/vm.cc                               |   22 +-
 src/runtime/vulkan/vulkan_amdrgp.cc                |   53 +
 .../vulkan/{vulkan_module.h => vulkan_amdrgp.h}    |   49 +-
 src/runtime/vulkan/vulkan_device.cc                |   13 +
 src/runtime/vulkan/vulkan_device.h                 |   12 +
 src/runtime/vulkan/vulkan_device_api.cc            |    3 +
 src/runtime/vulkan/vulkan_instance.cc              |    7 +
 src/runtime/vulkan/vulkan_stream.cc                |   13 +
 src/runtime/vulkan/vulkan_stream.h                 |   16 +
 src/runtime/vulkan/vulkan_wrapped_func.cc          |   18 +
 src/support/libinfo.cc                             |    6 +-
 src/target/source/codegen_opencl.cc                |   28 +
 src/target/source/codegen_opencl.h                 |    5 +
 src/target/target_kind.cc                          |    6 +
 src/te/operation/create_primfunc.cc                |    4 +-
 src/tir/ir/index_map.cc                            |    4 +-
 src/tir/schedule/primitive/compute_inline.cc       |   26 +
 src/tir/schedule/primitive/reduction.cc            |   34 +-
 src/tir/transforms/narrow_datatype.cc              |   12 +-
 .../plan_update_buffer_allocation_location.cc      |   33 +-
 tests/cpp/runtime/hexagon_buffer.cc                |   34 -
 .../contrib/test_arm_compute_lib/infrastructure.py |   28 +-
 .../test_arm_compute_lib/test_concatenate.py       |  151 ++
 tests/python/contrib/test_cudnn.py                 |   51 +-
 .../contrib/test_hexagon/benchmark_hexagon.py      |  335 ++++
 tests/python/contrib/test_hexagon/conftest.py      |    6 +-
 tests/python/contrib/test_hexagon/test_launcher.md |    2 +-
 tests/python/driver/tvmc/test_compiler.py          |  148 ++
 tests/python/frontend/oneflow/test_forward.py      |  723 ++++++++
 tests/python/frontend/onnx/test_forward.py         |  224 ++-
 tests/python/relay/test_op_level1.py               |   38 +
 tests/python/relay/test_op_level10.py              |   52 +
 tests/python/relay/test_op_level2.py               |   50 +
 .../relay/test_pass_fold_explicit_padding.py       |   45 +
 .../python/relay/test_pass_legalize_tensorcore.py  |    3 +-
 tests/python/relay/test_pipeline_executor.py       |   21 +-
 tests/python/topi/python/test_topi_conv2d_int8.py  |   13 +-
 tests/python/topi/python/test_topi_dense.py        |    1 -
 .../test_meta_schedule_measure_callback.py         |    4 +-
 .../test_meta_schedule_post_order_apply.py         |    4 +-
 .../unittest/test_meta_schedule_search_strategy.py |    2 +
 .../unittest/test_meta_schedule_task_scheduler.py  |    2 +
 .../unittest/test_meta_schedule_tune_relay.py      |   72 +-
 .../python/unittest/test_meta_schedule_tune_te.py  |    5 +-
 .../python/unittest/test_meta_schedule_tune_tir.py |   14 +-
 .../python/unittest/test_target_codegen_opencl.py  |   37 +
 tests/python/unittest/test_te_create_primfunc.py   |   18 +-
 .../unittest/test_tir_schedule_compute_inline.py   |   44 +
 tests/python/unittest/test_tir_schedule_rfactor.py |   63 +-
 .../python/unittest/test_tir_schedule_tensorize.py |   50 +
 .../test_tir_transform_extract_constants.py        |    2 +
 tests/python/unittest/test_transform_layout.py     |   30 +
 tests/scripts/task_build.py                        |    5 +-
 tests/scripts/task_config_build_hexagon.sh         |    3 +-
 tests/scripts/task_python_frontend.sh              |    3 +
 135 files changed, 6463 insertions(+), 1761 deletions(-)
 create mode 100644 apps/hexagon_launcher/CMakeLists.txt
 create mode 100644 python/tvm/relay/frontend/oneflow.py
 create mode 100644 python/tvm/tir/tensor_intrin/dot_product_common.py
 create mode 100644 python/tvm/tir/tensor_intrin/rocm.py
 create mode 100644 src/meta_schedule/schedule_rule/multi_level_tiling.h
 create mode 100644 src/runtime/vulkan/vulkan_amdrgp.cc
 copy src/runtime/vulkan/{vulkan_module.h => vulkan_amdrgp.h} (54%)
 create mode 100644 tests/python/contrib/test_arm_compute_lib/test_concatenate.py
 create mode 100644 tests/python/contrib/test_hexagon/benchmark_hexagon.py
 create mode 100644 tests/python/frontend/oneflow/test_forward.py


[tvm] 01/02: initial

Posted by an...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

andrewzhaoluo pushed a commit to branch aluo/run-individual-node
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit f8718669194308921dc2d0a4b9137a2f739a45d0
Author: Andrew Zhao Luo <an...@gmail.com>
AuthorDate: Tue Apr 12 15:42:43 2022 -0700

    initial
---
 .../graph_executor/debug/graph_executor_debug.cc   | 114 +++++++++++++--------
 1 file changed, 74 insertions(+), 40 deletions(-)

diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index 12a739722a..dd95478e17 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -67,48 +67,21 @@ class GraphExecutorDebug : public GraphExecutor {
         time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
       }
     } else {
-      for (int i = 0; i < repeat; ++i) {
-        std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
-            tbegin, tend;
-        double duration_ms = 0.0;
-        do {
-          std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0);
-          if (duration_ms > 0.0) {
-            number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                               number * 1.618));  // 1.618 is chosen by random
-          }
-          tbegin = std::chrono::high_resolution_clock::now();
-          std::vector<std::vector<Timer>> op_timers;
-          for (size_t index = 0; index < op_execs_.size(); index++) {
-            op_timers.push_back({});
-          }
-          for (int k = 0; k < number; k++) {
-            for (size_t index = 0; index < op_execs_.size(); ++index) {
-              if (op_execs_[index]) {
-                op_timers[index].push_back(RunOpHost(index));
-              }
-            }
-          }
-          for (size_t index = 0; index < op_execs_.size(); ++index) {
-            for (auto t : op_timers[index]) {
-              time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9;
-            }
-          }
-          tend = std::chrono::high_resolution_clock::now();
-          duration_ms =
-              std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() *
-              1000;
-        } while (duration_ms < min_repeat_ms);
-
-        LOG(INFO) << "Iteration: " << i;
-        int op = 0;
-        for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-          if (op_execs_[index]) {
-            time_sec_per_op[index] /= number;
-            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
-                      << time_sec_per_op[index] * 1e6 << " us/iter";
+      for (size_t index = 0; index < op_execs_.size(); ++index) {
+        std::vector<std::vector<double>> results =
+            RunIndividualNode(index, number, repeat, min_repeat_ms);
+
+        double total = 0.0;
+        for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
+          std::vector<double>& timings = results[cur_repeat];
+          double total_in_trial = 0;
+          for (double t : timings) {
+            total_in_trial += t;
           }
+          total_in_trial /= timings.size();
+          total += total_in_trial;
         }
+        time_sec_per_op[index] = total / results.size();
       }
     }
 
@@ -119,6 +92,54 @@ class GraphExecutorDebug : public GraphExecutor {
     return os.str();
   }
 
+  std::vector<std::vector<double>> RunIndividualNode(int node_index, int number, int repeat,
+                                                     int min_repeat_ms) {
+    // warmup run
+    GraphExecutor::Run();
+    std::string tkey = module_->type_key();
+
+    // results_in_seconds[a][b] is the bth index run of the ath index repeat
+    std::vector<std::vector<double>> results_in_seconds;
+
+    if (tkey == "rpc") {
+      LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+    }
+
+    for (int i = 0; i < repeat; ++i) {
+      std::vector<Timer> op_timers;
+      double duration_ms = 0.0;
+
+      // Keep timing operations, upping number of repeats until we reach min_repeat_ms
+      do {
+        op_timers.clear();
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
+                                             number * 1.618));  // 1.618 is chosen by random
+        }
+
+        std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
+            tbegin, tend;
+        tbegin = std::chrono::high_resolution_clock::now();
+        for (int k = 0; k < number; k++) {
+          if (op_execs_[node_index]) {
+            op_timers.push_back(RunOpHost(node_index));
+          }
+        }
+        tend = std::chrono::high_resolution_clock::now();
+        duration_ms =
+            std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() * 1000;
+      } while (duration_ms < min_repeat_ms);
+
+      std::vector<double> timings_in_seconds;
+      for (Timer t : op_timers) {
+        timings_in_seconds.push_back(t->SyncAndGetElapsedNanos() / 1e9);
+      }
+      results_in_seconds.push_back(timings_in_seconds);
+    }
+
+    return results_in_seconds;
+  }
+
   double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
     // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which
     // represent inputs/parameters to the graph. Other types may be supported in the future, but
@@ -362,6 +383,19 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "run_individual_node") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int node_index = args[0];
+      int number = args[1];
+      int repeat = args[2];
+      int min_repeat_ms = args[3];
+      ICHECK_GE(node_index, 0);
+      ICHECK_LT(node_index, nodes_.size());
+      ICHECK_GT(number, 0);
+      ICHECK_GT(repeat, 0);
+      ICHECK_GE(min_repeat_ms, 0);
+      *rv = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+    });
   } else if (name == "profile") {
     return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
         [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {


[tvm] 02/02: secondary commit

Posted by an...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

andrewzhaoluo pushed a commit to branch aluo/run-individual-node
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 85751bce55737cb4645898e40e247dc830bab60a
Author: Andrew Zhao Luo <an...@gmail.com>
AuthorDate: Tue Apr 12 16:29:44 2022 -0700

    secondary commit
---
 python/tvm/contrib/debugger/debug_executor.py          | 16 ++++++++++++++++
 .../graph_executor/debug/graph_executor_debug.cc       | 18 ++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
index 12152e9de1..f40a2880b1 100644
--- a/python/tvm/contrib/debugger/debug_executor.py
+++ b/python/tvm/contrib/debugger/debug_executor.py
@@ -111,6 +111,7 @@ class GraphModuleDebug(graph_executor.GraphModule):
         self._dump_root = dump_root
         self._dump_path = None
         self._run_individual = module["run_individual"]
+        self._run_individual_node = module["run_individual_node"]
         self._debug_get_output = module["debug_get_output"]
         self._execute_node = module["execute_node"]
         self._get_node_output = module["get_node_output"]
@@ -281,6 +282,21 @@ class GraphModuleDebug(graph_executor.GraphModule):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def run_individual_node(self, index, number, repeat=1, min_repeat_ms=0):
+        """Results are returned as serialized strings which we deserialize."""
+        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
+        answer = []
+        for line in ret.split("\n"):
+            cur_results = []
+            if line.strip() == "":
+                continue
+            for value in line.split(","):
+                if value.strip() == "":
+                    continue
+                cur_results.append(float(value))
+            answer.append(cur_results)
+        return answer
+
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
         performance metrics.
diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
index dd95478e17..e1a0c3c490 100644
--- a/src/runtime/graph_executor/debug/graph_executor_debug.cc
+++ b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -95,7 +95,7 @@ class GraphExecutorDebug : public GraphExecutor {
   std::vector<std::vector<double>> RunIndividualNode(int node_index, int number, int repeat,
                                                      int min_repeat_ms) {
     // warmup run
-    GraphExecutor::Run();
+    // GraphExecutor::Run();
     std::string tkey = module_->type_key();
 
     // results_in_seconds[a][b] is the bth index run of the ath index repeat
@@ -394,7 +394,21 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       ICHECK_GT(number, 0);
       ICHECK_GT(repeat, 0);
       ICHECK_GE(min_repeat_ms, 0);
-      *rv = this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+      std::vector<std::vector<double>> results =
+          this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+      std::stringstream s;
+      s.precision(6);  // down to microseconds
+
+      for (std::vector<double>& row : results) {
+        for (double cur : row) {
+          s << cur << ", ";
+        }
+        s << "\n";
+      }
+
+      // Have problems returning Integers and FloatImm so this is hack
+      *rv = s.str();
     });
   } else if (name == "profile") {
     return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(