You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2020/12/28 15:29:56 UTC

[tvm] branch main updated: [µTVM] Add platform timer and RPCTimeEvaluator to enable AutoTVM (#6964)

This is an automated email from the ASF dual-hosted git repository.

tqchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new b8ac8d9  [µTVM] Add platform timer and RPCTimeEvaluator to enable AutoTVM (#6964)
b8ac8d9 is described below

commit b8ac8d94dec93cdaaec436f9105ea9b2eef752d0
Author: Andrew Reusch <ar...@octoml.ai>
AuthorDate: Mon Dec 28 07:29:37 2020 -0800

    [µTVM] Add platform timer and RPCTimeEvaluator to enable AutoTVM (#6964)
    
    * Add platform timer to microTVM.
    
    * Address liangfu comments
    
    * cppformat
    
    * clang-format
    
    Co-authored-by: Liangfu Chen <li...@apache.org>
---
 apps/bundle_deploy/bundle.c                    |   6 +
 apps/bundle_deploy/bundle_static.c             |   6 +
 include/tvm/runtime/c_runtime_api.h            |   7 ++
 include/tvm/runtime/crt/error_codes.h          |   5 +
 include/tvm/runtime/crt/platform.h             |  19 +++
 python/tvm/micro/session.py                    |   1 +
 src/runtime/c_runtime_api.cc                   |   9 ++
 src/runtime/crt/common/crt_runtime_api.c       | 164 +++++++++++++++++++++++--
 src/runtime/crt/host/crt_config.h              |   2 +-
 src/runtime/crt/host/main.cc                   |  23 ++--
 src/runtime/graph/debug/graph_runtime_debug.cc | 126 +++++++++++++------
 src/runtime/minrpc/minrpc_server.h             |   1 +
 tests/micro/qemu/test_zephyr.py                |  27 ++++
 tests/micro/qemu/zephyr-runtime/src/main.c     |  18 +--
 tests/python/unittest/test_crt.py              |  29 ++++-
 15 files changed, 372 insertions(+), 71 deletions(-)

diff --git a/apps/bundle_deploy/bundle.c b/apps/bundle_deploy/bundle.c
index 29712de..098ac99 100644
--- a/apps/bundle_deploy/bundle.c
+++ b/apps/bundle_deploy/bundle.c
@@ -123,3 +123,9 @@ tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void*
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) {
   return g_memory_manager->Free(g_memory_manager, ptr, ctx);
 }
+
+tvm_crt_error_t TVMPlatformTimerStart() { return kTvmErrorFunctionCallNotImplemented; }
+
+tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
diff --git a/apps/bundle_deploy/bundle_static.c b/apps/bundle_deploy/bundle_static.c
index 7ac95fd..c4b637c 100644
--- a/apps/bundle_deploy/bundle_static.c
+++ b/apps/bundle_deploy/bundle_static.c
@@ -124,3 +124,9 @@ tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void*
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) {
   return g_memory_manager->Free(g_memory_manager, ptr, ctx);
 }
+
+tvm_crt_error_t TVMPlatformTimerStart() { return kTvmErrorFunctionCallNotImplemented; }
+
+tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
+  return kTvmErrorFunctionCallNotImplemented;
+}
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index aac49c1..467e69a 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -540,6 +540,13 @@ TVM_DLL int TVMObjectRetain(TVMObjectHandle obj);
 TVM_DLL int TVMObjectFree(TVMObjectHandle obj);
 
 /*!
+ * \brief Free a TVMByteArray returned from TVMFuncCall, and associated memory.
+ * \param arr The TVMByteArray instance.
+ * \return 0 on success, -1 on failure.
+ */
+TVM_DLL int TVMByteArrayFree(TVMByteArray* arr);
+
+/*!
  * \brief Allocate a data space on device.
  * \param ctx The device context to perform operation.
  * \param nbytes The number of bytes in memory.
diff --git a/include/tvm/runtime/crt/error_codes.h b/include/tvm/runtime/crt/error_codes.h
index 41d727d..75e49e6 100644
--- a/include/tvm/runtime/crt/error_codes.h
+++ b/include/tvm/runtime/crt/error_codes.h
@@ -44,6 +44,7 @@ typedef enum {
   kTvmErrorCategoryGenerated = 6,
   kTvmErrorCategoryGraphRuntime = 7,
   kTvmErrorCategoryFunctionCall = 8,
+  kTvmErrorCategoryTimeEvaluator = 9,
 } tvm_crt_error_category_t;
 
 typedef enum {
@@ -77,6 +78,7 @@ typedef enum {
   kTvmErrorPlatformMemoryManagerInitialized = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 1),
   kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
   kTvmErrorPlatformNoMemory = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 3),
+  kTvmErrorPlatformTimerBadState = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 4),
 
   // Common error codes returned from generated functions.
   kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
@@ -91,6 +93,9 @@ typedef enum {
   kTvmErrorFunctionCallWrongArgType = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 1),
   kTvmErrorFunctionCallNotImplemented = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryFunctionCall, 2),
 
+  // Time Evaluator - times functions for use with debug runtime.
+  kTvmErrorTimeEvaluatorBadHandle = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryTimeEvaluator, 0),
+
   // System errors are always negative integers; this mask indicates presence of a system error.
   // Cast tvm_crt_error_t to a signed integer to interpret the negative error code.
   kTvmErrorSystemErrorMask = (1 << (sizeof(int) * 4 - 1)),
diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h
index 12dcdc5..8e03839 100644
--- a/include/tvm/runtime/crt/platform.h
+++ b/include/tvm/runtime/crt/platform.h
@@ -78,6 +78,25 @@ tvm_crt_error_t TVMPlatformMemoryAllocate(size_t num_bytes, DLContext ctx, void*
  * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
  */
 tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx);
+
+/*! \brief Start a device timer.
+ *
+ * The device timer used must not be running.
+ *
+ * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
+ */
+tvm_crt_error_t TVMPlatformTimerStart();
+
+/*! \brief Stop the running device timer and get the elapsed time (in microseconds).
+ *
+ * The device timer used must be running.
+ *
+ * \param elapsed_time_seconds Pointer to write elapsed time into.
+ *
+ * \return kTvmErrorNoError if successful; a descriptive error code otherwise.
+ */
+tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
index 0f2f09a..717b6e4 100644
--- a/python/tvm/micro/session.py
+++ b/python/tvm/micro/session.py
@@ -23,6 +23,7 @@ import sys
 from ..error import register_error
 from .._ffi import get_global_func
 from ..contrib import graph_runtime
+from ..contrib.debugger import debug_runtime
 from ..rpc import RPCSession
 from .transport import IoTimeoutError
 from .transport import TransportLogger
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 299f282..6ecc60a 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -420,6 +420,15 @@ int TVMFuncFree(TVMFunctionHandle func) {
   API_END();
 }
 
+int TVMByteArrayFree(TVMByteArray* arr) {
+  if (arr == &TVMAPIRuntimeStore::Get()->ret_bytes) {
+    return 0;  // Thread-local storage does not need explicit deleting.
+  }
+
+  delete arr;
+  return 0;
+}
+
 int TVMFuncCall(TVMFunctionHandle func, TVMValue* args, int* arg_type_codes, int num_args,
                 TVMValue* ret_val, int* ret_type_code) {
   API_BEGIN();
diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
index ac2b99a..fcfb51f 100644
--- a/src/runtime/crt/common/crt_runtime_api.c
+++ b/src/runtime/crt/common/crt_runtime_api.c
@@ -110,6 +110,9 @@ static const TVMModule* registered_modules[TVM_CRT_MAX_REGISTERED_MODULES];
 /*! \brief Passed as `module_index` to EncodeFunctionHandle. */
 static const tvm_module_index_t kGlobalFuncModuleIndex = TVM_CRT_MAX_REGISTERED_MODULES;
 
+/*! \brief Special module handle for retur values from RPCTimeEvaluator. */
+static const tvm_module_index_t kTimeEvaluatorModuleIndex = 0x7fff;
+
 static int DecodeModuleHandle(TVMModuleHandle handle, tvm_module_index_t* out_module_index) {
   tvm_module_index_t module_index;
 
@@ -185,13 +188,15 @@ static int DecodeFunctionHandle(TVMFunctionHandle handle, tvm_module_index_t* mo
       (tvm_module_index_t)(((uintptr_t)handle) >> (sizeof(tvm_function_index_t) * 8));
   unvalidated_module_index &= ~0x8000;
 
-  if (unvalidated_module_index > kGlobalFuncModuleIndex) {
-    TVMAPIErrorf("invalid module handle: index=%08x", unvalidated_module_index);
-    return -1;
-  } else if (unvalidated_module_index < kGlobalFuncModuleIndex &&
-             registered_modules[unvalidated_module_index] == NULL) {
-    TVMAPIErrorf("unregistered module: index=%08x", unvalidated_module_index);
-    return -1;
+  if (unvalidated_module_index != kTimeEvaluatorModuleIndex) {
+    if (unvalidated_module_index > kGlobalFuncModuleIndex) {
+      TVMAPIErrorf("invalid module handle: index=%08x", unvalidated_module_index);
+      return -1;
+    } else if (unvalidated_module_index < kGlobalFuncModuleIndex &&
+               registered_modules[unvalidated_module_index] == NULL) {
+      TVMAPIErrorf("unregistered module: index=%08x", unvalidated_module_index);
+      return -1;
+    }
   }
 
   *function_index = ((uint32_t)((uintptr_t)handle)) & ~0x8000;
@@ -199,6 +204,20 @@ static int DecodeFunctionHandle(TVMFunctionHandle handle, tvm_module_index_t* mo
   return 0;
 }
 
+int TVMByteArrayFree(TVMByteArray* arr) {
+  DLContext ctx = {kDLCPU, 0};
+  int to_return = TVMPlatformMemoryFree((void*)arr->data, ctx);
+  if (to_return != 0) {
+    return to_return;
+  }
+
+  return TVMPlatformMemoryFree((void*)arr, ctx);
+}
+
+tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* args,
+                                 int* type_codes, int num_args, TVMValue* ret_val,
+                                 int* ret_type_code);
+
 int TVMFuncCall(TVMFunctionHandle func_handle, TVMValue* arg_values, int* type_codes, int num_args,
                 TVMValue* ret_val, int* ret_type_code) {
   tvm_module_index_t module_index;
@@ -211,7 +230,10 @@ int TVMFuncCall(TVMFunctionHandle func_handle, TVMValue* arg_values, int* type_c
     return -1;
   }
 
-  if (module_index == kGlobalFuncModuleIndex) {
+  if (module_index == kTimeEvaluatorModuleIndex) {
+    return RunTimeEvaluator(function_index, arg_values, type_codes, num_args, ret_val,
+                            ret_type_code);
+  } else if (module_index == kGlobalFuncModuleIndex) {
     resource_handle = NULL;
     registry = &global_func_registry.registry;
   } else {
@@ -315,6 +337,8 @@ int TVMFuncFree(TVMFunctionHandle func) {
   return 0;
 }
 
+int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                     int* ret_type_code);
 tvm_crt_error_t TVMInitializeRuntime() {
   int idx = 0;
   tvm_crt_error_t error = kTvmErrorNoError;
@@ -351,6 +375,10 @@ tvm_crt_error_t TVMInitializeRuntime() {
     error = TVMFuncRegisterGlobal("tvm.rpc.server.ModuleGetFunction", &ModuleGetFunction, 0);
   }
 
+  if (error == kTvmErrorNoError) {
+    error = TVMFuncRegisterGlobal("runtime.RPCTimeEvaluator", &RPCTimeEvaluator, 0);
+  }
+
   if (error != kTvmErrorNoError) {
     TVMPlatformMemoryFree(registry_backing_memory, ctx);
     TVMPlatformMemoryFree(func_registry_memory, ctx);
@@ -358,3 +386,123 @@ tvm_crt_error_t TVMInitializeRuntime() {
 
   return error;
 }
+
+typedef struct {
+  uint16_t function_index;
+  TVMFunctionHandle func_to_time;
+  TVMContext ctx;
+  int number;
+  int repeat;
+  int min_repeat_ms;
+} time_evaluator_state_t;
+
+static time_evaluator_state_t g_time_evaluator_state;
+
+int RPCTimeEvaluator(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                     int* ret_type_code) {
+  ret_val[0].v_handle = NULL;
+  ret_type_code[0] = kTVMNullptr;
+  if (num_args < 8) {
+    TVMAPIErrorf("not enough args");
+    return kTvmErrorFunctionCallNumArguments;
+  }
+  if (type_codes[0] != kTVMModuleHandle || type_codes[1] != kTVMStr ||
+      type_codes[2] != kTVMArgInt || type_codes[3] != kTVMArgInt || type_codes[4] != kTVMArgInt ||
+      type_codes[5] != kTVMArgInt || type_codes[6] != kTVMArgInt || type_codes[7] != kTVMStr) {
+    TVMAPIErrorf("one or more invalid arg types");
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  TVMModuleHandle mod = (TVMModuleHandle)args[0].v_handle;
+  const char* name = args[1].v_str;
+  g_time_evaluator_state.ctx.device_type = args[2].v_int64;
+  g_time_evaluator_state.ctx.device_id = args[3].v_int64;
+  g_time_evaluator_state.number = args[4].v_int64;
+  g_time_evaluator_state.repeat = args[5].v_int64;
+  g_time_evaluator_state.min_repeat_ms = args[6].v_int64;
+
+  int ret_code =
+      TVMModGetFunction(mod, name, /* query_imports */ 0, &g_time_evaluator_state.func_to_time);
+  if (ret_code != 0) {
+    return ret_code;
+  }
+
+  g_time_evaluator_state.function_index++;
+  ret_val[0].v_handle =
+      EncodeFunctionHandle(kTimeEvaluatorModuleIndex, g_time_evaluator_state.function_index);
+  ret_type_code[0] = kTVMPackedFuncHandle;
+  return kTvmErrorNoError;
+}
+
+tvm_crt_error_t RunTimeEvaluator(tvm_function_index_t function_index, TVMValue* args,
+                                 int* type_codes, int num_args, TVMValue* ret_val,
+                                 int* ret_type_code) {
+  if (function_index != g_time_evaluator_state.function_index) {
+    return kTvmErrorTimeEvaluatorBadHandle;
+  }
+
+  // TODO(areusch): should *really* rethink needing to return doubles
+  DLContext result_byte_ctx = {kDLCPU, 0};
+  TVMByteArray* result_byte_arr = NULL;
+  tvm_crt_error_t err =
+      TVMPlatformMemoryAllocate(sizeof(TVMByteArray), result_byte_ctx, (void*)&result_byte_arr);
+  if (err != kTvmErrorNoError) {
+    goto release_and_return;
+  }
+  result_byte_arr->data = NULL;
+  size_t data_size = sizeof(double) * g_time_evaluator_state.repeat;
+  err = TVMPlatformMemoryAllocate(data_size, result_byte_ctx, (void*)&result_byte_arr->data);
+  if (err != kTvmErrorNoError) {
+    goto release_and_return;
+  }
+  result_byte_arr->size = data_size;
+  double min_repeat_seconds = ((double)g_time_evaluator_state.min_repeat_ms) / 1000;
+  double* iter = (double*)result_byte_arr->data;
+  for (int i = 0; i < g_time_evaluator_state.repeat; i++) {
+    double repeat_res_seconds = 0.0;
+    int exec_count = 0;
+    // do-while structure ensures we run even when `min_repeat_ms` isn't set (i.e., is 0).
+    do {
+      err = TVMPlatformTimerStart();
+      if (err != kTvmErrorNoError) {
+        goto release_and_return;
+      }
+
+      for (int j = 0; j < g_time_evaluator_state.number; j++) {
+        err = TVMFuncCall(g_time_evaluator_state.func_to_time, args, type_codes, num_args, ret_val,
+                          ret_type_code);
+        if (err != kTvmErrorNoError) {
+          goto release_and_return;
+        }
+      }
+      exec_count += g_time_evaluator_state.number;
+
+      double curr_res_seconds;
+      err = TVMPlatformTimerStop(&curr_res_seconds);
+      if (err != kTvmErrorNoError) {
+        goto release_and_return;
+      }
+      repeat_res_seconds += curr_res_seconds;
+    } while (repeat_res_seconds < min_repeat_seconds);
+    double mean_exec_seconds = repeat_res_seconds / exec_count;
+    *iter = mean_exec_seconds;
+    iter++;
+  }
+
+  *ret_type_code = kTVMBytes;
+  ret_val->v_handle = result_byte_arr;
+  return err;
+
+release_and_return : {
+  tvm_crt_error_t release_err =
+      TVMPlatformMemoryFree((void*)&result_byte_arr->data, result_byte_ctx);
+  if (release_err != kTvmErrorNoError) {
+    release_err = TVMPlatformMemoryFree((void*)&result_byte_arr, result_byte_ctx);
+  }
+
+  if (err == kTvmErrorNoError && release_err != kTvmErrorNoError) {
+    err = release_err;
+  }
+}
+  return err;
+}
diff --git a/src/runtime/crt/host/crt_config.h b/src/runtime/crt/host/crt_config.h
index 6891896..109abaf 100644
--- a/src/runtime/crt/host/crt_config.h
+++ b/src/runtime/crt/host/crt_config.h
@@ -43,7 +43,7 @@
 #define TVM_CRT_MAX_REGISTERED_MODULES 2
 
 /*! Size of the global function registry, in bytes. */
-#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 200
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 256
 
 /*! Maximum packet size, in bytes, including the length header. */
 #define TVM_CRT_MAX_PACKET_SIZE_BYTES 64000
diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
index ba43e84..7db17f5 100644
--- a/src/runtime/crt/host/main.cc
+++ b/src/runtime/crt/host/main.cc
@@ -68,29 +68,30 @@ tvm_crt_error_t TVMPlatformMemoryFree(void* ptr, DLContext ctx) {
   return memory_manager->Free(memory_manager, ptr, ctx);
 }
 
-high_resolution_clock::time_point g_utvm_start_time;
+steady_clock::time_point g_utvm_start_time;
 int g_utvm_timer_running = 0;
 
-int TVMPlatformTimerStart() {
+tvm_crt_error_t TVMPlatformTimerStart() {
   if (g_utvm_timer_running) {
     std::cerr << "timer already running" << std::endl;
-    return -1;
+    return kTvmErrorPlatformTimerBadState;
   }
-  g_utvm_start_time = high_resolution_clock::now();
+  g_utvm_start_time = std::chrono::steady_clock::now();
   g_utvm_timer_running = 1;
-  return 0;
+  return kTvmErrorNoError;
 }
 
-int TVMPlatformTimerStop(double* res_us) {
+tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   if (!g_utvm_timer_running) {
     std::cerr << "timer not running" << std::endl;
-    return -1;
+    return kTvmErrorPlatformTimerBadState;
   }
-  auto utvm_stop_time = high_resolution_clock::now();
-  duration<double, std::micro> time_span(utvm_stop_time - g_utvm_start_time);
-  *res_us = time_span.count();
+  auto utvm_stop_time = std::chrono::steady_clock::now();
+  std::chrono::microseconds time_span =
+      std::chrono::duration_cast<std::chrono::microseconds>(utvm_stop_time - g_utvm_start_time);
+  *elapsed_time_seconds = static_cast<double>(time_span.count()) / 1e6;
   g_utvm_timer_running = 0;
-  return 0;
+  return kTvmErrorNoError;
 }
 }
 
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index d02a6d9..3353c11 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -58,56 +58,106 @@ class GraphRuntimeDebug : public GraphRuntime {
   std::string RunIndividual(int number, int repeat, int min_repeat_ms) {
     // warmup run
     GraphRuntime::Run();
-    std::ostringstream os;
+    std::string tkey = module_->type_key();
     std::vector<double> time_per_op(op_execs_.size(), 0);
-    for (int i = 0; i < repeat; ++i) {
-      std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin,
-          tend;
-      double duration_ms = 0.0;
-      do {
-        std::fill(time_per_op.begin(), time_per_op.end(), 0);
-        if (duration_ms > 0.0) {
-          number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                             number * 1.618));  // 1.618 is chosen by random
-        }
-        tbegin = std::chrono::high_resolution_clock::now();
-        for (int k = 0; k < number; k++) {
-          for (size_t index = 0; index < op_execs_.size(); ++index) {
-            if (op_execs_[index]) {
-              const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
-              auto op_tbegin = std::chrono::high_resolution_clock::now();
-              op_execs_[index]();
-              TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
-              auto op_tend = std::chrono::high_resolution_clock::now();
-              double op_duration =
-                  std::chrono::duration_cast<std::chrono::duration<double> >(op_tend - op_tbegin)
-                      .count();
-              time_per_op[index] += op_duration * 1e6;  // us
+    if (tkey == "rpc") {
+      // RPC modules rely on remote timing which implements the logic from the else branch.
+      for (size_t index = 0; index < op_execs_.size(); ++index) {
+        time_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
+      }
+    } else {
+      for (int i = 0; i < repeat; ++i) {
+        std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
+            tbegin, tend;
+        double duration_ms = 0.0;
+        do {
+          std::fill(time_per_op.begin(), time_per_op.end(), 0);
+          if (duration_ms > 0.0) {
+            number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
+                                               number * 1.618));  // 1.618 is chosen by random
+          }
+          tbegin = std::chrono::high_resolution_clock::now();
+          for (int k = 0; k < number; k++) {
+            for (size_t index = 0; index < op_execs_.size(); ++index) {
+              if (op_execs_[index]) {
+                time_per_op[index] += RunOpHost(index);
+              }
             }
           }
-        }
-        tend = std::chrono::high_resolution_clock::now();
-        duration_ms =
-            std::chrono::duration_cast<std::chrono::duration<double> >(tend - tbegin).count() *
-            1000;
-      } while (duration_ms < min_repeat_ms);
-
-      LOG(INFO) << "Iteration: " << i;
-      int op = 0;
-      for (size_t index = 0; index < time_per_op.size(); index++) {
-        if (op_execs_[index]) {
-          time_per_op[index] /= number;
-          LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": " << time_per_op[index]
-                    << " us/iter";
+          tend = std::chrono::high_resolution_clock::now();
+          duration_ms =
+              std::chrono::duration_cast<std::chrono::duration<double> >(tend - tbegin).count() *
+              1000;
+        } while (duration_ms < min_repeat_ms);
+
+        LOG(INFO) << "Iteration: " << i;
+        int op = 0;
+        for (size_t index = 0; index < time_per_op.size(); index++) {
+          if (op_execs_[index]) {
+            time_per_op[index] /= number;
+            LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": " << time_per_op[index]
+                      << " us/iter";
+          }
         }
       }
     }
+
+    std::ostringstream os;
     for (size_t index = 0; index < time_per_op.size(); index++) {
       os << time_per_op[index] << ",";
     }
     return os.str();
   }
 
+  double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
+    const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
+    TVMOpParam param = nodes_[index].param;
+    std::string name = param.func_name;
+    uint32_t num_inputs = param.num_inputs;
+    uint32_t num_outputs = param.num_outputs;
+
+    PackedFunc time_eval = runtime::Registry::Get("runtime.RPCTimeEvaluator")
+                               ->
+                               operator()(module_, name, static_cast<int>(ctx.device_type),
+                                          ctx.device_id, number, repeat, min_repeat_ms, "");
+
+    int num_flat_args = num_inputs + num_outputs;
+    std::unique_ptr<TVMValue> values(new TVMValue[num_flat_args]);
+    std::unique_ptr<int> type_codes(new int[num_flat_args]);
+    TVMArgsSetter setter(values.get(), type_codes.get());
+    int offs = 0;
+    const auto& inode = nodes_[index];
+    for (const auto& e : inode.inputs) {
+      uint32_t eid = this->entry_id(e);
+      DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
+      setter(offs, arg);
+      offs++;
+    }
+    for (uint32_t i = 0; i < num_outputs; ++i) {
+      uint32_t eid = this->entry_id(index, i);
+      DLTensor* arg = const_cast<DLTensor*>(data_entry_[eid].operator->());
+      setter(offs, arg);
+      offs++;
+    }
+    TVMRetValue rv;
+    time_eval.CallPacked(TVMArgs(values.get(), type_codes.get(), num_flat_args), &rv);
+    std::string results = rv.operator std::string();
+    const double* results_arr = reinterpret_cast<const double*>(results.data());
+    LOG(INFO) << "Got op timing: " << results_arr[0];
+    return results_arr[0];
+  }
+
+  double RunOpHost(int index) {
+    auto op_tbegin = std::chrono::high_resolution_clock::now();
+    op_execs_[index]();
+    const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
+    TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+    auto op_tend = std::chrono::high_resolution_clock::now();
+    double op_duration =
+        std::chrono::duration_cast<std::chrono::duration<double> >(op_tend - op_tbegin).count();
+    return op_duration;
+  }
+
   /*!
    * \brief Run each operation and get the output.
    * \param index The index of op which needs to be returned.
diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h
index 62f7236..d28e0c3 100644
--- a/src/runtime/minrpc/minrpc_server.h
+++ b/src/runtime/minrpc/minrpc_server.h
@@ -156,6 +156,7 @@ class MinRPCServer {
       } else if (rv_tcode == kTVMBytes) {
         ret_tcode[1] = kTVMBytes;
         this->ReturnPackedSeq(ret_value, ret_tcode, 2);
+        TVMByteArrayFree(reinterpret_cast<TVMByteArray*>(ret_value[1].v_handle));  // NOLINT(*)
       } else if (rv_tcode == kTVMPackedFuncHandle || rv_tcode == kTVMModuleHandle) {
         ret_tcode[1] = kTVMOpaqueHandle;
         this->ReturnPackedSeq(ret_value, ret_tcode, 2);
diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py
index 3e73307..1c38c2d 100644
--- a/tests/micro/qemu/test_zephyr.py
+++ b/tests/micro/qemu/test_zephyr.py
@@ -143,6 +143,33 @@ def test_compile_runtime(platform):
         test_basic_add(sess)
 
 
+def test_platform_timer(platform):
+    """Test compiling the on-device runtime."""
+
+    model, zephyr_board = PLATFORMS[platform]
+
+    # NOTE: run test in a nested function so cPython will delete arrays before closing the session.
+    def test_basic_add(sess):
+        A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context)
+        assert (A_data.asnumpy() == np.array([2, 3])).all()
+        B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context)
+        assert (B_data.asnumpy() == np.array([4])).all()
+        C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context)
+        assert (C_data.asnumpy() == np.array([0, 0])).all()
+
+        system_lib = sess.get_system_lib()
+        time_eval_f = system_lib.time_evaluator(
+            "add", sess.context, number=20, repeat=3, min_repeat_ms=40
+        )
+        result = time_eval_f(A_data, B_data, C_data)
+        assert (C_data.asnumpy() == np.array([6, 7])).all()
+        assert result.mean > 0
+        assert len(result.results) == 3
+
+    with _make_add_sess(model, zephyr_board) as sess:
+        test_basic_add(sess)
+
+
 def test_relay(platform):
     """Testing a simple relay graph"""
     model, zephyr_board = PLATFORMS[platform]
diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c
index 86b2b27..9d10504 100644
--- a/tests/micro/qemu/zephyr-runtime/src/main.c
+++ b/tests/micro/qemu/zephyr-runtime/src/main.c
@@ -99,10 +99,10 @@ int g_utvm_timer_running = 0;
 static struct device* led_pin;
 #endif  // CONFIG_LED
 
-int TVMPlatformTimerStart() {
+tvm_crt_error_t TVMPlatformTimerStart() {
   if (g_utvm_timer_running) {
     TVMLogf("timer already running");
-    return -1;
+    return kTvmErrorPlatformTimerBadState;
   }
 
 #ifdef CONFIG_LED
@@ -111,13 +111,13 @@ int TVMPlatformTimerStart() {
   k_timer_start(&g_utvm_timer, TIME_TIL_EXPIRY, TIME_TIL_EXPIRY);
   g_utvm_start_time = k_cycle_get_32();
   g_utvm_timer_running = 1;
-  return 0;
+  return kTvmErrorNoError;
 }
 
-int TVMPlatformTimerStop(double* res_us) {
+tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) {
   if (!g_utvm_timer_running) {
     TVMLogf("timer not running");
-    return -1;
+    return kTvmErrorPlatformTimerBadState;
   }
 
   uint32_t stop_time = k_cycle_get_32();
@@ -134,7 +134,7 @@ int TVMPlatformTimerStop(double* res_us) {
   }
 
   uint32_t ns_spent = (uint32_t)k_cyc_to_ns_floor64(cycles_spent);
-  double hw_clock_res_us = ns_spent / 1000.0;
+  double hw_clock_elapsed_seconds = ns_spent / 1e9;
 
   // need to grab time remaining *before* stopping. when stopped, this function
   // always returns 0.
@@ -152,13 +152,13 @@ int TVMPlatformTimerStop(double* res_us) {
   // if we approach the limits of the HW clock datatype (uint32_t), use the
   // coarse-grained timer result instead
   if (approx_num_cycles > (0.5 * (~((uint32_t)0)))) {
-    *res_us = timer_res_ms * 1000.0;
+    *elapsed_time_seconds = timer_res_ms / 1e3;
   } else {
-    *res_us = hw_clock_res_us;
+    *elapsed_time_seconds = hw_clock_elapsed_seconds;
   }
 
   g_utvm_timer_running = 0;
-  return 0;
+  return kTvmErrorNoError;
 }
 
 #define RING_BUF_SIZE 512
diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py
index 1d84d4e..659d190 100644
--- a/tests/python/unittest/test_crt.py
+++ b/tests/python/unittest/test_crt.py
@@ -25,8 +25,10 @@ import subprocess
 import textwrap
 
 import numpy as np
+import pytest
 
 import tvm
+import tvm.testing
 import tvm.relay
 import tvm.testing
 
@@ -172,8 +174,27 @@ def test_std_math_functions():
         np.testing.assert_allclose(B_data.asnumpy(), np.array([7.389056, 20.085537]))
 
 
+@tvm.testing.requires_micro
+def test_platform_timer():
+    """Verify the platform timer can be used to time remote functions."""
+    import tvm.micro
+
+    workspace = tvm.micro.Workspace()
+    A = tvm.te.placeholder((2,), dtype="float32", name="A")
+    B = tvm.te.compute(A.shape, lambda i: tvm.te.exp(A[i]), name="B")
+    s = tvm.te.create_schedule(B.op)
+
+    with _make_sess_from_op(workspace, "myexpf", s, [A, B]) as sess:
+        A_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context)
+        B_data = tvm.nd.array(np.array([2.0, 3.0], dtype="float32"), ctx=sess.context)
+        lib = sess.get_system_lib()
+        time_eval_f = lib.time_evaluator(
+            "myexpf", sess.context, number=2000, repeat=3, min_repeat_ms=40
+        )
+        result = time_eval_f(A_data, B_data)
+        assert result.mean > 0
+        assert len(result.results) == 3
+
+
 if __name__ == "__main__":
-    test_compile_runtime()
-    test_reset()
-    test_graph_runtime()
-    test_std_math_functions()
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))