You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2022/05/10 20:14:22 UTC
[tvm] branch main updated: [OpenCL] Change of OpenCL profiling logic (#11180)

This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 0f6abea1ca [OpenCL] Change of OpenCL profiling logic (#11180)
0f6abea1ca is described below

commit 0f6abea1cafc664af0cb97f348906ae5715a5f51
Author: Kirill Snezhko <44...@users.noreply.github.com>
AuthorDate: Tue May 10 23:14:16 2022 +0300

    [OpenCL] Change of OpenCL profiling logic (#11180)
    
    * Enable profiling only when it is used explicitly
    
    * Change logic of clCommandQueue create/destroy
    
    * Update comments
    
    * Linter fix
    
    * Refactor queue create
    
    * Move queue recreation logic to function
    
    * Replace profiling flag by the queue info request
    
    * Enhance readability
    
    * Fix linter errors
---
 src/runtime/opencl/opencl_common.h      | 57 ++++++++++++++++++++++++++++-----
 src/runtime/opencl/opencl_device_api.cc |  5 ---
 src/runtime/opencl/opencl_module.cc     | 19 +++++------
 3 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 18061a7aee..c2905b4327 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -274,6 +274,16 @@ class OpenCLWorkspace : public DeviceAPI {
         << "Invalid OpenCL device_id=" << dev.device_id;
     return events[dev.device_id];
   }
+  // is current clCommandQueue in profiling mode
+  bool IsProfiling(Device dev) {
+    cl_command_queue queue = GetQueue(dev);
+    cl_command_queue_properties prop;
+
+    OPENCL_CALL(clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
+                                      sizeof(cl_command_queue_properties), &prop, nullptr));
+
+    return prop & CL_QUEUE_PROFILING_ENABLE;
+  }
 
   // override device API
   void SetDevice(Device dev) final;
@@ -422,23 +432,32 @@ class OpenCLTimerNode : public TimerNode {
   virtual void Start() {
     cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear();
     this->duration = 0;
+    // Very first call of Start() leads to the recreation of
+    // OpenCL command queue in profiling mode. This allows to run profile after inference.
+    recreateCommandQueue();
   }
   // Timer stop
   virtual void Stop() {
     std::vector<cl_event> evt_queue = cl::OpenCLWorkspace::Global()->GetEventQueue(dev_);
     cl_ulong start, end;
-    OPENCL_CALL(clWaitForEvents(1, &(cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).back())));
-    for (auto& kevt : evt_queue) {
-      OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
-                                          &start, nullptr));
-      OPENCL_CALL(
-          clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, nullptr));
-      this->duration += (end - start);
+    if (cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).size() > 0) {
+      OPENCL_CALL(clWaitForEvents(1, &(cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).back())));
+      for (auto& kevt : evt_queue) {
+        OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),
+                                            &start, nullptr));
+        OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end,
+                                            nullptr));
+        this->duration += (end - start);
+      }
     }
   }
   virtual int64_t SyncAndGetElapsedNanos() { return this->duration; }
   // destructor
-  virtual ~OpenCLTimerNode() {}
+  virtual ~OpenCLTimerNode() {
+    // Profiling session ends, recreate clCommandQueue in non-profiling mode
+    // This will disable collection of cl_events in case of executing inference after profile
+    recreateCommandQueue();
+  }
   // constructor
   OpenCLTimerNode() {}
   explicit OpenCLTimerNode(Device dev) : dev_(dev) {}
@@ -449,6 +468,28 @@ class OpenCLTimerNode : public TimerNode {
  private:
   int64_t duration;
   Device dev_;
+
+  void recreateCommandQueue() {
+    cl_command_queue_properties prop;
+    if (!cl::OpenCLWorkspace::Global()->IsProfiling(dev_)) {
+      prop = CL_QUEUE_PROFILING_ENABLE;
+    } else {
+      prop = 0;
+    }
+
+    auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev_);
+
+    OPENCL_CALL(clFlush(queue));
+    OPENCL_CALL(clFinish(queue));
+    OPENCL_CALL(clReleaseCommandQueue(queue));
+
+    cl_int err_code;
+    cl_device_id did = cl::OpenCLWorkspace::Global()->devices[dev_.device_id];
+    auto profiling_queue =
+        clCreateCommandQueue(cl::OpenCLWorkspace::Global()->context, did, prop, &err_code);
+    OPENCL_CHECK_ERROR(err_code);
+    cl::OpenCLWorkspace::Global()->queues[dev_.device_id] = profiling_queue;
+  }
 };
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index c352716042..80b95a6ebf 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -426,12 +426,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   ICHECK_EQ(this->queues.size(), 0U);
   for (size_t i = 0; i < this->devices.size(); ++i) {
     cl_device_id did = this->devices[i];
-#ifdef USE_PROFILER
-    this->queues.push_back(
-        clCreateCommandQueue(this->context, did, CL_QUEUE_PROFILING_ENABLE, &err_code));
-#else
     this->queues.push_back(clCreateCommandQueue(this->context, did, 0, &err_code));
-#endif
     OPENCL_CHECK_ERROR(err_code);
   }
   this->events.resize(this->devices.size());
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index e08c6070bc..9ae80d59d5 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -79,15 +79,16 @@ class OpenCLWrappedFunc {
       wl.work_size[i] *= wl.work_size[i + 3];
     }
     // launch kernel
-#ifdef USE_PROFILER
-    w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1);
-    OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
-                                       wl.work_size + 3, 0, nullptr,
-                                       &(w_->GetEventQueue(t->device).back())));
-#else
-    OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
-                                       wl.work_size + 3, 0, nullptr, nullptr));
-#endif
+
+    if (w_->IsProfiling(t->device)) {
+      w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() + 1);
+      OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
+                                         wl.work_size + 3, 0, nullptr,
+                                         &(w_->GetEventQueue(t->device).back())));
+    } else {
+      OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr, wl.work_size,
+                                         wl.work_size + 3, 0, nullptr, nullptr));
+    }
   }
 
  private: