You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2017/12/06 16:48:58 UTC
[GitHub] cjolivier01 closed pull request #8803: Small first iteration for profiler: use nonblocking queue

cjolivier01 closed pull request #8803: Small first iteration for profiler: use nonblocking queue
URL: https://github.com/apache/incubator-mxnet/pull/8803
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50f60089cf..4febbe4016 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,7 @@ mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and foun
 mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON AND NOT MSVC)
 mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
 mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
-mxnet_option(USE_PROFILER         "Build with Profiler support"   OFF)
+mxnet_option(USE_PROFILER         "Build with Profiler support"   ON)
 mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
 mxnet_option(USE_PLUGINS_WARPCTC  "Use WARPCTC Plugins" OFF)
 mxnet_option(USE_PLUGIN_CAFFE     "Use Caffe Plugin" OFF)
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 2aba8f4bdc..b378817e14 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -17,6 +17,7 @@
 
 import sys
 import os.path, re, StringIO
+import platform
 
 blacklist = [
     'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh',
@@ -26,7 +27,8 @@
     'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
     'nvml.h', 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h',
     'omp.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h',
-    'cusolverDn.h'
+    'cusolverDn.h', 'internal/concurrentqueue_internal_debug.h', 'relacy/relacy_std.hpp',
+    'relacy_shims.h'
     ]
 
 minimum = int(sys.argv[6]) if len(sys.argv) > 5 else 0
@@ -36,6 +38,12 @@
 if minimum != 0:
     blacklist.append('linalg.h')
 
+if platform.system() != 'Darwin':
+  blacklist.append('TargetConditionals.h')
+
+if platform.system() != 'Windows':
+  blacklist.append('windows.h')
+
 def pprint(lst):
     for item in lst:
         print item
diff --git a/dmlc-core b/dmlc-core
index 87b7ffa59e..ebbda66217 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 87b7ffa59eb78f753073ac56f5f60e46d930b93c
+Subproject commit ebbda66217efd5f7e16ace68cae0128a7117d081
diff --git a/src/engine/profiler.cc b/src/engine/profiler.cc
index 44ad138277..13f8cca37b 100644
--- a/src/engine/profiler.cc
+++ b/src/engine/profiler.cc
@@ -24,12 +24,8 @@
  */
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
+#include <dmlc/omp.h>
 #include <mxnet/base.h>
-#include <set>
-#include <map>
-#include <mutex>
-#include <chrono>
-#include <iostream>
 #include <fstream>
 #include <thread>
 #include "./profiler.h"
@@ -44,7 +40,6 @@
 
 namespace mxnet {
 namespace engine {
-const int INITIAL_SIZE = 1024;
 
 Profiler::Profiler()
   : state_(kNotRunning), enable_output_(false), filename_("profile.json") {
@@ -59,14 +54,13 @@ Profiler::Profiler()
 #endif
 
   this->profile_stat = new DevStat[cpu_num_ + gpu_num_ + 1];
-  this->profile_stat->opr_exec_stats.reserve(INITIAL_SIZE);
   for (unsigned int i = 0; i < cpu_num_; ++i) {
-    profile_stat[i].dev_name = "cpu/" + std::to_string(i);
+    profile_stat[i].dev_name_ = "cpu/" + std::to_string(i);
   }
   for (unsigned int i = 0; i < gpu_num_; ++i) {
-    profile_stat[cpu_num_ + i].dev_name = "gpu/" + std::to_string(i);
+    profile_stat[cpu_num_ + i].dev_name_ = "gpu/" + std::to_string(i);
   }
-  profile_stat[cpu_num_ + gpu_num_].dev_name = "cpu pinned/";
+  profile_stat[cpu_num_ + gpu_num_].dev_name_ = "cpu pinned/";
 
   mode_ = (ProfilerMode)dmlc::GetEnv("MXNET_PROFILER_MODE", static_cast<int>(kOnlySymbolic));
   if (dmlc::GetEnv("MXNET_PROFILER_AUTOSTART", 0)) {
@@ -99,7 +93,7 @@ void Profiler::SetConfig(ProfilerMode mode, std::string output_filename) {
 }
 
 OprExecStat *Profiler::AddOprStat(int dev_type, uint32_t dev_id) {
-  OprExecStat* opr_stat = new OprExecStat;
+  std::unique_ptr<OprExecStat> opr_stat(new OprExecStat);
   opr_stat->dev_type = dev_type;
   opr_stat->dev_id   = dev_id;
   opr_stat->opr_name[sizeof(opr_stat->opr_name)-1] = '\0';
@@ -116,16 +110,13 @@ OprExecStat *Profiler::AddOprStat(int dev_type, uint32_t dev_id) {
       idx = cpu_num_ + gpu_num_;
       break;
     default:
-      LOG(FATAL) << "Unkown dev_type";
+      LOG(FATAL) << "Unknown dev_type: " << dev_type;
       return NULL;
   }
 
   DevStat& dev_stat = profile_stat[idx];
-  {
-    std::lock_guard<std::mutex> lock{dev_stat.m_};
-    dev_stat.opr_exec_stats.push_back(opr_stat);
-  }
-  return opr_stat;
+  dev_stat.opr_exec_stats_->enqueue(opr_stat.get());
+  return opr_stat.release();
 }
 
 void Profiler::EmitPid(std::ostream *os, const std::string& name, uint32_t pid) {
@@ -167,19 +158,17 @@ void Profiler::DumpProfile() {
 
   for (uint32_t i = 0; i < dev_num; ++i) {
     const DevStat &d = profile_stat[i];
-    this->EmitPid(&file, d.dev_name, i);
+    this->EmitPid(&file, d.dev_name_, i);
     file << ",\n";
   }
 
   bool first_flag = true;
   for (uint32_t i = 0; i < dev_num; ++i) {
     DevStat &d = profile_stat[i];
-    std::lock_guard<std::mutex> lock(d.m_);
-    uint32_t opr_num = d.opr_exec_stats.size();
-
-    for (uint32_t j = 0; j < opr_num; ++j) {
-      const OprExecStat* opr_stat = d.opr_exec_stats[j];
-
+    OprExecStat *_opr_stat;
+    while (d.opr_exec_stats_->try_dequeue(_opr_stat)) {
+      CHECK_NOTNULL(_opr_stat);
+      std::unique_ptr<OprExecStat> opr_stat(_opr_stat);  // manage lifecycle
       uint32_t pid = i;
       uint32_t tid = opr_stat->thread_id;
 
@@ -190,10 +179,10 @@ void Profiler::DumpProfile() {
       }
       file << std::endl;
       this->EmitEvent(&file, opr_stat->opr_name, "category", "B",
-            opr_stat->opr_start_rel_micros, pid, tid);
+                      opr_stat->opr_start_rel_micros, pid, tid);
       file << ",\n";
       this->EmitEvent(&file, opr_stat->opr_name, "category", "E",
-            opr_stat->opr_end_rel_micros, pid, tid);
+                      opr_stat->opr_end_rel_micros, pid, tid);
     }
   }
 
diff --git a/src/engine/profiler.h b/src/engine/profiler.h
index dbbc773351..ebd942036c 100644
--- a/src/engine/profiler.h
+++ b/src/engine/profiler.h
@@ -25,6 +25,7 @@
 #ifndef MXNET_ENGINE_PROFILER_H_
 #define MXNET_ENGINE_PROFILER_H_
 
+#include <dmlc/concurrentqueue.h>
 #include <vector>
 #include <string>
 #include <mutex>
@@ -65,11 +66,24 @@ struct OprExecStat {
  */
 struct DevStat {
   /*! \brief device name */
-  std::string dev_name;
+  std::string dev_name_;
   /*! \brief operation execution statistics on this device */
-  std::vector<OprExecStat*> opr_exec_stats;
-  /*! \brief internal mutex of the execution state */
-  std::mutex m_;
+  std::shared_ptr<dmlc::moodycamel::ConcurrentQueue<OprExecStat *>> opr_exec_stats_ =
+    std::make_shared<dmlc::moodycamel::ConcurrentQueue<OprExecStat *>>();
+
+  /*!
+   * \brief Destructor, clean up allocated objects
+   *        TODO(cjolivier01) Investigate queueing unique_ptr<>'s if it won't hurt performance
+   */
+  ~DevStat() {
+    std::shared_ptr<dmlc::moodycamel::ConcurrentQueue<OprExecStat *>> es = opr_exec_stats_;
+    if (es) {
+      OprExecStat *opr_stat = nullptr;
+      while (es->try_dequeue(opr_stat)) {
+        delete opr_stat;
+      }
+    }
+  }
 };
 
 
diff --git a/tests/python/unittest/test_profiler.py b/tests/python/unittest/test_profiler.py
index 724ed3a387..78baf4a183 100644
--- a/tests/python/unittest/test_profiler.py
+++ b/tests/python/unittest/test_profiler.py
@@ -19,14 +19,12 @@
 import mxnet as mx
 from mxnet import profiler
 import time
-import numpy as np
 
 def test_profiler():
     profile_filename = "test_profile.json"
-    iter_num = 100
-    begin_profiling_iter = 50
-    end_profiling_iter = 50
-
+    iter_num = 5
+    begin_profiling_iter = 2
+    end_profiling_iter = 4
 
     profiler.profiler_set_config(mode='symbolic', filename=profile_filename)
     print('profile file save to {0}'.format(profile_filename))
@@ -43,9 +41,9 @@ def test_profiler():
     a.copyto(executor.arg_dict['A'])
     b.copyto(executor.arg_dict['B'])
 
-    flag = False
     print("execution begin")
     for i in range(iter_num):
+        print("Iteration {}/{}".format(i + 1, iter_num))
         if i == begin_profiling_iter:
             t0 = time.clock()
             profiler.profiler_set_state('run')
@@ -59,6 +57,7 @@ def test_profiler():
     duration = t1 - t0
     print('duration: {0}s'.format(duration))
     print('          {0}ms/operator'.format(duration*1000/iter_num))
+    profiler.dump_profile()
 
 if __name__ == '__main__':
     test_profiler()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services