You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@singa.apache.org by GitBox <gi...@apache.org> on 2020/06/06 13:19:52 UTC

[GitHub] [singa] XJDKC commented on a change in pull request #716: SINGA-510 Distributed Training Time Profiling

XJDKC commented on a change in pull request #716:
URL: https://github.com/apache/singa/pull/716#discussion_r436263014



##########
File path: include/singa/core/common.h
##########
@@ -100,13 +100,21 @@ typedef struct _Context {
   std::mt19937 random_generator;
 #ifdef USE_CUDA
   cublasHandle_t cublas_handle;
-  cudaStream_t stream;
-  curandGenerator_t curand_generator;
+  cudaStream_t stream; 
+  curandGenerator_t curand_generator; 
+
 #ifdef USE_CUDNN
   cudnnHandle_t cudnn_handle;
 #endif
 #endif  // USE_CUDA
 
+#ifdef USE_DIST

Review comment:
       I think this part should be wrapped by USE_CUDA. Otherwise we may create these streams when not using cuda.
   

##########
File path: src/io/communicator.cc
##########
@@ -179,14 +175,14 @@ void Communicator::wait() {
   device_->Exec(
       [this](Context *ctx) mutable {
         // synchronizing on all the CUDA streams used by communicator
-        CUDA_CHECK(cudaEventRecord(event, s));
-        CUDA_CHECK(cudaStreamWaitEvent(NULL, event, 0));
-        CUDA_CHECK(cudaEventRecord(event, c1));
-        CUDA_CHECK(cudaStreamWaitEvent(NULL, event, 0));
-        CUDA_CHECK(cudaEventRecord(event, c2));
-        CUDA_CHECK(cudaStreamWaitEvent(NULL, event, 0));
+        CUDA_CHECK(cudaEventRecord(event, ctx->s));
+        CUDA_CHECK(cudaStreamWaitEvent(ctx->stream, event, 0));
+        CUDA_CHECK(cudaEventRecord(event, ctx->c1));
+        CUDA_CHECK(cudaStreamWaitEvent(ctx->stream, event, 0));
+        CUDA_CHECK(cudaEventRecord(event, ctx->c2));
+        CUDA_CHECK(cudaStreamWaitEvent(ctx->stream, event, 0));
       },
-      blocks_, blocks_, "wait");
+      blocks_, blocks_, "Sync");

Review comment:
       The operator name is a bit confusing.

##########
File path: src/core/device/cuda_gpu.cc
##########
@@ -120,9 +132,41 @@ void CudaGPU::TimeProfilingDoExec(function<void(Context*)>&& fn, int executor,
   cudaEventCreate(&(node->start_));
   cudaEventCreate(&(node->end_));
 
+#ifdef USE_DIST

Review comment:
       Do we have a better scheme? 

##########
File path: include/singa/io/communicator.h
##########
@@ -98,16 +98,16 @@ class Communicator {
   void generateBlocks(Tensor &t);
   void generateBlocks(std::vector<Tensor> &t);
   void allReduce(int size, void *sendbuff, void *recvbuff,
-                 ncclDataType_t ncclType);
+                 ncclDataType_t ncclType, Context *ctx);

Review comment:
       Remember to delete the streams created in the communicator

##########
File path: src/io/communicator.cc
##########
@@ -134,7 +131,6 @@ void Communicator::sparsInit() {
   CUDA_CHECK(cudaMalloc(&xInd, (int)(sizeof(int) * maxSize)));
   CUDA_CHECK(cudaMalloc(&xVal, (int)(sizeof(float) * maxSize)));
   CUSPARSE_CHECK(cusparseCreate(&cusparse_handle));
-  CUSPARSE_CHECK(cusparseSetStream(cusparse_handle, c2));

Review comment:
       Should this line be deleted?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org