You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/12/22 02:34:39 UTC
incubator-singa git commit: SINGA-287 - Add memory size check for cudnn convolution

Repository: incubator-singa
Updated Branches:
  refs/heads/master 4dfee5208 -> 458b0f6a8


SINGA-287 - Add memory size check for cudnn convolution

add warning if the cudnn conv workspace size is too large (>512MB);
add size check when malloc memory for tensor to make sure the size is >=0


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/458b0f6a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/458b0f6a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/458b0f6a

Branch: refs/heads/master
Commit: 458b0f6a8cb7a05081e1908465c552ba10d2b23f
Parents: 4dfee52
Author: Wei Wang <wa...@gmail.com>
Authored: Tue Dec 20 17:55:50 2016 +0800
Committer: Wei Wang <wa...@gmail.com>
Committed: Wed Dec 21 18:14:15 2016 +0800

----------------------------------------------------------------------
 python/singa/data.py                 | 6 ++++--
 src/core/device/device.cc            | 2 ++
 src/core/tensor/tensor.cc            | 2 +-
 src/model/layer/cudnn_convolution.cc | 5 +++++
 4 files changed, 12 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/python/singa/data.py
----------------------------------------------------------------------
diff --git a/python/singa/data.py b/python/singa/data.py
index 4fffd92..3a99ad3 100644
--- a/python/singa/data.py
+++ b/python/singa/data.py
@@ -112,9 +112,9 @@ class ImageBatchIter:
             img_label = int(item[1])
             img_list.append((img_label, img_path))
         index = 0  # index for the image
+        if self.shuffle:
+            random.shuffle(img_list)
         while not self.stop:
-            if index == 0 and self.shuffle:
-                random.shuffle(img_list)
             if not self.queue.full():
                 x = []
                 y = np.empty(self.batch_size, dtype=np.int32)
@@ -134,6 +134,8 @@ class ImageBatchIter:
                     index += 1
                     if index == self.num_samples:
                         index = 0  # reset to the first image
+                        if self.shuffle:
+                            random.shuffle(img_list)
                 # enqueue one mini-batch
                 self.queue.put((np.asarray(x), y))
             else:

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/src/core/device/device.cc
----------------------------------------------------------------------
diff --git a/src/core/device/device.cc b/src/core/device/device.cc
index 0220df0..cda1b9f 100644
--- a/src/core/device/device.cc
+++ b/src/core/device/device.cc
@@ -33,6 +33,8 @@ void Device::Exec(function<void(Context*)>&& fn, const vector<Block*> read_block
 
 // TODO(wangwei) get Block from the memory manager
 Block* Device::NewBlock(int size) {
+  CHECK_GE(size, 0) << "size is negative, could be caused by the type cast "
+    << "from size_t to int. In that case, the size is too large.";
   if (size > 0) {
     void* ptr = Malloc(size);
     return new Block(ptr, size);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 4898594..d40fd88 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -122,7 +122,7 @@ void Tensor::AsType(const DataType type) {
 }
 
 void Tensor::ToDevice(std::shared_ptr<Device> dst) {
-  // TODO(wangwei) the comparison is very strict. May compare against device ID?
+  // TODO(wangwei) the comparison is restricted. May compare against device ID?
   if (device_ != dst) {
     Tensor tmp(shape_, dst, data_type_);
     if (block_ != nullptr && Size() && block_->initialized())

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/458b0f6a/src/model/layer/cudnn_convolution.cc
----------------------------------------------------------------------
diff --git a/src/model/layer/cudnn_convolution.cc b/src/model/layer/cudnn_convolution.cc
index 54bba06..196d137 100644
--- a/src/model/layer/cudnn_convolution.cc
+++ b/src/model/layer/cudnn_convolution.cc
@@ -151,6 +151,11 @@ void CudnnConvolution::InitCudnn(const Tensor &input) {
   workspace_count_ = std::max(std::max(fp_byte, bp_data_byte), bp_filter_byte) /
                          sizeof(float) +
                      1;
+  if (workspace_count_ > workspace_byte_limit_)
+    LOG(WARNING) << "The required memory for workspace ("
+      << workspace_count_ * sizeof(float)
+      << ") is larger than the expected Bytes ("
+      << workspace_byte_limit_ << ")";
   workspace_ = Tensor(Shape{workspace_count_}, dev, dtype);
   has_init_cudnn_ = true;
 }