You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/05/31 19:27:46 UTC
[4/4] systemml git commit: [SYSTEMML-445] Refactored GPU Memory Manager

[SYSTEMML-445] Refactored GPU Memory Manager

- Several bugfixes found during recent experiments with ResNet200.
- Added align_memory eviction policy.
- Added GPU usage documentation.
- Refactored the GPU Memory Manager into distinct components.

Closes #774.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4d321667
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4d321667
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4d321667

Branch: refs/heads/master
Commit: 4d3216678f252f731ac7d7db62111dde6ca063f5
Parents: af9cc8a
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu May 31 12:25:06 2018 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu May 31 12:26:44 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |   14 +-
 docs/gpu.md                                     |   94 +
 src/main/cpp/kernels/SystemML.cu                |   99 +-
 src/main/cpp/kernels/SystemML.ptx               | 8587 +++++++++---------
 .../java/org/apache/sysml/api/DMLScript.java    |    4 +-
 .../apache/sysml/api/ScriptExecutorUtils.java   |    3 +
 .../java/org/apache/sysml/conf/DMLConfig.java   |   10 +-
 .../controlprogram/caching/CacheableData.java   |    4 +-
 .../instructions/gpu/context/CSRPointer.java    |   21 +-
 .../instructions/gpu/context/GPUContext.java    |   58 +-
 .../context/GPULazyCudaFreeMemoryManager.java   |  171 +
 .../gpu/context/GPUMatrixMemoryManager.java     |  184 +
 .../gpu/context/GPUMemoryManager.java           |  654 +-
 .../instructions/gpu/context/GPUObject.java     |  322 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  130 +-
 .../runtime/matrix/data/LibMatrixCuDNN.java     |   54 +-
 .../LibMatrixCuDNNConvolutionAlgorithm.java     |    4 +-
 .../data/LibMatrixCuDNNInputRowFetcher.java     |    2 +-
 .../runtime/matrix/data/LibMatrixCuMatMult.java |    4 +-
 .../runtime/matrix/data/LibMatrixNative.java    |    2 +-
 .../SinglePrecisionCudaSupportFunctions.java    |   45 +-
 .../org/apache/sysml/utils/GPUStatistics.java   |   37 +-
 22 files changed, 5646 insertions(+), 4857 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index a9c73c8..05d6a1a 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -85,17 +85,23 @@
     <sysml.gpu.availableGPUs>-1</sysml.gpu.availableGPUs>
     
     <!-- whether to synchronize GPUs after every GPU instruction -->
-    <sysml.gpu.sync.postProcess>true</sysml.gpu.sync.postProcess>
+    <sysml.gpu.sync.postProcess>false</sysml.gpu.sync.postProcess>
     
     <!-- whether to perform eager CUDA free on rmvar instruction -->
     <sysml.gpu.eager.cudaFree>false</sysml.gpu.eager.cudaFree>
+    
+    <!-- Developer flag used to debug GPU memory leaks. This has huge performance overhead and should be only turned on for debugging purposes.  -->
+    <sysml.gpu.print.memoryInfo>false</sysml.gpu.print.memoryInfo>
    
     <!-- the floating point precision. supported values are double, single -->
     <sysml.floating.point.precision>double</sysml.floating.point.precision>
     
-    <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict -->
-    <sysml.gpu.eviction.policy>lru</sysml.gpu.eviction.policy>
+    <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict, align_memory -->
+    <sysml.gpu.eviction.policy>align_memory</sysml.gpu.eviction.policy>
     
    <!-- maximum wrap length for instruction and miscellaneous timer column of statistics -->
    <sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength>
-</root>
+   
+   <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) -->
+   <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
+</root>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/docs/gpu.md
----------------------------------------------------------------------
diff --git a/docs/gpu.md b/docs/gpu.md
new file mode 100644
index 0000000..e9d7bca
--- /dev/null
+++ b/docs/gpu.md
@@ -0,0 +1,94 @@
+---
+layout: global
+title: Using SystemML with GPU
+description: Using SystemML with GPU
+---
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+<br/>
+
+# User Guide
+
+To use SystemML on GPUs, please ensure that [CUDA 9](https://developer.nvidia.com/cuda-90-download-archive) and
+[CuDNN 7](https://developer.nvidia.com/cudnn) is installed on your system.
+
+## Python users
+
+Please install SystemML using pip:
+- For released version: `pip install systemml`
+- For bleeding edge version: `pip install https://sparktc.ibmcloud.com/repo/latest/systemml-1.2.0-SNAPSHOT-python.tar.gz`
+
+Then you can use the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) and 
+[MLLearn](http://apache.github.io/systemml/beginners-guide-python.html#invoke-systemmls-algorithms) APIs to enable the GPU usage.
+
+```python
+from systemml.mllearn import Caffe2DML
+lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28))
+lenet.setGPU(True)
+```
+To skip memory-checking and force all GPU-enabled operations on the GPU, please use the `setForceGPU(True)` method after `setGPU(True)` method.
+
+```python
+from systemml.mllearn import Caffe2DML
+lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28))
+lenet.setGPU(True).setForceGPU(True)
+```
+
+## Command-line users
+
+To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and `-gpu` flag.
+
+```
+spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu
+``` 
+
+To skip memory-checking and force all GPU-enabled operations on the GPU, please provide `force` option to the `-gpu` flag.
+
+```
+spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu force
+``` 
+
+## Scala users
+
+To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and use 
+the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) API to enable the GPU usage.
+
+```
+spark-shell --jars systemml-1.*-extra.jar,SystemML.jar
+``` 
+
+# Troubleshooting guide
+
+- If you have older gcc (< 5.0) and if you get `libstdc++.so.6: version CXXABI_1.3.8 not found` error, please upgrade to gcc v5+. 
+On Centos 5, you may have to compile gcc from the source:
+
+```
+sudo yum install libmpc-devel mpfr-devel gmp-devel zlib-devel*
+curl ftp://ftp.gnu.org/pub/gnu/gcc/gcc-5.3.0/gcc-5.3.0.tar.bz2 -O
+tar xvfj gcc-5.3.0.tar.bz2
+cd gcc-5.3.0
+./configure --with-system-zlib --disable-multilib --enable-languages=c,c++
+num_cores=`grep -c ^processor /proc/cpuinfo`
+make -j $num_cores
+sudo make install
+```
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 29ae820..55ebeaf 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -20,7 +20,7 @@
 /**********************************
 When updating a kernel or adding a new one,
 please compile the ptx file and commit it:
-nvcc -ptx -arch=sm_30 --std c++11 SystemML.cu
+nvcc -w -ptx -arch=sm_30 --std c++11 SystemML.cu
 ***********************************/
 
 #include <cfloat>
@@ -1961,3 +1961,100 @@ extern "C" __global__ void matrix_sigmoid_f(float *A, float *C,
                                          unsigned int size) {
   matrix_sigmoid(A, C, size);
 }
+
+// We can later fold it in our reduce method
+template <typename T>
+__device__ void compute_nnz(
+    T *g_idata,  ///< input data stored in device memory (of size n)
+    T *g_odata,  ///< output/temporary array stored in device memory (of size n)
+    unsigned int n)  ///< size of the input and temporary/output arrays
+{
+  // extern __shared__ T sdata[];
+  extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
+  T *sdata = reinterpret_cast<T *>(my_sdata);
+
+  // perform first level of reduction,
+  // reading from global memory, writing to shared memory
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+  unsigned int gridSize = blockDim.x * 2 * gridDim.x;
+
+  T v = 0;
+
+  // we reduce multiple elements per thread.  The number is determined by the
+  // number of active thread blocks (via gridDim).  More blocks will result
+  // in a larger gridSize and therefore fewer elements per thread
+  while (i < n) {
+    v += g_idata[i] != 0 ? 1 : 0;
+    // ensure we don't read out of bounds
+    if (i + blockDim.x < n) v += g_idata[i + blockDim.x] != 0 ? 1 : 0;
+    i += gridSize;
+  }
+
+  // each thread puts its local sum into shared memory
+  sdata[tid] = v;
+  __syncthreads();
+
+  // do reduction in shared mem
+  if (blockDim.x >= 1024) {
+    if (tid < 512) {
+      sdata[tid] = v = v + sdata[tid + 512];
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 512) {
+    if (tid < 256) {
+      sdata[tid] = v = v + sdata[tid + 256];
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 256) {
+    if (tid < 128) {
+      sdata[tid] = v = v + sdata[tid + 128];
+    }
+    __syncthreads();
+  }
+  if (blockDim.x >= 128) {
+    if (tid < 64) {
+      sdata[tid] = v = v + sdata[tid + 64];
+    }
+    __syncthreads();
+  }
+
+  if (tid < 32) {
+    // now that we are using warp-synchronous programming (below)
+    // we need to declare our shared memory volatile so that the compiler
+    // doesn't reorder stores to it and induce incorrect behavior.
+    volatile T *smem = sdata;
+    if (blockDim.x >= 64) {
+      smem[tid] = v = v + smem[tid + 32];
+    }
+    if (blockDim.x >= 32) {
+      smem[tid] = v = v + smem[tid + 16];
+    }
+    if (blockDim.x >= 16) {
+      smem[tid] = v = v + smem[tid + 8];
+    }
+    if (blockDim.x >= 8) {
+      smem[tid] = v = v + smem[tid + 4];
+    }
+    if (blockDim.x >= 4) {
+      smem[tid] = v = v + smem[tid + 2];
+    }
+    if (blockDim.x >= 2) {
+      smem[tid] = v = v + smem[tid + 1];
+    }
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+
+extern "C" __global__ void compute_nnz_d(double *g_idata, double *g_odata, unsigned int n) {
+	compute_nnz(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void compute_nnz_f(float *g_idata, float *g_odata, unsigned int n) {
+	compute_nnz(g_idata, g_odata, n);
+}