You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by ni...@apache.org on 2018/05/31 19:27:46 UTC
[4/4] systemml git commit: [SYSTEMML-445] Refactored GPU Memory
Manager
[SYSTEMML-445] Refactored GPU Memory Manager
- Several bugfixes found during recent experiments with ResNet200.
- Added align_memory eviction policy.
- Added GPU usage documentation.
- Refactored the GPU Memory Manager into distinct components.
Closes #774.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/4d321667
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/4d321667
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/4d321667
Branch: refs/heads/master
Commit: 4d3216678f252f731ac7d7db62111dde6ca063f5
Parents: af9cc8a
Author: Niketan Pansare <np...@us.ibm.com>
Authored: Thu May 31 12:25:06 2018 -0700
Committer: Niketan Pansare <np...@us.ibm.com>
Committed: Thu May 31 12:26:44 2018 -0700
----------------------------------------------------------------------
conf/SystemML-config.xml.template | 14 +-
docs/gpu.md | 94 +
src/main/cpp/kernels/SystemML.cu | 99 +-
src/main/cpp/kernels/SystemML.ptx | 8587 +++++++++---------
.../java/org/apache/sysml/api/DMLScript.java | 4 +-
.../apache/sysml/api/ScriptExecutorUtils.java | 3 +
.../java/org/apache/sysml/conf/DMLConfig.java | 10 +-
.../controlprogram/caching/CacheableData.java | 4 +-
.../instructions/gpu/context/CSRPointer.java | 21 +-
.../instructions/gpu/context/GPUContext.java | 58 +-
.../context/GPULazyCudaFreeMemoryManager.java | 171 +
.../gpu/context/GPUMatrixMemoryManager.java | 184 +
.../gpu/context/GPUMemoryManager.java | 654 +-
.../instructions/gpu/context/GPUObject.java | 322 +-
.../runtime/matrix/data/LibMatrixCUDA.java | 130 +-
.../runtime/matrix/data/LibMatrixCuDNN.java | 54 +-
.../LibMatrixCuDNNConvolutionAlgorithm.java | 4 +-
.../data/LibMatrixCuDNNInputRowFetcher.java | 2 +-
.../runtime/matrix/data/LibMatrixCuMatMult.java | 4 +-
.../runtime/matrix/data/LibMatrixNative.java | 2 +-
.../SinglePrecisionCudaSupportFunctions.java | 45 +-
.../org/apache/sysml/utils/GPUStatistics.java | 37 +-
22 files changed, 5646 insertions(+), 4857 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template
index a9c73c8..05d6a1a 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -85,17 +85,23 @@
<sysml.gpu.availableGPUs>-1</sysml.gpu.availableGPUs>
<!-- whether to synchronize GPUs after every GPU instruction -->
- <sysml.gpu.sync.postProcess>true</sysml.gpu.sync.postProcess>
+ <sysml.gpu.sync.postProcess>false</sysml.gpu.sync.postProcess>
<!-- whether to perform eager CUDA free on rmvar instruction -->
<sysml.gpu.eager.cudaFree>false</sysml.gpu.eager.cudaFree>
+
+ <!-- Developer flag used to debug GPU memory leaks. This has huge performance overhead and should be only turned on for debugging purposes. -->
+ <sysml.gpu.print.memoryInfo>false</sysml.gpu.print.memoryInfo>
<!-- the floating point precision. supported values are double, single -->
<sysml.floating.point.precision>double</sysml.floating.point.precision>
- <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict -->
- <sysml.gpu.eviction.policy>lru</sysml.gpu.eviction.policy>
+ <!-- the eviction policy for the GPU bufferpool. supported values are lru, mru, lfu, min_evict, align_memory -->
+ <sysml.gpu.eviction.policy>align_memory</sysml.gpu.eviction.policy>
<!-- maximum wrap length for instruction and miscellaneous timer column of statistics -->
<sysml.stats.maxWrapLength>30</sysml.stats.maxWrapLength>
-</root>
+
+ <!-- Advanced optimization: fraction of driver memory to use for caching (default: 0.15) -->
+ <sysml.caching.bufferSize>0.15</sysml.caching.bufferSize>
+</root>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/docs/gpu.md
----------------------------------------------------------------------
diff --git a/docs/gpu.md b/docs/gpu.md
new file mode 100644
index 0000000..e9d7bca
--- /dev/null
+++ b/docs/gpu.md
@@ -0,0 +1,94 @@
+---
+layout: global
+title: Using SystemML with GPU
+description: Using SystemML with GPU
+---
+<!--
+{% comment %}
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to you under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+{% endcomment %}
+-->
+
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
+<br/>
+
+# User Guide
+
+To use SystemML on GPUs, please ensure that [CUDA 9](https://developer.nvidia.com/cuda-90-download-archive) and
+[CuDNN 7](https://developer.nvidia.com/cudnn) is installed on your system.
+
+## Python users
+
+Please install SystemML using pip:
+- For released version: `pip install systemml`
+- For bleeding edge version: `pip install https://sparktc.ibmcloud.com/repo/latest/systemml-1.2.0-SNAPSHOT-python.tar.gz`
+
+Then you can use the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) and
+[MLLearn](http://apache.github.io/systemml/beginners-guide-python.html#invoke-systemmls-algorithms) APIs to enable the GPU usage.
+
+```python
+from systemml.mllearn import Caffe2DML
+lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28))
+lenet.setGPU(True)
+```
+To skip memory-checking and force all GPU-enabled operations on the GPU, please use the `setForceGPU(True)` method after `setGPU(True)` method.
+
+```python
+from systemml.mllearn import Caffe2DML
+lenet = Caffe2DML(spark, solver='lenet_solver.proto', input_shape=(1, 28, 28))
+lenet.setGPU(True).setForceGPU(True)
+```
+
+## Command-line users
+
+To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and `-gpu` flag.
+
+```
+spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu
+```
+
+To skip memory-checking and force all GPU-enabled operations on the GPU, please provide `force` option to the `-gpu` flag.
+
+```
+spark-submit --jars systemml-1.*-extra.jar SystemML.jar -f myDML.dml -gpu force
+```
+
+## Scala users
+
+To enable the GPU backend via command-line, please provide `systemml-1.*-extra.jar` in the classpath and use
+the `setGPU(True)` method of [MLContext](http://apache.github.io/systemml/spark-mlcontext-programming-guide.html) API to enable the GPU usage.
+
+```
+spark-shell --jars systemml-1.*-extra.jar,SystemML.jar
+```
+
+# Troubleshooting guide
+
+- If you have older gcc (< 5.0) and if you get `libstdc++.so.6: version CXXABI_1.3.8 not found` error, please upgrade to gcc v5+.
+On Centos 5, you may have to compile gcc from the source:
+
+```
+sudo yum install libmpc-devel mpfr-devel gmp-devel zlib-devel*
+curl ftp://ftp.gnu.org/pub/gnu/gcc/gcc-5.3.0/gcc-5.3.0.tar.bz2 -O
+tar xvfj gcc-5.3.0.tar.bz2
+cd gcc-5.3.0
+./configure --with-system-zlib --disable-multilib --enable-languages=c,c++
+num_cores=`grep -c ^processor /proc/cpuinfo`
+make -j $num_cores
+sudo make install
+```
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/cpp/kernels/SystemML.cu
----------------------------------------------------------------------
diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu
index 29ae820..55ebeaf 100644
--- a/src/main/cpp/kernels/SystemML.cu
+++ b/src/main/cpp/kernels/SystemML.cu
@@ -20,7 +20,7 @@
/**********************************
When updating a kernel or adding a new one,
please compile the ptx file and commit it:
-nvcc -ptx -arch=sm_30 --std c++11 SystemML.cu
+nvcc -w -ptx -arch=sm_30 --std c++11 SystemML.cu
***********************************/
#include <cfloat>
@@ -1961,3 +1961,100 @@ extern "C" __global__ void matrix_sigmoid_f(float *A, float *C,
unsigned int size) {
matrix_sigmoid(A, C, size);
}
+
+// We can later fold it in our reduce method
+template <typename T>
+__device__ void compute_nnz(
+ T *g_idata, ///< input data stored in device memory (of size n)
+ T *g_odata, ///< output/temporary array stored in device memory (of size n)
+ unsigned int n) ///< size of the input and temporary/output arrays
+{
+ // extern __shared__ T sdata[];
+ extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[];
+ T *sdata = reinterpret_cast<T *>(my_sdata);
+
+ // perform first level of reduction,
+ // reading from global memory, writing to shared memory
+ unsigned int tid = threadIdx.x;
+ unsigned int i = blockIdx.x * blockDim.x * 2 + threadIdx.x;
+ unsigned int gridSize = blockDim.x * 2 * gridDim.x;
+
+ T v = 0;
+
+ // we reduce multiple elements per thread. The number is determined by the
+ // number of active thread blocks (via gridDim). More blocks will result
+ // in a larger gridSize and therefore fewer elements per thread
+ while (i < n) {
+ v += g_idata[i] != 0 ? 1 : 0;
+ // ensure we don't read out of bounds
+ if (i + blockDim.x < n) v += g_idata[i + blockDim.x] != 0 ? 1 : 0;
+ i += gridSize;
+ }
+
+ // each thread puts its local sum into shared memory
+ sdata[tid] = v;
+ __syncthreads();
+
+ // do reduction in shared mem
+ if (blockDim.x >= 1024) {
+ if (tid < 512) {
+ sdata[tid] = v = v + sdata[tid + 512];
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 512) {
+ if (tid < 256) {
+ sdata[tid] = v = v + sdata[tid + 256];
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 256) {
+ if (tid < 128) {
+ sdata[tid] = v = v + sdata[tid + 128];
+ }
+ __syncthreads();
+ }
+ if (blockDim.x >= 128) {
+ if (tid < 64) {
+ sdata[tid] = v = v + sdata[tid + 64];
+ }
+ __syncthreads();
+ }
+
+ if (tid < 32) {
+ // now that we are using warp-synchronous programming (below)
+ // we need to declare our shared memory volatile so that the compiler
+ // doesn't reorder stores to it and induce incorrect behavior.
+ volatile T *smem = sdata;
+ if (blockDim.x >= 64) {
+ smem[tid] = v = v + smem[tid + 32];
+ }
+ if (blockDim.x >= 32) {
+ smem[tid] = v = v + smem[tid + 16];
+ }
+ if (blockDim.x >= 16) {
+ smem[tid] = v = v + smem[tid + 8];
+ }
+ if (blockDim.x >= 8) {
+ smem[tid] = v = v + smem[tid + 4];
+ }
+ if (blockDim.x >= 4) {
+ smem[tid] = v = v + smem[tid + 2];
+ }
+ if (blockDim.x >= 2) {
+ smem[tid] = v = v + smem[tid + 1];
+ }
+ }
+
+ // write result for this block to global mem
+ if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+
+extern "C" __global__ void compute_nnz_d(double *g_idata, double *g_odata, unsigned int n) {
+ compute_nnz(g_idata, g_odata, n);
+}
+
+extern "C" __global__ void compute_nnz_f(float *g_idata, float *g_odata, unsigned int n) {
+ compute_nnz(g_idata, g_odata, n);
+}