You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2018/01/15 20:12:26 UTC
[incubator-mxnet] branch master updated: Batching improvements for
GEMM/TRSM operators and full MKL usage docs. (#8846)
This is an automated email from the ASF dual-hosted git repository.
jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 3ac5376 Batching improvements for GEMM/TRSM operators and full MKL usage docs. (#8846)
3ac5376 is described below
commit 3ac5376cbe14faa120d382be62d32c9c49a0baa0
Author: Eric R Meissner <me...@gmail.com>
AuthorDate: Mon Jan 15 20:12:21 2018 +0000
Batching improvements for GEMM/TRSM operators and full MKL usage docs. (#8846)
* Batching improvements for GEMM/TRSM operators and full MKL usage docs.
* Changed GEMM operator to use gemmStridedBatch CUDA implementation when CUDA is version 8 or higher, otherwise to just do batching manually.
* Changed TRSM operator to not use the CUDA batching functionality as it's slower for large matrices. Instead do batching manually.
* Added instructions for using a full MKL installation instead of just MKL2017
* Batching improvements for GEMM/TRSM operators and full MKL usage docs.
* Changed GEMM operator to use gemmStridedBatch CUDA implementation when CUDA is version 8 or higher, otherwise to just do batching manually.
* Changed TRSM operator to not use the CUDA batching functionality as it's slower for large matrices. Instead do batching manually.
* Added instructions for using a full MKL installation instead of just MKL2017
---
MKL_README.md | 19 ++++++
make/config.mk | 8 ---
src/operator/linalg_impl.h | 145 ++++++++++++++++-----------------------------
3 files changed, 70 insertions(+), 102 deletions(-)
diff --git a/MKL_README.md b/MKL_README.md
index 80a31c9..0f97416 100644
--- a/MKL_README.md
+++ b/MKL_README.md
@@ -1,3 +1,22 @@
+# Full MKL Installation
+
+## Build/Install MXNet with a full MKL installation:
+Installing and enabling the full MKL installation enables MKL support for all operators under the linalg namespace.
+
+ 1. Download and install the latest full MKL version following instructions on the [intel website.](https://software.intel.com/en-us/articles/intel-mkl-111-install-guide)
+
+ 2. Set USE_BLAS=mkl in make/config.mk
+
+ 1.1 Set ADD_LDFLAGS=-L<path/to/mkl/lib/folder> (ex. ADD_LDFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/lib)
+
+ 1.1 Set ADD_CFLAGS=-I<path/to/mkl/include/folder> (ex. ADD_CFLAGS=-L/opt/intel/compilers_and_libraries_2018.0.128/linux/mkl/include)
+
+ 3. Run 'make -j ${nproc}'
+
+ 4. Navigate into the python directory
+
+ 5. Run 'sudo python setup.py install'
+
# MKL2017 PLUGIN
MKL2017 is an INTEL released library to accelerate Deep Neural Network (DNN) applications on Intel architecture.
diff --git a/make/config.mk b/make/config.mk
index 9f7564b..a322fee 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -110,21 +110,13 @@ USE_LAPACK = 1
# path to lapack library in case of a non-standard installation
USE_LAPACK_PATH =
-# by default, disable lapack when using MKL
-# switch on when there is a full installation of MKL available (not just MKL2017/MKL_ML)
-ifeq ($(USE_BLAS), mkl)
-USE_LAPACK = 0
-endif
-
# add path to intel library, you may need it for MKL, if you did not add the path
# to environment variable
USE_INTEL_PATH = NONE
# If use MKL only for BLAS, choose static link automatically to allow python wrapper
-ifeq ($(USE_MKL2017), 0)
ifeq ($(USE_BLAS), mkl)
USE_STATIC_MKL = 1
-endif
else
USE_STATIC_MKL = NONE
endif
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index b3e6573..b2a672f 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -69,14 +69,14 @@ void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
A.dptr_, A.stride_, B.dptr_, B.stride_, beta, C.dptr_, C.stride_); \
}
-#define LINALG_CPU_BATCH_GEMM(DType) \
+#define LINALG_XPU_BATCH_GEMM(xpu, DType) \
template<> inline \
-void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
- const Tensor<cpu, 3, DType>& C, DType alpha, DType beta, \
- bool tA, bool tB, Stream<cpu> *s) { \
+void linalg_batch_gemm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
+ const Tensor<xpu, 3, DType>& C, DType alpha, DType beta, \
+ bool tA, bool tB, Stream<xpu> *s) { \
linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
for (index_t i = 0; i < A.size(0); ++i) { \
- linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB); \
+ linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB, s); \
} \
}
@@ -90,11 +90,11 @@ void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
LOG(FATAL) << "linalg_gemm (without req arg) not implemented by mxnet for cpu, needs cblas!"; \
}
-#define LINALG_CPU_BATCH_GEMM(DType) \
+#define LINALG_XPU_BATCH_GEMM(xpu, DType) \
template<> inline \
-void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
- const Tensor<cpu, 3, DType>& C, DType alpha, DType beta, \
- bool tA, bool tB, Stream<cpu> *s) { \
+void linalg_batch_gemm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
+ const Tensor<xpu, 3, DType>& C, DType alpha, DType beta, \
+ bool tA, bool tB, Stream<xpu> *s) { \
LOG(FATAL) << "linalg_batch_gemm not implemented by mxnet for cpu, needs cblas!"; \
}
@@ -103,8 +103,8 @@ void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<
LINALG_CPU_GEMM(sgemm, float)
LINALG_CPU_GEMM(dgemm, double)
-LINALG_CPU_BATCH_GEMM(float)
-LINALG_CPU_BATCH_GEMM(double)
+LINALG_XPU_BATCH_GEMM(cpu, float)
+LINALG_XPU_BATCH_GEMM(cpu, double)
// Specialization of linalg_gemm<cpu, DType> for DType=mshadow::half::half_t.
template<> inline
@@ -119,13 +119,6 @@ void linalg_gemm<cpu, mshadow::half::half_t>(const Tensor<cpu, 2, mshadow::half:
#ifdef __CUDACC__
-template<typename DType>
-__global__ void linalgCollectBatchOffsetsGPU(DType *a[], DType* b, int stride, int N) {
- for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
- a[i] = b + i * stride;
- }
-}
-
// cublas col-major processing accounted for by switching first two operands
#define LINALG_GPU_GEMM(fname, DType) \
@@ -195,43 +188,36 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
#endif // CUDA_VERSION >= 7050
}
-
+// As of cuda8, cublas has implemented a strided version of batch gemm.
+#if CUDA_VERSION < 8000
+ LINALG_XPU_BATCH_GEMM(gpu, float)
+ LINALG_XPU_BATCH_GEMM(gpu, double)
+#else
#define LINALG_GPU_BATCH_GEMM(fname, DType) \
-template<> inline \
-void linalg_batch_gemm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<gpu, 3, DType>& B, \
- const Tensor<gpu, 3, DType>& C, DType alpha, DType beta, \
- bool tA, bool tB, Stream<gpu> *s) { \
- using namespace mxnet; \
- using mshadow::gpu; \
- CHECK_NOTNULL(s); \
- linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
- check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \
- Storage::Handle offsetsA, offsetsB, offsetsC; \
- offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \
- offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \
- offsetsC = Storage::Get()->Alloc(sizeof(DType*)*C.size(0), Context::GPU()); \
- using namespace mshadow::cuda; \
- int ngrid = std::min(kMaxGridNum, \
- static_cast<int>((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \
- linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
- (static_cast<DType **>(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \
- linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
- (static_cast<DType **>(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, B.size(0)); \
- linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
- (static_cast<DType **>(offsetsC.dptr), C.dptr_, C.size(1)*C.stride_, C.size(0)); \
- CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
- (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
- (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
- C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \
- &alpha, static_cast<const DType **>(offsetsB.dptr), B.stride_, \
- static_cast<const DType **>(offsetsA.dptr), A.stride_, \
- &beta, static_cast<DType **>(offsetsC.dptr), C.stride_, A.size(0))) \
- Storage::Get()->Free(offsetsA); \
- Storage::Get()->Free(offsetsB); \
- Storage::Get()->Free(offsetsC); \
-}
-LINALG_GPU_BATCH_GEMM(SgemmBatched, float)
-LINALG_GPU_BATCH_GEMM(DgemmBatched, double)
+ template<> inline \
+ void linalg_batch_gemm<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
+ const Tensor<gpu, 3, DType>& B, \
+ const Tensor<gpu, 3, DType>& C, DType alpha, DType beta, \
+ bool tA, bool tB, Stream<gpu> *s) { \
+ using namespace mxnet; \
+ using mshadow::gpu; \
+ CHECK_NOTNULL(s); \
+ linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
+ check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \
+ using namespace mshadow::cuda; \
+ CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+ (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
+ (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
+ C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \
+ &alpha, B.dptr_, B.stride_, B.size(1) * B.stride_, \
+ A.dptr_, A.stride_, A.size(1) * A.stride_, \
+ &beta, C.dptr_, C.stride_, C.size(1) * C.stride_, A.size(0))) \
+ }
+
+ LINALG_GPU_BATCH_GEMM(SgemmStridedBatched, float)
+ LINALG_GPU_BATCH_GEMM(DgemmStridedBatched, double)
+
+#endif // CUDA < 8000
#endif // __CUDACC__
@@ -266,13 +252,13 @@ void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
A.stride_, B.dptr_, B.stride_); \
}
-#define LINALG_CPU_BATCH_TRSM(DType) \
+#define LINALG_XPU_BATCH_TRSM(xpu, DType) \
template<> inline \
-void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
- DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+void linalg_batch_trsm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
+ DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s) { \
linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
for (index_t i = 0; i < A.size(0); ++i) { \
- linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose); \
+ linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose, s); \
} \
}
@@ -285,10 +271,10 @@ void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
LOG(FATAL) << "linalg_trsm not implemented, needs cblas!"; \
}
-#define LINALG_CPU_BATCH_TRSM(DType) \
+#define LINALG_XPU_BATCH_TRSM(xpu, DType) \
template<> inline \
-void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
- DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+void linalg_batch_trsm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B, \
+ DType alpha, bool rightside, bool lower, bool transpose, Stream<xpu> *s) { \
LOG(FATAL) << "linalg_batch_trsm not implemented, needs cblas!"; \
}
@@ -297,8 +283,8 @@ void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<
LINALG_CPU_TRSM(strsm, float)
LINALG_CPU_TRSM(dtrsm, double)
-LINALG_CPU_BATCH_TRSM(float)
-LINALG_CPU_BATCH_TRSM(double)
+LINALG_XPU_BATCH_TRSM(cpu, float)
+LINALG_XPU_BATCH_TRSM(cpu, double)
#ifdef __CUDACC__
@@ -322,37 +308,8 @@ void linalg_trsm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2
LINALG_GPU_TRSM(Strsm, float)
LINALG_GPU_TRSM(Dtrsm, double)
-#define LINALG_GPU_BATCH_TRSM(fname, DType) \
-template<> inline \
-void linalg_batch_trsm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<gpu, 3, DType>& B, \
- DType alpha, bool rightside, bool lower, bool transpose, Stream<gpu> *s) { \
- using namespace mxnet; \
- using mshadow::gpu; \
- CHECK_NOTNULL(s); \
- linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
- check_trsm(A[0], B[0], alpha, rightside, lower, transpose); \
- Storage::Handle offsetsA, offsetsB; \
- offsetsA = Storage::Get()->Alloc(sizeof(DType*)*A.size(0), Context::GPU()); \
- offsetsB = Storage::Get()->Alloc(sizeof(DType*)*B.size(0), Context::GPU()); \
- using namespace mshadow::cuda; \
- int ngrid = std::min(kMaxGridNum, \
- static_cast<int>((A.size(0) + kBaseThreadNum - 1) / kBaseThreadNum)); \
- linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
- (static_cast<DType **>(offsetsA.dptr), A.dptr_, A.size(1)*A.stride_, A.size(0)); \
- linalgCollectBatchOffsetsGPU<<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>> \
- (static_cast<DType **>(offsetsB.dptr), B.dptr_, B.size(1)*B.stride_, A.size(0)); \
- CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
- (rightside ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT), \
- (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
- (transpose ? CUBLAS_OP_T : CUBLAS_OP_N), \
- CUBLAS_DIAG_NON_UNIT, B.size(2), B.size(1), &alpha, \
- static_cast<const DType **>(offsetsA.dptr), A.stride_, \
- static_cast<DType **>(offsetsB.dptr), B.stride_, A.size(0))); \
- Storage::Get()->Free(offsetsA); \
- Storage::Get()->Free(offsetsB); \
-}
-LINALG_GPU_BATCH_TRSM(StrsmBatched, float)
-LINALG_GPU_BATCH_TRSM(DtrsmBatched, double)
+LINALG_XPU_BATCH_TRSM(gpu, float)
+LINALG_XPU_BATCH_TRSM(gpu, double)
#endif // __CUDACC__
--
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].