You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2018/10/08 18:10:18 UTC
[incubator-mxnet] branch master updated: Add option for automatic
downcasting dtype for cudnn to allow using Tensorcore for fp32 (#12722)
This is an automated email from the ASF dual-hosted git repository.
haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 5314cf4 Add option for automatic downcasting dtype for cudnn to allow using Tensorcore for fp32 (#12722)
5314cf4 is described below
commit 5314cf4742767319ce356bd5154c6885380e0d5c
Author: Sebastian Bodenstein <se...@gmail.com>
AuthorDate: Mon Oct 8 20:10:00 2018 +0200
Add option for automatic downcasting dtype for cudnn to allow using Tensorcore for fp32 (#12722)
---
docs/faq/env_var.md | 12 +++++++++++-
src/common/cuda_utils.h | 16 ++++++++++++++++
src/operator/cudnn_rnn-inl.h | 5 +++++
src/operator/nn/cudnn/cudnn_convolution-inl.h | 5 +++++
4 files changed, 37 insertions(+), 1 deletion(-)
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index d3ca753..fd4c16b 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -167,7 +167,17 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
- Performance tests are run to pick the convolution algo when value is 1 or 2
- Value of 1 chooses the best algo in a limited workspace
- Value of 2 chooses the fastest algo whose memory requirements may be larger than the default workspace threshold
-
+
+* MXNET_CUDA_ALLOW_TENSOR_CORE
+ - 0(false) or 1(true) ```(default=1)```
+ - If set to '0', disallows Tensor Core use in CUDA ops.
+ - If set to '1', allows Tensor Core use in CUDA ops.
+ - This variable can only be set once in a session.
+
+* MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION
+ - 0(false) or 1(true) ```(default=0)```
+ - If set to '0', disallows implicit type conversions to Float16 to use Tensor Cores
+ - If set to '1', allows CUDA ops like RNN and Convolution to use TensorCores even with Float32 input data by using implicit type casting to Float16. Only has an effect if `MXNET_CUDA_ALLOW_TENSOR_CORE` is `1`.
* MXNET_GLUON_REPO
- Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index b4b10c2..0ada350 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -374,6 +374,22 @@ inline bool GetEnvAllowTensorCore() {
return allow_tensor_core;
}
+// The policy if the user hasn't set the environment variable
+// CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
+#define MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT false
+
+/*!
+ * \brief Returns global policy for TensorCore implicit type casting
+ */
+inline bool GetEnvAllowTensorCoreConversion() {
+ // Use of optional<bool> here permits: "0", "1", "true" and "false" to all be
+ // legal.
+ bool default_value = MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION_DEFAULT;
+ return dmlc::GetEnv("MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION",
+ dmlc::optional<bool>(default_value))
+ .value();
+}
+
#if CUDA_VERSION >= 9000
// Sets the cuBLAS math mode that determines the 'allow TensorCore' policy. Returns previous.
inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index b33a717..077428f 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -496,6 +496,11 @@ class CuDNNRNNOp : public Operator{
if (cudnn_tensor_core_ && rnn_algo == CUDNN_RNN_ALGO_STANDARD) {
math_type = CUDNN_TENSOR_OP_MATH;
}
+ #if CUDNN_VERSION >= 7200
+ if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+ (DataType<DType>::kFlag != kFloat16))
+ math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+ #endif
CUDNN_CALL(cudnnSetRNNMatrixMathType(rnn_desc_, math_type));
#endif
// Get temp space sizes
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index acdd649..53bd76c 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -543,6 +543,11 @@ class CuDNNConvolutionOp {
#if CUDNN_MAJOR >= 7
cudnnMathType_t math_type = cudnn_tensor_core_ ? CUDNN_TENSOR_OP_MATH
: CUDNN_DEFAULT_MATH;
+ #if CUDNN_VERSION >= 7200
+ if (GetEnvAllowTensorCore() && GetEnvAllowTensorCoreConversion() &&
+ (DataType<DType>::kFlag != kFloat16))
+ math_type = CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION;
+ #endif
CUDNN_CALL(cudnnSetConvolutionMathType(forward_conv_desc_, math_type));
CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_, math_type));
CUDNN_CALL(cudnnSetConvolutionMathType(back_conv_desc_w_, math_type));