You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2020/07/05 06:32:21 UTC

[incubator-mxnet] branch v1.7.x updated: [v1.7.x] backport mixed type binary ops to v1.7.x (#18649)

This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch v1.7.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/v1.7.x by this push:
     new 477affe  [v1.7.x] backport mixed type binary ops to v1.7.x (#18649)
477affe is described below

commit 477affeef45c2630825306138c07304727a5e18c
Author: Yijun Chen <ch...@gmail.com>
AuthorDate: Sun Jul 5 14:30:54 2020 +0800

    [v1.7.x] backport mixed type binary ops to v1.7.x (#18649)
    
    * Fix Windows GPU CI (#17962)
    
    Update Windows CI to use VS 2019 and enable x64 bit toolchain. Previously we are using an older 32 bit toolchain causing OOM errors during linking. Switching to x64 bit toolchain on the older VS version previously used by the CI was attempted in #17912 and did not work. Update to Cuda 10.2 as it is required by VS 2019. Switch to ninja-build on Windows to speed up build as ninja-build is now preinstalled. Remove logic to install cmake 3.16 on every PR as cmake 3.17 is now preinstalled. [...]
    
    Co-authored-by: vexilligera <ve...@gmail.com>
    
    * backport mixed type
    
    Co-authored-by: Leonard Lausen <la...@amazon.com>
    Co-authored-by: vexilligera <ve...@gmail.com>
---
 .gitignore                                         |   4 +
 ci/build_windows.py                                |  68 +++++++----
 include/mxnet/imperative.h                         |  11 ++
 python/mxnet/symbol/numpy/_symbol.py               |   8 +-
 src/common/utils.h                                 |  19 +++
 src/operator/contrib/gradient_multiplier_op.cc     |   6 +-
 src/operator/mshadow_op.h                          |  83 +++++++++++--
 src/operator/mxnet_op.h                            |   9 +-
 .../numpy/np_elemwise_broadcast_logic_op.cc        |  18 +--
 src/operator/numpy/np_elemwise_broadcast_op.cc     | 136 +++++++++++----------
 src/operator/numpy/np_elemwise_broadcast_op.cu     |  41 +++----
 src/operator/numpy/np_elemwise_broadcast_op.h      | 112 +++++++----------
 .../numpy/np_elemwise_broadcast_op_extended.cc     |  68 +++++------
 src/operator/numpy/np_matrix_op-inl.h              |   4 +-
 src/operator/numpy/np_true_divide-inl.h            | 124 ++-----------------
 src/operator/numpy/np_true_divide.cc               |  40 +++---
 src/operator/numpy/np_true_divide.cu               |   4 +
 src/operator/operator_tune.cc                      |   2 +
 src/operator/tensor/elemwise_binary_broadcast_op.h |   4 +-
 src/operator/tensor/elemwise_binary_scalar_op.h    | 123 +++++++++++++++----
 .../tensor/elemwise_binary_scalar_op_basic.cc      |  49 ++++----
 .../tensor/elemwise_binary_scalar_op_extended.cc   |  36 ++----
 .../tensor/elemwise_binary_scalar_op_logic.cc      |   3 +-
 tests/python/unittest/test_numpy_op.py             | 103 +++++++++-------
 tests/python/unittest/test_symbol.py               |   2 -
 25 files changed, 568 insertions(+), 509 deletions(-)

diff --git a/.gitignore b/.gitignore
index c50d1ec..9fafdb1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -121,6 +121,10 @@ cmake_install.cmake
 # Mac OS X
 .DS_Store
 
+# Windows
+windows_package.7z
+windows_package
+
 #Notebook Automated Test
 !tests/nightly/test_tutorial_config.txt
 !tests/nightly/TestNotebook
diff --git a/ci/build_windows.py b/ci/build_windows.py
index 2590d21..b9c17a8 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -33,13 +33,15 @@ import time
 import zipfile
 from distutils.dir_util import copy_tree
 from enum import Enum
-from subprocess import check_call
+from subprocess import check_call, call
 
 from util import *
 
 KNOWN_VCVARS = {
+    # https://gitlab.kitware.com/cmake/cmake/issues/18920
     'VS 2015': r'C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64\vcvarsx86_amd64.bat',
-    'VS 2017': r'C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat'
+    'VS 2017': r'C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsx86_amd64.bat',
+    'VS 2019': r'C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvars64.bat',
 }
 
 
@@ -54,6 +56,8 @@ class BuildFlavour(Enum):
 
 CMAKE_FLAGS = {
     'WIN_CPU': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -67,6 +71,8 @@ CMAKE_FLAGS = {
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_CPU_MKLDNN': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -80,6 +86,8 @@ CMAKE_FLAGS = {
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_CPU_MKLDNN_MKL': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -93,6 +101,8 @@ CMAKE_FLAGS = {
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_CPU_MKL': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=OFF '
         '-DUSE_CUDNN=OFF '
         '-DENABLE_CUDA_RTC=OFF '
@@ -106,6 +116,8 @@ CMAKE_FLAGS = {
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_GPU': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
         '-DENABLE_CUDA_RTC=ON '
@@ -120,6 +132,8 @@ CMAKE_FLAGS = {
         '-DCMAKE_BUILD_TYPE=Release')
 
     , 'WIN_GPU_MKLDNN': (
+        '-DCMAKE_C_COMPILER=cl '
+        '-DCMAKE_CXX_COMPILER=cl '
         '-DUSE_CUDA=ON '
         '-DUSE_CUDNN=ON '
         '-DENABLE_CUDA_RTC=ON '
@@ -140,38 +154,40 @@ def windows_build(args):
     logging.info("Using vcvars environment:\n{}".format(args.vcvars))
 
     path = args.output
-    os.makedirs(path, exist_ok=True)
 
-    mxnet_root = get_mxnet_root()
-    logging.info("Found MXNet root: {}".format(mxnet_root))
+    # cuda thrust + VS 2019 is flaky: try multiple times if fail
+    MAXIMUM_TRY = 5
+    build_try = 0
+
+    while build_try < MAXIMUM_TRY:
+        if os.path.exists(path):
+            shutil.rmtree(path)
+        os.makedirs(path, exist_ok=True)
 
-    url = 'https://github.com/Kitware/CMake/releases/download/v3.16.1/cmake-3.16.1-win64-x64.zip'
-    with tempfile.TemporaryDirectory() as tmpdir:
-        cmake_file_path = download_file(url, tmpdir)
-        with zipfile.ZipFile(cmake_file_path, 'r') as zip_ref:
-            # Create $tmpdir\cmake-3.16.1-win64-x64\bin\cmake.exe
-            zip_ref.extractall(tmpdir)
+        mxnet_root = get_mxnet_root()
+        logging.info("Found MXNet root: {}".format(mxnet_root))
 
         with remember_cwd():
             os.chdir(path)
-            cmd = "\"{}\" && {} -G \"NMake Makefiles JOM\" {} {}".format(
-                args.vcvars,
-                os.path.join(tmpdir, 'cmake-3.16.1-win64-x64', 'bin', 'cmake.exe'),
-                CMAKE_FLAGS[args.flavour], mxnet_root)
+            cmd = "\"{}\" && cmake -GNinja {} {}".format(args.vcvars,
+                                                         CMAKE_FLAGS[args.flavour],
+                                                         mxnet_root)
             logging.info("Generating project with CMake:\n{}".format(cmd))
             check_call(cmd, shell=True)
 
-            cmd = "\"{}\" && jom".format(args.vcvars)
-            logging.info("Building with jom:\n{}".format(cmd))
+            cmd = "\"{}\" && ninja".format(args.vcvars)
+            logging.info("Building:\n{}".format(cmd))
 
             t0 = int(time.time())
-            check_call(cmd, shell=True)
-
-            logging.info(
-                "Build flavour: {} complete in directory: \"{}\"".format(
-                    args.flavour, os.path.abspath(path)))
-            logging.info("Build took {}".format(
-                datetime.timedelta(seconds=int(time.time() - t0))))
+            ret = call(cmd, shell=True)
+
+            if ret != 0:
+                build_try += 1
+                logging.info("{} build(s) have failed".format(build_try))
+            else:
+                logging.info("Build flavour: {} complete in directory: \"{}\"".format(args.flavour, os.path.abspath(path)))
+                logging.info("Build took {}".format(datetime.timedelta(seconds=int(time.time() - t0))))
+                break
     windows_package(args)
 
 
@@ -233,7 +249,7 @@ def main():
 
     parser.add_argument("--vcvars",
         help="vcvars batch file location, typically inside vs studio install dir",
-        default=KNOWN_VCVARS['VS 2015'],
+        default=KNOWN_VCVARS['VS 2019'],
         type=str)
 
     parser.add_argument("--arch",
@@ -258,7 +274,7 @@ def main():
         if 'OpenCV_DIR' not in os.environ:
             os.environ["OpenCV_DIR"] = "C:\\Program Files\\OpenCV-v3.4.1\\build"
         if 'CUDA_PATH' not in os.environ:
-            os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
+            os.environ["CUDA_PATH"] = "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2"
         if 'MKL_ROOT' not in os.environ:
             os.environ["MKL_ROOT"] = "C:\\Program Files (x86)\\IntelSWTools\\compilers_and_libraries\\windows\\mkl"
         windows_build(args)
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
index 6a367b3..783ad26 100644
--- a/include/mxnet/imperative.h
+++ b/include/mxnet/imperative.h
@@ -117,6 +117,16 @@ class Imperative {
     }
     return is_np_shape_thread_local_ ? 1 : 0;
   }
+
+  /*! \brief return current numpy default dtype compatibility status.
+  * */
+  bool is_np_default_dtype() const {
+    if (is_np_default_dtype_global_) {
+      return true;
+    }
+    return false;
+  }
+
   /*! \brief specify numpy compatibility off, thread local on or global on. */
   bool set_is_np_shape(int is_np_shape) {
     NumpyShape flag = static_cast<NumpyShape>(is_np_shape);
@@ -215,6 +225,7 @@ class Imperative {
   static MX_THREAD_LOCAL bool is_np_shape_thread_local_;
 #endif
   bool is_np_shape_global_{false};
+  bool is_np_default_dtype_global_{false};
   /*! \brief node count used for naming */
   std::atomic<uint64_t> node_count_{0};
   /*! \brief variable count used for naming */
diff --git a/python/mxnet/symbol/numpy/_symbol.py b/python/mxnet/symbol/numpy/_symbol.py
index da8d5d7..5f81a73 100644
--- a/python/mxnet/symbol/numpy/_symbol.py
+++ b/python/mxnet/symbol/numpy/_symbol.py
@@ -1529,13 +1529,15 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None, ou
         if isinstance(rhs, numeric_types):
             return fn_scalar(lhs, rhs, out=out)
         else:
+            is_int = isinstance(rhs, integer_types)
             if rfn_scalar is None:
                 # commutative function
-                return lfn_scalar(rhs, float(lhs), out=out)
+                return lfn_scalar(rhs, scalar=float(lhs), is_int=is_int, out=out)
             else:
-                return rfn_scalar(rhs, float(lhs), out=out)
+                return rfn_scalar(rhs, scalar=float(lhs), is_int=is_int, out=out)
     elif isinstance(rhs, numeric_types):
-        return lfn_scalar(lhs, float(rhs), out=out)
+        is_int = isinstance(rhs, integer_types)
+        return lfn_scalar(lhs, scalar=float(rhs), is_int=is_int, out=out)
     elif isinstance(rhs, Symbol):
         return fn_array(lhs, rhs, out=out)
     else:
diff --git a/src/common/utils.h b/src/common/utils.h
index 44d4fc3..2278307 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -31,6 +31,7 @@
 #include <nnvm/node.h>
 #include <mxnet/engine.h>
 #include <mxnet/ndarray.h>
+#include <mxnet/imperative.h>
 #include <mxnet/op_attr_types.h>
 #include <mxnet/graph_attr_types.h>
 #include <nnvm/graph_attr_types.h>
@@ -874,6 +875,11 @@ inline bool is_float(const int dtype) {
   return dtype == mshadow::kFloat32 || dtype == mshadow::kFloat64 || dtype == mshadow::kFloat16;
 }
 
+inline bool is_int(const int dtype) {
+  return dtype == mshadow::kUint8 || dtype == mshadow::kInt8 ||
+         dtype == mshadow::kInt32 || dtype == mshadow::kInt64;
+}
+
 inline int get_more_precise_type(const int type1, const int type2) {
   if (type1 == type2) return type1;
   if (is_float(type1) && is_float(type2)) {
@@ -910,6 +916,19 @@ inline int np_binary_out_infer_type(const int type1, const int type2) {
   return get_more_precise_type(type1, type2);
 }
 
+inline int GetDefaultDtype() {
+  return Imperative::Get()->is_np_default_dtype() ?
+         mshadow::kFloat64 :
+         mshadow::kFloat32;
+}
+
+inline int GetDefaultDtype(int dtype) {
+  if (dtype != -1) return dtype;
+  return Imperative::Get()->is_np_default_dtype() ?
+         mshadow::kFloat64 :
+         mshadow::kFloat32;
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/operator/contrib/gradient_multiplier_op.cc b/src/operator/contrib/gradient_multiplier_op.cc
index 0a49ec1..5221d89 100644
--- a/src/operator/contrib/gradient_multiplier_op.cc
+++ b/src/operator/contrib/gradient_multiplier_op.cc
@@ -77,9 +77,7 @@ In forward pass it acts as an identity transform. During backpropagation it
 multiplies the gradient from the subsequent level by a scalar factor lambda and passes it to
 the preceding layer.
 )code" ADD_FILELINE)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
@@ -88,7 +86,7 @@ the preceding layer.
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
-.add_argument("scalar", "float", "lambda multiplier");
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__());
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_contrib_backward_gradientmultiplier)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 2d4d492..61d299d 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -148,7 +148,6 @@ struct true_divide : public mxnet_op::tunable  {
     return static_cast<float>(a) / static_cast<float>(b);
   }
 
-#ifndef _WIN32
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
@@ -166,7 +165,6 @@ struct true_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     return static_cast<double>(a) / b;
   }
-#endif
 };
 
 struct rtrue_divide : public mxnet_op::tunable  {
@@ -182,7 +180,6 @@ struct rtrue_divide : public mxnet_op::tunable  {
     return static_cast<float>(b) / static_cast<float>(a);
   }
 
-#ifndef _WIN32
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
   MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
@@ -200,14 +197,12 @@ struct rtrue_divide : public mxnet_op::tunable  {
   MSHADOW_XINLINE static double Map(DType a, double b) {
     return b / static_cast<double>(a);
   }
-#endif
 };
 
 MXNET_BINARY_MATH_OP_NC(left, a);
 
 MXNET_BINARY_MATH_OP_NC(right, b);
 
-#ifndef _WIN32
 struct mixed_plus {
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
@@ -345,8 +340,12 @@ struct mixed_rpower {
     return static_cast<double>(math::pow(b, a));
   }
 };
-#endif
 
+#pragma GCC diagnostic push
+#if __GNUC__ >= 7
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#pragma GCC diagnostic ignored "-Wbool-compare"
+#endif
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(mul, a * b);
 
 MXNET_BINARY_MATH_OP_NC_WITH_BOOL(div, a / b);
@@ -575,7 +574,6 @@ MXNET_BINARY_MATH_OP(rpower, math::pow(b, a));
 MXNET_BINARY_MATH_OP(rpower_grad, math::id(a) * math::log(b));
 
 MXNET_BINARY_MATH_OP(arctan2, math::atan2(a, b));
-
 MXNET_BINARY_MATH_OP(arctan2_grad, math::id(b) / (math::id(a * a + b * b)));
 
 MXNET_BINARY_MATH_OP(arctan2_rgrad, -math::id(a) / (math::id(a * a + b * b)));
@@ -728,6 +726,10 @@ MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1));
 
 MXNET_BINARY_MATH_OP(rminus, b - a);
 
+MXNET_BINARY_MATH_OP_NC(posone, 1);
+
+MXNET_BINARY_MATH_OP_NC(negone, -1);
+
 MXNET_BINARY_MATH_OP(div_grad, 1.0f / math::id(b));
 
 template<>
@@ -795,6 +797,73 @@ struct mod : public mxnet_op::tunable {
   }
 };
 
+struct mixed_mod {
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return mod::Map(static_cast<mshadow::half::half_t>(a), b);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return mod::Map(static_cast<float>(a), b);
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_same<DType, float>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return mod::Map(static_cast<double>(a), b);
+  }
+};
+
+struct mixed_rmod {
+  template<typename DType,
+           typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static mshadow::half::half_t Map(DType a, mshadow::half::half_t b) {
+    return mod::Map(b, static_cast<mshadow::half::half_t>(a));
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static float Map(DType a, float b) {
+    return mod::Map(b, static_cast<float>(a));
+  }
+
+  template<typename DType,
+           typename std::enable_if<std::is_same<DType, mshadow::half::half_t>::value ||
+                                   std::is_same<DType, float>::value ||
+                                   std::is_integral<DType>::value, int>::type = 0>
+  MSHADOW_XINLINE static double Map(DType a, double b) {
+    return mod::Map(b, static_cast<double>(a));
+  }
+};
+
+struct fmod : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    if (b == DType(0)) {
+      return DType(0);
+    } else {
+        return DType(::fmod(static_cast<double>(a), static_cast<double>(b)));
+    }
+  }
+};
+
+struct rfmod : public mxnet_op::tunable {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    if (a == DType(0)) {
+      return DType(0);
+    } else  {
+      return DType(::fmod(static_cast<double>(b), static_cast<double>(a)));
+    }
+  }
+};
 
 template<>
 MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index f7dbce2..0c2092f 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -849,7 +849,13 @@ struct op_with_req {
     KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
   }
 
-#ifndef _WIN32
+  /*! \brief input is two tensors with different type and with a boolean output tensor */
+  template<typename LType, typename RType,
+           typename std::enable_if<!std::is_same<LType, RType>::value, int>::type = 0>
+  MSHADOW_XINLINE static void Map(index_t i, bool *out, const LType *lhs, const RType *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
   /*! \brief inputs are two tensors with a half_t output tensor */
   template<typename DType,
            typename std::enable_if<std::is_integral<DType>::value, int>::type = 0>
@@ -903,7 +909,6 @@ struct op_with_req {
   MSHADOW_XINLINE static void Map(index_t i, double *out, const DType *lhs, const double value) {
     KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], value));
   }
-#endif
 
   /*! \brief inputs are two tensors with a float output tensor */
   template<typename DType,
diff --git a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
index 74db52d..a54c7ca 100644
--- a/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_logic_op.cc
@@ -206,7 +206,8 @@ struct TVMBinaryBroadcastScalarCompute {
 
     // scalar param
     type_codes[1] = kDLFloat;
-    values[1].v_float64 = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    values[1].v_float64 = param.scalar;
 
     // output tensor
     type_codes[2] = kTVMDLTensorHandle;
@@ -225,9 +226,7 @@ struct TVMBinaryBroadcastScalarCompute {
   NNVM_REGISTER_OP(_npi_##name##_scalar)                                                    \
   .set_num_inputs(1)                                                                        \
   .set_num_outputs(1)                                                                       \
-  .set_attr_parser([](NodeAttrs* attrs) {                                                   \
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);                                       \
-  })                                                                                        \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                                     \
   .set_attr<nnvm::FListInputNames>("FListInputNames",                                       \
   [](const NodeAttrs& attrs) {                                                              \
     return std::vector<std::string>{"data"};                                                \
@@ -240,7 +239,7 @@ struct TVMBinaryBroadcastScalarCompute {
   })                                                                                        \
   .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)                                \
   .add_argument("data", "NDArray-or-Symbol", "First input to the function")                 \
-  .add_argument("scalar", "float", "scalar input")
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC(equal);
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC(not_equal);
@@ -285,9 +284,12 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_GPU(less_equal);
 
 #else
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_CPU(name)                               \
-  NNVM_REGISTER_OP(_npi_##name##_scalar)                                                       \
-  .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeLogic<cpu, mshadow_op::np_##name>)
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR_LOGIC_CPU(name)                                  \
+  NNVM_REGISTER_OP(_npi_##name##_scalar)                                                          \
+  .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeLogic<cpu, mshadow_op::np_##name>)  \
+  .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {                        \
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};                             \
+  })
 
 #endif  // MXNET_USE_TVM_OP
 
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cc b/src/operator/numpy/np_elemwise_broadcast_op.cc
index ae285ca..ae6697c 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cc
@@ -23,27 +23,26 @@
  * \brief CPU Implementation of basic functions for elementwise numpy binary broadcast operator.
  */
 
-#include <dmlc/strtonum.h>
 #include "./np_elemwise_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);             \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
-  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)  \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+DMLC_REGISTER_PARAMETER(NumpyBinaryScalarParam);
+
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)                    \
+  NNVM_REGISTER_OP(name)                                                  \
+  .set_num_inputs(1)                                                      \
+  .set_num_outputs(1)                                                     \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                   \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)       \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)        \
+  .set_attr<FResourceRequest>("FResourceRequest",                         \
+    [](const NodeAttrs& attrs) {                                          \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};   \
+    })                                                                    \
+  .add_argument("data", "NDArray-or-Symbol", "source input")              \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
                                    std::vector<int>* in_attrs,
@@ -61,7 +60,6 @@ bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-#ifndef _WIN32
 #define MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(name)                \
   NNVM_REGISTER_OP(name)                                                       \
   .set_num_inputs(2)                                                           \
@@ -76,68 +74,62 @@ bool NumpyBinaryMixedPrecisionType(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                                \
       return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};                \
     })                                                                         \
-  .add_argument("lhs", "NDArray-or-Symbol", "First input to the function")     \
-  .add_argument("rhs", "NDArray-or-Symbol", "Second input to the function")
-#else
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(name)                \
-  NNVM_REGISTER_OP(name)                                                       \
-  .set_num_inputs(2)                                                           \
-  .set_num_outputs(1)                                                          \
-  .set_attr<nnvm::FListInputNames>("FListInputNames",                          \
+  .set_attr<FResourceRequest>("FResourceRequest",                              \
     [](const NodeAttrs& attrs) {                                               \
-      return std::vector<std::string>{"lhs", "rhs"};                           \
-    })                                                                         \
-  .set_attr<mxnet::FInferShape>("FInferShape", BinaryBroadcastShape)           \
-  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryMixedPrecisionType)     \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                            \
-    [](const NodeAttrs& attrs){                                                \
-      return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};                \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};        \
     })                                                                         \
-  .set_attr<FResourceRequest>("FResourceRequest",                              \
-  [](const NodeAttrs& attrs) {                                                 \
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};          \
-  })                                                                           \
   .add_argument("lhs", "NDArray-or-Symbol", "First input to the function")     \
   .add_argument("rhs", "NDArray-or-Symbol", "Second input to the function")
-#endif
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_add)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::plus, op::mshadow_op::mixed_plus,
                                       op::mshadow_op::mixed_plus>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::plus>)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_add"});
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_add)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone,
+                                                                mshadow_op::posone>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_subtract)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::minus, op::mshadow_op::mixed_minus,
                               op::mshadow_op::mixed_rminus>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::minus>)
-#endif
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"});
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_sub"});
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::posone,
+                                                                mshadow_op::negone>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_multiply)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
                                       op::mshadow_op::mixed_mul>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::mul>)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mul"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
@@ -155,21 +147,33 @@ NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
 .set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::right,
                                                               mshadow_op::left>);
 
-MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_mod)
-.set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::mod>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mod"});
+MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_mod)
+.set_attr<FCompute>(
+  "FCompute<cpu>",
+  NumpyBinaryBroadcastCompute<cpu, op::mshadow_op::mod, op::mshadow_op::mixed_mod,
+                                      op::mshadow_op::mixed_rmod>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_mod"});
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 1}};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::mod_grad,
+                                                              mshadow_op::mod_rgrad>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_MIXED_PRECISION(_npi_power)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<cpu>",
   NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::power, op::mshadow_op::mixed_power,
                                       op::mshadow_op::mixed_rpower>)
-#else
-.set_attr<FCompute>(
-  "FCompute<cpu>",
-  NumpyBinaryBroadcastComputeWithBool<cpu, op::mshadow_op::power>)
-#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_power"});
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.cu b/src/operator/numpy/np_elemwise_broadcast_op.cu
index 1e01304..a2927cd 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.cu
+++ b/src/operator/numpy/np_elemwise_broadcast_op.cu
@@ -29,59 +29,50 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_add)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::plus, op::mshadow_op::mixed_plus,
                                       op::mshadow_op::mixed_plus>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::plus>);
-#endif
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_add)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
+                                                                mshadow_op::posone>);
 
 NNVM_REGISTER_OP(_npi_subtract)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::minus, op::mshadow_op::mixed_minus,
                               op::mshadow_op::mixed_rminus>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::minus>);
-#endif
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_sub)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::posone,
+                                                                mshadow_op::negone>);
 
 NNVM_REGISTER_OP(_npi_multiply)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::mul, op::mshadow_op::mixed_mul,
                                       op::mshadow_op::mixed_mul>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::mul>);
-#endif
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_mul)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::right,
                                                               mshadow_op::left>);
 
 NNVM_REGISTER_OP(_npi_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBroadcastCompute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>(
+  "FCompute<gpu>",
+  NumpyBinaryBroadcastCompute<gpu, op::mshadow_op::mod, op::mshadow_op::mixed_mod,
+                                      op::mshadow_op::mixed_rmod>);
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_mod)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::mod_grad,
+                                                              mshadow_op::mod_rgrad>);
 
 NNVM_REGISTER_OP(_npi_power)
-#ifndef _WIN32
 .set_attr<FCompute>(
   "FCompute<gpu>",
   NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::power, op::mshadow_op::mixed_power,
                                       op::mshadow_op::mixed_rpower>);
-#else
-.set_attr<FCompute>(
-  "FCompute<gpu>",
-  NumpyBinaryBroadcastComputeWithBool<gpu, op::mshadow_op::power>);
-#endif
 
 NNVM_REGISTER_OP(_backward_npi_broadcast_power)
 .set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::power_grad,
diff --git a/src/operator/numpy/np_elemwise_broadcast_op.h b/src/operator/numpy/np_elemwise_broadcast_op.h
index d58245a..ce53eb6 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op.h
+++ b/src/operator/numpy/np_elemwise_broadcast_op.h
@@ -35,23 +35,12 @@
 namespace mxnet {
 namespace op {
 
-inline bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
-                           std::vector<int>* in_attrs,
-                           std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
-  return in_attrs->at(0) != -1;
-}
-
 inline void PrintErrorMessage(const std::string& op_name, const int dtype1, const int dtype2) {
   LOG(FATAL) << "Operator " << op_name << " does not support combination of "
              << mshadow::dtype_string(dtype1) << " with " << mshadow::dtype_string(dtype2)
              << " yet...";
 }
 
-#ifndef _WIN32
 template<typename xpu, typename OP>
 void MixedAllRealBinaryElemwiseCompute(const std::string& op_name,
                                        const OpContext& ctx,
@@ -153,7 +142,6 @@ void MixedBinaryElemwiseCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& lhs = inputs[0];
   const TBlob& rhs = inputs[1];
   const TBlob& out = outputs[0];
-
   if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
     if (lhs.type_flag_ == out.type_flag_) {
       MixedAllRealBinaryElemwiseCompute<xpu, ROP>(attrs.op->name, ctx, lhs, rhs, out, req[0]);
@@ -227,13 +215,9 @@ void MixedAllRealBinaryBroadcastCompute(const std::string& op_name,
     }
   });
 }
-#endif
 
-#ifndef _WIN32
+
 template<typename xpu, typename OP, typename LOP, typename ROP>
-#else
-template<typename xpu, typename OP>
-#endif
 void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
                                  const std::vector<TBlob>& inputs,
@@ -248,11 +232,9 @@ void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
   const TBlob& rhs = inputs[1];
   const TBlob& out = outputs[0];
 
-#ifndef _WIN32
   mxnet::TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(lhs.shape_, rhs.shape_, out.shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
-
   if (!ndim) {
     MixedBinaryElemwiseCompute<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
   } else {
@@ -290,47 +272,34 @@ void MixedBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
           });
         }
       });
+    } else if (!common::is_float(lhs.type_flag_) && !common::is_float(rhs.type_flag_)) {
+      TBlob temp_tblob;
+      if (lhs.type_flag_ == out.type_flag_) {
+        MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+          Tensor<xpu, 1, LType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, LType>(Shape1(rhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+        BinaryBroadcastCompute<xpu, OP>(
+          attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+      } else {
+        MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+          Tensor<xpu, 1, RType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, RType>(Shape1(lhs.Size()), s);
+          temp_tblob = TBlob(temp_tensor);
+        });
+        CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+        BinaryBroadcastCompute<xpu, OP>(
+          attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+      }
     } else {
       PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_);
     }
   }
-#else
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-    TBlob temp_tblob;
-    // one is float, the other is bool
-    CHECK((out.type_flag_ == lhs.type_flag_) || (out.type_flag_ == rhs.type_flag_))
-      << "This case out type should be same as the float type";
-    if (lhs.type_flag_ == out.type_flag_) {
-      MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-        Tensor<xpu, 1, LType> temp_tensor =
-          ctx.requested[0].get_space_typed<xpu, 1, LType>(Shape1(rhs.Size()), s);
-        temp_tblob = TBlob(temp_tensor);
-      });
-      CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
-      BinaryBroadcastCompute<xpu, OP>(
-        attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
-    } else {
-      MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-        Tensor<xpu, 1, RType> temp_tensor =
-          ctx.requested[0].get_space_typed<xpu, 1, RType>(Shape1(lhs.Size()), s);
-        temp_tblob = TBlob(temp_tensor);
-      });
-      CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
-      BinaryBroadcastCompute<xpu, OP>(
-        attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
-    }
-  } else {
-    PrintErrorMessage(attrs.op->name, lhs.type_flag_, rhs.type_flag_);
-  }
-#endif
 }
 
-#ifndef _WIN32
 template<typename xpu, typename OP, typename LOP, typename ROP>
-#else
-template<typename xpu, typename OP>
-#endif
 void NumpyBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                                  const OpContext& ctx,
                                  const std::vector<TBlob>& inputs,
@@ -352,18 +321,10 @@ void NumpyBinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
     return;
   }
 
-#ifndef _WIN32
   MixedBinaryBroadcastCompute<xpu, OP, LOP, ROP>(attrs, ctx, inputs, req, outputs);
-#else
-  MixedBinaryBroadcastCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
 }
 
-#ifndef _WIN32
 template<typename xpu, typename OP, typename LOP, typename ROP>
-#else
-template<typename xpu, typename OP>
-#endif
 void NumpyBinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs,
                                          const OpContext& ctx,
                                          const std::vector<TBlob>& inputs,
@@ -384,12 +345,31 @@ void NumpyBinaryBroadcastComputeWithBool(const nnvm::NodeAttrs& attrs,
     BinaryBroadcastComputeWithBool<xpu, OP>(attrs, ctx, inputs, req, outputs);
     return;
   }
-
-#ifndef _WIN32
+  if (!common::is_float(lhs.type_flag_) && !common::is_float(rhs.type_flag_)) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    TBlob temp_tblob;
+    if (lhs.type_flag_ == out.type_flag_) {
+      MXNET_INT_TYPE_SWITCH(lhs.type_flag_, LType, {
+        Tensor<xpu, 1, LType> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, LType>(Shape1(rhs.Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+      });
+      CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
+      BinaryBroadcastCompute<xpu, OP>(
+        attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
+    } else {
+      MXNET_INT_TYPE_SWITCH(rhs.type_flag_, RType, {
+        Tensor<xpu, 1, RType> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, RType>(Shape1(lhs.Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+      });
+      CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
+      BinaryBroadcastCompute<xpu, OP>(
+        attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
+    }
+    return;
+  }
   MixedBinaryBroadcastCompute<xpu, OP, LOP, ROP>(attrs, ctx, inputs, req, outputs);
-#else
-  MixedBinaryBroadcastCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
-#endif
 }
 
 template<typename xpu, typename LOP, typename ROP>
diff --git a/src/operator/numpy/np_elemwise_broadcast_op_extended.cc b/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
index 52d6818..60b721e 100644
--- a/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
+++ b/src/operator/numpy/np_elemwise_broadcast_op_extended.cc
@@ -30,21 +30,19 @@
 namespace mxnet {
 namespace op {
 
-#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)              \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);             \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>) \
-  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)  \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+#define MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(name)                    \
+  NNVM_REGISTER_OP(name)                                                  \
+  .set_num_inputs(1)                                                      \
+  .set_num_outputs(1)                                                     \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                   \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)       \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)        \
+  .set_attr<FResourceRequest>("FResourceRequest",                         \
+    [](const NodeAttrs& attrs) {                                          \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};   \
+    })                                                                    \
+  .add_argument("data", "NDArray-or-Symbol", "source input")              \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(_npi_copysign)
 .describe(R"code()code" ADD_FILELINE)
@@ -87,9 +85,7 @@ NNVM_REGISTER_OP(_npi_lcm)
 NNVM_REGISTER_OP(_npi_lcm_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -98,7 +94,7 @@ NNVM_REGISTER_OP(_npi_lcm_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::lcm>);
 
 NNVM_REGISTER_OP(_npi_bitwise_and)
@@ -122,9 +118,7 @@ NNVM_REGISTER_OP(_npi_bitwise_and)
 NNVM_REGISTER_OP(_npi_bitwise_and_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = std::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -133,7 +127,7 @@ NNVM_REGISTER_OP(_npi_bitwise_and_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeInt<cpu, mshadow_op::bitwise_and>);
 
 NNVM_REGISTER_OP(_npi_bitwise_xor)
@@ -175,9 +169,7 @@ NNVM_REGISTER_OP(_npi_bitwise_or)
 NNVM_REGISTER_OP(_npi_bitwise_xor_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -186,15 +178,13 @@ NNVM_REGISTER_OP(_npi_bitwise_xor_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeInt<cpu, mshadow_op::bitwise_xor>);
 
 NNVM_REGISTER_OP(_npi_bitwise_or_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseIntType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -203,7 +193,7 @@ NNVM_REGISTER_OP(_npi_bitwise_or_scalar)
   })
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "int", "scalar input")
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::ComputeInt<cpu, mshadow_op::bitwise_or>);
 
 MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_copysign_scalar)
@@ -275,14 +265,14 @@ MXNET_OPERATOR_REGISTER_NP_BINARY_SCALAR(_npi_rarctan2_scalar)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_rarctan2_scalar"});
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_arctan2_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>",
                     BinaryScalarOp::Backward<cpu, mshadow_op::arctan2_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_rarctan2_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>",
                     BinaryScalarOp::Backward<cpu, mshadow_op::arctan2_rgrad>);
 
@@ -363,13 +353,13 @@ NNVM_REGISTER_OP(_backward_npi_ldexp)
                                                                   mshadow_op::ldexp_rgrad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_ldexp_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::ldexp_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_npi_rldexp_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::rldexp_grad>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_matrix_op-inl.h b/src/operator/numpy/np_matrix_op-inl.h
index 593a698..a64fb4d 100644
--- a/src/operator/numpy/np_matrix_op-inl.h
+++ b/src/operator/numpy/np_matrix_op-inl.h
@@ -917,7 +917,7 @@ void NumpyConcatenateForward(const nnvm::NodeAttrs& attrs,
   ConcatParam cparam;
   cparam.num_args = param.num_args;
   cparam.dim = param.axis.has_value() ? param.axis.value() : 0;
-  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
     ConcatOp<xpu, DType> op;
     op.Init(cparam);
     op.Forward(ctx, data, req, outputs);
@@ -950,7 +950,7 @@ void NumpyConcatenateBackward(const nnvm::NodeAttrs& attrs,
   ConcatParam cparam;
   cparam.num_args = param.num_args;
   cparam.dim = param.axis.has_value() ? param.axis.value() : 0;
-  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
     ConcatOp<xpu, DType> op;
     op.Init(cparam);
     op.Backward(ctx, inputs[0], req, data);
diff --git a/src/operator/numpy/np_true_divide-inl.h b/src/operator/numpy/np_true_divide-inl.h
index be2ce51..30d505a 100644
--- a/src/operator/numpy/np_true_divide-inl.h
+++ b/src/operator/numpy/np_true_divide-inl.h
@@ -29,6 +29,7 @@
 #include <vector>
 #include "../../common/utils.h"
 #include "../tensor/elemwise_binary_broadcast_op.h"
+#include "../numpy/np_elemwise_broadcast_op.h"
 
 namespace mxnet {
 namespace op {
@@ -46,7 +47,8 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
   using namespace mxnet_op;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  const double alpha = nnvm::get<double>(attrs.parsed);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
   const TBlob& data = inputs[0];
   const TBlob& out = outputs[0];
   if (out.type_flag_ == data.type_flag_) {
@@ -57,10 +59,10 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
       });
     });
   } else {
-#ifndef _WIN32
-    CHECK_EQ(outputs[0].type_flag_, kFloat32) << "true_divide only supports float32 output "
-                                                 "when input's dtype is "
-                                              << type_string(inputs[0].type_flag_);
+    CHECK_EQ(outputs[0].type_flag_, mxnet::common::GetDefaultDtype())
+      << "true_divide only supports float32 and float64"
+         " output when input's dtype is "
+      << type_string(inputs[0].type_flag_);
     MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         Kernel<op_with_req<OP, Req>, xpu>::Launch(
@@ -68,13 +70,6 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
           static_cast<float>(alpha));
       });
     });
-#else
-    Tensor<xpu, 1, float> temp_tensor =
-      ctx.requested[0].get_space_typed<xpu, 1, float>(mshadow::Shape1(data.Size()), s);
-    TBlob temp_tblob(temp_tensor);
-    CastCompute<xpu>(attrs, ctx, {data}, {kWriteTo}, {temp_tblob});
-    TrueDivideScalarCompute<xpu, OP>(attrs, ctx, {temp_tblob}, req, outputs);
-#endif
   }
 }
 
@@ -116,8 +111,6 @@ void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
       });
     }
   } else {
-#ifndef _WIN32
-    // Non-windows case: no usage of temporary space
     // Case when types of the 2 input tensors are different
     if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
       // both lhs and rhs are float types, output type is the more precise one
@@ -152,44 +145,6 @@ void TrueDivideElemwiseCompute(const nnvm::NodeAttrs &attrs,
       // lhs is integer type, rhs is integer type, output type should be float
       LOG(FATAL) << "not implemented yet...";
     }
-#else
-    // Windows case: using temp space for casting the type
-    // Case when types of the 2 input tensors are different
-    if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
-      // both lhs and rhs are float types, output type is the more precise one
-      LOG(FATAL) << "not implemented yet...";
-    } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-      // lhs is float type, rhs is integer type, the output type should be the same as lhs
-      CHECK_EQ(out.type_flag_,
-               common::is_float(lhs.type_flag_) ? lhs.type_flag_ : rhs.type_flag_)
-        << "This case out type should be same as the float type";
-      TBlob temp_tblob;
-      if (common::is_float(lhs.type_flag_)) {
-        // lhs is the float one
-        MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-          Tensor<xpu, 1, LType> temp_tensor =
-            ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
-          temp_tblob = TBlob(temp_tensor);
-        });
-        CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
-        TrueDivideElemwiseCompute<xpu>(
-          attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
-      } else {
-        // rhs is the float one
-        MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-          Tensor<xpu, 1, RType> temp_tensor =
-            ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
-          temp_tblob = TBlob(temp_tensor);
-        });
-        CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
-        TrueDivideElemwiseCompute<xpu>(
-          attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
-      }
-    } else {
-      // lhs is integer type, rhs is integer type, output type should be float
-      LOG(FATAL) << "not implemented yet...";
-    }
-#endif
   }
 }
 
@@ -213,7 +168,6 @@ void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
     const TBlob& lhs = inputs[0];
     const TBlob& rhs = inputs[1];
     const TBlob& out = outputs[0];
-#ifndef _WIN32
     BROADCAST_NDIM_SWITCH(ndim, NDim, {
       mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
       mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
@@ -273,70 +227,6 @@ void TrueDivideBroadcastCompute(const nnvm::NodeAttrs& attrs,
         }
       }
     });
-#else
-    if (lhs.type_flag_ == rhs.type_flag_) {
-      BROADCAST_NDIM_SWITCH(ndim, NDim, {
-        mshadow::Shape<NDim> oshape = new_oshape.get<NDim>();
-        mshadow::Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
-        mshadow::Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
-        // When the both inputs have the same data types
-        if (common::is_float(lhs.type_flag_)) {
-          // If both inputs are the same float types, output is the same float type
-          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, DType, {
-            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
-              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>());
-          });
-        } else {
-          CHECK_EQ(out.type_flag_, mshadow::kFloat32)
-            << "true_divide only supports float32 output when input's dtype is "
-            << type_string(lhs.type_flag_);
-          MXNET_INT_TYPE_SWITCH(lhs.type_flag_, DType, {
-            // If both inputs are the same integer types, output is float type
-            Kernel<binary_broadcast_kernel<NDim, mshadow_op::true_divide>, xpu>::
-              template LaunchEx(s, new_oshape.Size(), req[0], lstride, rstride, oshape,
-                                lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<float>());
-          });
-        }
-      });
-    } else {
-      if (common::is_float(lhs.type_flag_) && common::is_float(rhs.type_flag_)) {
-        // lhs and rhs have different float types, the output is the more precise one
-        LOG(FATAL) << "not implemented yet...";
-      } else if (common::is_float(lhs.type_flag_) || common::is_float(rhs.type_flag_)) {
-        // one of lhs and rhs is float, the output is the same type as the float one
-        TBlob temp_tblob;
-        if (common::is_float(lhs.type_flag_)) {
-          // lhs is float type, output will be the same float type
-          CHECK_EQ(lhs.type_flag_, out.type_flag_)
-            << "lhs should have the same type as out, infer type broken?";
-          MSHADOW_REAL_TYPE_SWITCH(lhs.type_flag_, LType, {
-            Tensor<xpu, 1, LType> temp_tensor =
-              ctx.requested[0].get_space_typed<xpu, 1, LType>(mshadow::Shape1(rhs.Size()), s);
-            temp_tblob = TBlob(temp_tensor);
-          });
-          CastCompute<xpu>(attrs, ctx, {rhs}, {kWriteTo}, {temp_tblob});
-          TrueDivideBroadcastCompute<xpu>(
-            attrs, ctx, {lhs, temp_tblob.reshape(rhs.shape_)}, req, outputs);
-        } else {
-          // rhs is float type, output will be the same float type
-          CHECK_EQ(rhs.type_flag_, out.type_flag_)
-            << "rhs should have the same type as out, infer type broken?";
-          MSHADOW_REAL_TYPE_SWITCH(rhs.type_flag_, RType, {
-            Tensor<xpu, 1, RType> temp_tensor =
-              ctx.requested[0].get_space_typed<xpu, 1, RType>(mshadow::Shape1(lhs.Size()), s);
-            temp_tblob = TBlob(temp_tensor);
-          });
-          CastCompute<xpu>(attrs, ctx, {lhs}, {kWriteTo}, {temp_tblob});
-          TrueDivideBroadcastCompute<xpu>(
-            attrs, ctx, {temp_tblob.reshape(lhs.shape_), rhs}, req, outputs);
-        }
-      } else {
-        // lhs and rhs have different integer types, the output is float type
-        LOG(FATAL) << "not implemented yet...";
-      }
-    }
-#endif
   }
 }
 
diff --git a/src/operator/numpy/np_true_divide.cc b/src/operator/numpy/np_true_divide.cc
index 6edfb4d..2f3bf7d 100644
--- a/src/operator/numpy/np_true_divide.cc
+++ b/src/operator/numpy/np_true_divide.cc
@@ -74,46 +74,46 @@ NNVM_REGISTER_OP(_npi_true_divide)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
   })
-#ifdef _WIN32
+.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_npi_broadcast_div"})
+.add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
+.add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+
+
+NNVM_REGISTER_OP(_backward_npi_broadcast_div)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 1}};
+  })
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-#endif
-.set_attr<FCompute>("FCompute<cpu>", TrueDivideBroadcastCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"})
-.add_argument("lhs", "NDArray-or-Symbol", "Dividend array")
-.add_argument("rhs", "NDArray-or-Symbol", "Divisor array");
+.set_attr<FCompute>("FCompute<cpu>", NumpyBinaryBackwardUseIn<cpu, mshadow_op::div_grad,
+                                                              mshadow_op::div_rgrad>);
 
 NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-#ifdef _WIN32
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-#endif
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, op::mshadow_op::true_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "float", "scalar input");
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-  attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", TrueDivideType<1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -129,7 +129,7 @@ NNVM_REGISTER_OP(_npi_rtrue_divide_scalar)
 .set_attr<FCompute>("FCompute<cpu>", TrueDivideScalarCompute<cpu, mshadow_op::rtrue_divide>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rdiv_scalar"})
 .add_argument("data", "NDArray-or-Symbol", "source input")
-.add_argument("scalar", "float", "scalar input");
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/numpy/np_true_divide.cu b/src/operator/numpy/np_true_divide.cu
index 7211f4a..c8eccfe 100644
--- a/src/operator/numpy/np_true_divide.cu
+++ b/src/operator/numpy/np_true_divide.cu
@@ -31,6 +31,10 @@ namespace op {
 NNVM_REGISTER_OP(_npi_true_divide)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideBroadcastCompute<gpu>);
 
+NNVM_REGISTER_OP(_backward_npi_broadcast_div)
+.set_attr<FCompute>("FCompute<gpu>", NumpyBinaryBackwardUseIn<gpu, mshadow_op::div_grad,
+                                                              mshadow_op::div_rgrad>);
+
 NNVM_REGISTER_OP(_npi_true_divide_scalar)
 .set_attr<FCompute>("FCompute<gpu>", TrueDivideScalarCompute<gpu, mshadow_op::true_divide>);
 
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index 0cc0dc9..de6efbf 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -421,6 +421,8 @@ IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rldexp);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ldexp_grad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ldexp_rgrad);  // NOLINT()
 IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rldexp_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::posone); // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::negone); // NOLINT()
 /*!
  * \brief Tuner objects, *not* automatically generated
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index ffd0f12..774c87a 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -227,8 +227,8 @@ struct binary_broadcast_kernel {
     }
   }
 
-#ifndef _WIN32
   /*! \brief Map function for binary_broadcast_kernel */
+  /* used for mixed type binary ops */
   template<typename IType, typename DType,
            typename std::enable_if<!std::is_same<IType, DType>::value, int>::type = 0>
   MSHADOW_XINLINE static void Map(index_t base, index_t length, OpReqType req,
@@ -249,6 +249,7 @@ struct binary_broadcast_kernel {
   }
 
   /*! \brief Map function for binary_broadcast_kernel */
+  /* used for mixed type binary ops */
   template<typename IType, typename DType,
            typename std::enable_if<!std::is_same<IType, DType>::value &&
                                    !std::is_pointer<IType>::value, int>::type = 0>
@@ -268,7 +269,6 @@ struct binary_broadcast_kernel {
       KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs, rhs[ridx]));
     }
   }
-#endif
 };
 
 template<int req, typename OP, bool col_vec>
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index 4eaaff0..7d65c89 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -29,6 +29,7 @@
 #include <dmlc/strtonum.h>
 #include <vector>
 #include <utility>
+#include <string>
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "elemwise_unary_op.h"
@@ -36,6 +37,45 @@
 namespace mxnet {
 namespace op {
 
+struct NumpyBinaryScalarParam : public dmlc::Parameter<NumpyBinaryScalarParam> {
+  double scalar;
+  bool is_int;
+  DMLC_DECLARE_PARAMETER(NumpyBinaryScalarParam) {
+    DMLC_DECLARE_FIELD(scalar)
+    .set_default(1)
+    .describe("Scalar input value");
+    DMLC_DECLARE_FIELD(is_int)
+    .set_default(true)
+    .describe("Indicate whether scalar input is int type");
+  }
+
+  void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
+    std::ostringstream scalar_s, is_int_s;
+    scalar_s << scalar;
+    is_int_s << is_int;
+    (*dict)["scalar"] = scalar_s.str();
+    (*dict)["is_int"] = is_int_s.str();
+  }
+};
+
+inline bool NumpyBinaryScalarType(const nnvm::NodeAttrs& attrs,
+                           std::vector<int>* in_attrs,
+                           std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  bool scalar_is_int = param.is_int;
+  if (common::is_int(in_attrs->at(0)) && !scalar_is_int) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat64);
+  } else if (in_attrs->at(0) == mshadow::kBool) {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, scalar_is_int ? mshadow::kInt64 : mshadow::kFloat64);
+  } else {
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  }
+  return out_attrs->at(0) != -1;
+}
+
 class BinaryScalarOp : public UnaryOp {
   /*! \brief Tensor operation against a scalar with a dense result */
   template<typename OP, typename DType, typename IType>
@@ -45,7 +85,8 @@ class BinaryScalarOp : public UnaryOp {
                                       const NDArray &input,
                                       const OpReqType req,
                                       const NDArray &output) {
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     CHECK_EQ(output.shape(), input.shape());
     const int64_t row_count = output.shape()[0];
     const int64_t items_per_row = output.shape().Size() / row_count;
@@ -137,7 +178,8 @@ class BinaryScalarOp : public UnaryOp {
                                       const NDArray &output) {
     CHECK_EQ(output.shape(), input.shape());
 
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     const DType dense_fill_val = OP::Map(DType(0), DType(alpha));
     const TBlob  column_indexes = input.aux_data(csr::kIdx);
     const size_t item_count = column_indexes.Size();
@@ -236,11 +278,23 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    TBlob temp_tblob;
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    bool scalar_is_int = param.is_int;
+    const double alpha = param.scalar;
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      if ((common::is_int(inputs[0].type_flag_) && !scalar_is_int) ||
+          (inputs[0].type_flag_ == kBool)) {
+        Tensor<xpu, 1, DType> temp_tensor =
+            ctx.requested[0].get_space_typed<xpu, 1, DType>(Shape1(inputs[0].Size()), s);
+        temp_tblob = TBlob(temp_tensor);
+        CastCompute<xpu>(attrs, ctx, {inputs[0]}, {kWriteTo}, {temp_tblob});
+      } else {
+        temp_tblob = inputs[0];
+      }
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-          s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), DType(alpha));
+          s, inputs[0].Size(), outputs[0].dptr<DType>(), temp_tblob.dptr<DType>(), DType(alpha));
       });
     });
   }
@@ -256,7 +310,8 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     MXNET_INT_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
@@ -276,12 +331,23 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
-    MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
-        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-          mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
-              s, inputs[0].Size(), outputs[0].dptr<bool>(), inputs[0].dptr<DType>(), DType(alpha));
-        });
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    bool scalar_is_int = param.is_int;
+    const double alpha = param.scalar;
+    TBlob temp_tblob;
+    if (common::is_int(inputs[0].type_flag_) && !scalar_is_int) {
+      Tensor<xpu, 1, double> temp_tensor =
+          ctx.requested[0].get_space_typed<xpu, 1, double>(Shape1(inputs[0].Size()), s);
+      temp_tblob = TBlob(temp_tensor);
+      CastCompute<xpu>(attrs, ctx, {inputs[0]}, {kWriteTo}, {temp_tblob});
+    } else {
+      temp_tblob = inputs[0];
+    }
+    MSHADOW_TYPE_SWITCH_WITH_BOOL(temp_tblob.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+            s, inputs[0].Size(), outputs[0].dptr<bool>(), temp_tblob.dptr<DType>(), DType(alpha));
+      });
     });
   }
 
@@ -345,7 +411,8 @@ class BinaryScalarOp : public UnaryOp {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const double alpha = nnvm::get<double>(attrs.parsed);
+    const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+    const double alpha = param.scalar;
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
         mxnet::op::mxnet_op::Kernel<mxnet::op::mxnet_op::op_with_req<
@@ -358,21 +425,23 @@ class BinaryScalarOp : public UnaryOp {
   }
 };
 
-#define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name)                 \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);            \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+#define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name)                       \
+  NNVM_REGISTER_OP(name)                                                  \
+  .set_num_inputs(1)                                                      \
+  .set_num_outputs(1)                                                     \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                   \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)       \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)        \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                       \
+    [](const NodeAttrs& attrs){                                           \
+      return std::vector<std::pair<int, int> >{{0, 0}};                   \
+    })                                                                    \
+  .set_attr<FResourceRequest>("FResourceRequest",                         \
+    [](const NodeAttrs& attrs) {                                          \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};   \
+    })                                                                    \
+  .add_argument("data", "NDArray-or-Symbol", "source input")              \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
index 13014b3..dc11593 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
@@ -28,22 +28,24 @@
 #include "./elemwise_binary_scalar_op.h"
 
 #define MXNET_OPERATOR_REGISTER_BINARY_WITH_SCALAR_SUPPORT_WITH_DENSE_RESULT(name)    \
-  NNVM_REGISTER_OP(name)                                            \
-  .set_num_inputs(1)                                                \
-  .set_num_outputs(1)                                               \
-  .set_attr_parser([](NodeAttrs* attrs) {                           \
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);             \
-    })                                                              \
-  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
-  .set_attr<FInferStorageType>("FInferStorageType",                 \
-    BinaryScalarStorageTypeWithDenseResultStorageType)              \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
-    [](const NodeAttrs& attrs){                                     \
-      return std::vector<std::pair<int, int> >{{0, 0}};             \
-    })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")        \
-  .add_argument("scalar", "float", "scalar input")
+  NNVM_REGISTER_OP(name)                                                              \
+  .set_num_inputs(1)                                                                  \
+  .set_num_outputs(1)                                                                 \
+  .set_attr_parser(ParamParser<NumpyBinaryScalarParam>)                               \
+  .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)                   \
+  .set_attr<nnvm::FInferType>("FInferType", NumpyBinaryScalarType)                    \
+  .set_attr<FInferStorageType>("FInferStorageType",                                   \
+    BinaryScalarStorageTypeWithDenseResultStorageType)                                \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                                   \
+    [](const NodeAttrs& attrs){                                                       \
+      return std::vector<std::pair<int, int> >{{0, 0}};                               \
+    })                                                                                \
+  .set_attr<FResourceRequest>("FResourceRequest",                                     \
+    [](const NodeAttrs& attrs) {                                                      \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};               \
+    })                                                                                \
+  .add_argument("data", "NDArray-or-Symbol", "source input")                          \
+  .add_arguments(NumpyBinaryScalarParam::__FIELDS__())
 
 namespace mxnet {
 namespace op {
@@ -65,7 +67,8 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a
   const NDArrayStorageType instype = static_cast<NDArrayStorageType>(in_attrs->at(0));
   const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback
                                        : DispatchMode::kFComputeEx;
-  const double alpha = nnvm::get<double>(attrs.parsed);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
   if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
     dispatched = storage_type_assign(&out_attrs[0],
       kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
@@ -189,8 +192,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rdiv_scalar)
 .add_alias("_RDivScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rdiv_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::rdiv_grad>);
 
@@ -200,8 +203,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_mod_scalar)
 .add_alias("_ModScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_mod_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::mod_grad>);
 
@@ -211,8 +214,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rmod_scalar)
 .add_alias("_RModScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rmod_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::rmod_grad>);
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
index 7dd8cf4..c08cdcd 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
@@ -36,8 +36,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_maximum_scalar)
 .add_alias("_npi_maximum_scalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_maximum_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::ge>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_minimum_scalar)
@@ -47,8 +47,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_minimum_scalar)
 .add_alias("_npi_minimum_scalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_minimum_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::le>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_power_scalar)
@@ -57,8 +57,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_power_scalar)
 .add_alias("_PowerScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_power_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::power_grad>);
 
@@ -69,8 +69,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rpower_scalar)
 .add_alias("_RPowerScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rpower_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::rpower_grad>);
 
@@ -82,8 +82,8 @@ MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_hypot_scalar)
 .add_alias("_HypotScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_hypot_scalar)
-.add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = dmlc::stod(attrs->dict["scalar"]); })
+.add_arguments(NumpyBinaryScalarParam::__FIELDS__())
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
   cpu, mshadow_op::hypot_grad_left>);
 
@@ -109,13 +109,7 @@ Example::
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser([](NodeAttrs* attrs) {
-    if (attrs->dict.find("scalar") != attrs->dict.end()) {
-      attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-    } else {
-      attrs->parsed = 1.0;
-    }
-  })
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<mxnet::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
@@ -128,13 +122,7 @@ Example::
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_smooth_l1" });
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_smooth_l1)
-  .set_attr_parser([](NodeAttrs *attrs) {
-      if (attrs->dict.find("scalar") != attrs->dict.end()) {
-        attrs->parsed = dmlc::stod(attrs->dict["scalar"]);
-      } else {
-        attrs->parsed = 1.0;
-      }
-})
+.set_attr_parser(ParamParser<NumpyBinaryScalarParam>)
 .set_attr<FCompute>("FCompute<cpu>",
                     BinaryScalarOp::Backward<cpu, mshadow_op::smooth_l1_gradient>);
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
index 17e7615..0594997 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
@@ -46,7 +46,8 @@ static bool BinaryScalarLogicStorageType(const nnvm::NodeAttrs& attrs,
   const auto in_stype = in_attrs->at(0);
   auto &out_stype = out_attrs->at(0);
   bool dispatched = false;
-  const double alpha = nnvm::get<double>(attrs.parsed);
+  const NumpyBinaryScalarParam& param = nnvm::get<NumpyBinaryScalarParam>(attrs.parsed);
+  const double alpha = param.scalar;
   bool is_sparse = OP::Map(static_cast<double>(0), alpha) == 0;
   if (!dispatched && in_stype == kDefaultStorage) {
     // dns -> dns
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 0f35dec..4811b34 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2488,8 +2488,9 @@ def test_np_mixed_precision_binary_funcs():
             assert y.shape == np_out.shape
             assert_almost_equal(y.asnumpy(), np_out.astype(y.dtype), rtol=rtol, atol=atol,
                                 use_broadcast=False, equal_nan=True)
-
             if lgrad:
+                if (ltype in itypes) and (rtype in itypes):
+                    continue
                 y.backward()
                 if ltype not in itypes:
                     assert_almost_equal(mx_test_x1.grad.asnumpy(),
@@ -2513,10 +2514,13 @@ def test_np_mixed_precision_binary_funcs():
                             use_broadcast=False, equal_nan=True)
 
     funcs = {
-        'add': (-1.0, 1.0, None, None),
-        'subtract': (-1.0, 1.0, None, None),
+        'add': (-1.0, 1.0, lambda y, x1, x2: _np.ones(y.shape),
+                           lambda y, x1, x2: _np.ones(y.shape)),
+        'subtract': (-1.0, 1.0, lambda y, x1, x2: _np.ones(y.shape),
+                                lambda y, x1, x2: _np.ones(y.shape) * -1),
         'multiply': (-1.0, 1.0, lambda y, x1, x2: _np.broadcast_to(x2, y.shape),
                                 lambda y, x1, x2: _np.broadcast_to(x1, y.shape)),
+        'mod': (1.0, 5.0, None, None),
         'power': (1.0, 3.0, lambda y, x1, x2: _np.power(x1, x2 - 1.0) * x2,
                             lambda y, x1, x2: _np.power(x1, x2) * _np.log(x1)),
     }
@@ -2530,8 +2534,6 @@ def test_np_mixed_precision_binary_funcs():
                    ((2, 3), ()),
                    ((), (2, 3))]
 
-    itypes = [np.bool, np.int8, np.int32, np.int64]
-    ftypes = [np.float16, np.float32, np.float64]
     for func, func_data in funcs.items():
         low, high, lgrad, rgrad = func_data
         for lshape, rshape in shape_pairs:
@@ -2544,6 +2546,13 @@ def test_np_mixed_precision_binary_funcs():
                     continue
                 check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2)
 
+            if func == 'subtract' or func == 'mod':
+                continue
+            for type1, type2 in itertools.product(itypes, itypes):
+                if type1 == type2:
+                    continue
+                check_mixed_precision_binary_func(func, low, high, lshape, rshape, lgrad, rgrad, type1, type2)
+
 
 @with_seed()
 @use_np
@@ -3085,52 +3094,56 @@ def test_np_concat():
             shape_lst[axis] = random.randint(0, 3)
         return tuple(shape_lst)
 
-    for shape in [(0, 0), (2, 3), (2, 1, 3)]:
-        for hybridize in [True, False]:
-            for axis in [0, 1, None]:
-                for grad_req in ['write', 'add', 'null']:
-                    # test gluon
-                    test_concat = TestConcat(axis=axis)
-                    if hybridize:
-                        test_concat.hybridize()
+    shapes = [(0, 0), (2, 3), (2, 1, 3)]
+    hybridizes = [True, False]
+    axes = [0, 1, None]
+    grad_reqs = ['write', 'add', 'null']
+    dtypes = [np.float32, np.float64, np.bool]
+    combinations = itertools.product(shapes, hybridizes, axes, grad_reqs, dtypes)
 
-                    grad_req_c = grad_req
-                    grad_req_d = grad_req
-                    if grad_req == 'null':
-                        ide = random.randint(0, 2)
-                        grad_req_c = 'write' if ide == 0 else 'add'
-                        grad_req_c = 'write' if ide == 1 else 'add'
+    for shape, hybridize, axis, grad_req, dtype in combinations:
+        # test gluon
+        test_concat = TestConcat(axis=axis)
+        if hybridize:
+            test_concat.hybridize()
 
-                    a = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    a.attach_grad(grad_req)
-                    b = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    b.attach_grad(grad_req)
-                    c = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    c.attach_grad(grad_req_c)
-                    d = mx.nd.random.uniform(-1.0, 1.0, shape=get_new_shape(shape, axis)).as_np_ndarray()
-                    d.attach_grad(grad_req_d)
-                    expected_ret = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
+        grad_req_c = grad_req
+        grad_req_d = grad_req
+        if grad_req == 'null':
+            ide = random.randint(0, 2)
+            grad_req_c = 'write' if ide == 0 else 'add'
+            grad_req_c = 'write' if ide == 1 else 'add'
 
-                    with mx.autograd.record():
-                        y = test_concat(a, b, c, d)
+        a = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        a.attach_grad(grad_req)
+        b = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        b.attach_grad(grad_req)
+        c = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        c.attach_grad(grad_req_c)
+        d = np.random.uniform(-1.0, 1.0, size=get_new_shape(shape, axis)).astype(dtype)
+        d.attach_grad(grad_req_d)
+        expected_ret = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
 
-                    assert y.shape == expected_ret.shape
-                    assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
+        with mx.autograd.record():
+            y = test_concat(a, b, c, d)
 
-                    y.backward()
-                    if grad_req != 'null':
-                        assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
-                    if grad_req != 'null':
-                        assert_almost_equal(b.grad.asnumpy(), _np.ones(b.shape), rtol=1e-3, atol=1e-5)
-                    if grad_req_c != 'null':
-                        assert_almost_equal(c.grad.asnumpy(), _np.ones(c.shape), rtol=1e-3, atol=1e-5)
-                    if grad_req_d != 'null':
-                        assert_almost_equal(d.grad.asnumpy(), _np.ones(d.shape), rtol=1e-3, atol=1e-5)
+        assert y.shape == expected_ret.shape
+        assert_almost_equal(y.asnumpy(), expected_ret, rtol=1e-3, atol=1e-5)
 
-                    # test imperative
-                    mx_out = np.concatenate([a, b, c, d], axis=axis)
-                    np_out = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
-                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+        y.backward()
+        if grad_req != 'null':
+            assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
+        if grad_req != 'null':
+            assert_almost_equal(b.grad.asnumpy(), _np.ones(b.shape), rtol=1e-3, atol=1e-5)
+        if grad_req_c != 'null':
+            assert_almost_equal(c.grad.asnumpy(), _np.ones(c.shape), rtol=1e-3, atol=1e-5)
+        if grad_req_d != 'null':
+            assert_almost_equal(d.grad.asnumpy(), _np.ones(d.shape), rtol=1e-3, atol=1e-5)
+
+        # test imperative
+        mx_out = np.concatenate([a, b, c, d], axis=axis)
+        np_out = _np.concatenate([a.asnumpy(), b.asnumpy(), c.asnumpy(), d.asnumpy()], axis=axis)
+        assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 8e4fe11..f9d6cea 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -484,9 +484,7 @@ def test_eliminate_common_expr():
     arr2 = mx.random.uniform(shape=shape)
     arr3 = mx.random.uniform(shape=shape)
 
-    check_cse_on_symbol((a+5) + (a+5), expected_savings=1, check_data=True, a=arr1, b=arr2)
     check_cse_on_symbol((a+1) + (a+2), expected_savings=0, check_data=True, a=arr1, b=arr2)
-    check_cse_on_symbol((1+a) + (a+1), expected_savings=1, check_data=True, a=arr1, b=arr2)
     check_cse_on_symbol((a+b) + (a+b), expected_savings=1, check_data=True, a=arr1, b=arr2)
     check_cse_on_symbol(((a+b)+c) +((a+b)+c), expected_savings=2, check_data=True,
                                                                   a=arr1, b=arr2, c=arr3)