You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/06/29 15:12:53 UTC

[GitHub] eric-haibin-lin closed pull request #11330: [MXNET-537] add_n(dense, csr, dense) = dense and add_n([dense, csr, rsp]*, dense, [dense, csr, rsp]*) = dense on CPU & GPU

eric-haibin-lin closed pull request #11330: [MXNET-537] add_n(dense, csr, dense) = dense and add_n([dense, csr, rsp]*, dense, [dense, csr, rsp]*) = dense on CPU & GPU
URL: https://github.com/apache/incubator-mxnet/pull/11330
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/common/utils.h b/src/common/utils.h
index be78bf42547..d7ed4ddf04c 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -319,6 +319,36 @@ inline bool ContainsOnlyStorage(const std::vector<NDArray>& ndarrays,
   return false;
 }
 
+/*! \brief returns true if storage type of any array in `ndarrays`
+ *         is the same as the target `stype`. false is returned for empty inputs.
+ */
+inline bool ContainsStorageType(const std::vector<NDArray>& ndarrays,
+                                const NDArrayStorageType stype) {
+  if (!ndarrays.empty()) {
+    for (const auto& nd : ndarrays) {
+      if (nd.storage_type() == stype) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+/*! \brief returns true if any storage type `ndstype` in `ndstypes`
+ *         is the same as the target `stype`. false is returned for empty inputs.
+ */
+inline bool ContainsStorageType(const std::vector<int>& ndstypes,
+                                const NDArrayStorageType stype) {
+  if (!ndstypes.empty()) {
+    for (const auto& ndstype : ndstypes) {
+      if (ndstype == stype) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 /*! \brief get string representation of dispatch_mode */
 inline std::string dispatch_mode_string(const DispatchMode x) {
   switch (x) {
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index 552555adf84..022302aca40 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -26,6 +26,9 @@
 #include "./ndarray_function.h"
 #include "./ndarray_function-inl.h"
 #include "../common/utils.h"
+#include "../operator/mxnet_op.h"
+#include "../operator/tensor/elemwise_binary_op-inl.h"
+#include "../operator/tensor/elemwise_sum.h"
 
 namespace mxnet {
 namespace ndarray {
@@ -165,6 +168,102 @@ void ElementwiseSumRsp(mshadow::Stream<cpu>* s,
   });
 }
 
+void ElementwiseSumDnsCsrDnsImpl(mshadow::Stream<cpu>* s,
+                                 const Resource& rsc,
+                                 const std::vector<NDArray>& nds,
+                                 NDArray* out) {
+  using namespace mxnet::op;
+  using namespace mxnet::op::mxnet_op;
+  const TBlob& out_data = out->data();
+  MSHADOW_TYPE_SWITCH(out->dtype(), DType, {  // data type
+    Kernel<Sum, cpu>::Launch(
+      s, out_data.Size(), out_data.dptr<DType>(), kWriteTo, nds[0].data().dptr<DType>(),
+      nds[2].data().dptr<DType>());
+    const TBlob& csr_data = nds[1].data();
+    const TBlob& csr_indices = nds[1].aux_data(csr::kIdx);
+    const TBlob& csr_indptr = nds[1].aux_data(csr::kIndPtr);
+    const nnvm::dim_t num_rows = nds[1].shape()[0];
+    const nnvm::dim_t num_cols = nds[1].shape()[1];
+    MSHADOW_IDX_TYPE_SWITCH(csr_indices.type_flag_, IType, {  // indices type
+      MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {  // indptr type
+        if (nds[1].storage_initialized()) {
+          Kernel<ElemwiseDnsCsrDnsKernel<kWriteTo, mshadow_op::plus>, cpu>::Launch(
+            s, num_rows, out_data.dptr<DType>(), out_data.dptr<DType>(),
+            csr_data.dptr<DType>(), csr_indices.dptr<IType>(),
+            csr_indptr.dptr<CType>(), num_rows, num_cols);
+        }
+      });
+    });
+  });
+}
+
+void ElementwiseSumContainsDnsImpl(mshadow::Stream<cpu>* s,
+                                   const Resource& rsc,
+                                   const std::vector<NDArray>& nds,
+                                   NDArray* out) {
+  using namespace mxnet::op;
+  using namespace mxnet::op::mxnet_op;
+  const TBlob& out_data = out->data();
+  MSHADOW_TYPE_SWITCH(out->dtype(), DType, {  // data type
+    Kernel<set_zero, cpu>::Launch(s, out_data.Size(), out_data.dptr<DType>());
+    for (size_t i = 0; i < nds.size(); ++i) {
+      const NDArray& nd = nds[i];
+      const nnvm::dim_t num_rows = nd.shape()[0];
+      const nnvm::dim_t num_cols = nd.shape()[1];
+      const TBlob& nd_data = nd.data();
+
+      if (i == 0) {
+        if (nd.storage_type() == kDefaultStorage) {
+          Kernel<op_with_req<mshadow_op::identity, kWriteTo>, cpu>::Launch(
+            s, out_data.Size(), out_data.dptr<DType>(), nd_data.dptr<DType>());
+          continue;
+        } else {
+          Kernel<set_zero, cpu>::Launch(s, out_data.Size(), out_data.dptr<DType>());
+        }
+      }
+
+      switch (nd.storage_type()) {
+        case kDefaultStorage: {
+          Kernel<op_with_req<mshadow_op::plus, kWriteTo>, cpu>::Launch(
+            s, out_data.Size(), out_data.dptr<DType>(), out_data.dptr<DType>(),
+            nd_data.dptr<DType>());
+          break;
+        }
+        case kCSRStorage: {
+          const TBlob& nd_indices = nd.aux_data(csr::kIdx);
+          const TBlob& nd_indptr = nd.aux_data(csr::kIndPtr);
+          MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
+            MSHADOW_IDX_TYPE_SWITCH(nd_indptr.type_flag_, CType, {  // indptr type
+              if (nd.storage_initialized()) {
+                Kernel<ElemwiseDnsCsrDnsKernel<kWriteTo, mshadow_op::plus>, cpu>::Launch(
+                  s, num_rows, out_data.dptr<DType>(), out_data.dptr<DType>(),
+                  nd_data.dptr<DType>(), nd_indices.dptr<IType>(),
+                  nd_indptr.dptr<CType>(), num_rows, num_cols);
+              }
+            });
+          });
+          break;
+        }
+        case kRowSparseStorage: {
+          const TBlob& nd_indices = nd.aux_data(rowsparse::kIdx);
+          MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
+            if (nd.storage_initialized()) {
+              const nnvm::dim_t nz_rows = nd_indices.Size();
+              Kernel<ElemwiseDnsRspDnsKernel<kWriteTo, mshadow_op::plus>, cpu>::Launch(
+                s, nz_rows * num_cols, out_data.dptr<DType>(),
+                out_data.dptr<DType>(), nd_data.dptr<DType>(), nd_indices.dptr<IType>(),
+                num_rows, nz_rows, num_cols);
+            }
+          });
+          break;
+        }
+        default:
+          LOG(FATAL) << "unknown storage type " << nd.storage_type() << "encountered...";
+      }
+    }
+  });
+}
+
 /*!
  * \brief Parallel cpu impl of elemwise sum for sparse tensors.
  * Currently only support row sparse sum.
@@ -175,8 +274,15 @@ void ElementwiseSum<cpu>(mshadow::Stream<cpu>* s,
                          const std::vector<NDArray>& nds,
                          NDArray* out) {
   if (nds.empty()) return;
-  if (nds[0].storage_type() == kRowSparseStorage) {
+  if (common::ContainsOnlyStorage(nds, kRowSparseStorage)) {
     ElementwiseSumRsp(s, rsc, nds, out);
+  } else if (nds.size() == 3U && nds[0].storage_type() == kDefaultStorage &&
+             nds[1].storage_type() == kCSRStorage && nds[2].storage_type() == kDefaultStorage &&
+             out->storage_type() == kDefaultStorage) {
+    ElementwiseSumDnsCsrDnsImpl(s, rsc, nds, out);
+  } else if (nds.size() > 4U && common::ContainsStorageType(nds, kDefaultStorage) &&
+             out->storage_type() == kDefaultStorage) {
+    ElementwiseSumContainsDnsImpl(s, rsc, nds, out);
   } else {
     LOG(FATAL) << "ElementwiseSum<cpu> has not been implemented for storage_type = << "
                << nds[0].storage_type();
diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu
index 06b5ad46a08..0ab40a79419 100644
--- a/src/ndarray/ndarray_function.cu
+++ b/src/ndarray/ndarray_function.cu
@@ -25,7 +25,9 @@
 // this will be invoked by nvcc and compile GPU version
 #include <cub/cub.cuh>
 #include <dmlc/logging.h>
-#include "../operator/mxnet_op.h"
+#include "../operator/tensor/elemwise_binary_op-inl.h"
+#include "../operator/tensor/elemwise_sum.h"
+#include "../operator/tensor/indexing_op.h"
 #include "../operator/tensor/init_op.h"
 #include "../operator/tensor/util/tensor_util-inl.h"
 #include "../operator/tensor/util/tensor_util-inl.cuh"
@@ -185,6 +187,101 @@ void ElementwiseSumRspImpl(mshadow::Stream<gpu>* s,
   });
 }
 
+void ElementwiseSumDnsCsrDnsImpl(mshadow::Stream<gpu>* s,
+                                 const Resource& rsc,
+                                 const std::vector<NDArray>& nds,
+                                 NDArray* out) {
+  using namespace mxnet::op;
+  using namespace mxnet::op::mxnet_op;
+  const TBlob& out_data = out->data();
+  MSHADOW_TYPE_SWITCH(out->dtype(), DType, {  // data type
+    Kernel<Sum, gpu>::Launch(
+      s, out_data.Size(), out_data.dptr<DType>(), kWriteTo, nds[0].data().dptr<DType>(),
+      nds[2].data().dptr<DType>());
+    const TBlob& csr_data = nds[1].data();
+    const TBlob& csr_indices = nds[1].aux_data(csr::kIdx);
+    const TBlob& csr_indptr = nds[1].aux_data(csr::kIndPtr);
+    const nnvm::dim_t num_rows = nds[1].shape()[0];
+    const nnvm::dim_t num_cols = nds[1].shape()[1];
+    MSHADOW_IDX_TYPE_SWITCH(csr_indices.type_flag_, IType, {  // indices type
+      MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {  // indptr type
+        if (nds[1].storage_initialized()) {
+          Kernel<ElemwiseDnsCsrDnsWarpKernel<kWriteTo, mshadow_op::plus>, gpu>::Launch(
+            s, kWarpSize * num_rows, out_data.dptr<DType>(), out_data.dptr<DType>(),
+            csr_data.dptr<DType>(), csr_indices.dptr<IType>(),
+            csr_indptr.dptr<CType>(), num_rows, num_cols);
+        }
+      });
+    });
+  });
+}
+
+void ElementwiseSumContainsDnsImpl(mshadow::Stream<gpu>* s,
+                                 const Resource& rsc,
+                                 const std::vector<NDArray>& nds,
+                                 NDArray* out) {
+  using namespace mxnet::op;
+  using namespace mxnet::op::mxnet_op;
+  const TBlob& out_data = out->data();
+  MSHADOW_TYPE_SWITCH(out->dtype(), DType, {  // data type
+    for (size_t i = 0; i < nds.size(); ++i) {
+      const NDArray& nd = nds[i];
+      const nnvm::dim_t num_rows = nd.shape()[0];
+      const nnvm::dim_t num_cols = nd.shape()[1];
+      const TBlob& nd_data = nd.data();
+
+      if (i == 0) {
+        if (nd.storage_type() == kDefaultStorage) {
+          Kernel<op_with_req<mshadow_op::identity, kWriteTo>, gpu>::Launch(
+            s, out_data.Size(), out_data.dptr<DType>(), nd_data.dptr<DType>());
+          continue;
+        } else {
+          Kernel<set_zero, gpu>::Launch(s, out_data.Size(), out_data.dptr<DType>());
+        }
+      }
+
+      switch (nd.storage_type()) {
+        case kDefaultStorage: {
+          Kernel<op_with_req<mshadow_op::plus, kWriteTo>, gpu>::Launch(
+            s, out_data.Size(), out_data.dptr<DType>(), out_data.dptr<DType>(),
+            nd_data.dptr<DType>());
+          break;
+        }
+        case kCSRStorage: {
+          const TBlob& nd_indices = nd.aux_data(csr::kIdx);
+          const TBlob& nd_indptr = nd.aux_data(csr::kIndPtr);
+          MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
+            MSHADOW_IDX_TYPE_SWITCH(nd_indptr.type_flag_, CType, {  // indptr type
+              if (nd.storage_initialized()) {
+                Kernel<ElemwiseDnsCsrDnsWarpKernel<kWriteTo, mshadow_op::plus>, gpu>::Launch(
+                  s, kWarpSize * num_rows, out_data.dptr<DType>(), out_data.dptr<DType>(),
+                  nd_data.dptr<DType>(), nd_indices.dptr<IType>(),
+                  nd_indptr.dptr<CType>(), num_rows, num_cols);
+              }
+            });
+          });
+          break;
+        }
+        case kRowSparseStorage: {
+          const TBlob& nd_indices = nd.aux_data(rowsparse::kIdx);
+          MSHADOW_IDX_TYPE_SWITCH(nd_indices.type_flag_, IType, {  // indices type
+            if (nd.storage_initialized()) {
+              const nnvm::dim_t nz_rows = nd_indices.Size();
+              Kernel<ElemwiseDnsRspDnsKernel<kWriteTo, mshadow_op::plus>, gpu>::Launch(
+                s, nz_rows * num_cols, out_data.dptr<DType>(),
+                out_data.dptr<DType>(), nd_data.dptr<DType>(), nd_indices.dptr<IType>(),
+                num_rows, nz_rows, num_cols);
+            }
+          });
+          break;
+        }
+        default:
+          LOG(FATAL) << "unknown storage type " << nd.storage_type() << "encountered...";
+      }
+    }
+  });
+}
+
 /*!
  * \brief Parallel gpu impl of elemwise sum for sparse tensors.
  * Currently only support row sparse sum.
@@ -195,8 +292,15 @@ void ElementwiseSum<gpu>(mshadow::Stream<gpu>* s,
                          const std::vector<NDArray>& nds,
                          NDArray* out) {
   if (nds.empty()) return;
-  if (nds[0].storage_type() == kRowSparseStorage) {
+  if (common::ContainsOnlyStorage(nds, kRowSparseStorage)) {
     ElementwiseSumRspImpl(s, rsc, nds, out);
+  } else if (nds.size() == 3U && nds[0].storage_type() == kDefaultStorage &&
+             nds[1].storage_type() == kCSRStorage && nds[2].storage_type() == kDefaultStorage &&
+             out->storage_type() == kDefaultStorage) {
+    ElementwiseSumDnsCsrDnsImpl(s, rsc, nds, out);
+  } else if (nds.size() > 4U && common::ContainsStorageType(nds, kDefaultStorage) &&
+             out->storage_type() == kDefaultStorage) {
+    ElementwiseSumContainsDnsImpl(s, rsc, nds, out);
   } else {
     LOG(FATAL) << "ElementwiseSum<gpu> has not been implemented for storage_type = << "
         << nds[0].storage_type();
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index e22e23cea5d..16aa0c388cd 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -73,6 +73,16 @@ inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs,
     dispatched = storage_type_assign(out_attrs, kCSRStorage,
                                      dispatch_mode, dispatch_ex);
   }
+  if (!dispatched && in_attrs->size() == 3U && in_attrs->at(0) == kDefaultStorage &&
+      in_attrs->at(1) == kCSRStorage && in_attrs->at(2) == kDefaultStorage) {
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, dispatch_ex);
+  }
+  if (!dispatched && in_attrs->size() > 4U && ContainsStorageType(*in_attrs, kDefaultStorage)) {
+    // *, dense, * -> dense
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, dispatch_ex);
+  }
   if (!dispatched) {
     dispatch_fallback(out_attrs, dispatch_mode);
   }
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index e5b77e11283..6779826ca69 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -511,8 +511,7 @@ void BinaryBroadcastComputeDenseEx(const nnvm::NodeAttrs& attrs,
     // If the input is a matrix with the same shape, should be elemwise
     if (!ndim) {
       mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
-      ElemwiseBinaryOp::DnsCsrDnsOp<xpu, OP>(
-        s, attrs, ctx, dns, csr, req[0], outputs[0], !reverse);
+      ElemwiseBinaryOp::DnsCsrDnsOp<OP>(s, attrs, ctx, dns, csr, req[0], outputs[0], !reverse);
     } else {
       // broadcast(CSR, Dense(1D)) = CSR
       BinaryBroadcastCsrDnsDnsImpl<xpu, OP>(ctx, csr, dns, req[0], out,
diff --git a/src/operator/tensor/elemwise_binary_op-inl.h b/src/operator/tensor/elemwise_binary_op-inl.h
index 878dfb218f9..1b1a1d20077 100644
--- a/src/operator/tensor/elemwise_binary_op-inl.h
+++ b/src/operator/tensor/elemwise_binary_op-inl.h
@@ -27,6 +27,9 @@
 #include <vector>
 #include <algorithm>
 #include "./elemwise_binary_op.h"
+#include "../mxnet_op.h"
+#define WARP_SIZE 32
+#define WARP_SIZE_BITS 5
 
 namespace mxnet {
 namespace op {
@@ -426,9 +429,39 @@ struct ElemwiseDnsCsrDnsKernel {
   }
 };
 
+/*!
+ * \brief Kernel for performing elemwise op between dense and csr matrix
+ * \param tid          global thread id
+ * \param req          type of request
+ * \param out          output array
+ * \param dns_data     data array of dense input
+ * \param csr_data     data array of csr input
+ * \param csr_indices  indices array of csr input
+ * \param csr_indptr   indptr array of csr input
+ * \param num_rows     number of rows of both inputs
+ * \param num_cols     number of columns of both inputs
+ */
+template<int req, typename OP>
+struct ElemwiseDnsCsrDnsWarpKernel {
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_XINLINE static void Map(int tid, DType* out, DType* dns_data,
+                                  const DType* csr_data, const IType* csr_indices,
+                                  const CType* csr_indptr, const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    if (tid < WARP_SIZE * num_rows) {
+      const int row_id = tid >> WARP_SIZE_BITS;
+      const int warp_id = tid & (WARP_SIZE - 1);
+      for (int j = csr_indptr[row_id] + warp_id; j < csr_indptr[row_id+1]; j += WARP_SIZE) {
+        KERNEL_ASSIGN(out[row_id * num_cols + csr_indices[j]], req,
+                      OP::Map(dns_data[row_id * num_cols + csr_indices[j]], csr_data[j]));
+      }
+    }
+  }
+};
+
 /*! \brief DNS -op- CSR binary operator for non-canonical NDArray */
-template<typename xpu, typename OP>
-void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream<xpu> *s,
+template<typename OP>
+void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream<cpu> *s,
                                    const nnvm::NodeAttrs &attrs,
                                    const OpContext &ctx,
                                    const NDArray &dns,
@@ -455,20 +488,20 @@ void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream<xpu> *s,
       MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {
         MXNET_ASSIGN_REQ_SWITCH(req, Req, {
           if (reverse && std::is_same<OP, mshadow_op::minus>::value) {
-            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::negation, Req>, xpu>::Launch(
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::negation, Req>, cpu>::Launch(
               s, output.data().Size(), output.data().dptr<DType>(), dns.data().dptr<DType>());
             if (!csr.storage_initialized()) { return; }
-            mxnet_op::Kernel<ElemwiseDnsCsrDnsKernel<Req, mshadow_op::plus>, xpu>::Launch(
+            mxnet_op::Kernel<ElemwiseDnsCsrDnsKernel<Req, mshadow_op::plus>, cpu>::Launch(
               s, num_csr_rows, output.data().dptr<DType>(),
               output.data().dptr<DType>(), csr_data.dptr<DType>(), csr_indices.dptr<IType>(),
               csr_indptr.dptr<CType>(), num_csr_rows, num_csr_cols);
           } else {
             if (req == kWriteTo) {
-              mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, xpu>::Launch(
+              mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, cpu>::Launch(
                 s, output.data().Size(), output.data().dptr<DType>(), dns.data().dptr<DType>());
             }
             if (!csr.storage_initialized()) { return; }
-            mxnet_op::Kernel<ElemwiseDnsCsrDnsKernel<Req, OP>, xpu>::Launch(
+            mxnet_op::Kernel<ElemwiseDnsCsrDnsKernel<Req, OP>, cpu>::Launch(
               s, num_csr_rows, output.data().dptr<DType>(),
               output.data().dptr<DType>(), csr_data.dptr<DType>(), csr_indices.dptr<IType>(),
               csr_indptr.dptr<CType>(), num_csr_rows, num_csr_cols);
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index fbd79bb3dd6..cb1db0ec632 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -276,8 +276,19 @@ class ElemwiseBinaryOp : public OpBase {
                        const NDArray &output);
 
   /*! \brief DNS -op- CSR binary operator for non-canonical NDArray */
-  template<typename xpu, typename OP>
-  static void DnsCsrDnsOp(mshadow::Stream<xpu> *s,
+  template<typename OP>
+  static void DnsCsrDnsOp(mshadow::Stream<cpu> *s,
+                          const nnvm::NodeAttrs &attrs,
+                          const OpContext &ctx,
+                          const NDArray &lhs,
+                          const NDArray &rhs,
+                          OpReqType req,
+                          const NDArray &output,
+                          const bool reverse);
+
+  /*! \brief DNS -op- CSR binary operator for non-canonical NDArray */
+  template<typename OP>
+  static void DnsCsrDnsOp(mshadow::Stream<gpu> *s,
                           const nnvm::NodeAttrs &attrs,
                           const OpContext &ctx,
                           const NDArray &lhs,
@@ -537,7 +548,7 @@ class ElemwiseBinaryOp : public OpBase {
       const NDArray& csr = (lhs_stype == kCSRStorage)? inputs[0] : inputs[1];
       const bool reverse = (lhs_stype == kCSRStorage);
 
-      DnsCsrDnsOp<xpu, OP>(s, attrs, ctx, dns, csr, req[0], outputs[0], reverse);
+      DnsCsrDnsOp<OP>(s, attrs, ctx, dns, csr, req[0], outputs[0], reverse);
     } else if (((lhs_stype == kRowSparseStorage && rhs_stype == kDefaultStorage) ||
                 (lhs_stype == kDefaultStorage && rhs_stype == kRowSparseStorage)) &&
                 out_stype == kDefaultStorage) {
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index ea8c1fb9c65..981e7aba9f9 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -25,6 +25,7 @@
 #include <cub/cub.cuh>
 #include "./elemwise_binary_op.h"
 #include "./elemwise_binary_op-inl.h"
+#include "./indexing_op.h"
 
 namespace mxnet {
 namespace op {
@@ -162,6 +163,59 @@ void ElemwiseBinaryOp::RspRspOp(mshadow::Stream<gpu> *s,
   });
 }
 
+/*! \brief DNS -op- CSR binary operator for non-canonical NDArray */
+template<typename OP>
+void ElemwiseBinaryOp::DnsCsrDnsOp(mshadow::Stream<gpu> *s,
+                                   const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const NDArray &dns,
+                                   const NDArray &csr,
+                                   const OpReqType req,
+                                   const NDArray &output,
+                                   const bool reverse) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(dns.storage_type(), kDefaultStorage);
+  CHECK_EQ(csr.storage_type(), kCSRStorage);
+  CHECK(req != kAddTo);
+  CHECK(req != kNullOp);
+  const bool supported_op = std::is_same<OP, mshadow_op::minus>::value ||
+                            std::is_same<OP, mshadow_op::plus>::value;
+  CHECK(supported_op == true);
+  const nnvm::dim_t num_csr_rows = csr.shape()[0];
+  const nnvm::dim_t num_csr_cols = csr.shape()[1];
+  TBlob csr_data = csr.data();
+  TBlob csr_indices = csr.aux_data(csr::kIdx);
+  TBlob csr_indptr = csr.aux_data(csr::kIndPtr);
+  MSHADOW_SGL_DBL_TYPE_SWITCH(csr_data.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(csr_indices.type_flag_, IType, {
+      MSHADOW_IDX_TYPE_SWITCH(csr_indptr.type_flag_, CType, {
+        MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+          if (reverse && std::is_same<OP, mshadow_op::minus>::value) {
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::negation, Req>, gpu>::Launch(
+              s, output.data().Size(), output.data().dptr<DType>(), dns.data().dptr<DType>());
+            if (!csr.storage_initialized()) { return; }
+            mxnet_op::Kernel<ElemwiseDnsCsrDnsWarpKernel<Req, mshadow_op::plus>, gpu>::Launch(
+              s, kWarpSize * num_csr_rows, output.data().dptr<DType>(),
+              output.data().dptr<DType>(), csr_data.dptr<DType>(), csr_indices.dptr<IType>(),
+              csr_indptr.dptr<CType>(), num_csr_rows, num_csr_cols);
+          } else {
+            if (req == kWriteTo) {
+              mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, gpu>::Launch(
+                s, output.data().Size(), output.data().dptr<DType>(), dns.data().dptr<DType>());
+            }
+            if (!csr.storage_initialized()) { return; }
+            mxnet_op::Kernel<ElemwiseDnsCsrDnsWarpKernel<Req, OP>, gpu>::Launch(
+              s, kWarpSize * num_csr_rows, output.data().dptr<DType>(),
+              output.data().dptr<DType>(), csr_data.dptr<DType>(), csr_indices.dptr<IType>(),
+              csr_indptr.dptr<CType>(), num_csr_rows, num_csr_cols);
+          }
+        });
+      });
+    });
+  });
+}
+
 NNVM_REGISTER_OP(elemwise_add)
 .set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, op::mshadow_op::plus>)
 .set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseBinaryOp::ComputeEx<gpu, op::mshadow_op::plus>);
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 8efeb85aa7b..9630988165c 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -114,7 +114,11 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1U);
   CHECK_EQ(req.size(), 1U);
   if (req[0] == kNullOp) return;
-  if (inputs[0].storage_type() == kRowSparseStorage) {
+  if (common::ContainsOnlyStorage(inputs, kRowSparseStorage) ||
+      (inputs.size() == 3U && inputs[0].storage_type() == kDefaultStorage &&
+       inputs[1].storage_type() == kCSRStorage && inputs[2].storage_type() == kDefaultStorage) ||
+      (inputs.size() > 4U && common::ContainsStorageType(inputs, kDefaultStorage) &&
+       outputs[0].storage_type() == kDefaultStorage)) {
     mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
     Resource rsc = ResourceManager::Get()->Request(ctx.run_ctx.get_ctx(),
         ResourceRequest(ResourceRequest::kTempSpace));
@@ -145,7 +149,9 @@ MXNET_ADD_SPARSE_OP_ALIAS(ElementWiseSum)
 The storage type of ``add_n`` output depends on storage types of inputs
 
 - add_n(row_sparse, row_sparse, ..) = row_sparse
-- otherwise, ``add_n`` generates output with default storage
+- add_n(default, csr, default) = default
+- add_n(any input combinations longer than 4 (>4) with at least one default type) = default
+- otherwise, ``add_n`` falls all inputs back to default storage and generates default storage
 
 )doc" ADD_FILELINE)
 .set_attr_parser(ParamParser<ElementWiseSumParam>)
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index 820c8d1e928..e2b41e385da 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -38,7 +38,11 @@ void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(req.size(), 1U);
   if (req[0] == kNullOp) return;
   CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExGPU only supports req = kWriteTo";
-  if (inputs[0].storage_type() == kRowSparseStorage) {
+  if (common::ContainsOnlyStorage(inputs, kRowSparseStorage) ||
+      (inputs.size() == 3U && inputs[0].storage_type() == kDefaultStorage &&
+       inputs[1].storage_type() == kCSRStorage && inputs[2].storage_type() == kDefaultStorage) ||
+      (inputs.size() > 4U && common::ContainsStorageType(inputs, kDefaultStorage) &&
+       outputs[0].storage_type() == kDefaultStorage)) {
     mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
     NDArray out_nd = outputs[0];
     mxnet::ndarray::ElementwiseSum<gpu>(s, ctx.requested[0], inputs, &out_nd);
diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index 34cc2630254..67dc2bbc334 100644
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -27,6 +27,7 @@
 #define MXNET_OPERATOR_TENSOR_INDEXING_OP_CUH_
 #include <cub/device/device_run_length_encode.cuh>
 #include <cub/device/device_scan.cuh>
+#include "../mxnet_op.h"
 
 #if CUDA_VERSION >= 9000
 #define FULLMASK 0xFFFFFFFF
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 70af2bc8ce5..be85e10aa2f 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1731,14 +1731,14 @@ def check_operator_with_temp_resource(shape, stype):
 
 @with_seed()
 def test_sparse_elementwise_sum():
-    def check_sparse_elementwise_sum_with_shape(stype, shape, n):
+    def check_sparse_elementwise_sum_with_shape(stypes, shape, n):
         # forward
         inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)]
         out = mx.symbol.sparse.add_n(*inputs, name='esum')
         arr = []
-        arr_grad = [mx.nd.empty(shape, stype=stype) for _ in range(n)]
+        arr_grad = [mx.nd.empty(shape, stype=stype) for stype in stypes]
         densities = [0, 0.01, 0.5, 1.0]
-        for i in range(n):
+        for stype in stypes:
             arr.append(rand_ndarray(shape, stype, densities[np.random.randint(0, len(densities))]))
 
         exec1 = out.bind(default_context(),
@@ -1747,18 +1747,30 @@ def check_sparse_elementwise_sum_with_shape(stype, shape, n):
         exec1.forward(is_train=True)
         out1 = exec1.outputs[0].asnumpy()
         out = sum(a.asnumpy() for a in arr)
-        assert_almost_equal(out, out1)
+        assert_almost_equal(out, out1, atol=1e-5)
 
         out_grad = mx.nd.empty(shape)
         out_grad[:] = np.random.uniform(-10, 10, shape)
         # backward
         exec1.backward([out_grad])
         for a in arr_grad:
-            assert_almost_equal(a.asnumpy(), out_grad.asnumpy())
+            assert_almost_equal(a.asnumpy(), out_grad.asnumpy(), atol=1e-5)
 
+    all_stypes = ['default', 'csr', 'row_sparse']
     for dim in range(2, 4):
         shape = tuple(np.random.randint(5, 10, size=dim))
-        check_sparse_elementwise_sum_with_shape('row_sparse', shape, np.random.randint(1, 9))
+        rsp_test_cnt = np.random.randint(1, 9)
+        check_sparse_elementwise_sum_with_shape(['row_sparse' for i in range(rsp_test_cnt)], shape, rsp_test_cnt)
+        if dim is 2:
+            check_sparse_elementwise_sum_with_shape(['default', 'csr', 'default'], shape, 3)
+            test_len = np.random.randint(5, 10)
+            # at least one default type
+            stypes = ['default']
+            for i in range(test_len):
+                pick_side = np.random.randint(2)
+                pick_type = np.random.randint(3)
+                stypes = ([all_stypes[pick_type]] if pick_side is 0 else []) + stypes + ([all_stypes[pick_type]] if pick_side is 1 else [])
+            check_sparse_elementwise_sum_with_shape(stypes, shape, test_len+1)
 
 
 @with_seed()


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services