You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2021/08/09 03:04:10 UTC
[incubator-mxnet] branch master updated: [Master] Auto-formatter to
keep the same coding style (#20472)
This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 718a860 [Master] Auto-formatter to keep the same coding style (#20472)
718a860 is described below
commit 718a860f3aa8f24acca2aec867a3b31bc60a6e79
Author: mozga <ma...@intel.com>
AuthorDate: Mon Aug 9 05:02:21 2021 +0200
[Master] Auto-formatter to keep the same coding style (#20472)
* Clang-format ONEDNN files are adjusted by the formater.
* Clang-format: ndarray file & duplication are removed
* A Line contains only semicolon: it was removed
* Adding missing headers to mkldnn_bn_relu_property file
* Removed oneDNN from PR
---
include/mxnet/ndarray.h | 594 +++----
src/ndarray/ndarray.cc | 1708 +++++++++++---------
src/operator/nn/batch_norm-inl.h | 295 ++--
src/operator/nn/batch_norm.cc | 515 +++---
src/operator/nn/mkldnn/mkldnn_act-inl.h | 57 +-
src/operator/nn/mkldnn/mkldnn_act.cc | 186 +--
src/operator/nn/mkldnn/mkldnn_base-inl.h | 401 ++---
src/operator/nn/mkldnn/mkldnn_base.cc | 388 ++---
src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h | 23 +-
src/operator/nn/mkldnn/mkldnn_batch_dot.cc | 78 +-
src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h | 312 ++--
src/operator/nn/mkldnn/mkldnn_concat-inl.h | 23 +-
src/operator/nn/mkldnn/mkldnn_concat.cc | 74 +-
src/operator/nn/mkldnn/mkldnn_convolution-inl.h | 130 +-
src/operator/nn/mkldnn/mkldnn_convolution.cc | 367 +++--
src/operator/nn/mkldnn/mkldnn_copy.cc | 23 +-
src/operator/nn/mkldnn/mkldnn_deconvolution.cc | 434 ++---
.../nn/mkldnn/mkldnn_fully_connected-inl.h | 117 +-
src/operator/nn/mkldnn/mkldnn_fully_connected.cc | 253 ++-
src/operator/nn/mkldnn/mkldnn_log_softmax.cc | 154 +-
src/operator/nn/mkldnn/mkldnn_lrn-inl.h | 207 ++-
src/operator/nn/mkldnn/mkldnn_ops-inl.h | 181 ++-
src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 128 +-
src/operator/nn/mkldnn/mkldnn_pooling.cc | 243 +--
src/operator/nn/mkldnn/mkldnn_reshape-inl.h | 19 +-
src/operator/nn/mkldnn/mkldnn_reshape.cc | 69 +-
src/operator/nn/mkldnn/mkldnn_rnn-inl.h | 338 ++--
src/operator/nn/mkldnn/mkldnn_rnn.cc | 920 ++++++-----
src/operator/nn/mkldnn/mkldnn_slice-inl.h | 26 +-
src/operator/nn/mkldnn/mkldnn_slice.cc | 50 +-
src/operator/nn/mkldnn/mkldnn_softmax.cc | 130 +-
src/operator/nn/mkldnn/mkldnn_sum.cc | 66 +-
src/operator/nn/mkldnn/mkldnn_transpose.cc | 54 +-
.../quantization/mkldnn/mkldnn_dequantize-inl.h | 57 +-
.../quantization/mkldnn/mkldnn_quantize-inl.h | 35 +-
.../quantization/mkldnn/mkldnn_quantize_v2-inl.h | 76 +-
.../quantization/mkldnn/mkldnn_quantized_act.cc | 9 +-
.../mkldnn/mkldnn_quantized_batch_norm.cc | 97 +-
.../quantization/mkldnn/mkldnn_quantized_concat.cc | 38 +-
.../quantization/mkldnn/mkldnn_quantized_conv.cc | 52 +-
.../mkldnn/mkldnn_quantized_elemwise_add.cc | 153 +-
.../mkldnn/mkldnn_quantized_flatten.cc | 24 +-
.../mkldnn/mkldnn_quantized_fully_connected.cc | 62 +-
.../quantization/mkldnn/mkldnn_quantized_ops-inl.h | 11 +-
.../mkldnn/mkldnn_quantized_pooling.cc | 20 +-
.../quantization/mkldnn/mkldnn_requantize-inl.h | 62 +-
.../subgraph/mkldnn/mkldnn_bn_relu_property.h | 46 +-
src/operator/subgraph/mkldnn/mkldnn_common.h | 61 +-
src/operator/subgraph/mkldnn/mkldnn_conv-inl.h | 5 +-
src/operator/subgraph/mkldnn/mkldnn_conv.cc | 532 +++---
.../subgraph/mkldnn/mkldnn_conv_property.h | 107 +-
.../mkldnn_elemwisemul_post_quantize_property.h | 98 +-
src/operator/subgraph/mkldnn/mkldnn_fc-inl.h | 26 +-
src/operator/subgraph/mkldnn/mkldnn_fc.cc | 482 +++---
.../mkldnn/mkldnn_fc_post_quantize_property.h | 96 +-
src/operator/subgraph/mkldnn/mkldnn_fc_property.h | 88 +-
.../mkldnn_post_quantize_align_scale_property.h | 96 +-
.../mkldnn/mkldnn_post_quantize_property.h | 49 +-
.../subgraph/mkldnn/mkldnn_subgraph_property.cc | 25 +-
.../subgraph/mkldnn/mkldnn_transformer-inl.h | 31 +-
src/operator/subgraph/mkldnn/mkldnn_transformer.cc | 604 +++----
.../mkldnn_transformer_post_quantize_property.h | 76 +-
.../mkldnn/mkldnn_transformer_qk_property.h | 63 +-
.../mkldnn/mkldnn_transformer_valatt_property.h | 87 +-
tests/cpp/include/test_mkldnn.h | 236 +--
tests/cpp/operator/mkldnn_operator_test.cc | 827 +++++-----
tests/cpp/operator/mkldnn_test.cc | 148 +-
67 files changed, 6931 insertions(+), 6111 deletions(-)
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index f01c67f..f41f2d3 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -26,23 +26,23 @@
#define MXNET_NDARRAY_H_
#include <dmlc/base.h>
-#include <dmlc/logging.h>
#include <dmlc/io.h>
-#include <dmlc/type_traits.h>
+#include <dmlc/logging.h>
#include <dmlc/registry.h>
+#include <dmlc/type_traits.h>
#include <nnvm/node.h>
-#include <vector>
-#include <map>
-#include <string>
+
#include <algorithm>
+#include <map>
#include <memory>
-#include <algorithm>
+#include <string>
+#include <vector>
#if MXNET_USE_ONEDNN == 1
#include <mkldnn.hpp>
#endif
#include "./base.h"
-#include "./storage.h"
#include "./engine.h"
+#include "./storage.h"
// check c++11
#if DMLC_USE_CXX11 == 0
#error "cxx11 was required for ndarray module"
@@ -51,11 +51,11 @@
namespace mxnet {
// enum for storage types
namespace csr {
-enum CSRAuxType {kIndPtr, kIdx};
+enum CSRAuxType { kIndPtr, kIdx };
}
namespace rowsparse {
-enum RowSparseAuxType {kIdx};
+enum RowSparseAuxType { kIdx };
}
enum NDArrayStorageType {
@@ -82,9 +82,7 @@ class MKLDNNMemory;
class NDArray {
public:
/*! \brief default constructor */
- NDArray()
- : autograd_entry_(nullptr) {
- }
+ NDArray() : autograd_entry_(nullptr) {}
/*!
* \brief constructs a new dynamic NDArray
* \param shape the shape of array
@@ -92,20 +90,25 @@ class NDArray {
* \param delay_alloc whether delay the allocation
* \param dtype data type of this ndarray
*/
- NDArray(const mxnet::TShape &shape, Context ctx,
- bool delay_alloc = false, int dtype = mshadow::default_type_flag)
+ NDArray(const mxnet::TShape& shape,
+ Context ctx,
+ bool delay_alloc = false,
+ int dtype = mshadow::default_type_flag)
: ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
shape_(shape),
dtype_(dtype),
storage_type_(kDefaultStorage),
- autograd_entry_(nullptr) {
- }
+ autograd_entry_(nullptr) {}
/*! \brief constructor for NDArray with storage type
*/
- NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Context ctx,
- bool delay_alloc = true, int dtype = mshadow::default_type_flag,
- const std::vector<int> &aux_types = {}, const mxnet::ShapeVector &aux_shapes = {},
- const mxnet::TShape &storage_shape = mxnet::TShape(mshadow::Shape1(0))) {
+ NDArray(const NDArrayStorageType stype,
+ const mxnet::TShape& shape,
+ Context ctx,
+ bool delay_alloc = true,
+ int dtype = mshadow::default_type_flag,
+ const std::vector<int>& aux_types = {},
+ const mxnet::ShapeVector& aux_shapes = {},
+ const mxnet::TShape& storage_shape = mxnet::TShape(mshadow::Shape1(0))) {
ReInit(stype, shape, ctx, dtype, delay_alloc, &aux_types, &aux_shapes, &storage_shape);
}
/*!
@@ -119,8 +122,7 @@ class NDArray {
shape_(),
dtype_(dtype),
storage_type_(kDefaultStorage),
- autograd_entry_(nullptr) {
- }
+ autograd_entry_(nullptr) {}
/*!
* \brief constructing a static NDArray that shares data with TBlob
* Use with caution: allocate ONLY ONE NDArray for each TBlob,
@@ -128,13 +130,12 @@ class NDArray {
* \param data the memory content of static data
* \param dev_id the device id this tensor sits at
*/
- NDArray(const TBlob &data, int dev_id)
+ NDArray(const TBlob& data, int dev_id)
: ptr_(std::make_shared<Chunk>(data, dev_id)),
shape_(data.shape_),
dtype_(data.type_flag_),
storage_type_(kDefaultStorage),
- autograd_entry_(nullptr) {
- }
+ autograd_entry_(nullptr) {}
/*!
* \brief constructing a static NDArray that shares data with TBlob which is with deleter
@@ -144,15 +145,16 @@ class NDArray {
* \param dev_id the device id this tensor sits at
* \param deleter the function pointer of custom deleter
*/
- NDArray(const TBlob &data, int dev_id, const std::function<void()>& deleter)
- : ptr_(new Chunk(data, dev_id), [deleter](Chunk *p) {
- deleter(); // call custom deleter
- delete p; // delete Chunk object
- }),
+ NDArray(const TBlob& data, int dev_id, const std::function<void()>& deleter)
+ : ptr_(new Chunk(data, dev_id),
+ [deleter](Chunk* p) {
+ deleter(); // call custom deleter
+ delete p; // delete Chunk object
+ }),
shape_(data.shape_),
- dtype_(data.type_flag_), storage_type_(kDefaultStorage),
- autograd_entry_(nullptr) {
- }
+ dtype_(data.type_flag_),
+ storage_type_(kDefaultStorage),
+ autograd_entry_(nullptr) {}
/*! \brief create ndarray from shared memory */
NDArray(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
@@ -160,8 +162,7 @@ class NDArray {
shape_(shape),
dtype_(dtype),
storage_type_(kDefaultStorage),
- autograd_entry_(nullptr) {
- }
+ autograd_entry_(nullptr) {}
/*!
* \brief constructing a static NDArray of non-default storage that shares data with TBlob
@@ -173,35 +174,41 @@ class NDArray {
* \param aux_data the memory content of static aux data
* \param dev_id the device id this tensor sits at
*/
- NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape,
- const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
+ NDArray(const NDArrayStorageType stype,
+ const mxnet::TShape& shape,
+ const TBlob& data,
+ const std::vector<TBlob>& aux_data,
+ int dev_id)
: ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)),
shape_(shape),
dtype_(data.type_flag_),
storage_type_(stype),
- autograd_entry_(nullptr) {
- }
+ autograd_entry_(nullptr) {}
/*!
* \brief initialize the NDArray, assuming it is not assigned a meaningful shape before
* \param shape the shape of the NDArray
*/
- void Init(const mxnet::TShape &shape) {
+ void Init(const mxnet::TShape& shape) {
ptr_->Init(shape, this->dtype_);
this->shape_ = shape;
}
- void InitDetached(const NDArray *src) {
- *this = *src;
+ void InitDetached(const NDArray* src) {
+ *this = *src;
autograd_entry_ = nnvm::NodeEntry(nullptr);
}
inline void ReInit() {
ptr_ = nullptr;
Init(kUndefinedStorage, TShape(), -1);
}
- void ReInit(const NDArrayStorageType stype, const mxnet::TShape &shape, Context ctx, int dtype,
- bool delay_alloc = true, const std::vector<int> *aux_types = nullptr,
- const mxnet::ShapeVector *aux_shapes = nullptr,
- const mxnet::TShape *storage_shape = nullptr);
+ void ReInit(const NDArrayStorageType stype,
+ const mxnet::TShape& shape,
+ Context ctx,
+ int dtype,
+ bool delay_alloc = true,
+ const std::vector<int>* aux_types = nullptr,
+ const mxnet::ShapeVector* aux_shapes = nullptr,
+ const mxnet::TShape* storage_shape = nullptr);
void SelfReorder2Default();
/*!
@@ -227,10 +234,8 @@ class NDArray {
/* \brief Check whether the two arrays are the same array */
inline bool IsSame(const NDArray& other) const {
- return ptr_ == other.ptr_ &&
- shape_ == other.shape_ &&
- byte_offset_ == other.byte_offset_ &&
- dtype_ == other.dtype_;
+ return ptr_ == other.ptr_ && shape_ == other.shape_ && byte_offset_ == other.byte_offset_ &&
+ dtype_ == other.dtype_;
}
/*!
@@ -244,10 +249,10 @@ class NDArray {
* It is only intended for non-default storage. For row-sparse storage, it is the shape of
* the tensor which stores the non-zero values.
*/
- inline const mxnet::TShape &storage_shape() const {
+ inline const mxnet::TShape& storage_shape() const {
CHECK(ptr_ != nullptr);
CHECK_NE(storage_type(), kDefaultStorage)
- << "storage_shape() is not intended for kDefaultStorage.";
+ << "storage_shape() is not intended for kDefaultStorage.";
return ptr_->storage_shape;
}
@@ -257,22 +262,20 @@ class NDArray {
* \return the shape of aux data at given index
*/
inline const mxnet::TShape& aux_shape(size_t index) const {
- CHECK_NE(storage_type(), kDefaultStorage)
- << "aux_shape() is not intended for kDefaultStorage.";
+ CHECK_NE(storage_type(), kDefaultStorage) << "aux_shape() is not intended for kDefaultStorage.";
return ptr_->aux_shapes[index];
}
/* \return the shapes of all aux data */
const mxnet::ShapeVector& aux_shapes() const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "aux_shapes() is not intended for kDefaultStorage.";
+ << "aux_shapes() is not intended for kDefaultStorage.";
return ptr_->aux_shapes;
}
/*! returns the dtypes of all aux data */
const std::vector<int>& aux_types() const {
- CHECK_NE(storage_type(), kDefaultStorage)
- << "aux_types() is not intended for kDefaultStorage.";
+ CHECK_NE(storage_type(), kDefaultStorage) << "aux_types() is not intended for kDefaultStorage.";
return ptr_->aux_types;
}
@@ -285,7 +288,7 @@ class NDArray {
*/
inline void set_aux_shape(size_t index, const mxnet::TShape& shape) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "set_aux_shape() is not intended for kDefaultStorage.";
+ << "set_aux_shape() is not intended for kDefaultStorage.";
ptr_->set_aux_shape(index, shape);
}
@@ -293,7 +296,8 @@ class NDArray {
* \return the data TBlob
*/
inline const TBlob& data() const {
- if (storage_type() == kDefaultStorage) CheckAndAlloc();
+ if (storage_type() == kDefaultStorage)
+ CheckAndAlloc();
SetTBlob();
return tblob_;
}
@@ -309,11 +313,11 @@ class NDArray {
auto stype = storage_type();
TBlob res;
auto shape = aux_shape(i);
- auto type = aux_type(i);
+ auto type = aux_type(i);
MSHADOW_TYPE_SWITCH(type, DType, {
auto dptr = static_cast<DType*>(ptr_->aux_handles[i].dptr);
CHECK(stype == kRowSparseStorage || stype == kCSRStorage)
- << "Unexpected storage type: " << stype;
+ << "Unexpected storage type: " << stype;
res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
});
return res;
@@ -352,19 +356,20 @@ class NDArray {
* Returns false if the indices array is empty(nnz = 0) for csr/row_sparse
*/
inline bool storage_initialized() const {
- if (is_none()) return false;
+ if (is_none())
+ return false;
auto stype = storage_type();
CHECK_NE(stype, kDefaultStorage)
- << "storage_initialized() is not intended for kDefaultStorage.";
+ << "storage_initialized() is not intended for kDefaultStorage.";
if (stype == kRowSparseStorage) {
CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0])
- << "inconsistent storage shape " << storage_shape()
- << " vs. aux shape " << aux_shape(rowsparse::kIdx);
+ << "inconsistent storage shape " << storage_shape() << " vs. aux shape "
+ << aux_shape(rowsparse::kIdx);
return aux_shape(rowsparse::kIdx).Size() != 0;
} else if (stype == kCSRStorage) {
CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0])
- << "inconsistent storage shape " << storage_shape()
- << " vs. aux shape " << aux_shape(csr::kIdx);
+ << "inconsistent storage shape " << storage_shape() << " vs. aux shape "
+ << aux_shape(csr::kIdx);
return aux_shape(csr::kIdx).Size() != 0;
} else {
LOG(FATAL) << "Unknown storage type";
@@ -379,8 +384,7 @@ class NDArray {
return ptr_->shandle;
}
/*! \brief assign profiler scope and name to the storage handles */
- void AssignStorageInfo(const std::string& profiler_scope,
- const std::string& name);
+ void AssignStorageInfo(const std::string& profiler_scope, const std::string& name);
/*!
* \brief Block until all the pending write operations with respect
* to current NDArray are finished, and read can be performed.
@@ -413,81 +417,81 @@ class NDArray {
* \brief save the content into binary stream
* \param strm the output stream
*/
- void Save(dmlc::Stream *strm) const;
+ void Save(dmlc::Stream* strm) const;
/*!
* \brief load ndarrays before supporting sparse ndarrays
* \param strm the output stream
* \param magic the magic number used for version control
*/
- bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic);
+ bool LegacyLoad(dmlc::Stream* strm, const uint32_t magic);
/*!
* \brief load the content from binary stream
* \param strm the output stream
* \return whether the load is successful
*/
- bool Load(dmlc::Stream *strm);
+ bool Load(dmlc::Stream* strm);
/*!
* \brief set all the elements in ndarray to be scalar
* \param scalar the scalar to set
* \return reference of self
*/
- NDArray &operator=(real_t scalar);
+ NDArray& operator=(real_t scalar);
/*!
* \brief elementwise add to current space
* this mutate the current NDArray
* \param src the data to add
* \return reference of self
*/
- NDArray &operator+=(const NDArray &src);
+ NDArray& operator+=(const NDArray& src);
/*!
* \brief elementwise add to current space
* this mutate the current NDArray
* \param src the data to add
* \return reference of self
*/
- NDArray &operator+=(const real_t &src);
+ NDArray& operator+=(const real_t& src);
/*!
* \brief elementwise subtract from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator-=(const NDArray &src);
+ NDArray& operator-=(const NDArray& src);
/*!
* \brief elementwise subtract from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator-=(const real_t &src);
+ NDArray& operator-=(const real_t& src);
/*!
* \brief elementwise multiplication to current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator*=(const NDArray &src);
+ NDArray& operator*=(const NDArray& src);
/*!
* \brief elementwise multiplication to current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator*=(const real_t &src);
+ NDArray& operator*=(const real_t& src);
/*!
* \brief elementwise division from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator/=(const NDArray &src);
+ NDArray& operator/=(const NDArray& src);
/*!
* \brief elementwise division from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator/=(const real_t &src);
+ NDArray& operator/=(const real_t& src);
/*!
* \brief return a new copy this NDArray
* \param ctx the new context of this NDArray
@@ -504,12 +508,12 @@ class NDArray {
* \param data the data source to copy from.
* \param size the size of the source array, in sizeof(DType) not raw btyes.
*/
- void SyncCopyFromCPU(const void *data, size_t size) const;
+ void SyncCopyFromCPU(const void* data, size_t size) const;
/*!
* \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j)
*/
- void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1);
+ void SyncCopyFromNDArray(const NDArray& src, int i = -1, int j = -1);
/*!
* \brief Do a synchronize copy to a contiguous CPU memory region.
@@ -521,12 +525,12 @@ class NDArray {
* \param data the data source to copyinto.
* \param size the memory size we want to copy into, in sizeof(DType) not raw btyes.
*/
- void SyncCopyToCPU(void *data, size_t size) const;
+ void SyncCopyToCPU(void* data, size_t size) const;
/*!
- * \brief check whether the NDArray format is valid
- * \param full_check if `True`, rigorous check, O(N) operations
- * Otherwise basic check, O(1) operations
- */
+ * \brief check whether the NDArray format is valid
+ * \param full_check if `True`, rigorous check, O(N) operations
+ * Otherwise basic check, O(1) operations
+ */
void SyncCheckFormat(const bool full_check) const;
/*!
* \brief Slice a NDArray
@@ -573,30 +577,27 @@ class NDArray {
* \param dtype The data type.
* \return NDArray in new shape and type.
*/
- inline NDArray AsArray(const mxnet::TShape &shape, int dtype) const {
- CHECK_EQ(storage_type(), kDefaultStorage)
- << "AsArray is intended only for kDefaultStorage.";
- CHECK_GE(ptr_->shandle.size,
- shape.Size() * mshadow::mshadow_sizeof(dtype))
+ inline NDArray AsArray(const mxnet::TShape& shape, int dtype) const {
+ CHECK_EQ(storage_type(), kDefaultStorage) << "AsArray is intended only for kDefaultStorage.";
+ CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype))
<< "NDArray.AsArray: target memory size is bigger";
// We can't reuse memory in a view.
CHECK(!IsView());
NDArray ret = *this;
- ret.shape_ = shape;
- ret.dtype_ = dtype;
- ret.reuse_ = true;
+ ret.shape_ = shape;
+ ret.dtype_ = dtype;
+ ret.reuse_ = true;
return ret;
}
- inline void InitAsArray(const NDArray &src, const mxnet::TShape &shape, int dtype) {
+ inline void InitAsArray(const NDArray& src, const mxnet::TShape& shape, int dtype) {
CHECK_EQ(src.storage_type(), kDefaultStorage)
- << "AsArray is intended only for kDefaultStorage.";
- CHECK_GE(src.ptr_->shandle.size,
- shape.Size() * mshadow::mshadow_sizeof(dtype))
- << "NDArray.AsArray: target memory size is bigger than what was allocated.";
+ << "AsArray is intended only for kDefaultStorage.";
+ CHECK_GE(src.ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype))
+ << "NDArray.AsArray: target memory size is bigger than what was allocated.";
// We can't reuse memory in a view.
CHECK(!src.IsView());
- *this = src;
+ *this = src;
shape_ = shape;
dtype_ = dtype;
reuse_ = true;
@@ -629,7 +630,7 @@ class NDArray {
* the temporary ndarray which stores intermediate custom op results.
* Should be used with caution elsewhere. Supports only CSR and RSP formats.
*/
- inline void SparseUpdateChunk(const NDArray &arr) const {
+ inline void SparseUpdateChunk(const NDArray& arr) const {
CHECK(shape_ == arr.shape_) << "ndarray shape is different from the target";
CHECK(dtype_ == arr.dtype_) << "ndarray dtype is different from the target";
auto stype = arr.storage_type();
@@ -637,24 +638,24 @@ class NDArray {
<< "Only to be used with CSR and RSP storage types";
// swap shandles between src and dst
Storage::Handle shandle_dst = arr.ptr_->shandle;
- arr.ptr_->shandle = ptr_->shandle;
- ptr_->shandle = shandle_dst;
+ arr.ptr_->shandle = ptr_->shandle;
+ ptr_->shandle = shandle_dst;
ptr_->storage_shape = arr.ptr_->storage_shape;
- ptr_->storage_type = arr.ptr_->storage_type;
- ptr_->ctx = arr.ptr_->ctx;
+ ptr_->storage_type = arr.ptr_->storage_type;
+ ptr_->ctx = arr.ptr_->ctx;
// swap aux_handles between src and dst
size_t aux_idx = 0;
CHECK(ptr_->aux_handles.size() == arr.ptr_->aux_handles.size())
<< "ndarray number of aux_handles is different from target";
- for (auto &aux_handle : arr.ptr_->aux_handles) {
- Storage::Handle aux_dst = ptr_->aux_handles[aux_idx];
+ for (auto& aux_handle : arr.ptr_->aux_handles) {
+ Storage::Handle aux_dst = ptr_->aux_handles[aux_idx];
ptr_->aux_handles[aux_idx] = aux_handle;
- aux_handle = aux_dst;
+ aux_handle = aux_dst;
aux_idx++;
}
- ptr_->aux_types = arr.ptr_->aux_types;
+ ptr_->aux_types = arr.ptr_->aux_types;
ptr_->aux_shapes = arr.ptr_->aux_shapes;
}
@@ -663,20 +664,20 @@ class NDArray {
* \param shape new shape
* \return NDArray in new shape
*/
- NDArray Reshape(const mxnet::TShape &shape) const;
+ NDArray Reshape(const mxnet::TShape& shape) const;
/*!
* \brief Get an reshaped NDArray. Supports autograd recording
* \param shape new shape
* \return NDArray in new shape
*/
- NDArray ReshapeWithRecord(const mxnet::TShape &shape);
+ NDArray ReshapeWithRecord(const mxnet::TShape& shape);
/*!
* \brief Return a copy of this NDArray without autograd and deferred compute
* history
*/
NDArray Detach() const {
NDArray ret(*this);
- ret.autograd_entry_ = nnvm::NodeEntry(nullptr);
+ ret.autograd_entry_ = nnvm::NodeEntry(nullptr);
ret.deferredcompute_entry_ = nnvm::NodeEntry(nullptr);
return ret;
}
@@ -711,19 +712,19 @@ class NDArray {
* \brief Alloc memory for non-default storage
* aux_shape is only known at run time
*/
- inline void CheckAndAlloc(const mxnet::ShapeVector &aux_shapes) const {
+ inline void CheckAndAlloc(const mxnet::ShapeVector& aux_shapes) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage";
+ << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage";
ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_);
}
- inline void CheckAndAllocData(const mxnet::TShape &storage_shape) const {
+ inline void CheckAndAllocData(const mxnet::TShape& storage_shape) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "CheckAndAllocData is not intended for kDefaultStorage";
+ << "CheckAndAllocData is not intended for kDefaultStorage";
ptr_->CheckAndAllocData(storage_shape, dtype_);
}
- inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape &aux_shape) const {
+ inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape& aux_shape) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "CheckAndAllocAuxData is not intended for kDefaultStorage";
+ << "CheckAndAllocAuxData is not intended for kDefaultStorage";
ptr_->CheckAndAllocAuxData(i, aux_shape);
}
@@ -732,12 +733,12 @@ class NDArray {
* Create NDArray from mkldnn memory.
* mkldnn_mem The mkldnn memory to be managed.
*/
- explicit NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem);
+ explicit NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem);
/*
* Create NDArray from mkldnn memory descriptor.
* mem_pd The mkldnn memory descriptor to be created.
*/
- explicit NDArray(const mkldnn::memory::desc &md);
+ explicit NDArray(const mkldnn::memory::desc& md);
/*
* Test if the data is stored in one of special MKLDNN format.
*/
@@ -760,29 +761,28 @@ class NDArray {
/*
* This function returns mkldnn::memory with the default primitive_desc.
*/
- const mkldnn::memory *GetMKLDNNData() const;
+ const mkldnn::memory* GetMKLDNNData() const;
/*
* This function returns mkldnn::memory with the given primitive_desc
* as long as the array size meets the required size in the given primitive_desc.
*/
- const mkldnn::memory *GetMKLDNNData(const mkldnn::memory::desc &md) const;
+ const mkldnn::memory* GetMKLDNNData(const mkldnn::memory::desc& md) const;
/*
* This function returns mkldnn::memory with the given primitive_desc.
* The returned mkldnn::memory will have the same physical layout as
* the given primitive_desc.
*/
- const mkldnn::memory *GetMKLDNNDataReorder(
- const mkldnn::memory::desc &md) const;
+ const mkldnn::memory* GetMKLDNNDataReorder(const mkldnn::memory::desc& md) const;
/*
* This function copies data from mkldnn memory.
*/
- void CopyFrom(const mkldnn::memory &mem);
+ void CopyFrom(const mkldnn::memory& mem);
/*
* This function allocates memory for array and creates mkldnn memory
* with the specified format.
*/
- mkldnn::memory *CreateMKLDNNData(const mkldnn::memory::desc &md);
+ mkldnn::memory* CreateMKLDNNData(const mkldnn::memory::desc& md);
/*
* These are the async version of the methods above.
@@ -790,7 +790,7 @@ class NDArray {
* the array are complete.
*/
void Reorder2DefaultAsync() const;
- void MKLDNNDataReorderAsync(const mkldnn::memory::desc &md) const;
+ void MKLDNNDataReorderAsync(const mkldnn::memory::desc& md) const;
/*
* This creates a new NDArray with the reordered data.
@@ -798,7 +798,7 @@ class NDArray {
*/
NDArray Reorder2Default() const;
- /*
+ /*
* This creates a new NDArray using f32 with the reordered data.
* It doesn't affect the data of the original NDArray.
*/
@@ -816,12 +816,12 @@ class NDArray {
* which can be expensive.
* It's used by FullyConnected right now.
*/
- NDArray MKLDNNDataReshape(const mxnet::TShape &shape) const;
+ NDArray MKLDNNDataReshape(const mxnet::TShape& shape) const;
- /*!
+ /*!
* \ Fix mkldnn memory descriptor mismatch from NDArray.
*/
- void UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc);
+ void UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc);
#endif
/*!
@@ -839,9 +839,7 @@ class NDArray {
* \param data the NDArrays to be loaded
* \param keys the name of the NDArray, if saved in the file.
*/
- static void Load(dmlc::Stream* fi,
- std::vector<NDArray>* data,
- std::vector<std::string>* keys);
+ static void Load(dmlc::Stream* fi, std::vector<NDArray>* data, std::vector<std::string>* keys);
private:
friend class Imperative;
@@ -894,30 +892,34 @@ class NDArray {
/*! \brief Reference to the engine to ensure we cleanup without calling a destructed engine */
std::weak_ptr<Engine> engine_ref_;
-
/*! \brief default constructor */
- Chunk() : static_data(true), delay_alloc(false),
- storage_ref_(Storage::_GetSharedRef()),
- engine_ref_(Engine::_GetSharedRef()) {}
+ Chunk()
+ : static_data(true),
+ delay_alloc(false),
+ storage_ref_(Storage::_GetSharedRef()),
+ engine_ref_(Engine::_GetSharedRef()) {}
/*! \brief construct a new chunk */
Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
- : static_data(false), delay_alloc(true), ctx(ctx_),
+ : static_data(false),
+ delay_alloc(true),
+ ctx(ctx_),
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
storage_shape = shape;
if (shape_is_known(storage_shape)) {
shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
}
- var = Engine::Get()->NewVariable();
+ var = Engine::Get()->NewVariable();
shandle.ctx = ctx_;
if (!delay_alloc_) {
this->CheckAndAlloc();
}
}
- Chunk(const TBlob &data, int dev_id)
- : static_data(true), delay_alloc(false),
+ Chunk(const TBlob& data, int dev_id)
+ : static_data(true),
+ delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
CHECK(storage_type == kDefaultStorage);
@@ -929,35 +931,45 @@ class NDArray {
ctx = Context::GPU(dev_id);
}
// init shandle
- shandle.ctx = ctx;
- shandle.dptr = data.dptr_;
- shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
+ shandle.ctx = ctx;
+ shandle.dptr = data.dptr_;
+ shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
storage_shape = data.shape_;
}
Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
- : static_data(false), delay_alloc(false),
+ : static_data(false),
+ delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
- var = Engine::Get()->NewVariable();
- ctx = Context::CPUShared(0);
- shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
- shandle.ctx = ctx;
+ var = Engine::Get()->NewVariable();
+ ctx = Context::CPUShared(0);
+ shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
+ shandle.ctx = ctx;
shandle.shared_pid = shared_pid;
- shandle.shared_id = shared_id;
+ shandle.shared_id = shared_id;
Storage::Get()->Alloc(&shandle);
storage_shape = shape;
}
// Constructor for a non-default storage chunk
- Chunk(NDArrayStorageType storage_type_, const mxnet::TShape &storage_shape_, Context ctx_,
- bool delay_alloc_, int dtype, const std::vector<int> &aux_types_,
- const mxnet::ShapeVector &aux_shapes_)
- : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
- aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
- aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()),
+ Chunk(NDArrayStorageType storage_type_,
+ const mxnet::TShape& storage_shape_,
+ Context ctx_,
+ bool delay_alloc_,
+ int dtype,
+ const std::vector<int>& aux_types_,
+ const mxnet::ShapeVector& aux_shapes_)
+ : static_data(false),
+ delay_alloc(delay_alloc_),
+ storage_type(storage_type_),
+ aux_types(aux_types_),
+ ctx(ctx_),
+ storage_shape(storage_shape_),
+ aux_shapes(aux_shapes_),
+ storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
shandle.ctx = ctx;
- var = Engine::Get()->NewVariable();
+ var = Engine::Get()->NewVariable();
// aux_handles always reflect the correct number of aux data
for (size_t i = 0; i < aux_shapes.size(); i++) {
CheckAndAllocAuxData(i, aux_shapes[i]);
@@ -970,10 +982,15 @@ class NDArray {
}
}
- Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
- const std::vector<TBlob> &aux_data, int dev_id)
- : static_data(true), delay_alloc(false), storage_type(storage_type_),
- storage_ref_(Storage::_GetSharedRef()), engine_ref_(Engine::_GetSharedRef()) {
+ Chunk(const NDArrayStorageType storage_type_,
+ const TBlob& data,
+ const std::vector<TBlob>& aux_data,
+ int dev_id)
+ : static_data(true),
+ delay_alloc(false),
+ storage_type(storage_type_),
+ storage_ref_(Storage::_GetSharedRef()),
+ engine_ref_(Engine::_GetSharedRef()) {
using namespace mshadow;
CHECK_NE(storage_type, kDefaultStorage);
// init var
@@ -986,14 +1003,14 @@ class NDArray {
ctx = Context::GPU(dev_id);
}
// init shandle
- shandle.ctx = ctx;
- shandle.dptr = data.dptr_;
- shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
+ shandle.ctx = ctx;
+ shandle.dptr = data.dptr_;
+ shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
storage_shape = data.shape_;
// init aux handles
- for (const auto &aux : aux_data) {
+ for (const auto& aux : aux_data) {
Storage::Handle aux_handle;
- aux_handle.ctx = ctx;
+ aux_handle.ctx = ctx;
aux_handle.dptr = aux.dptr_;
aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_);
aux_handles.push_back(aux_handle);
@@ -1050,13 +1067,14 @@ class NDArray {
}
}
/*! \brief initialize the shape and dtype, assuming it is not initialized before. */
- void Init(const mxnet::TShape &shape, int dtype) {
- auto size = shape.Size();
+ void Init(const mxnet::TShape& shape, int dtype) {
+ auto size = shape.Size();
storage_shape = shape;
- shandle.size = size * mshadow::mshadow_sizeof(dtype);
+ shandle.size = size * mshadow::mshadow_sizeof(dtype);
this->CheckAndAlloc();
}
- inline void CheckAndAlloc(const mxnet::TShape &shape, const mxnet::ShapeVector &aux_shapes,
+ inline void CheckAndAlloc(const mxnet::TShape& shape,
+ const mxnet::ShapeVector& aux_shapes,
int dtype) {
// calculate size, perform allocation
if (kRowSparseStorage == storage_type) {
@@ -1078,17 +1096,17 @@ class NDArray {
// storage shape is also updated
// if data is already allocated, try reuse the storage. Otherwise, free the current one
// and allocate new storage
- void CheckAndAllocData(const mxnet::TShape &shape, int dtype);
+ void CheckAndAllocData(const mxnet::TShape& shape, int dtype);
#if MXNET_USE_ONEDNN == 1
// Have MKL memory reference to the data in the default storage
// or create memory for MKLDNN.
- void SetMKLMem(const mxnet::TShape &shape, int dtype);
+ void SetMKLMem(const mxnet::TShape& shape, int dtype);
// If the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
// save the result in shandle.
void Reorder2Default();
// Reroder data to a specified layout.
- void MKLDNNDataReorder(const mkldnn::memory::desc &md);
+ void MKLDNNDataReorder(const mkldnn::memory::desc& md);
bool IsMKLDNN() const;
bool IsDefault() const;
#endif
@@ -1098,12 +1116,12 @@ class NDArray {
// aux shape is also updated
// if aux data is already allocated, try reuse the storage. Otherwise, free the current one
// and allocate new storage
- inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape &shape) {
+ inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape& shape) {
CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData";
CHECK_NE(storage_type, kUndefinedStorage)
- << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
+ << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
CHECK_NE(storage_type, kDefaultStorage)
- << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
+ << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
if (aux_handles.size() <= i) {
aux_handles.resize(i + 1);
}
@@ -1123,13 +1141,13 @@ class NDArray {
/*!
* \brief initialize the NDArray
- */
- inline void Init(const NDArrayStorageType stype, const mxnet::TShape &shape, int dtype) {
- shape_ = shape;
- dtype_ = dtype;
- storage_type_ = stype;
- reuse_ = false;
- byte_offset_ = 0;
+ */
+ inline void Init(const NDArrayStorageType stype, const mxnet::TShape& shape, int dtype) {
+ shape_ = shape;
+ dtype_ = dtype;
+ storage_type_ = stype;
+ reuse_ = false;
+ byte_offset_ = 0;
autograd_entry_ = nnvm::NodeEntry(nullptr);
}
@@ -1180,7 +1198,7 @@ size_t num_aux_data(NDArrayStorageType stype);
* \note The function name explicitly marks the order of from and to
* due to different possible convention carried by copy function.
*/
-void CopyFromTo(const NDArray &from, const NDArray *to, int priority = 0);
+void CopyFromTo(const NDArray& from, const NDArray* to, int priority = 0);
/*!
* \brief issue an copy operation from one NDArray to another
@@ -1195,7 +1213,7 @@ void CopyFromTo(const NDArray &from, const NDArray *to, int priority = 0);
* \note The function name explicitly marks the order of from and to
* due to different possible convention carried by copy function.
*/
-void CopyFromTo(const NDArray &from, const NDArray& to, int priority = 0, bool is_opr = false);
+void CopyFromTo(const NDArray& from, const NDArray& to, int priority = 0, bool is_opr = false);
/*!
* \brief Perform elementwise sum over each data from source, store result into out.
@@ -1203,7 +1221,7 @@ void CopyFromTo(const NDArray &from, const NDArray& to, int priority = 0, bool i
* \param out the target ndarray
* \param priority Priority of the action.
*/
-void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority = 0);
+void ElementwiseSum(const std::vector<NDArray>& source, NDArray* out, int priority = 0);
/*!
* \brief elementwise add
@@ -1211,56 +1229,56 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator+(const NDArray &lhs, const NDArray &rhs);
+NDArray operator+(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise add
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator+(const NDArray &lhs, const real_t &rhs);
+NDArray operator+(const NDArray& lhs, const real_t& rhs);
/*!
* \brief elementwise subtraction
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator-(const NDArray &lhs, const NDArray &rhs);
+NDArray operator-(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise subtraction
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator-(const NDArray &lhs, const real_t &rhs);
+NDArray operator-(const NDArray& lhs, const real_t& rhs);
/*!
* \brief elementwise multiplication
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator*(const NDArray &lhs, const NDArray &rhs); \
+NDArray operator*(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise multiplication
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator*(const NDArray &lhs, const real_t &rhs);
+NDArray operator*(const NDArray& lhs, const real_t& rhs);
/*!
* \brief elementwise division
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator/(const NDArray &lhs, const NDArray &rhs);
+NDArray operator/(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise division
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator/(const NDArray &lhs, const real_t &rhs);
+NDArray operator/(const NDArray& lhs, const real_t& rhs);
/*!
* \brief Seed all random number generator in mxnet.
@@ -1278,60 +1296,60 @@ void RandomSeed(Context ctx, uint32_t seed);
* \param end upper bound of distribution.
* \param out output NDArray.
*/
-void SampleUniform(real_t begin, real_t end, NDArray *out);
+void SampleUniform(real_t begin, real_t end, NDArray* out);
/*!
* \brief Sample gaussian distribution for each elements of out.
* \param mu mean of gaussian distribution.
* \param sigma standard deviation of gaussian distribution.
* \param out output NDArray.
*/
-void SampleGaussian(real_t mu, real_t sigma, NDArray *out);
+void SampleGaussian(real_t mu, real_t sigma, NDArray* out);
/*!
* \brief Sample gamma distribution for each elements of out.
* \param alpha parameter (shape) of the gamma distribution
* \param beta parameter (scale) of the gamma distribution
* \param out output NDArray.
*/
-void SampleGamma(real_t alpha, real_t beta, NDArray *out);
+void SampleGamma(real_t alpha, real_t beta, NDArray* out);
/*!
* \brief Sample exponential distribution for each elements of out.
* \param lambda parameter (rate) of the exponential distribution
* \param out output NDArray.
*/
-void SampleExponential(real_t lambda, NDArray *out);
+void SampleExponential(real_t lambda, NDArray* out);
/*!
* \brief Sample Poisson distribution for each elements of out.
* \param lambda parameter (rate) of the Poisson distribution
* \param out output NDArray.
*/
-void SamplePoisson(real_t lambda, NDArray *out);
+void SamplePoisson(real_t lambda, NDArray* out);
/*!
* \brief Sample negative binomial distribution for each elements of out.
* \param k failure limit
* \param p success probability
* \param out output NDArray.
*/
-void SampleNegBinomial(int32_t k, real_t p, NDArray *out);
+void SampleNegBinomial(int32_t k, real_t p, NDArray* out);
/*!
* \brief Sample generalized negative binomial distribution for each elements of out.
* \param mu parameter (mean) of the distribution
* \param alpha parameter (over dispersion) of the distribution
* \param out output NDArray.
*/
-void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray *out);
-
+void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray* out);
//--------------------------------------------------------------
// The following part are API Registration of NDArray functions.
//--------------------------------------------------------------
/*! \brief definition of NDArray function */
-typedef std::function<void (NDArray **used_vars,
- real_t *scalars,
- NDArray **mutate_vars,
- int num_params,
- char **param_keys,
- char **param_vals)> NDArrayAPIFunction;
+typedef std::function<void(NDArray** used_vars,
+ real_t* scalars,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals)>
+ NDArrayAPIFunction;
/*! \brief mask information on how functions can be exposed */
enum NDArrayFunctionTypeMask {
/*! \brief all the use_vars should go before scalar */
@@ -1350,8 +1368,7 @@ enum NDArrayFunctionTypeMask {
};
/*! \brief Registry entry for NDArrayFunction */
struct NDArrayFunctionReg
- : public dmlc::FunctionRegEntryBase<NDArrayFunctionReg,
- NDArrayAPIFunction> {
+ : public dmlc::FunctionRegEntryBase<NDArrayFunctionReg, NDArrayAPIFunction> {
/*! \brief number of variable used by this function */
unsigned num_use_vars;
/*! \brief number of variable mutated by this function */
@@ -1363,44 +1380,44 @@ struct NDArrayFunctionReg
/*!
* \brief constructor
*/
- NDArrayFunctionReg()
- : num_use_vars(0),
- num_mutate_vars(0),
- num_scalars(0),
- type_mask(0) {}
+ NDArrayFunctionReg() : num_use_vars(0), num_mutate_vars(0), num_scalars(0), type_mask(0) {}
/*!
* \brief set the function body to a NDArray setvalue function
* this will also auto set the parameters correctly
* \param fsetvalue function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*fsetvalue)(const real_t &rhs,
- NDArray *out)) {
- body = [fsetvalue] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
- (*fsetvalue)(s[0], mutate_vars[0]);
- };
- num_mutate_vars = 1; num_scalars = 1;
+ inline NDArrayFunctionReg& set_function(void (*fsetvalue)(const real_t& rhs, NDArray* out)) {
+ body = [fsetvalue](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) { (*fsetvalue)(s[0], mutate_vars[0]); };
+ num_mutate_vars = 1;
+ num_scalars = 1;
this->add_argument("src", "real_t", "Source input to the function.");
return *this;
}
/*!
- * \brief set the function body to a ternary NDArray function
- * this will also auto set the parameters correctly
- * \param fternary function body to set
- * \return ref to the registered entry, used to set properties
- */
- inline NDArrayFunctionReg &set_function(void(*fternary)(const NDArray &lhs,
- const NDArray &mhs,
- const NDArray &rhs,
- NDArray *out)) {
- body = [fternary](NDArray **used_vars,
- real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
+ * \brief set the function body to a ternary NDArray function
+ * this will also auto set the parameters correctly
+ * \param fternary function body to set
+ * \return ref to the registered entry, used to set properties
+ */
+ inline NDArrayFunctionReg& set_function(
+ void (*fternary)(const NDArray& lhs, const NDArray& mhs, const NDArray& rhs, NDArray* out)) {
+ body = [fternary](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
(*fternary)(*used_vars[0], *used_vars[1], *used_vars[2], mutate_vars[0]);
};
- num_use_vars = 3; num_mutate_vars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ num_use_vars = 3;
+ num_mutate_vars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("lhs", "NDArray", "Left operand to the function.");
this->add_argument("mhs", "NDArray", "Middle operand to the function.");
this->add_argument("rhs", "NDArray", "Right operand to the function.");
@@ -1412,15 +1429,20 @@ struct NDArrayFunctionReg
* \param fbinary function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*fbinary)(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out)) {
- body = [fbinary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
+ inline NDArrayFunctionReg& set_function(void (*fbinary)(const NDArray& lhs,
+ const NDArray& rhs,
+ NDArray* out)) {
+ body = [fbinary](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
(*fbinary)(*used_vars[0], *used_vars[1], mutate_vars[0]);
};
- num_use_vars = 2; num_mutate_vars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ num_use_vars = 2;
+ num_mutate_vars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("lhs", "NDArray", "Left operand to the function.");
this->add_argument("rhs", "NDArray", "Right operand to the function.");
return *this;
@@ -1431,15 +1453,19 @@ struct NDArrayFunctionReg
* \param fscalar function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*fscalar)(const NDArray &lhs,
- const real_t &rhs,
- NDArray *out)) {
- body = [fscalar] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
- (*fscalar)(*used_vars[0], s[0], mutate_vars[0]);
- };
- num_use_vars = 1; num_mutate_vars = 1; num_scalars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ inline NDArrayFunctionReg& set_function(void (*fscalar)(const NDArray& lhs,
+ const real_t& rhs,
+ NDArray* out)) {
+ body = [fscalar](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) { (*fscalar)(*used_vars[0], s[0], mutate_vars[0]); };
+ num_use_vars = 1;
+ num_mutate_vars = 1;
+ num_scalars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("lhs", "NDArray", "Left operand to the function.");
this->add_argument("rhs", "real_t", "Right operand to the function.");
return *this;
@@ -1450,14 +1476,16 @@ struct NDArrayFunctionReg
* \param funary function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*funary)(const NDArray &src,
- NDArray *out)) {
- body = [funary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
- (*funary)(*used_vars[0], mutate_vars[0]);
- };
- num_use_vars = 1; num_mutate_vars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ inline NDArrayFunctionReg& set_function(void (*funary)(const NDArray& src, NDArray* out)) {
+ body = [funary](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) { (*funary)(*used_vars[0], mutate_vars[0]); };
+ num_use_vars = 1;
+ num_mutate_vars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("src", "NDArray", "Source input to the function.");
return *this;
}
@@ -1467,13 +1495,17 @@ struct NDArrayFunctionReg
* \param fgeneric function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(
- void (*fgeneric)(NDArray **used_vars,
- real_t *s,
- NDArray **mutate_vars,
- const std::map<std::string, std::string>& param)) {
- body = [fgeneric] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
+ inline NDArrayFunctionReg& set_function(
+ void (*fgeneric)(NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ const std::map<std::string, std::string>& param)) {
+ body = [fgeneric](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
std::map<std::string, std::string> param;
for (int i = 0; i < num_params; ++i) {
param[param_keys[i]] = param_vals[i];
@@ -1487,32 +1519,36 @@ struct NDArrayFunctionReg
* \param n number of mutate variablesx
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_num_use_vars(unsigned n) {
- num_use_vars = n; return *this;
+ inline NDArrayFunctionReg& set_num_use_vars(unsigned n) {
+ num_use_vars = n;
+ return *this;
}
/*!
* \brief set the number of mutate variables
* \param n number of mutate variablesx
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_num_mutate_vars(unsigned n) {
- num_mutate_vars = n; return *this;
+ inline NDArrayFunctionReg& set_num_mutate_vars(unsigned n) {
+ num_mutate_vars = n;
+ return *this;
}
/*!
* \brief set the number of scalar arguments
* \param n number of scalar arguments
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_num_scalars(unsigned n) {
- num_scalars = n; return *this;
+ inline NDArrayFunctionReg& set_num_scalars(unsigned n) {
+ num_scalars = n;
+ return *this;
}
/*!
* \brief set type mask
* \param tmask typemask
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_type_mask(int tmask) {
- type_mask = tmask; return *this;
+ inline NDArrayFunctionReg& set_type_mask(int tmask) {
+ type_mask = tmask;
+ return *this;
}
}; // NDArrayFunctionReg
@@ -1527,7 +1563,7 @@ struct NDArrayFunctionReg
*
* \endcode
*/
-#define MXNET_REGISTER_NDARRAY_FUN(name) \
+#define MXNET_REGISTER_NDARRAY_FUN(name) \
DMLC_REGISTRY_REGISTER(::mxnet::NDArrayFunctionReg, NDArrayFunctionReg, name)
} // namespace mxnet
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index bbce020..95c2f5d 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -23,19 +23,22 @@
* \brief ndarry module of mxnet
*/
#include <dmlc/io.h>
-#include <dmlc/memory_io.h>
#include <dmlc/logging.h>
+#include <dmlc/memory_io.h>
#include <dmlc/registry.h>
#include <mxnet/base.h>
+#include <mxnet/imperative.h>
#include <mxnet/ndarray.h>
#include <mxnet/resource.h>
-#include <mxnet/imperative.h>
+
#include <mshadow/tensor.h>
+
#include "./ndarray_function.h"
+
#include "../common/utils.h"
-#include "../operator/tensor/matrix_op-inl.h"
-#include "../operator/tensor/init_op.h"
#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/tensor/init_op.h"
+#include "../operator/tensor/matrix_op-inl.h"
#include "../profiler/storage_profiler.h"
#if MXNET_USE_OPENCV
@@ -48,25 +51,32 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
namespace mxnet {
-void NDArray::ReInit(const NDArrayStorageType stype, const mxnet::TShape &shape,
- Context ctx, int dtype, bool delay_alloc, const std::vector<int> *pAux_types,
- const mxnet::ShapeVector *pAux_shapes, const mxnet::TShape *pStorage_shapes) {
+void NDArray::ReInit(const NDArrayStorageType stype,
+ const mxnet::TShape& shape,
+ Context ctx,
+ int dtype,
+ bool delay_alloc,
+ const std::vector<int>* pAux_types,
+ const mxnet::ShapeVector* pAux_shapes,
+ const mxnet::TShape* pStorage_shapes) {
Init(stype, shape, dtype);
if (stype != kDefaultStorage) {
const auto sparseStorage = stype == kRowSparseStorage;
if (!sparseStorage && stype != kCSRStorage)
LOG(FATAL) << "Unknown storage type " << stype;
- const auto &aux_types = (pAux_types && pAux_types->size())? *pAux_types :
- std::vector<int>(sparseStorage? 1 : 2, mshadow::kInt64);
+ const auto& aux_types = (pAux_types && pAux_types->size())
+ ? *pAux_types
+ : std::vector<int>(sparseStorage ? 1 : 2, mshadow::kInt64);
- const auto &aux_shapes = (pAux_shapes && pAux_shapes->size()) ? *pAux_shapes :
- ShapeVector(sparseStorage? 1 : 2, TShape(mshadow::Shape1(0)));
+ const auto& aux_shapes = (pAux_shapes && pAux_shapes->size())
+ ? *pAux_shapes
+ : ShapeVector(sparseStorage ? 1 : 2, TShape(mshadow::Shape1(0)));
mxnet::TShape storage_shape;
if (!pStorage_shapes || !pStorage_shapes->Size()) {
if (sparseStorage) {
- storage_shape = shape;
+ storage_shape = shape;
storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
} else {
storage_shape = aux_shapes[csr::kIdx];
@@ -74,26 +84,25 @@ void NDArray::ReInit(const NDArrayStorageType stype, const mxnet::TShape &shape,
} else {
storage_shape = *pStorage_shapes;
}
- ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
- dtype, aux_types, aux_shapes);
+ ptr_ = std::make_shared<Chunk>(
+ stype, storage_shape, ctx, delay_alloc, dtype, aux_types, aux_shapes);
} else {
ptr_ = std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype);
}
}
-void NDArray::AssignStorageInfo(const std::string& profiler_scope,
- const std::string& name) {
+void NDArray::AssignStorageInfo(const std::string& profiler_scope, const std::string& name) {
if (is_none()) {
return;
}
ptr_->shandle.profiler_scope = profiler_scope;
- ptr_->shandle.name = name;
+ ptr_->shandle.name = name;
#if MXNET_USE_CUDA
profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(ptr_->shandle);
#endif // MXNET_USE_CUDA
for (Storage::Handle& aux_handle : ptr_->aux_handles) {
aux_handle.profiler_scope = profiler_scope;
- aux_handle.name = name + "_aux_data";
+ aux_handle.name = name + "_aux_data";
#if MXNET_USE_CUDA
profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(aux_handle);
#endif // MXNET_USE_CUDA
@@ -118,38 +127,40 @@ struct ChunkMem {
NDArray::Chunk::~Chunk() {
bool skip_free = static_data || delay_alloc;
ChunkMem mem;
- mem.h = this->shandle;
+ mem.h = this->shandle;
mem.aux_h = this->aux_handles;
#if MXNET_USE_ONEDNN == 1
// We want to delete mkldnn memory after deleting the variable.
mem.mem = this->mkl_mem_;
#endif
if (auto engine = engine_ref_.lock()) {
- engine->DeleteVariable([mem, skip_free](RunContext s) {
- if (skip_free == false) {
+ engine->DeleteVariable(
+ [mem, skip_free](RunContext s) {
+ if (skip_free == false) {
#if MXNET_USE_ONEDNN == 1
- if (mem.mem) {
- CHECK_LE(mem.mem->GetSize(), mem.h.size);
- CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
- }
+ if (mem.mem) {
+ CHECK_LE(mem.mem->GetSize(), mem.h.size);
+ CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
+ }
#endif
- Storage::Get()->Free(mem.h);
- for (const auto &aux : mem.aux_h) {
- Storage::Get()->Free(aux);
- }
- }
- }, shandle.ctx, var);
+ Storage::Get()->Free(mem.h);
+ for (const auto& aux : mem.aux_h) {
+ Storage::Get()->Free(aux);
+ }
+ }
+ },
+ shandle.ctx,
+ var);
}
}
-void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
- CHECK_NE(aux_shapes.size(), 0)
- << "data is expected to be allocated after aux_data";
+void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape& shape, int dtype) {
+ CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
- CHECK_LT(shape.Size(), (int64_t{1} << 31) - 1) <<
- "[CheckAndAllocData] Size of tensor you are trying to allocate is larger than "
- "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+ CHECK_LT(shape.Size(), (int64_t{1} << 31) - 1)
+ << "[CheckAndAllocData] Size of tensor you are trying to allocate is larger than "
+ "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
}
if (shandle.size < dbytes) {
// free storage
@@ -168,7 +179,8 @@ void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
}
NDArray NDArray::grad() const {
- if (Imperative::AGInfo::IsNone(*this)) return NDArray();
+ if (Imperative::AGInfo::IsNone(*this))
+ return NDArray();
Imperative::AGInfo& info = Imperative::AGInfo::Get(autograd_entry_.node);
if (info.out_grads.size()) {
CHECK_EQ(info.out_grads.size(), 1);
@@ -179,7 +191,7 @@ NDArray NDArray::grad() const {
nnvm::Symbol NDArray::get_autograd_symbol() const {
CHECK(!Imperative::AGInfo::IsNone(*this))
- << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
+ << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
nnvm::Symbol ret;
ret.outputs.emplace_back(autograd_entry_);
return ret;
@@ -187,36 +199,36 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
#if MXNET_USE_ONEDNN == 1
-NDArray::NDArray(const mkldnn::memory::desc &md)
+NDArray::NDArray(const mkldnn::memory::desc& md)
: storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
shape_ = mxnet::TShape(md.data.dims, md.data.dims + md.data.ndims);
dtype_ = get_mxnet_type(md.data.data_type);
- ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+ ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
ptr_->CheckAndAlloc(md.get_size());
ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(md, ptr_->shandle.dptr);
}
-NDArray::NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem)
+NDArray::NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem)
: storage_type_(kDefaultStorage), autograd_entry_(nullptr) {
- auto mem_desc = mkldnn_mem->get_desc();
- shape_ = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
- dtype_ = get_mxnet_type(mem_desc.data.data_type);
- ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+ auto mem_desc = mkldnn_mem->get_desc();
+ shape_ = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
+ dtype_ = get_mxnet_type(mem_desc.data.data_type);
+ ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
ptr_->shandle.dptr = mkldnn_mem->get_data_handle();
ptr_->shandle.size = mem_desc.get_size();
- ptr_->delay_alloc = false;
- ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mkldnn_mem);
- ptr_->static_data = true;
+ ptr_->delay_alloc = false;
+ ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mkldnn_mem);
+ ptr_->static_data = true;
}
-NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape &shape) const {
+NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape& shape) const {
CHECK(!is_none()) << "NDArray is not initialized";
CHECK_GE(shape_.Size(), shape.Size())
- << "NDArray.Reshape: target shape size is larger current shape";
+ << "NDArray.Reshape: target shape size is larger current shape";
CHECK_EQ(storage_type(), kDefaultStorage);
if (!IsMKLDNNData()) {
NDArray ret = this->Detach();
- ret.shape_ = shape;
+ ret.shape_ = shape;
return ret;
} else {
NDArray ret(shape, ctx(), true, dtype());
@@ -224,32 +236,32 @@ NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape &shape) const {
// be called in operators.
mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
CHECK(ptr_->IsMKLDNN());
- mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
- mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_desc);
- MKLDNNStream *stream = MKLDNNStream::Get();
+ mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
+ mkldnn::memory* def_mem = TmpMemMgr::Get()->Alloc(def_desc);
+ MKLDNNStream* stream = MKLDNNStream::Get();
std::shared_ptr<mkldnn::memory> curr_mem = ptr_->mkl_mem_->GetMem();
stream->RegisterMem(curr_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *curr_mem},
- {MKLDNN_ARG_TO, *def_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *curr_mem}, {MKLDNN_ARG_TO, *def_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(*curr_mem, *def_mem), args);
// def_mem points to a memory region in the temp space. It's only valid
// inside an operator. As such, the returned NDArray can only be valid
// inside an operator and the shared point doesn't need to do anything
// when it's destroyed.
- auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory *mem) {});
+ auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory* mem) {});
ret.ptr_->mkl_mem_.reset(new MKLDNNMemory(tmp));
ret.ptr_->shandle.dptr = def_mem->get_data_handle();
ret.ptr_->shandle.size = def_mem->get_desc().get_size();
- ret.ptr_->delay_alloc = false;
- ret.ptr_->static_data = true;
- ret.byte_offset_ = byte_offset_;
- ret.reuse_ = false;
+ ret.ptr_->delay_alloc = false;
+ ret.ptr_->static_data = true;
+ ret.byte_offset_ = byte_offset_;
+ ret.reuse_ = false;
return ret;
}
}
#endif
-NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
+NDArray NDArray::Reshape(const mxnet::TShape& shape) const {
CHECK(!is_none()) << "NDArray is not initialized";
if (Imperative::Get()->is_np_shape()) {
CHECK_EQ(shape_.Size(), shape.Size())
@@ -270,8 +282,8 @@ NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
return ret;
}
-NDArray NDArray::ReshapeWithRecord(const mxnet::TShape &shape) {
- bool is_recording = Imperative::Get()->is_recording();
+NDArray NDArray::ReshapeWithRecord(const mxnet::TShape& shape) {
+ bool is_recording = Imperative::Get()->is_recording();
bool is_deferred_compute = Imperative::Get()->is_deferred_compute();
NDArray ret;
if (!is_deferred_compute) {
@@ -303,11 +315,11 @@ NDArray NDArray::ReshapeWithRecord(const mxnet::TShape &shape) {
std::ostringstream os;
os << shape;
if (!Imperative::Get()->is_np_shape()) {
- attrs.op = nnvm::Op::Get("Reshape");;
- attrs.dict.insert({"shape", os.str()});
+ attrs.op = nnvm::Op::Get("Reshape");
+ attrs.dict.insert({"shape", os.str()});
} else {
- attrs.op = nnvm::Op::Get("_np_reshape");;
- attrs.dict.insert({"newshape", os.str()});
+ attrs.op = nnvm::Op::Get("_np_reshape");
+ attrs.dict.insert({"newshape", os.str()});
}
attrs.op->attr_parser(&attrs);
std::vector<NDArray*> inputs(1, this), outputs(1, &ret);
@@ -322,22 +334,20 @@ NDArray NDArray::ReshapeWithRecord(const mxnet::TShape &shape) {
NDArray NDArray::Slice(index_t begin, index_t end) const {
CHECK(!is_none()) << "NDArray is empty";
- CHECK_LE(begin, end)
- << "Invalid slicing range [" << begin << ", " << end << ")";
+ CHECK_LE(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")";
CHECK_GE(shape_[0], end) << "Slice end index out of range";
CHECK_EQ(storage_type(), kDefaultStorage);
- NDArray ret = this->Detach();
+ NDArray ret = this->Detach();
size_t length = shape_.ProdShape(1, shape_.ndim());
- MSHADOW_TYPE_SWITCH_WITH_BOOL(ret.dtype(), DType, {
- ret.byte_offset_ += begin * length * sizeof(DType);
- });
- ret.reuse_ = false;
+ MSHADOW_TYPE_SWITCH_WITH_BOOL(
+ ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); });
+ ret.reuse_ = false;
ret.shape_[0] = end - begin;
return ret;
}
NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
- bool is_recording = Imperative::Get()->is_recording();
+ bool is_recording = Imperative::Get()->is_recording();
bool is_deferred_compute = Imperative::Get()->is_deferred_compute();
NDArray ret;
if (!is_deferred_compute) {
@@ -378,9 +388,9 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
NDArray NDArray::At(index_t idx) const {
CHECK(storage_type() == kDefaultStorage)
<< "Storage type " << storage_type() << " doesn't support At()";
- NDArray ret = this->Slice(idx, idx+1);
+ NDArray ret = this->Slice(idx, idx + 1);
if (shape_.ndim() > 1) {
- return ret.Reshape(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+ return ret.Reshape(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));
} else {
return ret;
}
@@ -389,33 +399,34 @@ NDArray NDArray::At(index_t idx) const {
NDArray NDArray::AtWithRecord(index_t idx) {
CHECK(storage_type() == kDefaultStorage)
<< "Storage type " << storage_type() << " doesn't support At()";
- NDArray sliced = this->SliceWithRecord(idx, idx+1);
+ NDArray sliced = this->SliceWithRecord(idx, idx + 1);
if (shape_.ndim() > 1 || Imperative::Get()->is_np_shape()) {
// Imperative reshape with concrete shape
- NDArray reshaped = sliced.Reshape(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+ NDArray reshaped =
+ sliced.Reshape(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));
// Record reshape with magic numbers
nnvm::NodeAttrs attrs;
std::ostringstream os;
if (!Imperative::Get()->is_np_shape()) {
- os << mxnet::TShape({-3, -2}); // See ndarray.py reshape for definition of magic numbers
- attrs.op = nnvm::Op::Get("Reshape");;
- attrs.dict.insert({"shape", os.str()});
+ os << mxnet::TShape({-3, -2}); // See ndarray.py reshape for definition of magic numbers
+ attrs.op = nnvm::Op::Get("Reshape");
+ attrs.dict.insert({"shape", os.str()});
} else {
- // See NumpyXReshapeInferShape for definition of magic numbers
- os << mxnet::TShape({-3, -4});
- attrs.op = nnvm::Op::Get("_npx_reshape");;
- attrs.dict.insert({"newshape", os.str()});
+ // See NumpyXReshapeInferShape for definition of magic numbers
+ os << mxnet::TShape({-3, -4});
+ attrs.op = nnvm::Op::Get("_npx_reshape");
+ attrs.dict.insert({"newshape", os.str()});
}
attrs.op->attr_parser(&attrs);
std::vector<NDArray*> inputs(1, &sliced), outputs(1, &reshaped);
- bool is_recording = Imperative::Get()->is_recording();
+ bool is_recording = Imperative::Get()->is_recording();
bool is_deferred_compute = Imperative::Get()->is_deferred_compute();
if (is_recording) {
- Imperative::Get()->RecordOp(std::move(attrs), inputs, outputs);
+ Imperative::Get()->RecordOp(std::move(attrs), inputs, outputs);
} else if (is_deferred_compute) {
- Imperative::Get()->RecordDeferredCompute(std::move(attrs), inputs, outputs);
+ Imperative::Get()->RecordDeferredCompute(std::move(attrs), inputs, outputs);
}
return reshaped;
@@ -451,20 +462,19 @@ struct NDArrayDLManager {
DLManagedTensor* NDArray::ToDLPack() const {
CHECK(!is_none()) << "NDArray is not initialized";
NDArrayDLManager* dlmanager(new NDArrayDLManager);
- dlmanager->handle = *this;
- dlmanager->tensor.dl_tensor = dlmanager->handle.data().dltensor();
+ dlmanager->handle = *this;
+ dlmanager->tensor.dl_tensor = dlmanager->handle.data().dltensor();
dlmanager->tensor.manager_ctx = dlmanager;
- dlmanager->tensor.deleter = [](DLManagedTensor* dlmanager){
+ dlmanager->tensor.deleter = [](DLManagedTensor* dlmanager) {
delete static_cast<NDArrayDLManager*>(dlmanager->manager_ctx);
};
return &(dlmanager->tensor);
}
NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle) {
- DLManagedTensor *tensor_copy = transient_handle
- ? new DLManagedTensor(*tensor)
- : const_cast<DLManagedTensor*>(tensor);
- auto deleter = [tensor_copy, transient_handle](){
+ DLManagedTensor* tensor_copy =
+ transient_handle ? new DLManagedTensor(*tensor) : const_cast<DLManagedTensor*>(tensor);
+ auto deleter = [tensor_copy, transient_handle]() {
if (tensor_copy->deleter != nullptr) {
tensor_copy->deleter(tensor_copy);
}
@@ -476,17 +486,17 @@ NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle
}
bool NDArray::fresh_out_grad() const {
- if (Imperative::AGInfo::IsNone(*this)) return false;
+ if (Imperative::AGInfo::IsNone(*this))
+ return false;
Imperative::AGInfo& info = Imperative::AGInfo::Get(autograd_entry_.node);
return info.fresh_out_grad;
}
-
void NDArray::set_fresh_out_grad(bool state) const {
CHECK(!Imperative::AGInfo::IsNone(*this))
- << "NDArray has not been marked as a variable and does not have gradient state";
+ << "NDArray has not been marked as a variable and does not have gradient state";
Imperative::AGInfo& info = Imperative::AGInfo::Get(autograd_entry_.node);
- info.fresh_out_grad = state;
+ info.fresh_out_grad = state;
}
#if MXNET_USE_ONEDNN == 1
@@ -516,7 +526,7 @@ void NDArray::Chunk::Reorder2Default() {
if (IsDefault())
return;
- mkldnn_format_tag_t format = mkl_mem_->GetDefaultFormat();
+ mkldnn_format_tag_t format = mkl_mem_->GetDefaultFormat();
mkldnn::memory::desc def_desc = mkl_mem_->GetDesc(format);
mkldnn_mem_ptr def_mem(new mkldnn::memory(def_desc, CpuEngine::Get()->get_engine()));
mkl_mem_->ReorderTo(def_mem.get());
@@ -528,7 +538,7 @@ void NDArray::Chunk::Reorder2Default() {
mkl_mem_ = nullptr;
}
-void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
+void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc& md) {
// If the memory already uses the specified layout, don't do anything.
if (mkl_mem_ != nullptr && mkl_mem_->SameFormat(md))
return;
@@ -548,7 +558,7 @@ void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
std::shared_ptr<mkldnn::memory> old_mem;
if (IsDefault()) {
mkldnn_format_tag_t def_format = GetDefaultFormat(md);
- mkldnn::memory::desc def_desc = GetDesc(md, def_format);
+ mkldnn::memory::desc def_desc = GetDesc(md, def_format);
old_mem.reset(new mkldnn::memory(def_desc, engine, shandle.dptr));
} else {
old_mem = this->mkl_mem_->GetMem();
@@ -565,12 +575,11 @@ void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
mkl_mem_.reset(new MKLDNNMemory(md, shandle.dptr));
}
-void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
+void NDArray::Chunk::SetMKLMem(const mxnet::TShape& shape, int dtype) {
// The shape of the array and the one of the MKL memory may mismatch.
// For example, if the array stores parameters, the MKL memory may store data
// in 5 dimensions while the NDArray stores data in 4 dimensions.
- if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr
- && mkl_mem_->SameFormat(shape, dtype)) {
+ if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr && mkl_mem_->SameFormat(shape, dtype)) {
return;
}
@@ -585,12 +594,24 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
}
mkldnn::memory::format_tag layout = mkldnn::memory::format_tag::undef;
switch (dims.size()) {
- case 1: layout = mkldnn::memory::format_tag::a; break;
- case 2: layout = mkldnn::memory::format_tag::ab; break;
- case 3: layout = mkldnn::memory::format_tag::abc; break;
- case 4: layout = mkldnn::memory::format_tag::abcd; break;
- case 5: layout = mkldnn::memory::format_tag::abcde; break;
- case 6: layout = mkldnn::memory::format_tag::abcdef; break;
+ case 1:
+ layout = mkldnn::memory::format_tag::a;
+ break;
+ case 2:
+ layout = mkldnn::memory::format_tag::ab;
+ break;
+ case 3:
+ layout = mkldnn::memory::format_tag::abc;
+ break;
+ case 4:
+ layout = mkldnn::memory::format_tag::abcd;
+ break;
+ case 5:
+ layout = mkldnn::memory::format_tag::abcde;
+ break;
+ case 6:
+ layout = mkldnn::memory::format_tag::abcdef;
+ break;
default:
LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for MKLDNN";
}
@@ -603,12 +624,12 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
mkl_mem_.reset(new MKLDNNMemory(data_md, shandle.dptr));
}
-const mkldnn::memory *NDArray::GetMKLDNNData(const mkldnn::memory::desc &desc) const {
+const mkldnn::memory* NDArray::GetMKLDNNData(const mkldnn::memory::desc& desc) const {
if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
return nullptr;
}
- const mkldnn::memory *mem = GetMKLDNNData();
+ const mkldnn::memory* mem = GetMKLDNNData();
mkldnn::memory::desc desc1 = mem->get_desc();
// The MKL memory has the same format and shape as required,
// or both use the default format, we can return the MKL memory.
@@ -619,13 +640,12 @@ const mkldnn::memory *NDArray::GetMKLDNNData(const mkldnn::memory::desc &desc) c
}
}
-const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
- const mkldnn::memory::desc &new_desc) const {
+const mkldnn::memory* NDArray::GetMKLDNNDataReorder(const mkldnn::memory::desc& new_desc) const {
CHECK(storage_type() == kDefaultStorage);
- const mkldnn::memory *mem = GetMKLDNNData();
+ const mkldnn::memory* mem = GetMKLDNNData();
// If the memory descriptor matches, it's easy.
- MKLDNNStream *stream = MKLDNNStream::Get();
+ MKLDNNStream* stream = MKLDNNStream::Get();
if (mem->get_desc() == new_desc) {
return GetMKLDNNExact(mem, new_desc);
}
@@ -634,13 +654,13 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
// Now we need to determine if we should reorder the memory.
// If both use the default formats, we think we don't need to reorder.
if ((!mxnet::IsMKLDNN(old_desc)) && (!mxnet::IsMKLDNN(new_desc))) {
- mkldnn_mem_ptr ret(new mkldnn::memory(new_desc,
- CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+ mkldnn_mem_ptr ret(
+ new mkldnn::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
stream->RegisterMem(ret);
return ret.get();
} else if (same_shape(old_desc, new_desc)) {
// If they have the same shape, we can reorder data directly.
- mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(new_desc);
+ mkldnn::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
stream->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
return ret;
@@ -651,14 +671,14 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
mxnet::TShape required_shape(new_desc.data.ndims, -1);
for (int i = 0; i < new_desc.data.ndims; i++)
required_shape[i] = new_desc.data.dims[i];
- NDArray reshaped = MKLDNNDataReshape(required_shape);
- const mkldnn::memory *ret = reshaped.GetMKLDNNData();
+ NDArray reshaped = MKLDNNDataReshape(required_shape);
+ const mkldnn::memory* ret = reshaped.GetMKLDNNData();
if (ret->get_desc() == new_desc) {
return GetMKLDNNExact(ret, new_desc);
} else {
- mkldnn::memory *ret2 = TmpMemMgr::Get()->Alloc(new_desc);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *ret},
- {MKLDNN_ARG_TO, *ret2}});
+ mkldnn::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *ret}, {MKLDNN_ARG_TO, *ret2}});
stream->RegisterPrimArgs(mkldnn::reorder(*ret, *ret2), args);
return ret2;
}
@@ -676,17 +696,18 @@ NDArray NDArray::Reorder2Default() const {
// create new ndarray from mkldnn layout
mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetDesc();
mxnet::TShape tshape(from_desc.data.ndims, -1);
- for (int i = 0; i < from_desc.data.ndims; i++) tshape[i] = from_desc.data.dims[i];
+ for (int i = 0; i < from_desc.data.ndims; i++)
+ tshape[i] = from_desc.data.dims[i];
NDArray ret(tshape, ctx(), false, dtype());
- mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
+ mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
CHECK(ret.ptr_->shandle.size >= def_desc.get_size());
mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
ptr_->mkl_mem_->ReorderTo(&def_mem);
// reshape as needed
- ret.shape_ = shape_;
+ ret.shape_ = shape_;
ret.byte_offset_ = byte_offset_;
- ret.reuse_ = false;
+ ret.reuse_ = false;
return ret;
}
@@ -706,19 +727,19 @@ void NDArray::SelfReorder2Default() {
for (int i = 0; i < from_desc.data.ndims; i++)
tshape[i] = from_desc.data.dims[i];
- const auto saved_shape = shape_;
+ const auto saved_shape = shape_;
const auto saved_byte_offset = byte_offset_;
- this->ReInit(kDefaultStorage, tshape, ctx(), dtype(), false);
+ this->ReInit(kDefaultStorage, tshape, ctx(), dtype(), false);
- mkldnn_format_tag_t format = mkl_mem->GetDefaultFormat();
+ mkldnn_format_tag_t format = mkl_mem->GetDefaultFormat();
mkldnn::memory::desc def_desc = mkl_mem->GetDesc(format);
CHECK(ptr_->shandle.size >= def_desc.get_size());
mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ptr_->shandle.dptr);
mkl_mem->ReorderTo(&def_mem);
// reshape as needed
- shape_ = saved_shape;
+ shape_ = saved_shape;
byte_offset_ = saved_byte_offset;
- reuse_ = false;
+ reuse_ = false;
}
void NDArray::Reorder2DefaultAsync() const {
@@ -726,17 +747,22 @@ void NDArray::Reorder2DefaultAsync() const {
std::vector<Engine::VarHandle> mutable_vars(1, this->var());
NDArray tmp = *this;
Engine::Get()->PushAsync(
- [tmp](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- tmp.ptr_->Reorder2Default();
- on_complete();
- }, ctx(), const_vars, mutable_vars,
- FnProperty::kNormal, 0, "Reorder2Default");
+ [tmp](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ tmp.ptr_->Reorder2Default();
+ on_complete();
+ },
+ ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kNormal,
+ 0,
+ "Reorder2Default");
}
// now just support bf16->fp32
NDArray NDArray::Reorder2DefaultFloatFormat() const {
CHECK(storage_type() == kDefaultStorage && IsView() == false);
- if (dtype() != mshadow::kBfloat16) {
+ if (dtype() != mshadow::kBfloat16) {
return Reorder2Default();
}
NDArray ret(shape(), ctx(), false, mshadow::DataType<float>::kFlag);
@@ -747,24 +773,29 @@ NDArray NDArray::Reorder2DefaultFloatFormat() const {
return ret;
}
-void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc &desc) const {
+void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
std::vector<Engine::VarHandle> const_vars;
std::vector<Engine::VarHandle> mutable_vars(1, this->var());
- NDArray tmp = *this;
+ NDArray tmp = *this;
const auto version = this->version();
Engine::Get()->PushAsync(
- [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- // MXNet will try to reuse NDArray from memory planning, so we need to ensure
- // the NDArray is still holding the original trunk data.
- if (tmp.version() == version) {
- tmp.ptr_->MKLDNNDataReorder(desc);
- }
- on_complete();
- }, ctx(), const_vars, mutable_vars,
- FnProperty::kNormal, 0, "Reorder");
+ [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ // MXNet will try to reuse NDArray from memory planning, so we need to ensure
+ // the NDArray is still holding the original trunk data.
+ if (tmp.version() == version) {
+ tmp.ptr_->MKLDNNDataReorder(desc);
+ }
+ on_complete();
+ },
+ ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kNormal,
+ 0,
+ "Reorder");
}
-const mkldnn::memory *NDArray::GetMKLDNNData() const {
+const mkldnn::memory* NDArray::GetMKLDNNData() const {
CHECK(storage_type() == kDefaultStorage);
const auto is_view = IsView();
if (IsMKLDNNData()) {
@@ -782,14 +813,14 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const {
// If this is a view, we can't create a MKLDNN memory for the chunk
// because we don't have the complete data type and shape information for
// the chunk.
- void *off_addr = static_cast<char *>(ptr_->shandle.dptr) + byte_offset_;
+ void* off_addr = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
// Create the primitive desc for the new mkldnn memory.
mkldnn::memory::dims dims(shape().ndim());
for (size_t i = 0; i < dims.size(); i++)
dims[i] = shape()[i];
- const auto cpp_format = static_cast<mkldnn::memory::format_tag>(
- GetDefaultFormat(shape().ndim()));
+ const auto cpp_format =
+ static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(shape().ndim()));
mkldnn::memory::desc data_md(dims, get_mkldnn_type(dtype_), cpp_format);
std::shared_ptr<mkldnn::memory> ret(
new mkldnn::memory(data_md, CpuEngine::Get()->get_engine(), off_addr));
@@ -809,7 +840,7 @@ void NDArray::InvalidateMKLDNNData() {
ptr_->mkl_mem_ = nullptr;
}
-void NDArray::CopyFrom(const mkldnn::memory &mem) {
+void NDArray::CopyFrom(const mkldnn::memory& mem) {
CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetRaw() == &mem)
return;
@@ -822,15 +853,15 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
if (IsMKLDNNData() && IsView())
ptr_->Reorder2Default();
- const mkldnn::memory *this_mem = GetMKLDNNData();
+ const mkldnn::memory* this_mem = GetMKLDNNData();
MKLDNNMemoryCopy(mem, this_mem);
}
-mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
+mkldnn::memory* NDArray::CreateMKLDNNData(const mkldnn::memory::desc& desc) {
if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc. "
- << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
- << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
+ << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
+ << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
return nullptr;
}
bool isDefaultFormat = IsDefaultFormat(desc);
@@ -843,8 +874,10 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
CHECK(ptr_->shandle.dptr);
// When this is a view and a user wants the default layout, we can simply
// create a new mkldnn memory that points to the right memory.
- std::shared_ptr<mkldnn::memory> mem(new mkldnn::memory(desc,
- CpuEngine::Get()->get_engine(), static_cast<char *>(ptr_->shandle.dptr) + byte_offset_));
+ std::shared_ptr<mkldnn::memory> mem(
+ new mkldnn::memory(desc,
+ CpuEngine::Get()->get_engine(),
+ static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
MKLDNNStream::Get()->RegisterMem(mem);
return mem.get();
} else if (IsView()) {
@@ -870,9 +903,9 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
return ptr_->mkl_mem_->GetRaw();
}
-void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc) {
- auto new_desc = desc;
- auto this_dtype = get_mkldnn_type(dtype());
+void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc) {
+ auto new_desc = desc;
+ auto this_dtype = get_mkldnn_type(dtype());
new_desc.data.data_type = static_cast<mkldnn_data_type_t>(this_dtype);
ptr_->mkl_mem_.reset(new MKLDNNMemory(new_desc, ptr_->shandle.dptr));
MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
@@ -883,12 +916,12 @@ void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc) {
void NDArray::SetTBlob() const {
CHECK(ptr_ != nullptr);
mxnet::TShape shape = shape_;
- char *dptr = static_cast<char*>(ptr_->shandle.dptr);
- auto stype = storage_type();
+ char* dptr = static_cast<char*>(ptr_->shandle.dptr);
+ auto stype = storage_type();
if (stype == kDefaultStorage) {
#if MXNET_USE_ONEDNN == 1
CHECK(!IsMKLDNNData()) << "We can't generate TBlob for MKLDNN data. "
- << "Please use Reorder2Default() to generate a new NDArray first";
+ << "Please use Reorder2Default() to generate a new NDArray first";
#endif
dptr += byte_offset_;
} else if (stype == kCSRStorage || stype == kRowSparseStorage) {
@@ -897,27 +930,24 @@ void NDArray::SetTBlob() const {
} else {
LOG(FATAL) << "unknown storage type " << stype;
}
- tblob_.dptr_ = dptr;
- tblob_.shape_ = shape;
+ tblob_.dptr_ = dptr;
+ tblob_.shape_ = shape;
tblob_.type_flag_ = dtype_;
tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
}
/*!
-* \brief run a ternary operation
-* \param lhs left operand
-* \param mhs middle operand
-* \param rhs right operand
-* \param out the output ndarray
-*/
-template<typename OP>
-void TernaryOp(const NDArray &lhs,
- const NDArray &mhs,
- const NDArray &rhs,
- NDArray *out) {
+ * \brief run a ternary operation
+ * \param lhs left operand
+ * \param mhs middle operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ */
+template <typename OP>
+void TernaryOp(const NDArray& lhs, const NDArray& mhs, const NDArray& rhs, NDArray* out) {
// no check if all of them are on cpu
- if (lhs.ctx().dev_mask() != cpu::kDevMask || mhs.ctx().dev_mask() != cpu::kDevMask
- || rhs.ctx().dev_mask() != cpu::kDevMask) {
+ if (lhs.ctx().dev_mask() != cpu::kDevMask || mhs.ctx().dev_mask() != cpu::kDevMask ||
+ rhs.ctx().dev_mask() != cpu::kDevMask) {
CHECK((lhs.ctx() == mhs.ctx()) && (mhs.ctx() == rhs.ctx())) << "operands context mismatch";
}
// if out is none, allocate space
@@ -925,60 +955,75 @@ void TernaryOp(const NDArray &lhs,
*out = NDArray(OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()), lhs.ctx(), true);
} else {
// no check if both of them are on cpu
- if (lhs.ctx().dev_mask() != cpu::kDevMask ||
- out->ctx().dev_mask() != cpu::kDevMask) {
+ if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
}
CHECK(out->shape() == OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()))
- << "target shape mismatch";
+ << "target shape mismatch";
}
// important: callback must always capture by value
NDArray ret = *out;
// get the const variables
std::vector<Engine::VarHandle> const_vars;
- if (lhs.var() != ret.var()) const_vars.push_back(lhs.var());
- if (mhs.var() != ret.var()) const_vars.push_back(mhs.var());
- if (rhs.var() != ret.var()) const_vars.push_back(rhs.var());
+ if (lhs.var() != ret.var())
+ const_vars.push_back(lhs.var());
+ if (mhs.var() != ret.var())
+ const_vars.push_back(mhs.var());
+ if (rhs.var() != ret.var())
+ const_vars.push_back(rhs.var());
// redirect everything to mshadow operations
switch (lhs.ctx().dev_mask()) {
- case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, mhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<cpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
- }, lhs.ctx(), const_vars, { ret.var() },
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
- break;
- }
+ case cpu::kDevMask: {
+ Engine::Get()->PushSync(
+ [lhs, mhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<cpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
+ break;
+ }
#if MXNET_USE_CUDA
- case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, mhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<gpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, { ret.var() },
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
- break;
- }
+ case gpu::kDevMask: {
+ Engine::Get()->PushSync(
+ [lhs, mhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<gpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
+ break;
+ }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
/*!
-* \brief Performs some preparation required to apply binary operators.
-* Checks context and shape of ndarrays, allocates space for output
-* and prepares const variables for engine
-* \param lhs left operand
-* \param rhs right operand
-* \param out the output ndarray
-* \param binary_op the real operation
-*/
-template<typename OP>
-std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out) {
+ * \brief Performs some preparation required to apply binary operators.
+ * Checks context and shape of ndarrays, allocates space for output
+ * and prepares const variables for engine
+ * \param lhs left operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ * \param binary_op the real operation
+ */
+template <typename OP>
+std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray& lhs,
+ const NDArray& rhs,
+ NDArray* out) {
// no check if both of them are on cpu
if (lhs.ctx().dev_mask() != cpu::kDevMask || rhs.ctx().dev_mask() != cpu::kDevMask) {
CHECK(lhs.ctx() == rhs.ctx()) << "operands context mismatch";
@@ -988,59 +1033,69 @@ std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray &lhs,
*out = NDArray(OP::GetShape(lhs.shape(), rhs.shape()), lhs.ctx(), true, lhs.dtype());
} else {
// no check if both of them are on cpu
- if (lhs.ctx().dev_mask() != cpu::kDevMask ||
- out->ctx().dev_mask() != cpu::kDevMask) {
+ if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
}
- CHECK(out->shape() == OP::GetShape(lhs.shape(), rhs.shape()))
- << "target shape mismatch";
+ CHECK(out->shape() == OP::GetShape(lhs.shape(), rhs.shape())) << "target shape mismatch";
}
std::vector<Engine::VarHandle> const_vars;
// prepare const variables for engine
- if (lhs.var() != out->var()) const_vars.push_back(lhs.var());
- if (rhs.var() != out->var()) const_vars.push_back(rhs.var());
+ if (lhs.var() != out->var())
+ const_vars.push_back(lhs.var());
+ if (rhs.var() != out->var())
+ const_vars.push_back(rhs.var());
return const_vars;
}
/*!
-* \brief run a binary operation using the kernel launch method
-* \param lhs left operand
-* \param rhs right operand
-* \param out the output ndarray
-* \param binary_op the real operation
-*/
-template<typename OP>
-void BinaryOpKernel(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out) {
+ * \brief run a binary operation using the kernel launch method
+ * \param lhs left operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ * \param binary_op the real operation
+ */
+template <typename OP>
+void BinaryOpKernel(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
// important: callback must always capture by value
NDArray ret = *out;
switch (lhs.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
- ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
- },
- lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+ ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
- ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+ ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
-}
+ }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
@@ -1051,71 +1106,89 @@ void BinaryOpKernel(const NDArray &lhs,
* \param out the output ndarray
* \param binary_op the real operation
*/
-template<typename OP>
-void BinaryOp(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out) {
+template <typename OP>
+void BinaryOp(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
// important: callback must always capture by value
NDArray ret = *out;
// redirect everything to mshadow operations
switch (lhs.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
TBlob tmp = ret.data();
ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
-void SetValueOp(const real_t &rhs, NDArray *out) {
+void SetValueOp(const real_t& rhs, NDArray* out) {
CHECK_NE(out->is_none(), true) << "Set value target must not be empty";
// important: callback must always capture by value
- NDArray ret = *out;
+ NDArray ret = *out;
const NDArrayStorageType stype = ret.storage_type();
- Engine::Get()->PushSync([rhs, ret, stype](RunContext ctx) {
- TBlob tmp = ret.data();
- switch (ret.ctx().dev_mask()) {
- case cpu::kDevMask: {
- if (stype == kDefaultStorage) {
- ndarray::Eval<cpu>(rhs, &tmp, ctx);
- } else {
- ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
+ Engine::Get()->PushSync(
+ [rhs, ret, stype](RunContext ctx) {
+ TBlob tmp = ret.data();
+ switch (ret.ctx().dev_mask()) {
+ case cpu::kDevMask: {
+ if (stype == kDefaultStorage) {
+ ndarray::Eval<cpu>(rhs, &tmp, ctx);
+ } else {
+ ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
+ }
+ break;
}
- break;
- }
#if MXNET_USE_CUDA
- case gpu::kDevMask: {
- if (stype == kDefaultStorage) {
- ndarray::Eval<gpu>(rhs, &tmp, ctx);
- } else {
- ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
+ case gpu::kDevMask: {
+ if (stype == kDefaultStorage) {
+ ndarray::Eval<gpu>(rhs, &tmp, ctx);
+ } else {
+ ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
+ }
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ break;
}
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- break;
- }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
- }
- }, ret.ctx(), {}, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ }
+ },
+ ret.ctx(),
+ {},
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
}
/*!
@@ -1125,10 +1198,8 @@ void SetValueOp(const real_t &rhs, NDArray *out) {
* \param out the output ndarray
* \param binary_op the real
*/
-template<typename OP, bool reverse>
-void ScalarOp(const NDArray &lhs,
- const real_t &rhs,
- NDArray *out) {
+template <typename OP, bool reverse>
+void ScalarOp(const NDArray& lhs, const real_t& rhs, NDArray* out) {
if (out->is_none()) {
*out = NDArray(lhs.shape(), lhs.ctx(), true, lhs.dtype());
} else {
@@ -1139,47 +1210,69 @@ void ScalarOp(const NDArray &lhs,
NDArray ret = *out;
// get the const variables
std::vector<Engine::VarHandle> const_vars;
- if (lhs.var() != ret.var()) const_vars.push_back(lhs.var());
+ if (lhs.var() != ret.var())
+ const_vars.push_back(lhs.var());
// redirect everything to mshadow operations
switch (lhs.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<cpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<cpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<gpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<gpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
size_t num_aux_data(NDArrayStorageType stype) {
size_t num = 0;
switch (stype) {
- case kDefaultStorage: num = 0; break;
- case kCSRStorage: num = 2; break;
- case kRowSparseStorage: num = 1; break;
- default: LOG(FATAL) << "Unknown storage type" << stype; break;
+ case kDefaultStorage:
+ num = 0;
+ break;
+ case kCSRStorage:
+ num = 2;
+ break;
+ case kRowSparseStorage:
+ num = 1;
+ break;
+ default:
+ LOG(FATAL) << "Unknown storage type" << stype;
+ break;
}
return num;
}
// Make a copy of a CSR NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
using namespace mshadow;
CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
@@ -1193,19 +1286,16 @@ inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext
to.CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr));
to.CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx));
to.CheckAndAllocData(from.aux_shape(csr::kIdx));
- TBlob val = to.data();
+ TBlob val = to.data();
TBlob indptr = to.aux_data(csr::kIndPtr);
- TBlob idx = to.aux_data(csr::kIdx);
- ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
- from.ctx(), to.ctx(), ctx);
- ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr,
- from.ctx(), to.ctx(), ctx);
- ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx,
- from.ctx(), to.ctx(), ctx);
+ TBlob idx = to.aux_data(csr::kIdx);
+ ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr, from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx, from.ctx(), to.ctx(), ctx);
}
// Make a copy of a row-sparse NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
using namespace mshadow;
CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
@@ -1219,14 +1309,12 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
to.CheckAndAlloc({aux_shape});
TBlob val = to.data();
TBlob idx = to.aux_data(rowsparse::kIdx);
- ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
- from.ctx(), to.ctx(), ctx);
- ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx,
- from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx, from.ctx(), to.ctx(), ctx);
}
// Make a copy of a dense NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
#if MXNET_USE_ONEDNN == 1
// If neither is MKLDNN, we can copy data normally.
@@ -1235,23 +1323,19 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
using namespace mshadow;
CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
TBlob tmp = to.data();
- ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
- from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp, from.ctx(), to.ctx(), ctx);
#if MXNET_USE_ONEDNN == 1
- } else if (SupportMKLDNN(from.dtype(), from.shape())
- && SupportMKLDNN(to.dtype(), to.shape())
- && from.ctx().dev_mask() == cpu::kDevMask
- && to.ctx().dev_mask() == cpu::kDevMask) {
+ } else if (SupportMKLDNN(from.dtype(), from.shape()) && SupportMKLDNN(to.dtype(), to.shape()) &&
+ from.ctx().dev_mask() == cpu::kDevMask && to.ctx().dev_mask() == cpu::kDevMask) {
// If we copy data directly, we need to make sure both NDArrays are supported
// by MKLDNN.
auto from_mem = from.GetMKLDNNData();
- auto to_mem = to.GetMKLDNNData();
+ auto to_mem = to.GetMKLDNNData();
if (from_mem->get_desc() == to_mem->get_desc()) {
- size_t size = std::min(from_mem->get_desc().get_size(),
- to_mem->get_desc().get_size());
+ size_t size = std::min(from_mem->get_desc().get_size(), to_mem->get_desc().get_size());
memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
} else {
- const_cast<NDArray &>(to).CopyFrom(*from_mem);
+ const_cast<NDArray&>(to).CopyFrom(*from_mem);
MKLDNNStream::Get()->Submit();
}
} else {
@@ -1261,7 +1345,7 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
NDArray tmp_from = from;
if (tmp_from.IsMKLDNNData()) {
// TODO(zhengda) tmp_from should be cached.
- tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
+ tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
auto tmp_mem = from.GetMKLDNNData();
tmp_from.CopyFrom(*tmp_mem);
MKLDNNStream::Get()->Submit();
@@ -1269,35 +1353,31 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
CHECK(tmp_from.IsDefaultData());
CHECK(to.IsDefaultData());
TBlob tmp = to.data();
- ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp,
- from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp, from.ctx(), to.ctx(), ctx);
}
#endif
}
// Make a copy of an NDArray based on storage type
-template<typename from_xpu, typename to_xpu>
-void CopyFromToImpl(const NDArray& from, const NDArray& to,
- RunContext rctx, const std::vector<Resource>& requested) {
+template <typename from_xpu, typename to_xpu>
+void CopyFromToImpl(const NDArray& from,
+ const NDArray& to,
+ RunContext rctx,
+ const std::vector<Resource>& requested) {
using namespace std;
using namespace mshadow;
// if storage type doesn't match, cast the storage first
const NDArrayStorageType from_stype = from.storage_type();
- const NDArrayStorageType to_stype = to.storage_type();
- CHECK(from_stype == kDefaultStorage
- || to_stype == kDefaultStorage
- || from_stype == to_stype)
- << "Copying ndarray of stype = " << from_stype
- << " to stype = " << to_stype << " is not supported";
+ const NDArrayStorageType to_stype = to.storage_type();
+ CHECK(from_stype == kDefaultStorage || to_stype == kDefaultStorage || from_stype == to_stype)
+ << "Copying ndarray of stype = " << from_stype << " to stype = " << to_stype
+ << " is not supported";
const Context from_ctx = from.ctx();
- const Context to_ctx = to.ctx();
- bool is_train = Imperative::Get()->is_training();
-
- OpContext opctx{Imperative::Get()->is_recording(),
- is_train,
- rctx,
- engine::CallbackOnComplete(),
- requested};
+ const Context to_ctx = to.ctx();
+ bool is_train = Imperative::Get()->is_training();
+
+ OpContext opctx{
+ Imperative::Get()->is_recording(), is_train, rctx, engine::CallbackOnComplete(), requested};
if (from_ctx == to_ctx && from_stype != to_stype) {
// same ctx, different stypes, use cast op directly without copying
common::CastStorageDispatch<from_xpu>(opctx, from, to);
@@ -1305,7 +1385,7 @@ void CopyFromToImpl(const NDArray& from, const NDArray& to,
NDArray casted_nd; // an intermediate result before copying from to to
if (from_stype == to_stype) {
casted_nd = from; // same stype, no need to cast from
- } else { // different stypes on different ctx needs an temporary casted_nd
+ } else { // different stypes on different ctx needs an temporary casted_nd
const mxnet::TShape& shape = from.shape();
if (to_stype == kDefaultStorage) {
casted_nd = NDArray(shape, from_ctx);
@@ -1336,21 +1416,21 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
CHECK(from.shape() == to.shape())
<< "operands shape mismatch "
<< "from.shape = " << from.shape() << " to.shape=" << to.shape();
- CHECK(!mxnet::op::shape_is_none(from.shape()))
- << "source operands have undefined shape";
+ CHECK(!mxnet::op::shape_is_none(from.shape())) << "source operands have undefined shape";
// zero-size array, no need to copy
if (from.shape().Size() == 0U) {
return;
}
// important: callback must always capture by value
const Context from_ctx = from.ctx();
- const int a = from_ctx.dev_mask();
- const int b = to.ctx().dev_mask();
+ const int a = from_ctx.dev_mask();
+ const int b = to.ctx().dev_mask();
std::vector<Engine::VarHandle> const_vars;
- if (from.var() != to.var()) const_vars.push_back(from.var());
+ if (from.var() != to.var())
+ const_vars.push_back(from.var());
const NDArrayStorageType from_stype = from.storage_type();
- const NDArrayStorageType to_stype = to.storage_type();
+ const NDArrayStorageType to_stype = to.storage_type();
std::vector<Engine::VarHandle> mutable_vars(1, to.var());
@@ -1373,8 +1453,8 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
// request temp resource if cast_storage performs on GPU
if (a == gpu::kDevMask) {
- Resource rsc = ResourceManager::Get()->Request(from_ctx,
- ResourceRequest(ResourceRequest::kTempSpace));
+ Resource rsc =
+ ResourceManager::Get()->Request(from_ctx, ResourceRequest(ResourceRequest::kTempSpace));
requested.push_back(rsc);
mutable_vars.push_back(rsc.var);
}
@@ -1382,38 +1462,57 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
if (a == cpu::kDevMask && b == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
- on_complete();
- }, from.ctx(), const_vars, mutable_vars,
- FnProperty::kNormal, priority, "CopyCPU2CPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
+ on_complete();
+ },
+ from.ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kNormal,
+ priority,
+ "CopyCPU2CPU");
} else {
#if MXNET_USE_CUDA
if (a == cpu::kDevMask && b == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
- ctx.get_stream<gpu>()->Wait();
- on_complete();
- }, to.ctx(), const_vars, mutable_vars,
- FnProperty::kCopyToGPU, priority, "CopyCPU2GPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
+ ctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ to.ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kCopyToGPU,
+ priority,
+ "CopyCPU2GPU");
} else if (a == gpu::kDevMask && b == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
- ctx.get_stream<gpu>()->Wait();
- on_complete();
- }, from.ctx(), const_vars, mutable_vars,
- FnProperty::kCopyFromGPU, priority, "CopyGPU2CPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
+ ctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ from.ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kCopyFromGPU,
+ priority,
+ "CopyGPU2CPU");
} else if (a == gpu::kDevMask && b == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
- ctx.get_stream<gpu>()->Wait();
- on_complete();
- }, from.ctx(), const_vars, mutable_vars,
- from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
- priority, is_opr ? "_copyto_GPU2GPU" : "CopyGPU2GPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
+ ctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ from.ctx(),
+ const_vars,
+ mutable_vars,
+ from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+ priority,
+ is_opr ? "_copyto_GPU2GPU" : "CopyGPU2GPU");
} else {
LOG(FATAL) << "unknown device mask";
}
@@ -1423,26 +1522,22 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
}
}
-
-void CopyFromTo(const NDArray& from, const NDArray *to, int priority) {
+void CopyFromTo(const NDArray& from, const NDArray* to, int priority) {
CopyFromTo(from, *to, priority);
}
-void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority) {
+void ElementwiseSum(const std::vector<NDArray>& source, NDArray* out, int priority) {
std::vector<Engine::VarHandle> const_vars;
const_vars.reserve(source.size());
for (const auto& source_array : source) {
if (source_array.var() != out->var()) {
const_vars.push_back(source_array.var());
}
- CHECK_EQ(source_array.shape() , out->shape())
- << "operands shape mismatch";
+ CHECK_EQ(source_array.shape(), out->shape()) << "operands shape mismatch";
if (out->ctx().dev_mask() == Context::kCPU) {
- CHECK_EQ(source_array.ctx().dev_mask(), Context::kCPU)
- << "operands context mismatch";
+ CHECK_EQ(source_array.ctx().dev_mask(), Context::kCPU) << "operands context mismatch";
} else {
- CHECK_EQ(source_array.ctx(), out->ctx())
- << "operands context mismatch";
+ CHECK_EQ(source_array.ctx(), out->ctx()) << "operands context mismatch";
}
}
// important: callback must always capture by value
@@ -1453,67 +1548,84 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
if (stype == kDefaultStorage) {
switch (out->ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([source, ret](RunContext ctx) {
- std::vector<TBlob> source_tblob(source.size());
- for (size_t i = 0; i < source.size(); ++i) {
- source_tblob[i] = source[i].data();
- }
- TBlob tmp = ret.data();
- ndarray::ElementwiseSum<cpu>(source_tblob, &tmp, ctx);
- }, out->ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, priority, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [source, ret](RunContext ctx) {
+ std::vector<TBlob> source_tblob(source.size());
+ for (size_t i = 0; i < source.size(); ++i) {
+ source_tblob[i] = source[i].data();
+ }
+ TBlob tmp = ret.data();
+ ndarray::ElementwiseSum<cpu>(source_tblob, &tmp, ctx);
+ },
+ out->ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ priority,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([source, ret](RunContext ctx) {
- std::vector<TBlob> source_tblob(source.size());
- for (size_t i = 0; i < source.size(); ++i) {
- source_tblob[i] = source[i].data();
- }
- TBlob tmp = ret.data();
- ndarray::ElementwiseSum<gpu>(source_tblob, &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, out->ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, priority, "DenseElementwiseSum");
+ Engine::Get()->PushSync(
+ [source, ret](RunContext ctx) {
+ std::vector<TBlob> source_tblob(source.size());
+ for (size_t i = 0; i < source.size(); ++i) {
+ source_tblob[i] = source[i].data();
+ }
+ TBlob tmp = ret.data();
+ ndarray::ElementwiseSum<gpu>(source_tblob, &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ out->ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ priority,
+ "DenseElementwiseSum");
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
} else if (stype == kRowSparseStorage) {
- Resource rsc = ResourceManager::Get()->Request(ret.ctx(),
- ResourceRequest(ResourceRequest::kTempSpace));
+ Resource rsc =
+ ResourceManager::Get()->Request(ret.ctx(), ResourceRequest(ResourceRequest::kTempSpace));
Engine::Get()->PushSync(
- [source, ret, rsc](RunContext rctx) {
- NDArray result = ret;
- switch (ret.ctx().dev_mask()) {
- case cpu::kDevMask: {
- mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, source, &result);
- break;
- }
+ [source, ret, rsc](RunContext rctx) {
+ NDArray result = ret;
+ switch (ret.ctx().dev_mask()) {
+ case cpu::kDevMask: {
+ mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, source, &result);
+ break;
+ }
#if MXNET_USE_CUDA
- case gpu::kDevMask: {
- mxnet::ndarray::ElementwiseSum(rctx.get_stream<gpu>(), rsc, source, &result);
- // wait for GPU operations to complete
- rctx.get_stream<gpu>()->Wait();
- break;
- }
+ case gpu::kDevMask: {
+ mxnet::ndarray::ElementwiseSum(rctx.get_stream<gpu>(), rsc, source, &result);
+ // wait for GPU operations to complete
+ rctx.get_stream<gpu>()->Wait();
+ break;
+ }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
- }
- }, ret.ctx(), const_vars, {ret.var(), rsc.var},
- FnProperty::kNormal, priority, "RowSparseElementwiseSum");
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ }
+ },
+ ret.ctx(),
+ const_vars,
+ {ret.var(), rsc.var},
+ FnProperty::kNormal,
+ priority,
+ "RowSparseElementwiseSum");
} else {
LOG(FATAL) << "Not implemented for storage_type " << common::stype_string(stype);
}
}
-void ClipOp(const NDArray &src,
- const real_t &a_min, const real_t &a_max,
- NDArray *out) {
+void ClipOp(const NDArray& src, const real_t& a_min, const real_t& a_max, NDArray* out) {
if (out->is_none()) {
*out = NDArray(src.shape(), src.ctx(), true, src.dtype());
} else {
@@ -1522,99 +1634,123 @@ void ClipOp(const NDArray &src,
}
NDArray ret = *out;
std::vector<Engine::VarHandle> const_vars;
- if (src.var() != ret.var()) const_vars.push_back(src.var());
+ if (src.var() != ret.var())
+ const_vars.push_back(src.var());
switch (src.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([src, a_min, a_max, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalClip<cpu>(src.data(), a_min, a_max, &tmp, ctx);
- }, src.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [src, a_min, a_max, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalClip<cpu>(src.data(), a_min, a_max, &tmp, ctx);
+ },
+ src.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
- #if MXNET_USE_CUDA
+#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([src, a_min, a_max, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalClip<gpu>(src.data(), a_min, a_max, &tmp, ctx);
- }, src.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [src, a_min, a_max, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalClip<gpu>(src.data(), a_min, a_max, &tmp, ctx);
+ },
+ src.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
- #endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
-template<typename Distribution>
-void SampleOP(const real_t &a,
- const real_t &b,
- NDArray *out) {
+template <typename Distribution>
+void SampleOP(const real_t& a, const real_t& b, NDArray* out) {
CHECK(!out->is_none());
- Resource resource = ResourceManager::Get()->Request(
- out->ctx(), ResourceRequest::kRandom);
+ Resource resource = ResourceManager::Get()->Request(out->ctx(), ResourceRequest::kRandom);
// important: callback must always capture by value
NDArray ret = *out;
// redirect everything to mshadow operations
switch (out->ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([a, b, resource, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalRandom<cpu, Distribution>(a, b, resource, &tmp, ctx);
- }, out->ctx(), {}, {ret.var(), resource.var},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [a, b, resource, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalRandom<cpu, Distribution>(a, b, resource, &tmp, ctx);
+ },
+ out->ctx(),
+ {},
+ {ret.var(), resource.var},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([a, b, resource, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalRandom<gpu, Distribution>(a, b, resource, &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, out->ctx(), {}, {ret.var(), resource.var},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [a, b, resource, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalRandom<gpu, Distribution>(a, b, resource, &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ out->ctx(),
+ {},
+ {ret.var(), resource.var},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
-void SampleUniform(real_t begin, real_t end, NDArray *out) {
+void SampleUniform(real_t begin, real_t end, NDArray* out) {
SampleOP<ndarray::UniformDistribution>(begin, end, out);
}
-void SampleGaussian(real_t mu, real_t sigma, NDArray *out) {
+void SampleGaussian(real_t mu, real_t sigma, NDArray* out) {
SampleOP<ndarray::GaussianDistribution>(mu, sigma, out);
}
-void SampleExponential(real_t lambda, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"exponential sampling only valid on cpu";
+void SampleExponential(real_t lambda, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "exponential sampling only valid on cpu";
}
real_t dummy;
SampleOP<ndarray::ExponentialDistribution>(lambda, dummy, out);
}
-void SamplePoisson(real_t lambda, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"poisson sampling only valid on cpu";
+void SamplePoisson(real_t lambda, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "poisson sampling only valid on cpu";
}
real_t dummy;
SampleOP<ndarray::PoissonDistribution>(lambda, dummy, out);
}
-void SampleNegBinomial(int32_t k, real_t p, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"negative binomial sampling only valid on cpu";
+void SampleNegBinomial(int32_t k, real_t p, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "negative binomial sampling only valid on cpu";
}
SampleOP<ndarray::NegBinomialDistribution>(k, p, out);
}
-void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"negative binomial sampling only valid on cpu";
+void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "negative binomial sampling only valid on cpu";
}
SampleOP<ndarray::GenNegBinomialDistribution>(mu, alpha, out);
}
@@ -1627,92 +1763,88 @@ void RandomSeed(Context ctx, uint32_t seed) {
ResourceManager::Get()->SeedRandom(ctx, seed);
}
-template<typename OP>
-inline NDArray BinaryOpRet(const NDArray &lhs,
- const NDArray &rhs) {
+template <typename OP>
+inline NDArray BinaryOpRet(const NDArray& lhs, const NDArray& rhs) {
NDArray ret;
BinaryOpKernel<OP>(lhs, rhs, &ret);
return ret;
}
-template<typename OP, bool reverse>
-inline NDArray ScalarOpRet(const NDArray &lhs,
- const real_t &rhs) {
+template <typename OP, bool reverse>
+inline NDArray ScalarOpRet(const NDArray& lhs, const real_t& rhs) {
NDArray ret;
ScalarOp<OP, reverse>(lhs, rhs, &ret);
return ret;
}
-template<typename OP>
-inline NDArray &BinaryOpApply(NDArray *dst,
- const NDArray &src) {
+template <typename OP>
+inline NDArray& BinaryOpApply(NDArray* dst, const NDArray& src) {
BinaryOpKernel<OP>(*dst, src, dst);
return *dst;
}
-template<typename OP>
-inline NDArray &ScalarOpApply(NDArray *dst,
- const real_t &src) {
+template <typename OP>
+inline NDArray& ScalarOpApply(NDArray* dst, const real_t& src) {
ScalarOp<OP, false>(*dst, src, dst);
return *dst;
}
// Binary
-NDArray operator+(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator+(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Plus>(lhs, rhs);
}
-NDArray operator-(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator-(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Minus>(lhs, rhs);
}
-NDArray operator*(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator*(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Mul>(lhs, rhs);
}
-NDArray operator/(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator/(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Div>(lhs, rhs);
}
// Scalar
-NDArray operator+(const NDArray &lhs, const real_t &rhs) {
+NDArray operator+(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Plus, false>(lhs, rhs);
}
-NDArray operator-(const NDArray &lhs, const real_t &rhs) {
+NDArray operator-(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Minus, false>(lhs, rhs);
}
-NDArray operator*(const NDArray &lhs, const real_t &rhs) {
+NDArray operator*(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Mul, false>(lhs, rhs);
}
-NDArray operator/(const NDArray &lhs, const real_t &rhs) {
+NDArray operator/(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Div, false>(lhs, rhs);
}
// Binary
-NDArray &NDArray::operator=(real_t scalar) {
+NDArray& NDArray::operator=(real_t scalar) {
SetValueOp(scalar, this);
return *this;
}
-NDArray &NDArray::operator+=(const NDArray &src) {
+NDArray& NDArray::operator+=(const NDArray& src) {
return BinaryOpApply<ndarray::Plus>(this, src);
}
-NDArray &NDArray::operator-=(const NDArray &src) {
+NDArray& NDArray::operator-=(const NDArray& src) {
return BinaryOpApply<ndarray::Minus>(this, src);
}
-NDArray &NDArray::operator*=(const NDArray &src) {
+NDArray& NDArray::operator*=(const NDArray& src) {
return BinaryOpApply<ndarray::Mul>(this, src);
}
-NDArray &NDArray::operator/=(const NDArray &src) {
+NDArray& NDArray::operator/=(const NDArray& src) {
return BinaryOpApply<ndarray::Div>(this, src);
}
// Scalar
-NDArray &NDArray::operator+=(const real_t &src) {
+NDArray& NDArray::operator+=(const real_t& src) {
return ScalarOpApply<ndarray::Plus>(this, src);
}
-NDArray &NDArray::operator-=(const real_t &src) {
+NDArray& NDArray::operator-=(const real_t& src) {
return ScalarOpApply<ndarray::Minus>(this, src);
}
-NDArray &NDArray::operator*=(const real_t &src) {
+NDArray& NDArray::operator*=(const real_t& src) {
return ScalarOpApply<ndarray::Mul>(this, src);
}
-NDArray &NDArray::operator/=(const real_t &src) {
+NDArray& NDArray::operator/=(const real_t& src) {
return ScalarOpApply<ndarray::Div>(this, src);
}
@@ -1726,7 +1858,7 @@ static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9;
// The ndarray must be saved and loaded within np shape semantics.
static const uint32_t NDARRAY_V3_MAGIC = 0xF993faca;
-void NDArray::Save(dmlc::Stream *strm) const {
+void NDArray::Save(dmlc::Stream* strm) const {
if (Imperative::Get()->is_np_shape()) {
CHECK_EQ(storage_type(), kDefaultStorage)
<< "only allow serializing ndarray of default storage type in np shape semantics";
@@ -1749,7 +1881,8 @@ void NDArray::Save(dmlc::Stream *strm) const {
// save shape
shape_.Save(strm);
- if (is_none()) return;
+ if (is_none())
+ return;
// save context
Context ctx = this->ctx();
@@ -1802,56 +1935,66 @@ void NDArray::Save(dmlc::Stream *strm) const {
}
}
-bool LegacyTShapeLoad(dmlc::Stream *strm, mxnet::TShape *shape, const uint32_t magic) {
+bool LegacyTShapeLoad(dmlc::Stream* strm, mxnet::TShape* shape, const uint32_t magic) {
switch (magic) {
case NDARRAY_V1_MAGIC:
return shape->Load(strm);
default:
// meet legacy mxnet::TShape, magic is ndim here
uint32_t ndim = magic;
- *shape = mxnet::TShape(ndim, -1);
+ *shape = mxnet::TShape(ndim, -1);
std::vector<uint32_t> buffer(ndim);
size_t nread = ndim * sizeof(uint32_t);
- if (strm->Read(buffer.data(), nread) != nread) return false;
+ if (strm->Read(buffer.data(), nread) != nread)
+ return false;
nnvm::ShapeTypeCast(buffer.begin(), buffer.end(), shape->begin());
return true;
}
}
-bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
+bool NDArray::LegacyLoad(dmlc::Stream* strm, const uint32_t magic) {
// load shape
mxnet::TShape shape;
- if (!LegacyTShapeLoad(strm, &shape, magic)) return false;
+ if (!LegacyTShapeLoad(strm, &shape, magic))
+ return false;
if (mxnet::op::shape_is_none(shape)) {
- *this = NDArray(); return true;
+ *this = NDArray();
+ return true;
}
// load context
Context ctx;
- if (!ctx.Load(strm)) return false;
+ if (!ctx.Load(strm))
+ return false;
// load type flag
int32_t type_flag;
- if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+ if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
+ return false;
// load data into CPU
NDArray temp(shape, Context::CPU(), false, type_flag);
- TBlob load_data = temp.data();
+ TBlob load_data = temp.data();
size_t type_size = mshadow::mshadow_sizeof(type_flag);
- size_t nread = type_size * shape.Size();
+ size_t nread = type_size * shape.Size();
- if (strm->Read(load_data.dptr_, nread) != nread) return false;
+ if (strm->Read(load_data.dptr_, nread) != nread)
+ return false;
if (ctx.dev_mask() == cpu::kDevMask) {
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
} else {
#if MXNET_USE_CUDA
- *this = temp.Copy(ctx); return true;
+ *this = temp.Copy(ctx);
+ return true;
#else
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
#endif
}
}
-bool NDArray::Load(dmlc::Stream *strm) {
+bool NDArray::Load(dmlc::Stream* strm) {
uint32_t magic;
- if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
+ if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t))
+ return false;
if (magic == NDARRAY_V3_MAGIC) {
CHECK(Imperative::Get()->is_np_shape())
<< "ndarray was saved in np shape semantics, must be loaded in the same semantics."
@@ -1870,7 +2013,8 @@ bool NDArray::Load(dmlc::Stream *strm) {
// load storage type
int32_t stype;
- if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false;
+ if (strm->Read(&stype, sizeof(stype)) != sizeof(stype))
+ return false;
if (Imperative::Get()->is_np_shape()) {
CHECK_EQ(stype, kDefaultStorage)
<< "only allow deserializing ndarray of default storage type in np shape semantics";
@@ -1880,28 +2024,33 @@ bool NDArray::Load(dmlc::Stream *strm) {
// load storage shape
mxnet::TShape sshape;
if (nad > 0) {
- if (!sshape.Load(strm)) return false;
+ if (!sshape.Load(strm))
+ return false;
}
// load shape
mxnet::TShape shape;
- if (!shape.Load(strm)) return false;
+ if (!shape.Load(strm))
+ return false;
if (Imperative::Get()->is_np_shape()) {
if (!shape_is_known(shape)) {
*this = NDArray();
return true;
}
} else if (shape.ndim() == 0) {
- *this = NDArray(); return true;
+ *this = NDArray();
+ return true;
}
// load context
Context ctx;
- if (!ctx.Load(strm)) return false;
+ if (!ctx.Load(strm))
+ return false;
// load type flag
int32_t type_flag;
- if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+ if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
+ return false;
// load aux_types and aux_shapes
std::vector<int32_t> aux_types;
@@ -1911,9 +2060,11 @@ bool NDArray::Load(dmlc::Stream *strm) {
aux_shapes.resize(nad);
for (int i = 0; i < nad; ++i) {
// load aux_type(i)
- if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i])) return false;
+ if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i]))
+ return false;
// load aux_shapes(i)
- if (!aux_shapes[i].Load(strm)) return false;
+ if (!aux_shapes[i].Load(strm))
+ return false;
}
}
@@ -1922,39 +2073,50 @@ bool NDArray::Load(dmlc::Stream *strm) {
if (0 == nad) {
temp = NDArray(shape, Context::CPU(), false, type_flag);
} else {
- temp = NDArray(static_cast<NDArrayStorageType>(stype), shape,
- Context::CPU(), false, type_flag,
- aux_types, aux_shapes, sshape);
+ temp = NDArray(static_cast<NDArrayStorageType>(stype),
+ shape,
+ Context::CPU(),
+ false,
+ type_flag,
+ aux_types,
+ aux_shapes,
+ sshape);
}
// load data
- TBlob load_data = temp.data();
+ TBlob load_data = temp.data();
size_t type_size = mshadow::mshadow_sizeof(type_flag);
- size_t nread = type_size * load_data.Size();
- if (strm->Read(load_data.dptr_, nread) != nread) return false;
+ size_t nread = type_size * load_data.Size();
+ if (strm->Read(load_data.dptr_, nread) != nread)
+ return false;
// load aux_data
if (nad > 0) {
for (int i = 0; i < nad; ++i) {
load_data = temp.aux_data(i);
type_size = mshadow::mshadow_sizeof(load_data.type_flag_);
- nread = type_size * load_data.Size();
- if (strm->Read(load_data.dptr_, nread) != nread) return false;
+ nread = type_size * load_data.Size();
+ if (strm->Read(load_data.dptr_, nread) != nread)
+ return false;
}
}
if (ctx.dev_mask() == cpu::kDevMask) {
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
} else {
#if MXNET_USE_CUDA
int device_count = -1;
cudaGetDeviceCount(&device_count);
if (device_count > 0) {
- *this = temp.Copy(ctx); return true;
+ *this = temp.Copy(ctx);
+ return true;
} else {
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
}
#else
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
#endif
}
}
@@ -1971,22 +2133,14 @@ void NDArray::Save(dmlc::Stream* fo,
fo->Write(names);
}
-void NDArray::Load(dmlc::Stream* fi,
- std::vector<NDArray>* data,
- std::vector<std::string>* keys) {
+void NDArray::Load(dmlc::Stream* fi, std::vector<NDArray>* data, std::vector<std::string>* keys) {
uint64_t header, reserved;
- CHECK(fi->Read(&header))
- << "Invalid NDArray file format";
- CHECK(fi->Read(&reserved))
- << "Invalid NDArray file format";
- CHECK(header == kMXAPINDArrayListMagic)
- << "Invalid NDArray file format";
- CHECK(fi->Read(data))
- << "Invalid NDArray file format";
- CHECK(fi->Read(keys))
- << "Invalid NDArray file format";
- CHECK(keys->size() == 0 || keys->size() == data->size())
- << "Invalid NDArray file format";
+ CHECK(fi->Read(&header)) << "Invalid NDArray file format";
+ CHECK(fi->Read(&reserved)) << "Invalid NDArray file format";
+ CHECK(header == kMXAPINDArrayListMagic) << "Invalid NDArray file format";
+ CHECK(fi->Read(data)) << "Invalid NDArray file format";
+ CHECK(fi->Read(keys)) << "Invalid NDArray file format";
+ CHECK(keys->size() == 0 || keys->size() == data->size()) << "Invalid NDArray file format";
}
NDArray NDArray::Copy(Context ctx) const {
@@ -1994,8 +2148,14 @@ NDArray NDArray::Copy(Context ctx) const {
if (kDefaultStorage == storage_type()) {
ret = NDArray(shape(), ctx, false, dtype_);
} else if (kUndefinedStorage != storage_type()) {
- ret = NDArray(storage_type(), shape(), ctx, false, dtype_,
- ptr_->aux_types, ptr_->aux_shapes, storage_shape());
+ ret = NDArray(storage_type(),
+ shape(),
+ ctx,
+ false,
+ dtype_,
+ ptr_->aux_types,
+ ptr_->aux_shapes,
+ storage_shape());
} else {
LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type="
<< ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id;
@@ -2004,20 +2164,19 @@ NDArray NDArray::Copy(Context ctx) const {
return ret;
}
-void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
+void NDArray::SyncCopyFromCPU(const void* data, size_t size) const {
mxnet::TShape dshape = this->shape();
if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
- CHECK_LT(size, (int64_t{1} << 31) - 1) <<
- "[SyncCopyFromCPU] Size of tensor you are trying to allocate is larger than "
- "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+ CHECK_LT(size, (int64_t{1} << 31) - 1)
+ << "[SyncCopyFromCPU] Size of tensor you are trying to allocate is larger than "
+ "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
}
- CHECK_EQ(dshape.Size(), size)
- << "Memory size do not match";
+ CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
// zero-size array, no need to copy
if (size == 0U) {
return;
}
- TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
+ TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
if (this->ctx().dev_mask() == cpu::kDevMask) {
this->WaitToWrite();
@@ -2027,15 +2186,19 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
} else {
#if MXNET_USE_CUDA
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- TBlob dst = this->data();
- ndarray::Copy<cpu, gpu>(src, &dst,
- Context::CPU(), this->ctx(), rctx);
- // Wait GPU kernel to complete
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), {}, {this->var()},
- FnProperty::kCopyToGPU, 0, "SyncCopyCPU2GPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ TBlob dst = this->data();
+ ndarray::Copy<cpu, gpu>(src, &dst, Context::CPU(), this->ctx(), rctx);
+ // Wait GPU kernel to complete
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ {},
+ {this->var()},
+ FnProperty::kCopyToGPU,
+ 0,
+ "SyncCopyCPU2GPU");
this->WaitToRead();
#else
LOG(FATAL) << "GPU is not enabled";
@@ -2081,51 +2244,71 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
this->CheckAndAllocAuxData(j, src_shape);
}
}
- TBlob dst_data = (j >= 0? this->aux_data(j) : this->data());
+ TBlob dst_data = (j >= 0 ? this->aux_data(j) : this->data());
CHECK_LE(src_shape.Size(), dst_data.shape_.Size());
return dst_data;
};
if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
- Engine::Get()->PushSync([&](RunContext rctx) {
- const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- }, this->ctx(), const_vars, {this->var()},
- FnProperty::kNormal, 0, "SyncCopyFromNDArrayCPU2CPU");
+ Engine::Get()->PushSync(
+ [&](RunContext rctx) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ },
+ this->ctx(),
+ const_vars,
+ {this->var()},
+ FnProperty::kNormal,
+ 0,
+ "SyncCopyFromNDArrayCPU2CPU");
} else {
#if MXNET_USE_CUDA
if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), const_vars, {this->var()},
- FnProperty::kCopyToGPU, 0, "SyncCopyFromNDArrayCPU2GPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ const_vars,
+ {this->var()},
+ FnProperty::kCopyToGPU,
+ 0,
+ "SyncCopyFromNDArrayCPU2GPU");
} else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, src.ctx(), const_vars, {this->var()},
- FnProperty::kCopyFromGPU, 0, "SyncCopyFromNDArrayGPU2CPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ src.ctx(),
+ const_vars,
+ {this->var()},
+ FnProperty::kCopyFromGPU,
+ 0,
+ "SyncCopyFromNDArrayGPU2CPU");
} else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), const_vars, {this->var()},
- src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
- 0, "SyncCopyFromNDArrayGPU2GPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ const_vars,
+ {this->var()},
+ src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+ 0,
+ "SyncCopyFromNDArrayGPU2GPU");
} else {
LOG(FATAL) << "unknown device mask";
}
@@ -2144,20 +2327,19 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
WaitToRead();
}
-void NDArray::SyncCopyToCPU(void *data, size_t size) const {
+void NDArray::SyncCopyToCPU(void* data, size_t size) const {
mxnet::TShape dshape = this->shape();
if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
- CHECK_LT(size, (int64_t{1} << 31) - 1) <<
- "[SyncCopyToCPU] Size of tensor you are trying to allocate is larger than "
- "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+ CHECK_LT(size, (int64_t{1} << 31) - 1)
+ << "[SyncCopyToCPU] Size of tensor you are trying to allocate is larger than "
+ "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
}
- CHECK_EQ(dshape.Size(), size)
- << "Memory size do not match";
+ CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
// zero-size array, no need to copy
if (size == 0U) {
return;
}
- TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
+ TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
this->WaitToRead();
@@ -2168,19 +2350,22 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
if (src.IsMKLDNNData())
src = this->Reorder2Default();
#endif
- ndarray::Copy<cpu, cpu>(src.data(), &dst,
- Context::CPU(), Context::CPU(), rctx);
+ ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), rctx);
} else {
#if MXNET_USE_CUDA
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- ndarray::Copy<gpu, cpu>(this->data(), &dst,
- this->ctx(), Context::CPU(), rctx);
- // Wait GPU kernel to complete
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), {this->var()}, {},
- FnProperty::kCopyFromGPU, 0, "SyncCopyGPU2CPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ ndarray::Copy<gpu, cpu>(this->data(), &dst, this->ctx(), Context::CPU(), rctx);
+ // Wait GPU kernel to complete
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ {this->var()},
+ {},
+ FnProperty::kCopyFromGPU,
+ 0,
+ "SyncCopyGPU2CPU");
this->WaitToWrite();
#else
LOG(FATAL) << "GPU is not enabled";
@@ -2192,17 +2377,27 @@ void NDArray::SyncCheckFormat(const bool full_check) const {
int32_t err = kNormalErr;
TBlob err_cpu(&err, mshadow::Shape1(1), cpu::kDevMask, 0);
if (this->ctx().dev_mask() == cpu::kDevMask) {
- Engine::Get()->PushSync([&](RunContext rctx) {
- common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check);
- }, this->ctx(), {this->var()}, {},
- FnProperty::kNormal, 0, "CheckFormat");
+ Engine::Get()->PushSync(
+ [&](RunContext rctx) { common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check); },
+ this->ctx(),
+ {this->var()},
+ {},
+ FnProperty::kNormal,
+ 0,
+ "CheckFormat");
} else {
#if MXNET_USE_CUDA
- Engine::Get()->PushSync([&](RunContext rctx) {
- common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
- rctx.get_stream<gpu>()->Wait();
- }, this->ctx(), {this->var()}, {},
- FnProperty::kNormal, 0, "CheckFormat");
+ Engine::Get()->PushSync(
+ [&](RunContext rctx) {
+ common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
+ rctx.get_stream<gpu>()->Wait();
+ },
+ this->ctx(),
+ {this->var()},
+ {},
+ FnProperty::kNormal,
+ 0,
+ "CheckFormat");
#else
LOG(FATAL) << "GPU is not enabled";
#endif
@@ -2210,65 +2405,65 @@ void NDArray::SyncCheckFormat(const bool full_check) const {
this->WaitToWrite();
CHECK_NE(err, kCSRShapeErr) << "Shape mismatch of this csr NDArray";
CHECK_NE(err, kCSRIndPtrErr)
- << "IndPtr of csr NDArray should be non-negative, in non-decreasing order, "
- << "start with 0, and end with value equal with size of indices.";
+ << "IndPtr of csr NDArray should be non-negative, in non-decreasing order, "
+ << "start with 0, and end with value equal with size of indices.";
CHECK_NE(err, kCSRIdxErr)
- << "Indices of csr NDArray should be non-negative, in ascending order per row "
- << " and less than the number of columns.";
+ << "Indices of csr NDArray should be non-negative, in ascending order per row "
+ << " and less than the number of columns.";
CHECK_NE(err, kRSPShapeErr) << "Shape mismatch of this row_sparse NDArray";
- CHECK_NE(err, kRSPIdxErr)
- << "Indices of row_sparse NDArray should be non-negative, "
- << "less than the size of first dimension and in ascending order";
+ CHECK_NE(err, kRSPIdxErr) << "Indices of row_sparse NDArray should be non-negative, "
+ << "less than the size of first dimension and in ascending order";
CHECK_EQ(err, kNormalErr) << "Check the validity of this sparse NDArray";
}
void NDArray::WaitToRead() const {
- if (is_none()) return;
+ if (is_none())
+ return;
Imperative::DCInfo::Compute(*this);
Engine::Get()->WaitForVar(ptr_->var);
}
void NDArray::WaitToWrite() const {
- if (is_none()) return;
+ if (is_none())
+ return;
Imperative::DCInfo::Compute(*this);
// Push an empty mutable function to flush all preceding reads to the variable.
Engine::Get()->PushAsync(
[](RunContext, Engine::CallbackOnComplete on_complete) { on_complete(); },
- Context{}, {}, {ptr_->var});
+ Context{},
+ {},
+ {ptr_->var});
Engine::Get()->WaitForVar(ptr_->var);
}
#if MXNET_PREDICT_ONLY == 0
// register API function
// those with underscore will be registered at NDArray
-MXNET_REGISTER_NDARRAY_FUN(_set_value)
-.set_function(SetValueOp);
-
-
-MXNET_REGISTER_NDARRAY_FUN(_onehot_encode)
-.set_function(BinaryOp<ndarray::OneHotEncode>);
+MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
+MXNET_REGISTER_NDARRAY_FUN(_onehot_encode).set_function(BinaryOp<ndarray::OneHotEncode>);
MXNET_REGISTER_NDARRAY_FUN(fill_element_0index)
-.set_function(TernaryOp<ndarray::MatFillRowElem>)
-.describe("Fill one element of each line(row for python, column for R/Julia)"
-" in lhs according to index indicated by rhs and values indicated by mhs."
-" This function assume rhs uses 0-based index.");
+ .set_function(TernaryOp<ndarray::MatFillRowElem>)
+ .describe(
+ "Fill one element of each line(row for python, column for R/Julia)"
+ " in lhs according to index indicated by rhs and values indicated by mhs."
+ " This function assume rhs uses 0-based index.");
// register API function
// those with underscore will be registered at NDArray
-void CopyFromToSimple(
- const nnvm::NodeAttrs& attrs,
- const OpContext& ctx,
- const std::vector<NDArray>& inputs,
- const std::vector<OpReqType>& req,
- const std::vector<NDArray>& outputs) {
+void CopyFromToSimple(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
CopyFromTo(inputs[0], outputs[0], 0, true);
}
-bool CopyToType(const nnvm::NodeAttrs &attrs, std::vector<int> *in_attrs,
- std::vector<int> *out_attrs) {
+bool CopyToType(const nnvm::NodeAttrs& attrs,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
CHECK_EQ(in_attrs->size(), 1U);
CHECK_EQ(out_attrs->size(), 1U);
int in_type = in_attrs->at(0);
@@ -2281,36 +2476,42 @@ bool CopyToType(const nnvm::NodeAttrs &attrs, std::vector<int> *in_attrs,
// copy function is special
// that we need to remove kAcceptEmptyMutateTarget from it
NNVM_REGISTER_OP(_copyto)
-.add_alias("_npi_copyto")
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr<mxnet::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInferType>("FInferType", CopyToType)
-.set_attr<FInferStorageType>("FInferStorageType",
- [](const NodeAttrs& attrs,
- const int dev_mask,
- DispatchMode* dispatch_mode,
- std::vector<int>* in_attrs,
- std::vector<int>* out_attrs) {
- op::dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx);
- if (op::storage_type_is_none((*out_attrs)[0])) {
- (*out_attrs)[0] = (*in_attrs)[0];
- }
- return true;
- })
-.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
- return ExecType::kCrossDeviceCopy;
- })
-.set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
-.set_attr<bool>("TIsBackward", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
-.set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
-.add_argument("data", "NDArray", "input data");
-
-
-void Imdecode(NDArray *ret, NDArray mean, size_t index,
- size_t x0, size_t y0, size_t x1, size_t y1, size_t n_channels,
- size_t size, char *str_img) {
+ .add_alias("_npi_copyto")
+ .set_num_inputs(1)
+ .set_num_outputs(1)
+ .set_attr<mxnet::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
+ .set_attr<nnvm::FInferType>("FInferType", CopyToType)
+ .set_attr<FInferStorageType>("FInferStorageType",
+ [](const NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ op::dispatch_mode_assign(dispatch_mode,
+ DispatchMode::kFComputeEx);
+ if (op::storage_type_is_none((*out_attrs)[0])) {
+ (*out_attrs)[0] = (*in_attrs)[0];
+ }
+ return true;
+ })
+ .set_attr<FExecType>("FExecType",
+ [](const NodeAttrs& attrs) { return ExecType::kCrossDeviceCopy; })
+ .set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
+ .set_attr<bool>("TIsBackward", true)
+ .set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
+ .set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
+ .add_argument("data", "NDArray", "input data");
+
+void Imdecode(NDArray* ret,
+ NDArray mean,
+ size_t index,
+ size_t x0,
+ size_t y0,
+ size_t x1,
+ size_t y1,
+ size_t n_channels,
+ size_t size,
+ char* str_img) {
#if MXNET_USE_OPENCV
cv::Mat buf(1, size, CV_8U, str_img);
cv::Mat res = cv::imdecode(buf, n_channels == 1 ? 0 : -1);
@@ -2322,12 +2523,12 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
y0 = 0;
y1 = res.rows;
}
- CHECK(x1 <= static_cast<size_t>(res.cols) &&
- y1 <= static_cast<size_t>(res.rows));
+ CHECK(x1 <= static_cast<size_t>(res.cols) && y1 <= static_cast<size_t>(res.rows));
if (ret->is_none()) {
- *ret = NDArray(mshadow::Shape3(n_channels, y1-y0, x1-x0),
- Context::CPU(), false,
+ *ret = NDArray(mshadow::Shape3(n_channels, y1 - y0, x1 - x0),
+ Context::CPU(),
+ false,
mean.is_none() ? mshadow::default_type_flag : mean.dtype());
}
NDArray buff;
@@ -2335,19 +2536,19 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
buff = ret->Reshape(mshadow::Shape4(1, ret->shape()[0], ret->shape()[1], ret->shape()[2]));
} else {
CHECK_EQ(ret->shape().ndim(), 4U);
- buff = ret->Slice(index, index+1);
+ buff = ret->Slice(index, index + 1);
}
CHECK_EQ(buff.ctx().dev_mask(), Context::kCPU);
CHECK_EQ(n_channels, buff.shape()[1]);
- CHECK_EQ(y1-y0, buff.shape()[2]);
- CHECK_EQ(x1-x0, buff.shape()[3]);
+ CHECK_EQ(y1 - y0, buff.shape()[2]);
+ CHECK_EQ(x1 - x0, buff.shape()[3]);
buff.WaitToWrite();
if (mean.is_none()) {
MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
- for (size_t i = 0; i < y1-y0; i++) {
- uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
- for (size_t j = 0; j < x1-x0; j++) {
+ for (size_t i = 0; i < y1 - y0; i++) {
+ uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
+ for (size_t j = 0; j < x1 - x0; j++) {
for (size_t k = 0; k < n_channels; k++) {
tensor[0][k][i][j] = DType(im_data[k]); // NOLINT(*)
}
@@ -2364,10 +2565,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
mean.WaitToRead();
MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
- mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
- for (size_t i = 0; i < y1-y0; i++) {
- uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
- for (size_t j = 0; j < x1-x0; j++) {
+ mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
+ for (size_t i = 0; i < y1 - y0; i++) {
+ uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
+ for (size_t j = 0; j < x1 - x0; j++) {
for (size_t k = 0; k < n_channels; k++) {
tensor[0][k][i][j] = DType(im_data[k]) - tmean[k][i][j]; // NOLINT(*)
}
@@ -2382,31 +2583,36 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
}
MXNET_REGISTER_NDARRAY_FUN(_imdecode)
-.set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
-.set_body([](NDArray **u, real_t *s, NDArray **out,
- int num_params, char **param_keys, char **param_vals) {
- CHECK_EQ(num_params, 1);
- Imdecode(out[0], *u[0],
- static_cast<size_t>(s[0]),
- static_cast<size_t>(s[1]),
- static_cast<size_t>(s[2]),
- static_cast<size_t>(s[3]),
- static_cast<size_t>(s[4]),
- static_cast<size_t>(s[5]),
- static_cast<size_t>(s[6]),
- param_vals[0]);
- })
-.set_num_use_vars(1)
-.set_num_scalars(7)
-.set_num_mutate_vars(1)
-.describe("Decode an image, clip to (x0, y0, x1, y1), subtract mean, and write to buffer")
-.add_argument("mean", "NDArray-or-Symbol", "image mean")
-.add_argument("index", "int", "buffer position for output")
-.add_argument("x0", "int", "x0")
-.add_argument("y0", "int", "y0")
-.add_argument("x1", "int", "x1")
-.add_argument("y1", "int", "y1")
-.add_argument("c", "int", "channel")
-.add_argument("size", "int", "length of str_img");
+ .set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
+ .set_body([](NDArray** u,
+ real_t* s,
+ NDArray** out,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
+ CHECK_EQ(num_params, 1);
+ Imdecode(out[0],
+ *u[0],
+ static_cast<size_t>(s[0]),
+ static_cast<size_t>(s[1]),
+ static_cast<size_t>(s[2]),
+ static_cast<size_t>(s[3]),
+ static_cast<size_t>(s[4]),
+ static_cast<size_t>(s[5]),
+ static_cast<size_t>(s[6]),
+ param_vals[0]);
+ })
+ .set_num_use_vars(1)
+ .set_num_scalars(7)
+ .set_num_mutate_vars(1)
+ .describe("Decode an image, clip to (x0, y0, x1, y1), subtract mean, and write to buffer")
+ .add_argument("mean", "NDArray-or-Symbol", "image mean")
+ .add_argument("index", "int", "buffer position for output")
+ .add_argument("x0", "int", "x0")
+ .add_argument("y0", "int", "y0")
+ .add_argument("x1", "int", "x1")
+ .add_argument("y1", "int", "y1")
+ .add_argument("c", "int", "channel")
+ .add_argument("size", "int", "length of str_img");
#endif
} // namespace mxnet
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
index bb8313d..2a73994 100644
--- a/src/operator/nn/batch_norm-inl.h
+++ b/src/operator/nn/batch_norm-inl.h
@@ -29,29 +29,41 @@
#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <mxnet/operator.h>
+
#include <mshadow/base.h>
+
#include <map>
-#include <vector>
#include <string>
#include <utility>
+#include <vector>
+
#include "../mshadow_op.h"
-#include "../operator_common.h"
#include "../mxnet_op.h"
+#include "../operator_common.h"
#ifdef __GNUG__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#endif
+/*! \brief inverse standard deviation <-> variance */
+#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0 / std::sqrt((__var$) + DType(__eps$)))
+#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
+
namespace mxnet {
namespace op {
namespace batchnorm {
-enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
- kInMovingVar}; // kGamma: weights, kBeta: biases
-enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data
-enum BatchNormOpResource {kTempSpace};
-enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states
+enum BatchNormOpInputs {
+ kData,
+ kGamma,
+ kBeta,
+ kInMovingMean,
+ kInMovingVar
+}; // kGamma: weights, kBeta: biases
+enum BatchNormOpOutputs { kOut, kMean, kVar }; // req, out_data
+enum BatchNormOpResource { kTempSpace };
+enum BatchNormOpAuxiliary { kMovingMean, kMovingVar }; // aux_states
/*! \brief Default channel axis if none specified in the params */
constexpr int DEFAULT_AXIS = 1;
@@ -59,11 +71,18 @@ constexpr int DEFAULT_AXIS = 1;
/*! \brief Parameters for BatchNorm operator */
namespace quantized_batchnorm {
-enum QuantizedBatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
- kInMovingVar, kDataMin, kDataMax};
-enum QuantizedBatchNormOutputs {kOut, kOutMin, kOutMax};
-enum QuantizedBatchNormOpAuxiliary {kMovingMean, kMovingVar};
-} // quantized_batchnorm
+enum QuantizedBatchNormOpInputs {
+ kData,
+ kGamma,
+ kBeta,
+ kInMovingMean,
+ kInMovingVar,
+ kDataMin,
+ kDataMax
+};
+enum QuantizedBatchNormOutputs { kOut, kOutMin, kOutMax };
+enum QuantizedBatchNormOpAuxiliary { kMovingMean, kMovingVar };
+} // namespace quantized_batchnorm
/*! \brief Parameters for BatchNoram operator */
struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
@@ -79,38 +98,42 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
dmlc::optional<float> max_calib_range; // max float value calculated from calibration dataset
DMLC_DECLARE_PARAMETER(BatchNormParam) {
- DMLC_DECLARE_FIELD(eps).set_default(1e-3f)
- .describe("Epsilon to prevent div 0. "
- "Must be no less than CUDNN_BN_MIN_EPSILON "
- "defined in cudnn.h when using cudnn (usually 1e-5)");
- DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
- .describe("Momentum for moving average");
- DMLC_DECLARE_FIELD(fix_gamma).set_default(true)
- .describe("Fix gamma while training");
- DMLC_DECLARE_FIELD(use_global_stats).set_default(false)
- .describe("Whether use global moving statistics instead of local batch-norm. "
- "This will force change batch-norm into a scale shift operator.");
- DMLC_DECLARE_FIELD(output_mean_var).set_default(false)
- .describe("Output the mean and inverse std ");
- DMLC_DECLARE_FIELD(axis).set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
- .describe("Specify which shape axis the channel is specified");
- DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
- .describe("Do not select CUDNN operator, if available");
+ DMLC_DECLARE_FIELD(eps).set_default(1e-3f).describe(
+ "Epsilon to prevent div 0. "
+ "Must be no less than CUDNN_BN_MIN_EPSILON "
+ "defined in cudnn.h when using cudnn (usually 1e-5)");
+ DMLC_DECLARE_FIELD(momentum).set_default(0.9f).describe("Momentum for moving average");
+ DMLC_DECLARE_FIELD(fix_gamma).set_default(true).describe("Fix gamma while training");
+ DMLC_DECLARE_FIELD(use_global_stats)
+ .set_default(false)
+ .describe(
+ "Whether use global moving statistics instead of local batch-norm. "
+ "This will force change batch-norm into a scale shift operator.");
+ DMLC_DECLARE_FIELD(output_mean_var)
+ .set_default(false)
+ .describe("Output the mean and inverse std ");
+ DMLC_DECLARE_FIELD(axis)
+ .set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
+ .describe("Specify which shape axis the channel is specified");
+ DMLC_DECLARE_FIELD(cudnn_off).set_default(false).describe(
+ "Do not select CUDNN operator, if available");
DMLC_DECLARE_FIELD(min_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The minimum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized batch norm op to calculate primitive scale."
- "Note: this calib_range is to calib bn output.");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The minimum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized batch norm op to calculate primitive scale."
+ "Note: this calib_range is to calib bn output.");
DMLC_DECLARE_FIELD(max_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The maximum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized batch norm op to calculate primitive scale."
- "Note: this calib_range is to calib bn output.");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The maximum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized batch norm op to calculate primitive scale."
+ "Note: this calib_range is to calib bn output.");
}
- bool operator==(const BatchNormParam &other) const {
+ bool operator==(const BatchNormParam& other) const {
bool flag = this->eps == other.eps && this->momentum == other.momentum &&
this->fix_gamma == other.fix_gamma &&
this->use_global_stats == other.use_global_stats &&
@@ -127,7 +150,7 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
}
void SetAttrDict(std::unordered_map<std::string, std::string>* dict) {
std::ostringstream eps_s, momentum_s, fix_gamma_s, use_global_stats_s, output_mean_var_s,
- axis_s, cudnn_off_s, min_calib_range_s, max_calib_range_s;
+ axis_s, cudnn_off_s, min_calib_range_s, max_calib_range_s;
eps_s << eps;
momentum_s << momentum;
fix_gamma_s << fix_gamma;
@@ -137,15 +160,15 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
cudnn_off_s << cudnn_off;
min_calib_range_s << min_calib_range;
max_calib_range_s << max_calib_range;
- (*dict)["eps"] = eps_s.str();
- (*dict)["momentum"] = momentum_s.str();
- (*dict)["fix_gamma"] = fix_gamma_s.str();
+ (*dict)["eps"] = eps_s.str();
+ (*dict)["momentum"] = momentum_s.str();
+ (*dict)["fix_gamma"] = fix_gamma_s.str();
(*dict)["use_global_stats"] = use_global_stats_s.str();
- (*dict)["output_mean_var"] = output_mean_var_s.str();
- (*dict)["axis"] = axis_s.str();
- (*dict)["cudnn_off"] = cudnn_off_s.str();
- (*dict)["min_calib_range"] = min_calib_range_s.str();
- (*dict)["max_calib_range"] = max_calib_range_s.str();
+ (*dict)["output_mean_var"] = output_mean_var_s.str();
+ (*dict)["axis"] = axis_s.str();
+ (*dict)["cudnn_off"] = cudnn_off_s.str();
+ (*dict)["min_calib_range"] = min_calib_range_s.str();
+ (*dict)["max_calib_range"] = max_calib_range_s.str();
}
};
@@ -153,15 +176,15 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
} // namespace mxnet
namespace std {
-template<>
+template <>
struct hash<mxnet::op::BatchNormParam> {
size_t operator()(const mxnet::op::BatchNormParam& val) {
size_t ret = 0;
- ret = dmlc::HashCombine(ret, val.momentum);
- ret = dmlc::HashCombine(ret, val.fix_gamma);
- ret = dmlc::HashCombine(ret, val.use_global_stats);
- ret = dmlc::HashCombine(ret, val.output_mean_var);
- ret = dmlc::HashCombine(ret, val.axis);
+ ret = dmlc::HashCombine(ret, val.momentum);
+ ret = dmlc::HashCombine(ret, val.fix_gamma);
+ ret = dmlc::HashCombine(ret, val.use_global_stats);
+ ret = dmlc::HashCombine(ret, val.output_mean_var);
+ ret = dmlc::HashCombine(ret, val.axis);
return ret;
}
};
@@ -175,40 +198,44 @@ static inline bool IsBNWriting(const OpReqType ort) {
}
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states);
+void BatchNormForwardImpl(mshadow::Stream<cpu>* stream,
+ const OpContext& ctx,
+ const BatchNormParam& param,
+ const std::vector<TBlob>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& out_data,
+ const std::vector<TBlob>& aux_states);
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &out_grad,
- const std::vector<TBlob> &in_data,
- const std::vector<TBlob> &out_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &in_grad,
- const std::vector<TBlob> &aux_states);
+void BatchNormBackwardImpl(mshadow::Stream<cpu>* stream,
+ const OpContext& ctx,
+ const BatchNormParam& param,
+ const std::vector<TBlob>& out_grad,
+ const std::vector<TBlob>& in_data,
+ const std::vector<TBlob>& out_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& in_grad,
+ const std::vector<TBlob>& aux_states);
#if MXNET_USE_CUDA
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states);
+void BatchNormForwardImpl(mshadow::Stream<gpu>* stream,
+ const OpContext& ctx,
+ const BatchNormParam& param,
+ const std::vector<TBlob>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& out_data,
+ const std::vector<TBlob>& aux_states);
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &out_grad,
- const std::vector<TBlob> &in_data,
- const std::vector<TBlob> &out_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &in_grad,
- const std::vector<TBlob> &aux_states);
+void BatchNormBackwardImpl(mshadow::Stream<gpu>* stream,
+ const OpContext& ctx,
+ const BatchNormParam& param,
+ const std::vector<TBlob>& out_grad,
+ const std::vector<TBlob>& in_data,
+ const std::vector<TBlob>& out_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& in_grad,
+ const std::vector<TBlob>& aux_states);
#endif // MXNET_USE_CUDA
/*!
@@ -223,11 +250,12 @@ void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
* \sa OpReqType, OpContext
*/
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states) {
+void BatchNormForward(const OpContext& ctx,
+ const BatchNormParam& param,
+ const std::vector<TBlob>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& out_data,
+ const std::vector<TBlob>& aux_states) {
using namespace mshadow;
using namespace mshadow::expr;
@@ -241,9 +269,8 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
CHECK_GE(req.size(), 1U);
CHECK_EQ(req[batchnorm::kOut], kWriteTo);
}
- Stream<xpu> *s = ctx.get_stream<xpu>();
- BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req,
- out_data, aux_states);
+ Stream<xpu>* s = ctx.get_stream<xpu>();
+ BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req, out_data, aux_states);
}
/*!
@@ -275,10 +302,11 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
* \sa OperatorProperty, OpReqType, OpContext
*/
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &outputs) {
+void BatchNormBackward(const OpContext& ctx,
+ const BatchNormParam& param,
+ const std::vector<TBlob>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 8U);
CHECK_EQ(outputs.size(), 3U);
@@ -287,40 +315,39 @@ void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
std::vector<TBlob> in_data(3);
std::vector<TBlob> aux_states(2);
- out_grad[0] = inputs[0];
- out_data[batchnorm::kMean] = inputs[1];
- out_data[batchnorm::kVar] = inputs[2];
- in_data[batchnorm::kData] = inputs[3];
- in_data[batchnorm::kGamma] = inputs[4];
- in_data[batchnorm::kBeta] = inputs[5];
+ out_grad[0] = inputs[0];
+ out_data[batchnorm::kMean] = inputs[1];
+ out_data[batchnorm::kVar] = inputs[2];
+ in_data[batchnorm::kData] = inputs[3];
+ in_data[batchnorm::kGamma] = inputs[4];
+ in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
- aux_states[batchnorm::kMovingVar] = inputs[7];
- const std::vector<TBlob> &in_grad = outputs;
- mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
- BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
- out_data, req, in_grad, aux_states);
+ aux_states[batchnorm::kMovingVar] = inputs[7];
+ const std::vector<TBlob>& in_grad = outputs;
+ mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+ BatchNormBackwardImpl<xpu, DType, AccReal>(
+ s, ctx, param, out_grad, in_data, out_data, req, in_grad, aux_states);
}
-template<typename xpu>
+template <typename xpu>
void BatchNormCompute(const nnvm::NodeAttrs& attrs,
- const OpContext& ctx, const std::vector<TBlob>& inputs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
CHECK_EQ(inputs.size(), 5U);
- std::vector<TBlob> in_data(inputs.begin(),
- inputs.begin() + batchnorm::kInMovingMean);
- std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean,
- inputs.end());
+ std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
+ std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
- BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs,
- aux_states);
+ BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
});
}
-template<typename xpu>
+template <typename xpu>
void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
- const OpContext& ctx, const std::vector<TBlob>& inputs,
+ const OpContext& ctx,
+ const std::vector<TBlob>& inputs,
const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 8U);
@@ -335,15 +362,15 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
namespace batchnorm {
-template<typename DType>
+template <typename DType>
class BNTensor3 {
enum { OUTER, CHANNEL, INNER, COUNT };
public:
inline BNTensor3(const TBlob& blob, const int indexOfChannel)
- : dptr_(blob.dptr<DType>())
- , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
- ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
+ : dptr_(blob.dptr<DType>()),
+ indexOfChannel_(static_cast<size_t>(
+ indexOfChannel < 0 ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
: indexOfChannel)) {
CHECK_EQ(blob.type_flag_, mshadow::DataType<DType>::kFlag);
shape_[OUTER] = 1;
@@ -351,23 +378,23 @@ class BNTensor3 {
shape_[OUTER] *= blob.shape_[i];
}
shape_[CHANNEL] = blob.shape_[indexOfChannel_];
- shape_[INNER] = 1;
+ shape_[INNER] = 1;
for (size_t i = indexOfChannel_ + 1, n = blob.shape_.ndim(); i < n; ++i) {
shape_[INNER] *= blob.shape_[i];
}
}
- inline BNTensor3(DType *p, const mxnet::TShape& shape, const int indexOfChannel)
- : dptr_(p)
- , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
- ? (static_cast<int>(shape.ndim()) + indexOfChannel)
- : indexOfChannel)) {
+ inline BNTensor3(DType* p, const mxnet::TShape& shape, const int indexOfChannel)
+ : dptr_(p),
+ indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
+ ? (static_cast<int>(shape.ndim()) + indexOfChannel)
+ : indexOfChannel)) {
shape_[OUTER] = 1;
for (size_t i = 0; i < indexOfChannel_; ++i) {
shape_[OUTER] *= shape[i];
}
shape_[CHANNEL] = shape[indexOfChannel_];
- shape_[INNER] = 1;
+ shape_[INNER] = 1;
for (size_t i = indexOfChannel_ + 1, n = shape.ndim(); i < n; ++i) {
shape_[INNER] *= shape[i];
}
@@ -414,12 +441,10 @@ class BNTensor3 {
return (ChannelCount() - 1) * InnerSize();
}
- MSHADOW_XINLINE size_t offset(const size_t outer,
- const size_t channel,
- const size_t i) const {
+ MSHADOW_XINLINE size_t offset(const size_t outer, const size_t channel, const size_t i) const {
const size_t spatial_size = InnerSize();
- const size_t skip_length = SkipLengthToNextSameChannelData();
- size_t off = StartOffset(channel);
+ const size_t skip_length = SkipLengthToNextSameChannelData();
+ size_t off = StartOffset(channel);
off += outer * shape_[CHANNEL] * shape_[INNER];
const size_t skips = i / spatial_size;
off += (1 + skip_length) * skips;
@@ -427,9 +452,7 @@ class BNTensor3 {
return off;
}
- MSHADOW_XINLINE DType& get_ref(const size_t batch,
- const size_t channel,
- const size_t i) {
+ MSHADOW_XINLINE DType& get_ref(const size_t batch, const size_t channel, const size_t i) {
const size_t off = offset(batch, channel, i);
return dptr_[off];
}
@@ -441,7 +464,7 @@ class BNTensor3 {
return dptr_[off];
}
- DType *dptr_;
+ DType* dptr_;
size_t indexOfChannel_;
size_t shape_[COUNT];
};
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index be0b015..e6f6fce 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -22,20 +22,18 @@
* \file batch_norm.cc
* \brief
* \author Bing Xu, Chris Olivier, Da Zheng
-*/
+ */
-#include "batch_norm-inl.h"
#include <nnvm/op_attr_types.h>
+
#include "../elemwise_op_common.h"
#include "../operator_common.h"
+
+#include "batch_norm-inl.h"
#if MXNET_USE_ONEDNN == 1
#include "./mkldnn/mkldnn_batch_norm-inl.h"
#endif
-/*! \brief inverse standard deviation <-> variance */
-#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/std::sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
-
namespace mxnet {
namespace op {
namespace batchnorm {
@@ -44,15 +42,15 @@ namespace batchnorm {
volatile bool disable_mkl = false;
/*! \brief Fast-foreach when you don't care about the position other than channel */
-template<typename DType, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType> &tensor,
+template <typename DType, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType>& tensor,
const size_t channel,
OnData onData) {
- const size_t num = tensor.OuterSize();
- const size_t matrixSize = tensor.InnerSize();
- const size_t skipLength = tensor.SkipLengthToNextSameChannelData();
+ const size_t num = tensor.OuterSize();
+ const size_t matrixSize = tensor.InnerSize();
+ const size_t skipLength = tensor.SkipLengthToNextSameChannelData();
const size_t startOffset = tensor.StartOffset(channel);
- DType *data = tensor.dptr_ + startOffset;
+ DType* data = tensor.dptr_ + startOffset;
for (size_t outer = 0; outer < num; ++outer) {
for (size_t i = 0; i < matrixSize; ++i) {
@@ -63,9 +61,9 @@ static inline void ForEachFast(const BNTensor3<DType> &tensor,
}
/*! \brief Fast-foreach when you don't care about the position other than channel */
-template<typename DType1, typename DType2, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType1> &in_data,
- const BNTensor3<DType2> &out_data,
+template <typename DType1, typename DType2, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType1>& in_data,
+ const BNTensor3<DType2>& out_data,
const size_t channel,
OnData onData) {
const size_t num = in_data.OuterSize();
@@ -73,22 +71,22 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
const size_t skipLength = in_data.SkipLengthToNextSameChannelData();
const size_t startOffset = in_data.StartOffset(channel);
- DType1 *data = in_data.dptr_ + startOffset;
- DType2 *odata = out_data.dptr_ + startOffset;
+ DType1* data = in_data.dptr_ + startOffset;
+ DType2* odata = out_data.dptr_ + startOffset;
for (size_t outer = 0; outer < num; ++outer) {
for (size_t i = 0; i < matrixSize; ++i) {
onData(data++, odata++);
}
- data += skipLength;
+ data += skipLength;
odata += skipLength;
}
}
-template<typename DType1, typename DType2, typename DType3, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType1> &in_data,
- const BNTensor3<DType2> &in_data2,
- const BNTensor3<DType3> &out_data,
+template <typename DType1, typename DType2, typename DType3, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType1>& in_data,
+ const BNTensor3<DType2>& in_data2,
+ const BNTensor3<DType3>& out_data,
const size_t channel,
OnData onData) {
const size_t num = in_data.OuterSize();
@@ -96,15 +94,15 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
const size_t skipLength = in_data.SkipLengthToNextSameChannelData();
const size_t startOffset = in_data.StartOffset(channel);
- DType1 *data = in_data.dptr_ + startOffset;
- DType2 *data2 = in_data2.dptr_ + startOffset;
- DType3 *odata = out_data.dptr_ + startOffset;
+ DType1* data = in_data.dptr_ + startOffset;
+ DType2* data2 = in_data2.dptr_ + startOffset;
+ DType3* odata = out_data.dptr_ + startOffset;
for (size_t outer = 0; outer < num; ++outer) {
for (size_t i = 0; i < matrixSize; ++i) {
onData(data++, data2++, odata++);
}
- data += skipLength;
+ data += skipLength;
data2 += skipLength;
odata += skipLength;
}
@@ -114,50 +112,50 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
/*! \brief Forward CPU */
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *,
- const OpContext &ctx, const BatchNormParam& param_,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states) {
+void BatchNormForwardImpl(mshadow::Stream<cpu>*,
+ const OpContext& ctx,
+ const BatchNormParam& param_,
+ const std::vector<TBlob>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& out_data,
+ const std::vector<TBlob>& aux_states) {
// Input
batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
- const TBlob &weights = in_data[batchnorm::kGamma];
- const TBlob &bias = in_data[batchnorm::kBeta];
+ const TBlob& weights = in_data[batchnorm::kGamma];
+ const TBlob& bias = in_data[batchnorm::kBeta];
// Aux (Moving)
- const TBlob &runningMean = aux_states[batchnorm::kMovingMean];
- const TBlob &runningVariance = aux_states[batchnorm::kMovingVar];
+ const TBlob& runningMean = aux_states[batchnorm::kMovingMean];
+ const TBlob& runningVariance = aux_states[batchnorm::kMovingVar];
// Output
batchnorm::BNTensor3<DType> outputData(out_data[batchnorm::kOut], param_.axis);
- const TBlob &meanVector = out_data[batchnorm::kMean];
- const TBlob &varianceVector = out_data[batchnorm::kVar];
+ const TBlob& meanVector = out_data[batchnorm::kMean];
+ const TBlob& varianceVector = out_data[batchnorm::kVar];
- AccReal *mean = meanVector.dptr<AccReal>();
- AccReal *var = varianceVector.dptr<AccReal>();
+ AccReal* mean = meanVector.dptr<AccReal>();
+ AccReal* var = varianceVector.dptr<AccReal>();
const bool is_train_and_not_global_stats = ctx.is_train && !param_.use_global_stats;
- const size_t channelCount = inputData.ChannelCount();
- const size_t itemCountPerChannel = inputData.Size() / channelCount;
+ const size_t channelCount = inputData.ChannelCount();
+ const size_t itemCountPerChannel = inputData.Size() / channelCount;
- #pragma omp parallel for
+#pragma omp parallel for
for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
if (is_train_and_not_global_stats) {
// compute mean per input
mean[channel] = 0;
- ForEachFast(inputData, channel, [mean, channel](const DType *in_data) {
- mean[channel] += *in_data; });
+ ForEachFast(
+ inputData, channel, [mean, channel](const DType* in_data) { mean[channel] += *in_data; });
mean[channel] /= itemCountPerChannel;
// compute variance per input
const AccReal thisMean = mean[channel];
- var[channel] = 0;
- ForEachFast(inputData, channel,
- [var, thisMean, channel](const DType *current_in_data) {
- const AccReal current = *current_in_data;
- var[channel] += (current - thisMean) * (current - thisMean);
- });
+ var[channel] = 0;
+ ForEachFast(inputData, channel, [var, thisMean, channel](const DType* current_in_data) {
+ const AccReal current = *current_in_data;
+ var[channel] += (current - thisMean) * (current - thisMean);
+ });
const AccReal sum = var[channel];
@@ -167,45 +165,49 @@ void BatchNormForwardImpl(mshadow::Stream<cpu> *,
invstd = 0;
} else {
const AccReal variance = sum / itemCountPerChannel;
- invstd = VARIANCE_TO_INVSTD(variance, param_.eps);
+ invstd = VARIANCE_TO_INVSTD(variance, param_.eps);
}
var[channel] = invstd;
} else {
- const AccReal *rm = runningMean.dptr<AccReal>();
- const AccReal *rv = runningVariance.dptr<AccReal>();
+ const AccReal* rm = runningMean.dptr<AccReal>();
+ const AccReal* rv = runningVariance.dptr<AccReal>();
mean[channel] = rm[channel];
- var[channel] = VARIANCE_TO_INVSTD(rv[channel], param_.eps);
+ var[channel] = VARIANCE_TO_INVSTD(rv[channel], param_.eps);
}
// compute output
- AccReal *w = weights.dptr<AccReal>();
- const AccReal *b = bias.dptr<AccReal>();
+ AccReal* w = weights.dptr<AccReal>();
+ const AccReal* b = bias.dptr<AccReal>();
- const AccReal thisMean = mean[channel];
+ const AccReal thisMean = mean[channel];
const AccReal thisInvstd = var[channel];
const AccReal thisWeight = w[channel];
- const AccReal thisBias = b[channel];
+ const AccReal thisBias = b[channel];
// note that var is still invstd
if (!param_.fix_gamma) {
if (IsBNWriting(req[batchnorm::kData])) {
- ForEachFast(inputData, outputData, channel,
- [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
- DType *out_data) {
- *out_data = static_cast<DType>(
- ((*in_data - thisMean) * thisInvstd) * thisWeight + thisBias);
- });
+ ForEachFast(
+ inputData,
+ outputData,
+ channel,
+ [thisWeight, thisBias, thisMean, thisInvstd](const DType* in_data, DType* out_data) {
+ *out_data =
+ static_cast<DType>(((*in_data - thisMean) * thisInvstd) * thisWeight + thisBias);
+ });
}
} else {
if (IsBNWriting(req[batchnorm::kGamma])) {
w[channel] = AccReal(1);
}
if (IsBNWriting(req[batchnorm::kData])) {
- ForEachFast(inputData, outputData, channel,
- [thisBias, thisMean, thisInvstd](const DType *in_data, DType *out_data) {
- *out_data = static_cast<DType>(
- ((*in_data - thisMean) * thisInvstd) + thisBias);
+ ForEachFast(inputData,
+ outputData,
+ channel,
+ [thisBias, thisMean, thisInvstd](const DType* in_data, DType* out_data) {
+ *out_data =
+ static_cast<DType>(((*in_data - thisMean) * thisInvstd) + thisBias);
});
}
}
@@ -213,78 +215,80 @@ void BatchNormForwardImpl(mshadow::Stream<cpu> *,
}
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
- const OpContext &ctx, const BatchNormParam& param_,
- const std::vector<TBlob> &out_grad,
- const std::vector<TBlob> &in_data,
- const std::vector<TBlob> &out_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &in_grad,
- const std::vector<TBlob> &aux_states) {
+void BatchNormBackwardImpl(mshadow::Stream<cpu>*,
+ const OpContext& ctx,
+ const BatchNormParam& param_,
+ const std::vector<TBlob>& out_grad,
+ const std::vector<TBlob>& in_data,
+ const std::vector<TBlob>& out_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& in_grad,
+ const std::vector<TBlob>& aux_states) {
// Input Data
batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
- const TBlob &weights = in_data[batchnorm::kGamma];
+ const TBlob& weights = in_data[batchnorm::kGamma];
// Input Grad
batchnorm::BNTensor3<DType> gradIn(in_grad[batchnorm::kData], param_.axis);
- const TBlob &gradWeight = in_grad[batchnorm::kGamma];
- const TBlob &gradBias = in_grad[batchnorm::kBeta];
+ const TBlob& gradWeight = in_grad[batchnorm::kGamma];
+ const TBlob& gradBias = in_grad[batchnorm::kBeta];
// Aux (Moving)
- const TBlob &runningMean = aux_states[batchnorm::kMovingMean];
- const TBlob &runningVariance = aux_states[batchnorm::kMovingVar];
+ const TBlob& runningMean = aux_states[batchnorm::kMovingMean];
+ const TBlob& runningVariance = aux_states[batchnorm::kMovingVar];
// Output
batchnorm::BNTensor3<DType> gradOut(out_grad[batchnorm::kOut], param_.axis);
- const TBlob &saveMean = out_data[batchnorm::kMean];
- const TBlob &saveStd = out_data[batchnorm::kVar];
+ const TBlob& saveMean = out_data[batchnorm::kMean];
+ const TBlob& saveStd = out_data[batchnorm::kVar];
const size_t channelCount = inputData.ChannelCount();
const size_t itemCount = inputData.Size() / channelCount;
// Avoid multiple dptr() call within the channel loop
- AccReal *runningMeanDataPtr = runningMean.dptr<AccReal>();
- AccReal *runningVarDataPtr = runningVariance.dptr<AccReal>();
- const AccReal *saveMeanDataPtr = saveMean.dptr<AccReal>();
- const AccReal *saveInvStdDataPtr = saveStd.dptr<AccReal>();
- AccReal *gradWeightData = gradWeight.dptr<AccReal>();
- AccReal *gradBiasData = gradBias.dptr<AccReal>();
+ AccReal* runningMeanDataPtr = runningMean.dptr<AccReal>();
+ AccReal* runningVarDataPtr = runningVariance.dptr<AccReal>();
+ const AccReal* saveMeanDataPtr = saveMean.dptr<AccReal>();
+ const AccReal* saveInvStdDataPtr = saveStd.dptr<AccReal>();
+ AccReal* gradWeightData = gradWeight.dptr<AccReal>();
+ AccReal* gradBiasData = gradBias.dptr<AccReal>();
const bool is_train_and_not_global_stats = ctx.is_train && !param_.use_global_stats;
- #pragma omp parallel for
+#pragma omp parallel for
for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
- const AccReal *weight = weights.dptr<AccReal>();
- const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1);
+ const AccReal* weight = weights.dptr<AccReal>();
+ const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1);
AccReal mean, invstd;
if (is_train_and_not_global_stats) {
- mean = saveMeanDataPtr[channel];
- invstd = saveInvStdDataPtr[channel];
+ mean = saveMeanDataPtr[channel];
+ invstd = saveInvStdDataPtr[channel];
const AccReal variance = INVSTD_TO_VARIANCE(invstd, param_.eps);
// update running averages
- runningMeanDataPtr[channel] = runningMeanDataPtr[channel] * param_.momentum
- + mean * (AccReal(1) - param_.momentum);
+ runningMeanDataPtr[channel] =
+ runningMeanDataPtr[channel] * param_.momentum + mean * (AccReal(1) - param_.momentum);
- runningVarDataPtr[channel] = runningVarDataPtr[channel] * param_.momentum
- + variance * (AccReal(1) - param_.momentum);
+ runningVarDataPtr[channel] =
+ runningVarDataPtr[channel] * param_.momentum + variance * (AccReal(1) - param_.momentum);
} else {
- mean = runningMeanDataPtr[channel];
+ mean = runningMeanDataPtr[channel];
invstd = VARIANCE_TO_INVSTD(runningVarDataPtr[channel], param_.eps);
}
// sumGradOut over all gradOutput in feature plane
AccReal sumGradOut = 0;
- ForEachFast(gradOut, static_cast<size_t>(channel),
- [&sumGradOut](const DType *gradOut_data) {
- sumGradOut += *gradOut_data;
- });
+ ForEachFast(gradOut, static_cast<size_t>(channel), [&sumGradOut](const DType* gradOut_data) {
+ sumGradOut += *gradOut_data;
+ });
// dot product of the Q(X) and gradOuput
AccReal dotp = 0;
- ForEachFast(inputData, gradOut, static_cast<size_t>(channel),
- [&dotp, mean](const DType *thisInputData, const DType *gradOut_data) {
+ ForEachFast(inputData,
+ gradOut,
+ static_cast<size_t>(channel),
+ [&dotp, mean](const DType* thisInputData, const DType* gradOut_data) {
dotp += (*thisInputData - mean) * (*gradOut_data);
});
@@ -296,28 +300,34 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
// dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
// projection of gradOutput on to output scaled by std
- const AccReal k = dotp * invstd * invstd / itemCount;
- const AccReal iw = invstd * w;
+ const AccReal k = dotp * invstd * invstd / itemCount;
+ const AccReal iw = invstd * w;
const AccReal gradMean = sumGradOut / itemCount;
if (req[batchnorm::kData] != kAddTo) {
- ForEachFast(inputData, gradIn, static_cast<size_t>(channel),
- [&mean, &k](const DType *inputDataPtr, DType *gradIn_data) {
+ ForEachFast(inputData,
+ gradIn,
+ static_cast<size_t>(channel),
+ [&mean, &k](const DType* inputDataPtr, DType* gradIn_data) {
*gradIn_data = (*inputDataPtr - mean) * k;
});
- ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
- [iw, gradMean](const DType *gradOut_data, DType *gradIn_data) {
+ ForEachFast(gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [iw, gradMean](const DType* gradOut_data, DType* gradIn_data) {
*gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * iw;
});
} else {
- ForEachFast(inputData, gradOut, gradIn, static_cast<size_t>(channel),
- [&mean, &k, iw, gradMean](const DType *inputDataPtr,
- const DType *gradOut_data,
- DType *gradIn_data) {
- DType normal_val = (*inputDataPtr - mean) * k;
- *gradIn_data += (*gradOut_data - gradMean -
- normal_val) * iw;
- });
+ ForEachFast(
+ inputData,
+ gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [&mean, &k, iw, gradMean](
+ const DType* inputDataPtr, const DType* gradOut_data, DType* gradIn_data) {
+ DType normal_val = (*inputDataPtr - mean) * k;
+ *gradIn_data += (*gradOut_data - gradMean - normal_val) * iw;
+ });
}
} else {
// when in evaluation mode
@@ -326,13 +336,17 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
// dL/dX = w / running_std
const AccReal iw = invstd * w;
if (req[batchnorm::kData] != kAddTo) {
- ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
- [iw](const DType *gradOut_data, DType *gradIn_data) {
+ ForEachFast(gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [iw](const DType* gradOut_data, DType* gradIn_data) {
*gradIn_data = *gradOut_data * iw;
});
} else {
- ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
- [iw](const DType *gradOut_data, DType *gradIn_data) {
+ ForEachFast(gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [iw](const DType* gradOut_data, DType* gradIn_data) {
*gradIn_data += *gradOut_data * iw;
});
}
@@ -357,20 +371,19 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
DMLC_REGISTER_PARAMETER(BatchNormParam);
static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
- mxnet::ShapeVector *in_shape,
- mxnet::ShapeVector *out_shape) {
+ mxnet::ShapeVector* in_shape,
+ mxnet::ShapeVector* out_shape) {
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
using namespace mshadow;
CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]";
CHECK_EQ(out_shape->size(), 3U);
- const mxnet::TShape &dshape = in_shape->at(batchnorm::kData);
+ const mxnet::TShape& dshape = in_shape->at(batchnorm::kData);
if (!mxnet::ndim_is_known(dshape)) {
return false;
}
- const size_t channelAxis = static_cast<size_t>(param.axis < 0
- ? static_cast<int>(dshape.ndim()) + param.axis
- : param.axis);
+ const size_t channelAxis = static_cast<size_t>(
+ param.axis < 0 ? static_cast<int>(dshape.ndim()) + param.axis : param.axis);
CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis;
const index_t channelCount = dshape[channelAxis];
@@ -380,7 +393,6 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
SHAPE_ASSIGN_CHECK(*in_shape, batchnorm::kInMovingMean, Shape1(channelCount)); // kMovingMean
SHAPE_ASSIGN_CHECK(*in_shape, batchnorm::kInMovingVar, Shape1(channelCount)); // kMovingVar
-
SHAPE_ASSIGN_CHECK(*out_shape, batchnorm::kOut, dshape);
SHAPE_ASSIGN_CHECK(*out_shape, batchnorm::kMean, Shape1(channelCount));
SHAPE_ASSIGN_CHECK(*out_shape, batchnorm::kVar, Shape1(channelCount));
@@ -389,7 +401,8 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
}
static bool BatchNormType(const nnvm::NodeAttrs& attrs,
- std::vector<int> *in_type, std::vector<int> *out_type) {
+ std::vector<int>* in_type,
+ std::vector<int>* out_type) {
using namespace mshadow;
CHECK_GE(in_type->size(), 1U);
const size_t n_out = 3;
@@ -400,21 +413,21 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
int dtype = (*in_type)[0];
if (type_is_none(dtype)) {
// Input type is undefined, we try backward inference
- if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
- // Neither the input nor the output are defined,
- // types cannot be infered for this op
- return false;
- } else {
- // Input type is undefined but output type is: backward inference
- dtype = (*out_type)[0];
- (*in_type)[0] = dtype;
- MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
- dtype_param = mshadow::DataType<AccRealX>::kFlag; });
- }
+ if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
+ // Neither the input nor the output are defined,
+ // types cannot be infered for this op
+ return false;
+ } else {
+ // Input type is undefined but output type is: backward inference
+ dtype = (*out_type)[0];
+ (*in_type)[0] = dtype;
+ MSHADOW_REAL_TYPE_SWITCH_EX(
+ dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+ }
} else {
// Input type is defined but output type is not: forward inference
- MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
- dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+ MSHADOW_REAL_TYPE_SWITCH_EX(
+ dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType<AccRealX>::kFlag; });
out_type->clear();
out_type->push_back(dtype);
for (size_t i = 1; i < n_out; ++i) {
@@ -434,29 +447,30 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
}
#if MXNET_USE_ONEDNN == 1
-static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) {
- if (mxnet::op::batchnorm::disable_mkl) return false;
+static inline bool SupportMKLDNNBN(const NDArray& input, const BatchNormParam& param) {
+ if (mxnet::op::batchnorm::disable_mkl)
+ return false;
const mxnet::TShape shape = input.shape();
- const int ndim = shape.ndim();
- if (ndim == 0 || shape.Size() == 0) return false;
+ const int ndim = shape.ndim();
+ if (ndim == 0 || shape.Size() == 0)
+ return false;
const int dtype = input.dtype();
- return (dtype == mshadow::kFloat32 ||
- dtype == mshadow::kBfloat16) &&
- SupportStorageMKLDNN(input.storage_type());
+ return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
+ SupportStorageMKLDNN(input.storage_type());
}
-void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void BatchNormComputeExCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
CHECK_EQ(inputs.size(), 5U);
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
- bool fuse_relu = false;
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+ bool fuse_relu = false;
if (SupportMKLDNNBN(inputs[0], param)) {
MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
MKLDNN_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
- MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
+ MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
});
MKLDNN_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
return;
@@ -464,45 +478,45 @@ void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
FallBackCompute(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
}
-void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
- bool fuse_relu = false;
+void BatchNormGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+ bool fuse_relu = false;
if (SupportMKLDNNBN(inputs[0], param)) {
- MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
- MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
- MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
- return;
+ MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+ MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
+ MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+ return;
}
FallBackCompute(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
}
#endif
-static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs,
+static inline bool BatchNormStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
- DispatchMode *dispatch_mode,
- std::vector<int> *in_attrs,
- std::vector<int> *out_attrs) {
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
bool dispatched = false;
#if MXNET_USE_ONEDNN == 1
if (!dispatched) {
- dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode,
- in_attrs, out_attrs);
+ dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
}
if (!MKLDNNEnvSet()) {
*dispatch_mode = DispatchMode::kFComputeFallback;
}
#else
for (int& v : *in_attrs)
- if (v == - 1) v = kDefaultStorage;
+ if (v == -1)
+ v = kDefaultStorage;
if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
- dispatched = storage_type_assign(out_attrs, kDefaultStorage,
- dispatch_mode, DispatchMode::kFCompute);
+ dispatched =
+ storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
}
if (!dispatched) {
dispatched = dispatch_fallback(out_attrs, dispatch_mode);
@@ -532,10 +546,10 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
heads.emplace_back(n->inputs.at(batchnorm::kInMovingVar));
nnvm::ObjectPtr gnode = nnvm::Node::Create();
- gnode->inputs = std::move(heads);
+ gnode->inputs = std::move(heads);
gnode->control_deps.emplace_back(n);
- gnode->attrs = n->attrs;
- gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
+ gnode->attrs = n->attrs;
+ gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
gnode->attrs.name = n->attrs.name + "_backward";
// The input of batchnorm
std::vector<nnvm::NodeEntry> in_grad;
@@ -544,8 +558,8 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
in_grad.emplace_back(gnode, i, 0);
// attach no gradient node to forbid gradient on aux_state
nnvm::ObjectPtr ng = nnvm::Node::Create();
- ng->attrs.op = Op::Get("_NoGradient");
- ng->attrs.name = "NoGradient";
+ ng->attrs.op = Op::Get("_NoGradient");
+ ng->attrs.name = "NoGradient";
// the aux state of batchnorm
for (size_t i = 3; i < 5; ++i)
in_grad.emplace_back(ng);
@@ -553,8 +567,8 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
}
NNVM_REGISTER_OP(BatchNorm)
-.add_alias("_npx_batch_norm")
-.describe(R"code(Batch normalization.
+ .add_alias("_npx_batch_norm")
+ .describe(R"code(Batch normalization.
Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
well as offset ``beta``.
@@ -604,73 +618,80 @@ then set ``gamma`` to 1 and its gradient to 0.
the sparse tensors will fallback.
)code" ADD_FILELINE)
-.set_num_inputs(5)
-.set_num_outputs(3)
-.set_attr_parser(ParamParser<BatchNormParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
- [](const NodeAttrs& attrs) {
- return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
- [](const NodeAttrs& attrs) {
- return std::vector<std::string>{"output", "mean", "var"};
-})
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
- [](const NodeAttrs& attrs) {
- const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
- return param.output_mean_var ? 3 : 1;
-})
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
- return std::vector<uint32_t>{3, 4};
-})
-.set_attr<mxnet::FInferShape>("FInferShape", BatchNormShape)
-.set_attr<nnvm::FInferType>("FInferType", BatchNormType)
-.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
-.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
+ .set_num_inputs(5)
+ .set_num_outputs(3)
+ .set_attr_parser(ParamParser<BatchNormParam>)
+ .set_attr<nnvm::FListInputNames>(
+ "FListInputNames",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+ })
+ .set_attr<nnvm::FListOutputNames>("FListOutputNames",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::string>{"output", "mean", "var"};
+ })
+ .set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+ [](const NodeAttrs& attrs) {
+ const BatchNormParam& param =
+ nnvm::get<BatchNormParam>(attrs.parsed);
+ return param.output_mean_var ? 3 : 1;
+ })
+ .set_attr<nnvm::FMutateInputs>("FMutateInputs",
+ [](const nnvm::NodeAttrs& attrs) {
+ return std::vector<uint32_t>{3, 4};
+ })
+ .set_attr<mxnet::FInferShape>("FInferShape", BatchNormShape)
+ .set_attr<nnvm::FInferType>("FInferType", BatchNormType)
+ .set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+ .set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
#if MXNET_USE_ONEDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
+ .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
#endif
-.set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
+ .set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
#if MXNET_USE_ONEDNN == 1
-.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
- return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
+ .set_attr<bool>("TIsMKLDNN", true)
+ .set_attr<FResourceRequest>("FResourceRequest",
+ [](const NodeAttrs& n) {
+ return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+ })
#endif
-.add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
-.add_argument("gamma", "NDArray-or-Symbol", "gamma array")
-.add_argument("beta", "NDArray-or-Symbol", "beta array")
-.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
-.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
-.add_arguments(BatchNormParam::__FIELDS__())
-.set_attr<nnvm::FSetInputVarAttrOnCompose>(
- "FSetInputVarAttrOnCompose",
- [](const nnvm::NodeAttrs& attrs, nnvm::ObjectPtr var, const int index) {
- if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
- if (index == 3) {
- var->attrs.dict["__init__"] = "[\"zero\", {}]";
- } else if (index == 4) {
- var->attrs.dict["__init__"] = "[\"one\", {}]";
- }
- });
+ .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
+ .add_argument("gamma", "NDArray-or-Symbol", "gamma array")
+ .add_argument("beta", "NDArray-or-Symbol", "beta array")
+ .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
+ .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
+ .add_arguments(BatchNormParam::__FIELDS__())
+ .set_attr<nnvm::FSetInputVarAttrOnCompose>(
+ "FSetInputVarAttrOnCompose",
+ [](const nnvm::NodeAttrs& attrs, nnvm::ObjectPtr var, const int index) {
+ if (var->attrs.dict.find("__init__") != var->attrs.dict.end())
+ return;
+ if (index == 3) {
+ var->attrs.dict["__init__"] = "[\"zero\", {}]";
+ } else if (index == 4) {
+ var->attrs.dict["__init__"] = "[\"one\", {}]";
+ }
+ });
NNVM_REGISTER_OP(_backward_BatchNorm)
-.set_num_inputs(8)
-.set_num_outputs(3)
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
- return std::vector<uint32_t>{6, 7}; // moving_mean, moving_var
-})
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
- return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-.set_attr_parser(ParamParser<BatchNormParam>)
+ .set_num_inputs(8)
+ .set_num_outputs(3)
+ .set_attr<nnvm::FMutateInputs>("FMutateInputs",
+ [](const nnvm::NodeAttrs& attrs) {
+ return std::vector<uint32_t>{6, 7}; // moving_mean, moving_var
+ })
+ .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+ .set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+ .set_attr<FResourceRequest>("FResourceRequest",
+ [](const NodeAttrs& n) {
+ return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+ })
+ .set_attr_parser(ParamParser<BatchNormParam>)
#if MXNET_USE_ONEDNN == 1
-.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
+ .set_attr<bool>("TIsMKLDNN", true)
+ .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
#endif
-.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
+ .set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
} // namespace op
} // namespace mxnet
diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h
index 0c6e856..ca3e4aa 100644
--- a/src/operator/nn/mkldnn/mkldnn_act-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h
@@ -22,17 +22,17 @@
* \file mkldnn_act-inl.h
* \brief MKLDNN Activation operator
* /author Zhiyuan Huang
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
-
#if MXNET_USE_ONEDNN == 1
-#include <vector>
#include <utility>
-#include "../activation-inl.h"
+#include <vector>
+
#include "../../leaky_relu-inl.h"
+#include "../activation-inl.h"
namespace mxnet {
namespace op {
@@ -42,53 +42,56 @@ struct MKLDNNActParam {
float slope = 0.f;
bool operator==(const MKLDNNActParam& other) const {
- return this->alg == other.alg &&
- this->slope == other.slope;
+ return this->alg == other.alg && this->slope == other.slope;
}
};
mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param);
mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param);
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
- const MKLDNNActParam& param, bool is_train,
- const mkldnn::memory &input_mem);
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
+ bool is_train,
+ const mkldnn::memory& input_mem);
class MKLDNNActForward {
public:
const mkldnn::eltwise_forward::primitive_desc fwd_pd;
- MKLDNNActForward(const MKLDNNActParam& param, bool is_train,
- const NDArray &data, const mkldnn::memory &mem): fwd_pd(
- GetActFwdDescImpl(param, is_train, mem)) {
+ MKLDNNActForward(const MKLDNNActParam& param,
+ bool is_train,
+ const NDArray& data,
+ const mkldnn::memory& mem)
+ : fwd_pd(GetActFwdDescImpl(param, is_train, mem)) {
fwd_ = std::make_shared<mkldnn::eltwise_forward>(fwd_pd);
}
- const inline mkldnn::eltwise_forward &GetFwd() const;
+ const inline mkldnn::eltwise_forward& GetFwd() const;
private:
std::shared_ptr<mkldnn::eltwise_forward> fwd_;
};
typedef ParamOpSign<MKLDNNActParam> MKLDNNActSignature;
-MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
- const OpContext &ctx, const NDArray &in_data,
- const mkldnn::memory &in_mem);
+MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const mkldnn::memory& in_mem);
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(
- const MKLDNNActParam ¶m, const mkldnn::memory &input_mem,
- const mkldnn::memory &diff_dst_memory);
+mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
+ const mkldnn::memory& input_mem,
+ const mkldnn::memory& diff_dst_memory);
class MKLDNNActBackward {
public:
const mkldnn::eltwise_backward::primitive_desc bwd_pd;
- explicit MKLDNNActBackward(const MKLDNNActParam ¶m, const NDArray &data,
- const mkldnn::memory &mem,
- const mkldnn::memory &diff_dst_memory): bwd_pd(
- GetActBwdDescImpl(param, mem, diff_dst_memory)) {
+ explicit MKLDNNActBackward(const MKLDNNActParam& param,
+ const NDArray& data,
+ const mkldnn::memory& mem,
+ const mkldnn::memory& diff_dst_memory)
+ : bwd_pd(GetActBwdDescImpl(param, mem, diff_dst_memory)) {
bwd_prim_ = std::make_shared<mkldnn::eltwise_backward>(bwd_pd);
}
- const inline mkldnn::eltwise_backward &GetBwd() const;
+ const inline mkldnn::eltwise_backward& GetBwd() const;
private:
std::shared_ptr<mkldnn::eltwise_backward> bwd_prim_;
@@ -97,12 +100,12 @@ class MKLDNNActBackward {
} // namespace mxnet
namespace std {
-template<>
+template <>
struct hash<mxnet::op::MKLDNNActParam> {
size_t operator()(const mxnet::op::MKLDNNActParam& val) {
size_t ret = 0;
- ret = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
- ret = dmlc::HashCombine(ret, val.slope);
+ ret = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
+ ret = dmlc::HashCombine(ret, val.slope);
return ret;
}
};
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index 43c198f..afaf5e9 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -21,59 +21,57 @@
* \file mkldnn_act.cc
* \brief
* \author Da Zheng
-*/
+ */
#if MXNET_USE_ONEDNN == 1
#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <mxnet/operator.h>
+
#include <algorithm>
#include <map>
-#include <vector>
#include <string>
#include <utility>
+#include <vector>
+
+#include "./mkldnn_base-inl.h"
+
#include "../../operator_common.h"
+
#include "mkldnn_act-inl.h"
-#include "./mkldnn_base-inl.h"
namespace mxnet {
namespace op {
bool SupportMKLDNNAct(const ActivationParam& param) {
- return param.act_type == activation::kReLU
- || param.act_type == activation::kSigmoid
- || param.act_type == activation::kLogSigmoid
- || param.act_type == activation::kMish
- || param.act_type == activation::kSoftReLU
- || param.act_type == activation::kTanh;
+ return param.act_type == activation::kReLU || param.act_type == activation::kSigmoid ||
+ param.act_type == activation::kLogSigmoid || param.act_type == activation::kMish ||
+ param.act_type == activation::kSoftReLU || param.act_type == activation::kTanh;
}
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input) {
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input) {
// MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
- if ((input.shape().ndim() < 1) ||
- (input.shape().ndim() > 5) ||
+ if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
!(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
return false;
return SupportMKLDNNAct(param);
}
bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param) {
- return param.act_type == leakyrelu::kLeakyReLU
- || param.act_type == leakyrelu::kELU
- || param.act_type == leakyrelu::kGELU;
+ return param.act_type == leakyrelu::kLeakyReLU || param.act_type == leakyrelu::kELU ||
+ param.act_type == leakyrelu::kGELU;
}
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray &input) {
+bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input) {
// MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
- if ((input.shape().ndim() < 1) ||
- (input.shape().ndim() > 5) ||
+ if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
!(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
return false;
return SupportMKLDNNLeakyRelu(param);
}
-bool SupportQuantizedMKLDNNAct(const ActivationParam ¶m) {
+bool SupportQuantizedMKLDNNAct(const ActivationParam& param) {
// TODO(zhennan): Add more activation type when mkldnn supports.
// Remove this when it's identity to SupportMKLDNNAct.
return param.act_type == activation::kReLU;
@@ -113,26 +111,26 @@ mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param) {
}
}
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
- const MKLDNNActParam& param, bool is_train,
- const mkldnn::memory &input_mem) {
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
+ bool is_train,
+ const mkldnn::memory& input_mem) {
mkldnn::memory::desc data_md = input_mem.get_desc();
- auto cpu_engine = CpuEngine::Get()->get_engine();
- auto alg = param.alg;
+ auto cpu_engine = CpuEngine::Get()->get_engine();
+ auto alg = param.alg;
- auto prop = is_train ? mkldnn::prop_kind::forward_training :
- mkldnn::prop_kind::forward_scoring;
+ auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
auto desc = mkldnn::eltwise_forward::desc(prop, alg, data_md, param.slope);
return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
}
-const inline mkldnn::eltwise_forward &MKLDNNActForward::GetFwd() const {
+const inline mkldnn::eltwise_forward& MKLDNNActForward::GetFwd() const {
return *fwd_;
}
-MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
- const OpContext &ctx, const NDArray &in_data,
- const mkldnn::memory &in_mem) {
+MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const mkldnn::memory& in_mem) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
#else
@@ -151,72 +149,75 @@ MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
return it->second;
}
-void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const NDArray &in_data, const OpReqType &req,
- const NDArray &out_data) {
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const OpReqType& req,
+ const NDArray& out_data) {
const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
MKLDNNActParam param_;
- param_.alg = GetMKLDNNActAlgo(param);
+ param_.alg = GetMKLDNNActAlgo(param);
const NDArray& in_buffer = in_data;
- MKLDNNStream *stream = MKLDNNStream::Get();
- auto input_mem = in_buffer.GetMKLDNNData();
- MKLDNNActForward &fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
- auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ auto input_mem = in_buffer.GetMKLDNNData();
+ MKLDNNActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
+ auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
stream->RegisterPrimArgs(fwd.GetFwd(),
- {{ MKLDNN_ARG_SRC, *input_mem}, { MKLDNN_ARG_DST, *out_mem_t.second}});
+ {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
CommitOutput(out_data, out_mem_t);
stream->Submit();
}
-void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const NDArray &in_data, const OpReqType &req,
- const NDArray &out_data) {
+void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const OpReqType& req,
+ const NDArray& out_data) {
const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
MKLDNNActParam param_;
- param_.alg = GetMKLDNNActAlgo(param);
+ param_.alg = GetMKLDNNActAlgo(param);
param_.slope = param.slope;
- NDArray in_buffer = in_data;
- MKLDNNStream *stream = MKLDNNStream::Get();
+ NDArray in_buffer = in_data;
+ MKLDNNStream* stream = MKLDNNStream::Get();
if (in_data.IsView() && in_data.IsMKLDNNData())
in_buffer = in_data.Reorder2Default();
- auto input_mem = in_buffer.GetMKLDNNData();
- MKLDNNActForward &fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
- auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+ auto input_mem = in_buffer.GetMKLDNNData();
+ MKLDNNActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
+ auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
stream->RegisterPrimArgs(fwd.GetFwd(),
- {{ MKLDNN_ARG_SRC, *input_mem}, { MKLDNN_ARG_DST, *out_mem_t.second}});
+ {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
CommitOutput(out_data, out_mem_t);
stream->Submit();
}
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(
- const MKLDNNActParam ¶m, const mkldnn::memory &input_mem,
- const mkldnn::memory &diff_dst_memory) {
+mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
+ const mkldnn::memory& input_mem,
+ const mkldnn::memory& diff_dst_memory) {
mkldnn::memory::desc data_md = input_mem.get_desc();
mkldnn::memory::desc diff_md = diff_dst_memory.get_desc();
- auto cpu_engine = CpuEngine::Get()->get_engine();
- auto alg = param.alg;
+ auto cpu_engine = CpuEngine::Get()->get_engine();
+ auto alg = param.alg;
- mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training,
- alg, data_md, param.slope);
+ mkldnn::eltwise_forward::desc fw_desc(
+ mkldnn::prop_kind::forward_training, alg, data_md, param.slope);
mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, param.slope);
- mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine,
- fw_pdesc);
+ mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc);
return bw_pdesc;
}
-const inline mkldnn::eltwise_backward &MKLDNNActBackward::GetBwd() const {
+const inline mkldnn::eltwise_backward& MKLDNNActBackward::GetBwd() const {
return *bwd_prim_;
}
-static inline MKLDNNActBackward &GetActBackward(const MKLDNNActParam ¶m,
- const OpContext &ctx,
- const NDArray &in_data,
- const NDArray &out_grad,
- const mkldnn::memory &in_mem) {
+static inline MKLDNNActBackward& GetActBackward(const MKLDNNActParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const NDArray& out_grad,
+ const mkldnn::memory& in_mem) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActBackward, OpHash> bwds;
#else
@@ -236,37 +237,38 @@ static inline MKLDNNActBackward &GetActBackward(const MKLDNNActParam ¶m,
// For backward relu activation, it's okay to pass "out_data" as "in_data" to this
// function, since the computation only involes non-zeros.
-void MKLDNNActivationBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
if (req[0] == kNullOp) {
return;
}
const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
// XXX: for y = relu(x), y is passed as "in_data" to Backward()
- const bool relu = param.act_type == activation::kReLU;
- const NDArray &out_buffer = inputs[0];
- const NDArray &in_buffer = relu ? inputs[1] : inputs[2];
- const NDArray &in_grad = outputs[0];
+ const bool relu = param.act_type == activation::kReLU;
+ const NDArray& out_buffer = inputs[0];
+ const NDArray& in_buffer = relu ? inputs[1] : inputs[2];
+ const NDArray& in_grad = outputs[0];
MKLDNNActParam param_;
param_.alg = GetMKLDNNActAlgo(param);
TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
auto diff_dst_memory = out_buffer.GetMKLDNNData();
- auto input_mem = in_buffer.GetMKLDNNData();
+ auto input_mem = in_buffer.GetMKLDNNData();
// We need to make sure the two inputs to eltwise_backward has the same memory
// descriptor. Otherwise, the perf will suffer.
if (input_mem->get_desc() != diff_dst_memory->get_desc()) {
input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
}
- MKLDNNActBackward &bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
- MKLDNNStream *stream = MKLDNNStream::Get();
- mkldnn_args_map_t args = {{MKLDNN_ARG_SRC, *input_mem},
- {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}};
+ MKLDNNActBackward& bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ mkldnn_args_map_t args = {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DIFF_DST, *diff_dst_memory}};
if (req[0] != kAddTo) {
// req[0] is kWriteTo or kWriteInplace
auto diff_src_memory =
- const_cast<NDArray &>(in_grad).CreateMKLDNNData(bwd.bwd_pd.diff_src_desc());
+ const_cast<NDArray&>(in_grad).CreateMKLDNNData(bwd.bwd_pd.diff_src_desc());
args.insert({MKLDNN_ARG_DIFF_SRC, *diff_src_memory});
stream->RegisterPrimArgs(bwd.GetBwd(), args);
stream->Submit();
@@ -280,40 +282,38 @@ void MKLDNNActivationBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx
}
void MKLDNNLeakyReluBackward(const nnvm::NodeAttrs& attrs,
- const OpContext &ctx,
+ const OpContext& ctx,
const std::vector<NDArray>& inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
if (req[0] == kNullOp) {
return;
}
CHECK_EQ(inputs.size(), 2U);
CHECK_EQ(outputs.size(), 1U);
const NDArray& out_buffer = inputs[0];
- const NDArray& in_buffer = inputs[1];
- const NDArray &output = outputs[0];
+ const NDArray& in_buffer = inputs[1];
+ const NDArray& output = outputs[0];
const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
MKLDNNActParam param_;
- param_.alg = GetMKLDNNActAlgo(param);
+ param_.alg = GetMKLDNNActAlgo(param);
param_.slope = param.slope;
TmpMemMgr::Get()->Init(ctx.requested[leakyrelu::kRandom]);
auto diff_dst_memory = out_buffer.GetMKLDNNData();
- auto input_mem = in_buffer.GetMKLDNNData();
+ auto input_mem = in_buffer.GetMKLDNNData();
// We need to make sure the two inputs to eltwise_backward has the same memory
// descriptor. Otherwise, the perf will suffer.
if (input_mem->get_desc() != diff_dst_memory->get_desc())
input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
- MKLDNNActBackward &bwd =
- GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
- MKLDNNStream *stream = MKLDNNStream::Get();
- mkldnn_output_t diff_src_memory =
- CreateMKLDNNMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
- mkldnn_args_map_t args = {
- { MKLDNN_ARG_SRC, *input_mem },
- { MKLDNN_ARG_DIFF_DST, *diff_dst_memory },
- { MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second },
+ MKLDNNActBackward& bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ mkldnn_output_t diff_src_memory = CreateMKLDNNMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
+ mkldnn_args_map_t args = {
+ {MKLDNN_ARG_SRC, *input_mem},
+ {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
+ {MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second},
};
stream->RegisterPrimArgs(bwd.GetBwd(), args);
CommitOutput(output, diff_src_memory);
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 49a4ce4..2cef524 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -18,30 +18,30 @@
*/
/*******************************************************************************
-* Copyright 2016-2017 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkldnn_base-inl.h
-* \brief
-* \author young.jin.kim@intel.com
-* ashok.emani@intel.com
-* deepthi.karkada@intel.com
-* louis.feng@intel.com
-* adam.d.straw@intel.com
-* zhengda1936@gmail.com
-*
-*******************************************************************************/
+ * Copyright 2016-2017 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * \file mkldnn_base-inl.h
+ * \brief
+ * \author young.jin.kim@intel.com
+ * ashok.emani@intel.com
+ * deepthi.karkada@intel.com
+ * louis.feng@intel.com
+ * adam.d.straw@intel.com
+ * zhengda1936@gmail.com
+ *
+ *******************************************************************************/
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
@@ -54,28 +54,25 @@
#include <unordered_map>
#include <utility>
#include <vector>
+
#include "mkldnn.hpp"
#include "mxnet/graph_attr_types.h"
#include "mxnet/ndarray.h"
#include "mxnet/op_attr_types.h"
#include "mxnet/resource.h"
-#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...) \
- switch (type) { \
- case mshadow::kFloat32: \
- { \
- typedef float DType; \
- {__VA_ARGS__} \
- } \
- break; \
- case mshadow::kBfloat16: \
- { \
- typedef mshadow::bfloat::bf16_t DType; \
- {__VA_ARGS__} \
- } \
- break; \
- default: \
- LOG(FATAL) << "Unknown type enum " << type; \
+#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...) \
+ switch (type) { \
+ case mshadow::kFloat32: { \
+ typedef float DType; \
+ { __VA_ARGS__ } \
+ } break; \
+ case mshadow::kBfloat16: { \
+ typedef mshadow::bfloat::bf16_t DType; \
+ { __VA_ARGS__ } \
+ } break; \
+ default: \
+ LOG(FATAL) << "Unknown type enum " << type; \
}
namespace mxnet {
@@ -84,18 +81,20 @@ namespace mxnet {
// cpu_engine singleton
class CpuEngine {
public:
- static CpuEngine *Get() {
+ static CpuEngine* Get() {
// I's thread-safe in C++11.
// ensure same mkldnn engine is used across threads
static CpuEngine myInstance;
return &myInstance;
}
- CpuEngine(CpuEngine const &) = delete; // Copy construct
- CpuEngine(CpuEngine &&) = delete; // Move construct
- CpuEngine &operator=(CpuEngine const &) = delete; // Copy assign
- CpuEngine &operator=(CpuEngine &&) = delete; // Move assign
+ CpuEngine(CpuEngine const&) = delete; // Copy construct
+ CpuEngine(CpuEngine&&) = delete; // Move construct
+ CpuEngine& operator=(CpuEngine const&) = delete; // Copy assign
+ CpuEngine& operator=(CpuEngine&&) = delete; // Move assign
- mkldnn::engine &get_engine() { return _cpu_engine; }
+ mkldnn::engine& get_engine() {
+ return _cpu_engine;
+ }
protected:
CpuEngine() : _cpu_engine(mkldnn::engine::kind::cpu, 0) {}
@@ -134,10 +133,10 @@ struct data_type_enum<uint8_t> {
enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::u8) };
};
-static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape &shape) {
- int ndim = shape.ndim();
+static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape& shape) {
+ int ndim = shape.ndim();
bool support = ndim == 1 || ndim == 2 || ndim == 4;
- support = support &&
+ support = support &&
(dtype == mshadow::kFloat32 || dtype == mshadow::kInt32 || dtype == mshadow::kInt8 ||
dtype == mshadow::kUint8 || dtype == mshadow::kBfloat16);
return support;
@@ -147,24 +146,23 @@ static inline bool SupportStorageMKLDNN(int stype) {
return stype == kDefaultStorage;
}
-static inline bool SupportMKLDNN(int dtype, const mxnet::TShape &shape) {
+static inline bool SupportMKLDNN(int dtype, const mxnet::TShape& shape) {
int ndim = shape.ndim();
if (ndim == 0 || shape.Size() == 0) {
// MKLDNN currently does not support 0-dim Tensor and 0-size Tensor
return false;
}
return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
- (ndim == 1 || ndim == 2 || ndim == 4);
+ (ndim == 1 || ndim == 2 || ndim == 4);
}
static inline bool SupportMKLDNNQuantize(int dtype) {
- return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 ||
- dtype == mshadow::kUint8 || dtype == mshadow::kBfloat16;
+ return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 || dtype == mshadow::kUint8 ||
+ dtype == mshadow::kBfloat16;
}
-static inline bool SupportMKLDNN(const NDArray &input) {
- return SupportMKLDNN(input.dtype(), input.shape())
- && SupportStorageMKLDNN(input.storage_type());
+static inline bool SupportMKLDNN(const NDArray& input) {
+ return SupportMKLDNN(input.dtype(), input.shape()) && SupportStorageMKLDNN(input.storage_type());
}
static inline bool MKLDNNEnvSet() {
@@ -178,9 +176,10 @@ static inline int GetMKLDNNCacheSize() {
}
// TODO(alex): (MXNET-1075) Will remove env variable and calculate cache size during runtime
-template<typename S, typename I, typename H>
-static typename std::unordered_map<S, I, H>::iterator AddToCache(
- std::unordered_map<S, I, H>* cache, const S &key, const I &item) {
+template <typename S, typename I, typename H>
+static typename std::unordered_map<S, I, H>::iterator AddToCache(std::unordered_map<S, I, H>* cache,
+ const S& key,
+ const I& item) {
int mkldnn_cache_size = GetMKLDNNCacheSize();
if (mkldnn_cache_size != -1 && static_cast<int>(cache->size()) > mkldnn_cache_size)
cache->erase(cache->begin());
@@ -192,7 +191,7 @@ static typename std::unordered_map<S, I, H>::iterator AddToCache(
/*
* This is to align address to a certain alignment.
*/
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space);
+void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space);
namespace op {
struct ActivationParam;
@@ -204,29 +203,28 @@ struct SoftmaxOutputParam;
struct TransposeParam;
struct ReshapeParam;
bool SupportMKLDNNAct(const ActivationParam& param);
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input);
bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param);
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray &input);
-bool SupportQuantizedMKLDNNAct(const ActivationParam ¶m);
-bool SupportMKLDNNConv(const ConvolutionParam ¶ms, const NDArray &input);
-bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
-bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray &input, const NDArray &output);
-bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param, const NDArray &input,
- const NDArray &output);
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam ¶m);
-bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray &data);
-bool SupportMKLDNNBatchDot(const std::vector<NDArray> &inputs, const NDArray &output);
+bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input);
+bool SupportQuantizedMKLDNNAct(const ActivationParam& param);
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input);
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input);
+bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
+bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param,
+ const NDArray& input,
+ const NDArray& output);
+bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam& param);
+bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray& data);
+bool SupportMKLDNNBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
} // namespace op
static int GetTypeSize(int dtype) {
int size = -1;
- MSHADOW_TYPE_SWITCH(dtype, DType, {
- size = sizeof(DType);
- });
+ MSHADOW_TYPE_SWITCH(dtype, DType, { size = sizeof(DType); });
return size;
}
-static inline size_t GetArraySize(const NDArray &arr) {
+static inline size_t GetArraySize(const NDArray& arr) {
if (arr.IsMKLDNNData()) {
return arr.GetMKLDNNData()->get_desc().get_size();
}
@@ -251,7 +249,7 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
}
}
-template<typename T>
+template <typename T>
static inline mkldnn::memory::data_type get_mkldnn_type() {
return static_cast<mkldnn::memory::data_type>(data_type_enum<T>::type);
}
@@ -260,12 +258,11 @@ static inline mkldnn_data_type_t get_mkldnn_type_t(int dtype) {
return static_cast<mkldnn_data_type_t>(get_mkldnn_type(dtype));
}
-template<typename T>
+template <typename T>
static inline mkldnn_data_type_t get_mkldnn_type_t() {
return static_cast<mkldnn_data_type_t>(data_type_enum<T>::type);
}
-
static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
auto mkldnn_dtype = static_cast<mkldnn::memory::data_type>(dtype);
switch (mkldnn_dtype) {
@@ -285,8 +282,9 @@ static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
}
}
-static inline size_t GetMemDescSize(const mkldnn::memory::desc &md) {
- if (md.data.ndims == 0) return 0;
+static inline size_t GetMemDescSize(const mkldnn::memory::desc& md) {
+ if (md.data.ndims == 0)
+ return 0;
size_t ret = 1;
for (int i = 0; i < md.data.ndims; i++) {
@@ -297,19 +295,21 @@ static inline size_t GetMemDescSize(const mkldnn::memory::desc &md) {
return ret;
}
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int dtype = -1) {
+inline static mkldnn::memory::desc GetMemDesc(const NDArray& arr, int dtype = -1) {
int ndim = arr.shape().ndim();
mkldnn::memory::dims dims(ndim);
dtype = (dtype == -1) ? arr.dtype() : dtype;
- for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+ for (size_t i = 0; i < dims.size(); i++)
+ dims[i] = arr.shape()[i];
return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
}
-inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray &arr, int dtype = -1) {
+inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray& arr, int dtype = -1) {
int ndim = arr.shape().ndim();
mkldnn::memory::dims dims(ndim);
dtype = (dtype == -1) ? arr.dtype() : dtype;
- for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+ for (size_t i = 0; i < dims.size(); i++)
+ dims[i] = arr.shape()[i];
auto format = mkldnn::memory::format_tag::any;
// for batch 256 alexnet benchmark test
if (dims.size() == 2) {
@@ -319,7 +319,7 @@ inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray &arr, int dtype
return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), format};
}
-inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
+inline static mkldnn::memory::desc GetWeightDesc(const NDArray& arr,
int num_groups,
bool quantized = false) {
int dtype = quantized ? mshadow::kInt8 : arr.dtype();
@@ -340,25 +340,29 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
switch (ndim) {
case 3:
tz = mkldnn::memory::dims{
- num_groups, arr.shape()[N] / num_groups,
- arr.shape()[C], arr.shape()[H]};
+ num_groups, arr.shape()[N] / num_groups, arr.shape()[C], arr.shape()[H]};
break;
case 4:
- tz = mkldnn::memory::dims{
- num_groups, arr.shape()[N] / num_groups,
- arr.shape()[C], arr.shape()[H], arr.shape()[W]};
+ tz = mkldnn::memory::dims{num_groups,
+ arr.shape()[N] / num_groups,
+ arr.shape()[C],
+ arr.shape()[H],
+ arr.shape()[W]};
break;
case 5:
- tz = mkldnn::memory::dims{
- num_groups, arr.shape()[N] / num_groups,
- arr.shape()[C], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
+ tz = mkldnn::memory::dims{num_groups,
+ arr.shape()[N] / num_groups,
+ arr.shape()[C],
+ arr.shape()[D],
+ arr.shape()[H],
+ arr.shape()[W]};
}
return mkldnn::memory::desc{tz, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
}
}
-inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray> &inputs) {
- for (const auto &in : inputs) {
+inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray>& inputs) {
+ for (const auto& in : inputs) {
if (in.IsView() && in.IsMKLDNNData()) {
return true;
}
@@ -381,7 +385,7 @@ typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
*/
class TmpMemMgr {
// This points to the memory buffer where we can allocate temp memory.
- char *curr_mem;
+ char* curr_mem;
// The total size of the temp memory.
size_t mem_size;
// This contains the current available memory size.
@@ -391,7 +395,7 @@ class TmpMemMgr {
const size_t alignment = kMKLDNNAlign;
public:
- static TmpMemMgr *Get() {
+ static TmpMemMgr* Get() {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local TmpMemMgr mgr;
#else
@@ -407,14 +411,14 @@ class TmpMemMgr {
}
void Reset() {
- curr_mem = nullptr;
+ curr_mem = nullptr;
curr_size = 0;
// We don't reset est_size and mem_size because est_size contains the
// estimated temp memory size from the last run and mem_size contains the
// memroy size allocated in the last run.
}
- void Init(const Resource &r) {
+ void Init(const Resource& r) {
// If the last time, if we estimate that we need more memory, we should the
// larger memory size.
mem_size = std::max(mem_size, est_size);
@@ -422,13 +426,13 @@ class TmpMemMgr {
// Let's allocate some extra memory. If we don't use some of them all the time,
// the OS won't physically allocate pages for them any way.
this->curr_size = mem_size * 2;
- this->curr_mem = static_cast<char *>(r.get_host_space_internal(this->curr_size));
+ this->curr_mem = static_cast<char*>(r.get_host_space_internal(this->curr_size));
}
// reset est_size, so we can start to estimate the temp memory size.
this->est_size = 0;
}
- mkldnn::memory *Alloc(const mkldnn::memory::desc &md);
+ mkldnn::memory* Alloc(const mkldnn::memory::desc& md);
};
typedef std::unordered_map<int, mkldnn::memory> mkldnn_args_map_t;
@@ -439,12 +443,11 @@ class MKLDNNStream {
mkldnn::stream s;
public:
- static MKLDNNStream *Get();
+ static MKLDNNStream* Get();
- MKLDNNStream(): s(CpuEngine::Get()->get_engine()) {}
+ MKLDNNStream() : s(CpuEngine::Get()->get_engine()) {}
- void RegisterPrimArgs(const mkldnn::primitive &prim,
- const mkldnn_args_map_t &args) {
+ void RegisterPrimArgs(const mkldnn::primitive& prim, const mkldnn_args_map_t& args) {
net_prim_args.emplace_back(prim, args);
}
@@ -463,7 +466,7 @@ class MKLDNNStream {
*/
void Submit(bool cleanup = true) {
if (!net_prim_args.empty()) {
- for (auto &v : net_prim_args) {
+ for (auto& v : net_prim_args) {
v.first.execute(s, v.second);
}
net_prim_args.clear();
@@ -484,22 +487,22 @@ enum OutDataOp {
AddBack,
};
-typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
-void MKLDNNMemoryCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem);
+typedef std::pair<OutDataOp, mkldnn::memory*> mkldnn_output_t;
+void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem);
/*
* Here we want to get MKLDNN memory whose desc is exactly the same as
* the given one. operator== can't guarantee that. == can return true even if
* the formats are different. I need to double check its format.
*/
-static inline mkldnn::memory *GetMKLDNNExact(
- const mkldnn::memory *mem, const mkldnn::memory::desc &desc) {
+static inline mkldnn::memory* GetMKLDNNExact(const mkldnn::memory* mem,
+ const mkldnn::memory::desc& desc) {
mkldnn::memory::desc src_desc = mem->get_desc();
if (desc == src_desc) {
- return const_cast<mkldnn::memory *>(mem);
+ return const_cast<mkldnn::memory*>(mem);
} else {
- std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
- desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+ std::shared_ptr<mkldnn::memory> ret(
+ new mkldnn::memory(desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
MKLDNNStream::Get()->RegisterMem(ret);
return ret.get();
}
@@ -516,29 +519,30 @@ static inline mkldnn::memory *GetMKLDNNExact(
* If these two functions are used, we have to call CommitOutput to write
* the output back to the output NDArray.
*/
-mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
- OpReqType req, const NDArray* in_arr = nullptr);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
+ OpReqType req,
+ const NDArray* in_arr = nullptr);
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
OpReqType req);
/* This function has to be used with one of the functions above. */
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res);
+void CommitOutput(const NDArray& arr, const mkldnn_output_t& res);
-const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups);
+const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups);
-const mkldnn::memory *GetWeights(const NDArray &arr,
- const mkldnn::memory::desc &target_md,
+const mkldnn::memory* GetWeights(const NDArray& arr,
+ const mkldnn::memory::desc& target_md,
int num_groups);
-bool IsDefaultFormat(const mkldnn::memory::desc &desc);
-bool IsMKLDNN(const mkldnn::memory::desc &desc);
+bool IsDefaultFormat(const mkldnn::memory::desc& desc);
+bool IsMKLDNN(const mkldnn::memory::desc& desc);
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc &md);
+mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& md);
mkldnn_format_tag_t GetDefaultFormat(int num_dims);
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc &md, const mkldnn_format_tag_t &format);
+mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& md, const mkldnn_format_tag_t& format);
-inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int ndims) {
+inline bool same_shape(const mxnet::TShape& shape, const mkldnn_dims_t dims, int ndims) {
if (shape.ndim() != ndims)
return false;
for (int i = 0; i < ndims; i++)
@@ -547,8 +551,7 @@ inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int
return true;
}
-inline bool same_shape(const mkldnn::memory::desc &desc1,
- const mkldnn::memory::desc &desc2) {
+inline bool same_shape(const mkldnn::memory::desc& desc1, const mkldnn::memory::desc& desc2) {
if (desc1.data.ndims != desc2.data.ndims)
return false;
for (int i = 0; i < desc1.data.ndims; i++)
@@ -557,10 +560,9 @@ inline bool same_shape(const mkldnn::memory::desc &desc1,
return true;
}
-inline bool same_shape(const mxnet::TShape &shape, int dtype,
- const mkldnn::memory::desc &desc) {
- return same_shape(shape, desc.data.dims, desc.data.ndims)
- && get_mkldnn_type(dtype) == desc.data.data_type;
+inline bool same_shape(const mxnet::TShape& shape, int dtype, const mkldnn::memory::desc& desc) {
+ return same_shape(shape, desc.data.dims, desc.data.ndims) &&
+ get_mkldnn_type(dtype) == desc.data.data_type;
}
/*
@@ -571,25 +573,24 @@ inline bool same_shape(const mxnet::TShape &shape, int dtype,
class MKLDNNMemory {
std::shared_ptr<mkldnn::memory> mem;
mkldnn::memory::desc desc;
- size_t size; // The number of bytes.
+ size_t size; // The number of bytes.
public:
- MKLDNNMemory(mkldnn::memory::desc md, void *addr): desc(md) {
+ MKLDNNMemory(mkldnn::memory::desc md, void* addr) : desc(md) {
mem.reset(new mkldnn::memory(md, CpuEngine::Get()->get_engine(), addr));
size = desc.get_size();
}
- explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem): desc(
- mem->get_desc()) {
+ explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem) : desc(mem->get_desc()) {
this->mem = mem;
- size = desc.get_size();
+ size = desc.get_size();
}
- void SetDataHandle(void *handle) {
+ void SetDataHandle(void* handle) {
mem->set_data_handle(handle);
}
- void *GetDataHandle() const {
+ void* GetDataHandle() const {
return mem->get_data_handle();
}
@@ -597,7 +598,7 @@ class MKLDNNMemory {
return mem;
}
- mkldnn::memory *GetRaw() const {
+ mkldnn::memory* GetRaw() const {
return mem.get();
}
@@ -609,13 +610,15 @@ class MKLDNNMemory {
return mem->get_desc();
}
- mkldnn::memory::desc GetDesc(mkldnn_format_tag_t format,
- mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
+ mkldnn::memory::desc GetDesc(
+ mkldnn_format_tag_t format,
+ mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
mkldnn::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
- mkldnn::memory::data_type cpp_type = (data_type == mkldnn::memory::data_type::undef)
- ? static_cast<mkldnn::memory::data_type>(desc.data.data_type) : data_type;
- mkldnn::memory::desc data_md(dims, cpp_type,
- static_cast<mkldnn::memory::format_tag>(format));
+ mkldnn::memory::data_type cpp_type =
+ (data_type == mkldnn::memory::data_type::undef)
+ ? static_cast<mkldnn::memory::data_type>(desc.data.data_type)
+ : data_type;
+ mkldnn::memory::desc data_md(dims, cpp_type, static_cast<mkldnn::memory::format_tag>(format));
return data_md;
}
@@ -631,25 +634,26 @@ class MKLDNNMemory {
return mem->get_desc() == md;
}
- bool SameFormat(const mxnet::TShape &shape, int dtype) const {
+ bool SameFormat(const mxnet::TShape& shape, int dtype) const {
return same_shape(shape, dtype, desc);
}
- void ReorderTo(mkldnn::memory *other) const {
+ void ReorderTo(mkldnn::memory* other) const {
mkldnn::stream s(CpuEngine::Get()->get_engine());
mkldnn::reorder(*mem, *other).execute(s, *mem, *other);
}
};
// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst);
+void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst);
template <typename Compute, typename AttrState>
-void FallBackCompute(Compute fn, const AttrState &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs);
+void FallBackCompute(Compute fn,
+ const AttrState& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs);
/*
* This class is used to check the correctness of MKLDNN operators.
@@ -662,66 +666,69 @@ class OpCheck {
public:
OpCheck(bool backward, size_t num_checks) {
- this->backward = backward;
+ this->backward = backward;
this->num_checks = num_checks;
}
- void Init(const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::NDArray> &outputs_);
+ void Init(const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::NDArray>& outputs_);
- void Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs_);
+ void Run(mxnet::FCompute fn,
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs_);
- void CopyResult(const std::vector<mxnet::NDArray> &outputs_,
- const std::vector<size_t>& indice);
+ void CopyResult(const std::vector<mxnet::NDArray>& outputs_, const std::vector<size_t>& indice);
};
-bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
+bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
bool support_mkldnn,
- DispatchMode *dispatch_mode,
- std::vector<int> *in_attrs,
- std::vector<int> *out_attrs);
-
-#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
- static bool debug = dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false); \
- OpCheck check(backward, num_checks); \
- if (debug) check.Init(inputs, outputs);
-
-#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
- if (debug) check.Run(fn, attrs, ctx, inputs, req, outputs);
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs);
+
+#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
+ static bool debug = dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false); \
+ OpCheck check(backward, num_checks); \
+ if (debug) \
+ check.Init(inputs, outputs);
+
+#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
+ if (debug) \
+ check.Run(fn, attrs, ctx, inputs, req, outputs);
#define MKLDNN_OPCHECK_COPY_RESULT(outputs, indice) \
- if (debug) check.CopyResult(outputs, indice);
+ if (debug) \
+ check.CopyResult(outputs, indice);
struct MKLDNNPostEltwiseParam {
mkldnn::algorithm alg = mkldnn::algorithm::undef;
- float scale = 1.f;
- float alpha = 0.f;
- float beta = 1.f;
+ float scale = 1.f;
+ float alpha = 0.f;
+ float beta = 1.f;
};
void MKLDNNRun(mxnet::FComputeEx fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs_);
-
-using FComputeExUnary = std::function<void (const nnvm::NodeAttrs& attrs,
- const OpContext& ctx,
- const NDArray& input,
- const OpReqType& req,
- const NDArray& output)>;
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs_);
+
+using FComputeExUnary = std::function<void(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& input,
+ const OpReqType& req,
+ const NDArray& output)>;
void MKLDNNRun(FComputeExUnary fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const mxnet::NDArray &inputs_,
- const mxnet::OpReqType &req,
- const mxnet::NDArray &outputs_);
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const mxnet::NDArray& inputs_,
+ const mxnet::OpReqType& req,
+ const mxnet::NDArray& outputs_);
} // namespace mxnet
#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index f825700..5415e9e 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -20,14 +20,16 @@
#if MXNET_USE_ONEDNN == 1
#include <atomic>
+
#include "./mkldnn_base-inl.h"
#include "./mkldnn_ops-inl.h"
+
#include "../../../common/exec_utils.h"
#include "../../operator_common.h"
namespace mxnet {
-MKLDNNStream *MKLDNNStream::Get() {
+MKLDNNStream* MKLDNNStream::Get() {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local MKLDNNStream stream;
#else
@@ -36,7 +38,7 @@ MKLDNNStream *MKLDNNStream::Get() {
return &stream;
}
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
+void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space) {
if (size > *space)
return nullptr;
intptr_t addr = reinterpret_cast<intptr_t>(mem);
@@ -51,13 +53,13 @@ void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
addr += padding;
*space -= padding;
CHECK_EQ(addr % alignment, 0);
- return reinterpret_cast<void *>(addr);
+ return reinterpret_cast<void*>(addr);
}
-mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
+mkldnn::memory* TmpMemMgr::Alloc(const mkldnn::memory::desc& md) {
// We need to include the size of the memory used for alignment.
this->est_size += md.get_size() + alignment;
- void *mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
+ void* mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
if (mem) {
// The memory is allocated from the temporary memory space in the
// operator. It'll only become invalid after we exit from the operator.
@@ -65,7 +67,7 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
MKLDNNStream::Get()->RegisterMem(ret);
CHECK_EQ(mem, mem);
this->curr_size -= md.get_size();
- this->curr_mem = static_cast<char *>(mem) + md.get_size();
+ this->curr_mem = static_cast<char*>(mem) + md.get_size();
return ret.get();
} else {
// If curr_mem has been initialized and we still reach here, it means the current
@@ -76,8 +78,8 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
// required space size. It will be allocated at next call.
if (this->curr_mem && dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false)) {
LOG(WARNING) << "mkl-dnn debug message: The rest of the temporary space is not "
- << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
- << "allocate the space by itself.";
+ << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
+ << "allocate the space by itself.";
}
mkldnn_mem_ptr ret(new mkldnn::memory(md, CpuEngine::Get()->get_engine()));
MKLDNNStream::Get()->RegisterMem(ret);
@@ -85,97 +87,93 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
}
}
-void MKLDNNMemoryCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem) {
- MKLDNNStream *stream = MKLDNNStream::Get();
- mkldnn::memory::desc from_desc = mem.get_desc();
- mkldnn::memory::desc this_desc = this_mem->get_desc();
+void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem) {
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ mkldnn::memory::desc from_desc = mem.get_desc();
+ mkldnn::memory::desc this_desc = this_mem->get_desc();
mkldnn_format_tag_t from_def_format = GetDefaultFormat(from_desc);
mkldnn_format_tag_t this_def_format = GetDefaultFormat(this_desc);
if (!same_shape(this_desc, from_desc) && IsDefaultFormat(from_desc)) {
// In this case, we can simply create a new MKLDNN memory for the required
// shape.
- mkldnn::memory::dims dims(this_desc.data.dims,
- this_desc.data.dims + this_desc.data.ndims);
+ mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims);
auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
- mkldnn::memory::desc data_md(dims, this_dtype,
- static_cast<mkldnn::memory::format_tag>(this_def_format));
+ mkldnn::memory::desc data_md(
+ dims, this_dtype, static_cast<mkldnn::memory::format_tag>(this_def_format));
mkldnn_mem_ptr tmp_mem(new mkldnn::memory(data_md, mem.get_engine(), mem.get_data_handle()));
stream->RegisterMem(tmp_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *tmp_mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
} else if (!same_shape(this_desc, from_desc)) {
// In this case, the source memory stores data in a customized layout. We
// need to reorganize the data in memory before we can reshape.
mkldnn::memory::desc def_desc = GetDesc(from_desc, from_def_format);
- mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_desc);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *def_mem}});
+ mkldnn::memory* def_mem = TmpMemMgr::Get()->Alloc(def_desc);
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *def_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(mem, *def_mem), args);
// Now we can reshape it
- mkldnn_mem_ptr tmp_mem(new mkldnn::memory(this_desc,
- mem.get_engine(), def_mem->get_data_handle()));
+ mkldnn_mem_ptr tmp_mem(
+ new mkldnn::memory(this_desc, mem.get_engine(), def_mem->get_data_handle()));
stream->RegisterMem(tmp_mem);
args = {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}};
stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
-} else if (this_desc == from_desc) {
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ } else if (this_desc == from_desc) {
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
// If the layout is the same, we can just copy data.
stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
-} else {
+ } else {
// If both are not using the default layouts. There isn't much we can do,
// other than reorder data layout directly.
if (!IsDefaultFormat(this_desc) && !IsDefaultFormat(from_desc)) {
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
} else if (IsDefaultFormat(this_desc)) {
// If the dest mem uses the default memory layout, we can simply use
// the default format of the source memory to improve perf of reorder.
mkldnn::memory::desc desc = GetDesc(from_desc, from_def_format);
- mkldnn_mem_ptr tmp_mem(new mkldnn::memory(desc,
- mem.get_engine(), this_mem->get_data_handle()));
+ mkldnn_mem_ptr tmp_mem(
+ new mkldnn::memory(desc, mem.get_engine(), this_mem->get_data_handle()));
stream->RegisterMem(tmp_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *tmp_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *tmp_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(mem, *tmp_mem), args);
} else {
// If the src mem uses the default memory layout, we can use
// the default format of the source memory to improve perf.
mkldnn::memory::desc desc = GetDesc(this_desc, this_def_format);
- mkldnn_mem_ptr tmp_mem(new mkldnn::memory(desc,
- this_mem->get_engine(), mem.get_data_handle()));
+ mkldnn_mem_ptr tmp_mem(
+ new mkldnn::memory(desc, this_mem->get_engine(), mem.get_data_handle()));
stream->RegisterMem(tmp_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *tmp_mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
}
}
}
-bool CanWriteTo(const NDArray &out_arr,
- const NDArray &in_arr,
- const mkldnn::memory::desc &desc) {
- auto in_mem = in_arr.GetMKLDNNData();
- bool add_same = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
- bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc &&
- in_mem->get_desc() == desc;
+bool CanWriteTo(const NDArray& out_arr, const NDArray& in_arr, const mkldnn::memory::desc& desc) {
+ auto in_mem = in_arr.GetMKLDNNData();
+ bool add_same = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
+ bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc && in_mem->get_desc() == desc;
return add_same && pdesc_same;
}
-mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
OpReqType req,
const NDArray* in_arr) {
if (kAddTo == req) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::AddBack, tmp);
} else if (kWriteInplace == req && in_arr != nullptr && CanWriteTo(out_arr, *in_arr, desc)) {
- mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+ mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
// mem is nullptr if out_arr is view and desc is MKLDNN format.
// need to Reorder2Default before calling CreateMKLDNNMem
CHECK(mem != nullptr);
@@ -184,7 +182,7 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::CopyBack, tmp);
} else if (kWriteTo == req) {
- mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+ mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
if (nullptr == mem) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::CopyBack, tmp);
@@ -195,8 +193,8 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
return mkldnn_output_t(OutDataOp::Noop, tmp);
}
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
OpReqType req) {
if (kAddTo == req) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
@@ -205,9 +203,9 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::CopyBack, tmp);
} else {
- mkldnn::memory *mem = nullptr;
+ mkldnn::memory* mem = nullptr;
if (IsDefaultFormat(desc)) {
- mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+ mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
}
if (mem == nullptr) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
@@ -218,29 +216,29 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
}
}
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
+void CommitOutput(const NDArray& arr, const mkldnn_output_t& res) {
if (res.first == CopyBack) {
- const_cast<NDArray &>(arr).CopyFrom(*res.second);
+ const_cast<NDArray&>(arr).CopyFrom(*res.second);
} else if (res.first == AddBack) {
auto res_memory = res.second;
- auto target_pd = arr.GetMKLDNNData()->get_desc();
- auto mem = arr.GetMKLDNNData(res.second->get_desc());
+ auto target_pd = arr.GetMKLDNNData()->get_desc();
+ auto mem = arr.GetMKLDNNData(res.second->get_desc());
if (mem == nullptr) {
auto tmp_memory = TmpMemMgr::Get()->Alloc(target_pd);
MKLDNNMemoryCopy(*res_memory, tmp_memory);
res_memory = tmp_memory;
- mem = arr.GetMKLDNNData();
+ mem = arr.GetMKLDNNData();
}
op::MKLDNNSum(*mem, *res_memory, *mem);
}
}
-const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
+const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups) {
const auto type = get_mkldnn_type(arr.dtype());
- auto tz = mkldnn::memory::dims{0};
+ auto tz = mkldnn::memory::dims{0};
auto format_tag = mkldnn::memory::format_tag::undef;
- auto engine = CpuEngine::Get()->get_engine();
- const int ndim = arr.shape().ndim();
+ auto engine = CpuEngine::Get()->get_engine();
+ const int ndim = arr.shape().ndim();
int O = 0, I = 1, H = 2, W = 3;
int D = -1;
if (ndim == 5) {
@@ -249,35 +247,38 @@ const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
W = 4;
}
if (ndim == 2) {
- tz = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
+ tz = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
format_tag = mkldnn::memory::format_tag::oi;
} else if (ndim == 3) {
- tz = num_groups > 1
- ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
- arr.shape()[I], arr.shape()[H]}
- : mkldnn::memory::dims{arr.shape()[O],
- arr.shape()[I], arr.shape()[H]};
- format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goiw
- : mkldnn::memory::format_tag::oiw;
+ tz = num_groups > 1 ? mkldnn::memory::dims{num_groups,
+ arr.shape()[O] / num_groups,
+ arr.shape()[I],
+ arr.shape()[H]}
+ : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
+ format_tag =
+ num_groups > 1 ? mkldnn::memory::format_tag::goiw : mkldnn::memory::format_tag::oiw;
} else if (ndim == 4) {
tz = num_groups > 1
- ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
- arr.shape()[I], arr.shape()[H],
+ ? mkldnn::memory::dims{num_groups,
+ arr.shape()[O] / num_groups,
+ arr.shape()[I],
+ arr.shape()[H],
arr.shape()[W]}
- : mkldnn::memory::dims{
- arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
- format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goihw
- : mkldnn::memory::format_tag::oihw;
+ : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
+ format_tag =
+ num_groups > 1 ? mkldnn::memory::format_tag::goihw : mkldnn::memory::format_tag::oihw;
} else if (ndim == 5) {
tz = num_groups > 1
- ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
- arr.shape()[I], arr.shape()[D],
- arr.shape()[H], arr.shape()[W]}
+ ? mkldnn::memory::dims{num_groups,
+ arr.shape()[O] / num_groups,
+ arr.shape()[I],
+ arr.shape()[D],
+ arr.shape()[H],
+ arr.shape()[W]}
: mkldnn::memory::dims{
- arr.shape()[O], arr.shape()[I], arr.shape()[D],
- arr.shape()[H], arr.shape()[W]};
- format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goidhw
- : mkldnn::memory::format_tag::oidhw;
+ arr.shape()[O], arr.shape()[I], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
+ format_tag =
+ num_groups > 1 ? mkldnn::memory::format_tag::goidhw : mkldnn::memory::format_tag::oidhw;
} else {
LOG(FATAL) << "The weight array has an unsupported number of dimensions";
}
@@ -285,37 +286,39 @@ const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
return arr.GetMKLDNNData(md);
}
-const mkldnn::memory *GetWeights(const NDArray &arr,
- const mkldnn::memory::desc &target_desc, int num_groups) {
- const mkldnn::memory *mem = arr.GetMKLDNNData(target_desc);
+const mkldnn::memory* GetWeights(const NDArray& arr,
+ const mkldnn::memory::desc& target_desc,
+ int num_groups) {
+ const mkldnn::memory* mem = arr.GetMKLDNNData(target_desc);
// If the weight array already uses the target layout, simply return it directly.
- if (mem) return mem;
+ if (mem)
+ return mem;
mem = GetWeights(arr, num_groups);
- if (mem == nullptr) mem = arr.GetMKLDNNDataReorder(target_desc);
- if (mem->get_desc() == target_desc) return mem;
+ if (mem == nullptr)
+ mem = arr.GetMKLDNNDataReorder(target_desc);
+ if (mem->get_desc() == target_desc)
+ return mem;
auto ret = TmpMemMgr::Get()->Alloc(target_desc);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem},
- {MKLDNN_ARG_TO, *ret}});
+ std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
return ret;
}
-
// default: block and dims' stride increase monotonically
// mkldnn: 1.winograd 2.rnn packed 3. block and dims'stride is not increase monotonically
-bool IsMKLDNN(const mkldnn::memory::desc &desc) {
+bool IsMKLDNN(const mkldnn::memory::desc& desc) {
bool rslt = true;
if (desc.data.format_kind == mkldnn_blocked) {
if (desc.data.format_desc.blocking.inner_nblks == 0) {
int i = 0;
- for (i = 0; i < desc.data.ndims-1; i++) {
- if (desc.data.format_desc.blocking.strides[i]
- < desc.data.format_desc.blocking.strides[i + 1]) {
+ for (i = 0; i < desc.data.ndims - 1; i++) {
+ if (desc.data.format_desc.blocking.strides[i] <
+ desc.data.format_desc.blocking.strides[i + 1]) {
break;
}
}
- if (i == desc.data.ndims-1) {
+ if (i == desc.data.ndims - 1) {
rslt = false;
}
}
@@ -325,34 +328,40 @@ bool IsMKLDNN(const mkldnn::memory::desc &desc) {
mkldnn_format_tag_t GetDefaultFormat(int num_dims) {
switch (num_dims) {
- case 1: return mkldnn_a;
- case 2: return mkldnn_ab;
- case 3: return mkldnn_abc;
- case 4: return mkldnn_abcd;
- case 5: return mkldnn_abcde;
- case 6: return mkldnn_abcdef;
+ case 1:
+ return mkldnn_a;
+ case 2:
+ return mkldnn_ab;
+ case 3:
+ return mkldnn_abc;
+ case 4:
+ return mkldnn_abcd;
+ case 5:
+ return mkldnn_abcde;
+ case 6:
+ return mkldnn_abcdef;
default:
LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for MKLDNN";
return mkldnn_format_tag_undef;
}
}
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
+mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& desc) {
return GetDefaultFormat(desc.data.ndims);
}
-bool IsDefaultFormat(const mkldnn::memory::desc &desc) {
+bool IsDefaultFormat(const mkldnn::memory::desc& desc) {
bool rslt = false;
if (desc.data.format_kind == mkldnn_blocked) {
if (desc.data.format_desc.blocking.inner_nblks == 0) {
int i = 0;
- for (i = 0; i < desc.data.ndims-1; i++) {
- if (desc.data.format_desc.blocking.strides[i]
- < desc.data.format_desc.blocking.strides[i + 1]) {
+ for (i = 0; i < desc.data.ndims - 1; i++) {
+ if (desc.data.format_desc.blocking.strides[i] <
+ desc.data.format_desc.blocking.strides[i + 1]) {
break;
}
}
- if (i == desc.data.ndims-1) {
+ if (i == desc.data.ndims - 1) {
rslt = true;
}
}
@@ -360,20 +369,18 @@ bool IsDefaultFormat(const mkldnn::memory::desc &desc) {
return rslt;
}
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc &desc,
- const mkldnn_format_tag_t &format) {
+mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& desc, const mkldnn_format_tag_t& format) {
mkldnn::memory::dims dims(desc.data.ndims);
for (size_t i = 0; i < dims.size(); i++)
dims[i] = desc.data.dims[i];
mkldnn::memory::format_tag cpp_format = static_cast<mkldnn::memory::format_tag>(format);
- mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
- desc.data.data_type);
+ mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(desc.data.data_type);
mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
return mkldnn::memory::desc(dims, cpp_type, cpp_format);
}
// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst) {
+void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst) {
mkldnn::stream s(CpuEngine::Get()->get_engine());
auto new_src = *src;
auto new_dst = *dst;
@@ -381,11 +388,12 @@ void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst) {
}
template <typename Compute, typename AttrState>
-void FallBackCompute(Compute fn, const AttrState &attrs_states,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void FallBackCompute(Compute fn,
+ const AttrState& attrs_states,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
std::vector<TBlob> in_blobs(inputs.size());
std::vector<NDArray> in_bufs;
std::vector<OpReqType> new_req = req;
@@ -427,7 +435,7 @@ void FallBackCompute(Compute fn, const AttrState &attrs_states,
// ensure output does not use mkldnn mem.
// for inplace, we already converted & copied input above.
if ((req[i] == kWriteTo) || (req[i] == kWriteInplace)) {
- const_cast<NDArray &>(output).InvalidateMKLDNNData();
+ const_cast<NDArray&>(output).InvalidateMKLDNNData();
if (req[i] == kWriteInplace) {
new_req[i] = kWriteTo;
}
@@ -454,18 +462,20 @@ void FallBackCompute(Compute fn, const AttrState &attrs_states,
}
}
-template<typename DType>
-void print_diff(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2) {
- DType *data1 = reinterpret_cast<DType *>(arr1.data().dptr_);
- DType *data2 = reinterpret_cast<DType *>(arr2.data().dptr_);
+template <typename DType>
+void print_diff(const mxnet::NDArray& arr1, const mxnet::NDArray& arr2) {
+ DType* data1 = reinterpret_cast<DType*>(arr1.data().dptr_);
+ DType* data2 = reinterpret_cast<DType*>(arr2.data().dptr_);
for (size_t i = 0; i < arr1.shape().Size(); i++)
std::cout << data1[i] - data2[i] << ", ";
std::cout << std::endl;
}
-template<typename DType>
-static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
- DType rtol, DType atol) {
+template <typename DType>
+static bool SimilarArray(const mxnet::NDArray& arr1,
+ const mxnet::NDArray& arr2,
+ DType rtol,
+ DType atol) {
if (arr1.shape().Size() != arr2.shape().Size())
return false;
@@ -476,21 +486,21 @@ static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
// But we shouldn't reorder data in the original array.
NDArray buf1, buf2;
if (arr1.IsMKLDNNData()) {
- buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
+ buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
auto mem = arr1.GetMKLDNNData();
buf1.CopyFrom(*mem);
}
if (arr2.IsMKLDNNData()) {
- buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
+ buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
auto mem = arr2.GetMKLDNNData();
buf2.CopyFrom(*mem);
}
MKLDNNStream::Get()->Submit();
- DType *data1 = reinterpret_cast<DType *>(
- arr1.IsMKLDNNData() ? buf1.data().dptr_: arr1.data().dptr_);
- DType *data2 = reinterpret_cast<DType *>(
- arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_);
+ DType* data1 =
+ reinterpret_cast<DType*>(arr1.IsMKLDNNData() ? buf1.data().dptr_ : arr1.data().dptr_);
+ DType* data2 =
+ reinterpret_cast<DType*>(arr2.IsMKLDNNData() ? buf2.data().dptr_ : arr2.data().dptr_);
std::atomic<bool> success(true);
#pragma omp parallel for
#ifdef _MSC_VER
@@ -508,39 +518,42 @@ static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
return success.load();
}
-template void FallBackCompute(void (*)(nnvm::NodeAttrs const &, OpContext const &,
- std::vector<TBlob, std::allocator<TBlob> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<TBlob, std::allocator<TBlob> > const &),
- nnvm::NodeAttrs const &, OpContext const &,
- std::vector<NDArray, std::allocator<NDArray> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<NDArray, std::allocator<NDArray> > const &);
-
-template void FallBackCompute(void (*)(OpStatePtr const &, OpContext const &,
- std::vector<TBlob, std::allocator<TBlob> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<TBlob, std::allocator<TBlob> > const &),
- OpStatePtr const &, OpContext const &,
- std::vector<NDArray, std::allocator<NDArray> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<NDArray, std::allocator<NDArray> > const &);
-
-void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::NDArray> &outputs_) {
+template void FallBackCompute(void (*)(nnvm::NodeAttrs const&,
+ OpContext const&,
+ std::vector<TBlob, std::allocator<TBlob> > const&,
+ std::vector<OpReqType, std::allocator<OpReqType> > const&,
+ std::vector<TBlob, std::allocator<TBlob> > const&),
+ nnvm::NodeAttrs const&,
+ OpContext const&,
+ std::vector<NDArray, std::allocator<NDArray> > const&,
+ std::vector<OpReqType, std::allocator<OpReqType> > const&,
+ std::vector<NDArray, std::allocator<NDArray> > const&);
+
+template void FallBackCompute(void (*)(OpStatePtr const&,
+ OpContext const&,
+ std::vector<TBlob, std::allocator<TBlob> > const&,
+ std::vector<OpReqType, std::allocator<OpReqType> > const&,
+ std::vector<TBlob, std::allocator<TBlob> > const&),
+ OpStatePtr const&,
+ OpContext const&,
+ std::vector<NDArray, std::allocator<NDArray> > const&,
+ std::vector<OpReqType, std::allocator<OpReqType> > const&,
+ std::vector<NDArray, std::allocator<NDArray> > const&);
+
+void OpCheck::Init(const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::NDArray>& outputs_) {
auto ctx = inputs_[0].ctx();
CHECK(!MKLDNNStream::Get()->HasOps());
for (size_t i = 0; i < inputs_.size(); i++) {
NDArray data = inputs_[i];
inputs.emplace_back(data.shape(), ctx, false, data.dtype());
if (data.IsMKLDNNData() && data.IsView())
- data = data.Reorder2Default();
+ data = data.Reorder2Default();
auto mem = data.GetMKLDNNData();
inputs[i].CopyFrom(*mem);
}
for (size_t i = 0; i < outputs_.size(); i++) {
- outputs.emplace_back(outputs_[i].shape(), ctx,
- false, outputs_[i].dtype());
+ outputs.emplace_back(outputs_[i].shape(), ctx, false, outputs_[i].dtype());
if (backward) {
auto mem = outputs_[i].GetMKLDNNData();
outputs[i].CopyFrom(*mem);
@@ -549,18 +562,20 @@ void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
MKLDNNStream::Get()->Submit();
}
-void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs_) {
+void OpCheck::Run(mxnet::FCompute fn,
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs_) {
static auto& is_excluded = Op::GetAttr<bool>("TExcludeMKLDNNDebug");
if (is_excluded.get(attrs.op, false)) {
LOG(WARNING) << attrs.op->name << " not checked. TExcludeMKLDNNDebug flag present";
return;
}
std::vector<mxnet::TBlob> in_blobs(inputs.size());
- for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data();
+ for (size_t i = 0; i < in_blobs.size(); i++)
+ in_blobs[i] = inputs[i].data();
std::vector<mxnet::TBlob> out_blobs(outputs.size());
for (size_t i = 0; i < out_blobs.size(); i++)
out_blobs[i] = outputs[i].data();
@@ -568,14 +583,14 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
if (dmlc::GetEnv("MXNET_ONEDNN_DEBUG", false))
LOG(INFO) << "test " << attrs.op->name;
size_t num = std::min(outputs.size(), outputs_.size());
- num = std::min(num_checks, num);
+ num = std::min(num_checks, num);
for (size_t i = 0; i < num; i++) {
// We don't need to compare if it doesn't need to output data.
if (req[i] == kNullOp)
continue;
MSHADOW_TYPE_SWITCH(outputs[i].dtype(), DType, {
- bool similar = SimilarArray<DType>(outputs[i], outputs_[i], static_cast<DType>(1e-2),
- static_cast<DType>(1e-2));
+ bool similar = SimilarArray<DType>(
+ outputs[i], outputs_[i], static_cast<DType>(1e-2), static_cast<DType>(1e-2));
if (!similar) {
LOG(ERROR) << attrs.op->name << " fails";
}
@@ -584,10 +599,10 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
}
}
-void OpCheck::CopyResult(const std::vector<mxnet::NDArray> &outputs_,
- const std::vector<size_t> &indice) {
+void OpCheck::CopyResult(const std::vector<mxnet::NDArray>& outputs_,
+ const std::vector<size_t>& indice) {
CHECK(!MKLDNNStream::Get()->HasOps());
- auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray> &>(outputs_);
+ auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray>&>(outputs_);
for (auto i = indice.begin(); i != indice.end(); ++i) {
auto mem = outputs[*i].GetMKLDNNData();
non_const_outputs_[*i].CopyFrom(*mem);
@@ -595,14 +610,15 @@ void OpCheck::CopyResult(const std::vector<mxnet::NDArray> &outputs_,
MKLDNNStream::Get()->Submit();
}
-bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
+bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
bool support_mkldnn,
- DispatchMode *dispatch_mode,
- std::vector<int> *in_attrs,
- std::vector<int> *out_attrs) {
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
for (int& v : *in_attrs)
- if (v == - 1) v = kDefaultStorage;
+ if (v == -1)
+ v = kDefaultStorage;
DispatchMode wanted_mode;
#if MXNET_USE_ONEDNN == 1
@@ -616,8 +632,8 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
bool dispatched = false;
if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
- dispatched = op::storage_type_assign(out_attrs, mxnet::kDefaultStorage,
- dispatch_mode, wanted_mode);
+ dispatched =
+ op::storage_type_assign(out_attrs, mxnet::kDefaultStorage, dispatch_mode, wanted_mode);
}
if (!dispatched) {
dispatched = op::dispatch_fallback(out_attrs, dispatch_mode);
@@ -625,10 +641,10 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
return dispatched;
}
-inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray> &inputs) {
+inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray>& inputs) {
std::vector<NDArray> ret;
ret.reserve(inputs.size());
- for (const auto &in : inputs) {
+ for (const auto& in : inputs) {
if (in.IsView() && in.IsMKLDNNData()) {
ret.push_back(in.Reorder2Default());
} else {
@@ -639,11 +655,11 @@ inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<N
}
void MKLDNNRun(mxnet::FComputeEx fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs) {
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs) {
if (CheckMKLDNNInputArrayIsView(inputs)) {
const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
fn(attrs, ctx, mkldnn_inputs, req, outputs);
@@ -653,11 +669,11 @@ void MKLDNNRun(mxnet::FComputeEx fn,
}
void MKLDNNRun(FComputeExUnary fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const mxnet::NDArray &input,
- const mxnet::OpReqType &req,
- const mxnet::NDArray &output) {
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const mxnet::NDArray& input,
+ const mxnet::OpReqType& req,
+ const mxnet::NDArray& output) {
auto mkldnn_input = input;
if (input.IsView() && input.IsMKLDNNData()) {
mkldnn_input = input.Reorder2Default();
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h
index 16b7ade..34c3eb9 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_batch_dot-inl.h
@@ -29,30 +29,33 @@
#include <numeric>
#include <utility>
#include <vector>
-#include "../../tensor/dot-inl.h"
+
#include "./mkldnn_base-inl.h"
#include "./mkldnn_ops-inl.h"
+#include "../../tensor/dot-inl.h"
+
namespace mxnet {
namespace op {
-using batch_dot_fwd_t = mkldnn::matmul;
+using batch_dot_fwd_t = mkldnn::matmul;
using batch_dot_fwd_pd_t = mkldnn::matmul::primitive_desc;
typedef ParamOpSign<DotParam> BatchDotSignature;
class MKLDNNBatchDotFwd {
public:
- static MKLDNNBatchDotFwd &GetCached(const DotParam ¶m,
- const std::vector<NDArray> &inputs,
- const std::vector<NDArray> &outputs);
+ static MKLDNNBatchDotFwd& GetCached(const DotParam& param,
+ const std::vector<NDArray>& inputs,
+ const std::vector<NDArray>& outputs);
- MKLDNNBatchDotFwd(const DotParam ¶m, const std::vector<NDArray> &inputs,
- const std::vector<NDArray> &outputs);
+ MKLDNNBatchDotFwd(const DotParam& param,
+ const std::vector<NDArray>& inputs,
+ const std::vector<NDArray>& outputs);
- void Execute(const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs);
+ void Execute(const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs);
private:
std::shared_ptr<batch_dot_fwd_t> fwd;
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_dot.cc b/src/operator/nn/mkldnn/mkldnn_batch_dot.cc
index a5f0cf1..f7c93ef 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_dot.cc
+++ b/src/operator/nn/mkldnn/mkldnn_batch_dot.cc
@@ -28,28 +28,26 @@
namespace mxnet {
namespace op {
-bool SupportMKLDNNBatchDot(const std::vector<NDArray> &inputs,
- const NDArray &output) {
+bool SupportMKLDNNBatchDot(const std::vector<NDArray>& inputs, const NDArray& output) {
return inputs[0].shape().Size() != 0 && inputs[1].shape().Size() != 0 &&
output.shape().Size() != 0 &&
- (inputs[0].dtype() == mshadow::kFloat32 ||
- inputs[0].dtype() == mshadow::kBfloat16);
+ (inputs[0].dtype() == mshadow::kFloat32 || inputs[0].dtype() == mshadow::kBfloat16);
}
-void MKLDNNBatchDotForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
- const DotParam ¶m = nnvm::get<DotParam>(attrs.parsed);
- MKLDNNBatchDotFwd &fwd = MKLDNNBatchDotFwd::GetCached(param, inputs, outputs);
+void MKLDNNBatchDotForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
+ const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+ MKLDNNBatchDotFwd& fwd = MKLDNNBatchDotFwd::GetCached(param, inputs, outputs);
fwd.Execute(inputs, req, outputs);
}
-MKLDNNBatchDotFwd &MKLDNNBatchDotFwd::GetCached(
- const DotParam ¶m, const std::vector<NDArray> &inputs,
- const std::vector<NDArray> &outputs) {
- using batch_dot_fwd_map =
- std::unordered_map<BatchDotSignature, MKLDNNBatchDotFwd, OpHash>;
+MKLDNNBatchDotFwd& MKLDNNBatchDotFwd::GetCached(const DotParam& param,
+ const std::vector<NDArray>& inputs,
+ const std::vector<NDArray>& outputs) {
+ using batch_dot_fwd_map = std::unordered_map<BatchDotSignature, MKLDNNBatchDotFwd, OpHash>;
#if DMLC_CXX11_THREAD_LOCAL
static thread_local batch_dot_fwd_map fwds;
#else
@@ -69,52 +67,48 @@ MKLDNNBatchDotFwd &MKLDNNBatchDotFwd::GetCached(
return it->second;
}
-MKLDNNBatchDotFwd::MKLDNNBatchDotFwd(const DotParam ¶m,
- const std::vector<NDArray> &inputs,
- const std::vector<NDArray> &outputs) {
- auto shape = inputs[0].shape();
- auto ndim = shape.ndim();
+MKLDNNBatchDotFwd::MKLDNNBatchDotFwd(const DotParam& param,
+ const std::vector<NDArray>& inputs,
+ const std::vector<NDArray>& outputs) {
+ auto shape = inputs[0].shape();
+ auto ndim = shape.ndim();
auto bigDim = shape[0];
for (size_t i = 1; i < ndim - 2; ++i) {
bigDim *= shape[i];
}
- auto GetMemoryDesc = [&ndim, &bigDim](const NDArray &tensor,
- const bool transpose) {
+ auto GetMemoryDesc = [&ndim, &bigDim](const NDArray& tensor, const bool transpose) {
auto shape = tensor.shape();
if (transpose) {
- return mkldnn::memory::desc(
- mkldnn::memory::dims{bigDim, shape[ndim - 1], shape[ndim - 2]},
- get_mkldnn_type(tensor.dtype()), mkldnn::memory::format_tag::acb);
+ return mkldnn::memory::desc(mkldnn::memory::dims{bigDim, shape[ndim - 1], shape[ndim - 2]},
+ get_mkldnn_type(tensor.dtype()),
+ mkldnn::memory::format_tag::acb);
} else {
- return mkldnn::memory::desc(
- mkldnn::memory::dims{bigDim, shape[ndim - 2], shape[ndim - 1]},
- get_mkldnn_type(tensor.dtype()), mkldnn::memory::format_tag::any);
+ return mkldnn::memory::desc(mkldnn::memory::dims{bigDim, shape[ndim - 2], shape[ndim - 1]},
+ get_mkldnn_type(tensor.dtype()),
+ mkldnn::memory::format_tag::any);
}
};
- mkldnn::memory::desc data_md = GetMemoryDesc(inputs[0], param.transpose_a);
+ mkldnn::memory::desc data_md = GetMemoryDesc(inputs[0], param.transpose_a);
mkldnn::memory::desc weights_md = GetMemoryDesc(inputs[1], param.transpose_b);
mkldnn::memory::desc out_md({bigDim, data_md.dims()[1], weights_md.dims()[2]},
get_mkldnn_type(outputs[0].dtype()),
mkldnn::memory::format_tag::any);
mkldnn::matmul::desc fwd_desc(data_md, weights_md, out_md);
- fwd_pd = std::make_shared<batch_dot_fwd_pd_t>(
- fwd_desc, mxnet::CpuEngine::Get()->get_engine());
- fwd = std::make_shared<batch_dot_fwd_t>(*fwd_pd);
+ fwd_pd = std::make_shared<batch_dot_fwd_pd_t>(fwd_desc, mxnet::CpuEngine::Get()->get_engine());
+ fwd = std::make_shared<batch_dot_fwd_t>(*fwd_pd);
}
-void MKLDNNBatchDotFwd::Execute(const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void MKLDNNBatchDotFwd::Execute(const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
auto engine = mxnet::CpuEngine::Get()->get_engine();
- auto data = mkldnn::memory(fwd_pd->src_desc(), engine,
- reinterpret_cast<void *>(inputs[0].data().dptr_));
- auto weights =
- mkldnn::memory(fwd_pd->weights_desc(), engine,
- reinterpret_cast<void *>(inputs[1].data().dptr_));
- mkldnn_output_t out_mem =
- CreateMKLDNNMem(outputs[0], fwd_pd->dst_desc(), req[0], &inputs[0]);
+ auto data =
+ mkldnn::memory(fwd_pd->src_desc(), engine, reinterpret_cast<void*>(inputs[0].data().dptr_));
+ auto weights = mkldnn::memory(
+ fwd_pd->weights_desc(), engine, reinterpret_cast<void*>(inputs[1].data().dptr_));
+ mkldnn_output_t out_mem = CreateMKLDNNMem(outputs[0], fwd_pd->dst_desc(), req[0], &inputs[0]);
mkldnn_args_map_t args = {
{MKLDNN_ARG_SRC, data},
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
index 5a6f84c..2a4b2bf 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -21,56 +21,57 @@
* \file mkldnn_batch_norm.cc
* \brief
* \author Tao Lv
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
#if MXNET_USE_ONEDNN == 1
-#include <vector>
-#include <utility>
#include <mkldnn.hpp>
-#include "../batch_norm-inl.h"
-#include "./mkldnn_ops-inl.h"
+
+#include <utility>
+#include <vector>
+
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
+
+#include "../batch_norm-inl.h"
-#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/std::sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
namespace mxnet {
namespace op {
-typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
-typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc;
-typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
-typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc;
+typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
+typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc;
+typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
+typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc;
-inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray> &in_data,
- const std::vector<NDArray> &aux_states,
+inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray>& in_data,
+ const std::vector<NDArray>& aux_states,
bool is_train_and_not_global_stats,
bool fuse_relu) {
mkldnn::normalization_flags flags = static_cast<mkldnn::normalization_flags>(0U);
if (in_data.size() == 3U) {
- flags |= mkldnn::normalization_flags::use_scale_shift;
+ flags |= mkldnn::normalization_flags::use_scale_shift;
}
// aux_states[0]: inMean
// aux_states[1]: inVariance
if (aux_states.size() == 2U && !is_train_and_not_global_stats) {
- flags |= mkldnn::normalization_flags::use_global_stats;
+ flags |= mkldnn::normalization_flags::use_global_stats;
}
if (fuse_relu) {
- flags |= mkldnn::normalization_flags::fuse_norm_relu;
+ flags |= mkldnn::normalization_flags::fuse_norm_relu;
}
return flags;
}
-inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
+inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory& data_mem,
bool is_train,
float eps,
mkldnn::normalization_flags flags) {
- auto data_md = data_mem.get_desc();
- auto engine = CpuEngine::Get()->get_engine();
+ auto data_md = data_mem.get_desc();
+ auto engine = CpuEngine::Get()->get_engine();
if (is_train) {
t_bn_f_desc bnFwd_desc(mkldnn::prop_kind::forward_training, data_md, eps, flags);
@@ -81,15 +82,15 @@ inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
}
}
-inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem,
- const mkldnn::memory &diff_mem,
+inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory& data_mem,
+ const mkldnn::memory& diff_mem,
float eps,
mkldnn::normalization_flags flags) {
- auto data_md = data_mem.get_desc();
- auto diff_md = diff_mem.get_desc();
- auto engine = CpuEngine::Get()->get_engine();
+ auto data_md = data_mem.get_desc();
+ auto diff_md = diff_mem.get_desc();
+ auto engine = CpuEngine::Get()->get_engine();
- t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
+ t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
}
@@ -102,28 +103,29 @@ class MKLDNNBNForward {
t_bn_f_pdesc pd;
public:
- MKLDNNBNForward(const t_bn_f_pdesc &_pd, bool is_train_and_not_global_stats): pd(_pd) {
+ MKLDNNBNForward(const t_bn_f_pdesc& _pd, bool is_train_and_not_global_stats) : pd(_pd) {
weight_m.reset(new mkldnn::memory(pd.weights_desc(), CpuEngine::Get()->get_engine()));
fwd.reset(new mkldnn::batch_normalization_forward(pd));
this->is_train_and_not_global_stats = is_train_and_not_global_stats;
}
- const mkldnn::memory &GetWeight() const {
+ const mkldnn::memory& GetWeight() const {
return *weight_m;
}
- const t_bn_f_pdesc &GetPd() const {
+ const t_bn_f_pdesc& GetPd() const {
return pd;
}
- const mkldnn::batch_normalization_forward &GetFwd() const {
+ const mkldnn::batch_normalization_forward& GetFwd() const {
return *fwd;
}
};
-template<typename DType>
-static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
- const OpContext &ctx, const mkldnn::memory *data_mem,
+template <typename DType>
+static MKLDNNBNForward& GetBNForward(const BatchNormParam& param,
+ const OpContext& ctx,
+ const mkldnn::memory* data_mem,
mkldnn::normalization_flags flags) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
@@ -137,8 +139,7 @@ static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
auto it = fwds.find(key);
if (it == fwds.end()) {
- auto fwd_pd = _GetFwd(*data_mem, ctx.is_train,
- param.eps, flags);
+ auto fwd_pd = _GetFwd(*data_mem, ctx.is_train, param.eps, flags);
MKLDNNBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
it = AddToCache(&fwds, key, fwd);
}
@@ -146,10 +147,13 @@ static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
}
template <typename DType>
-void MKLDNNBatchNormForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs, bool fuse_relu) {
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
+void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs,
+ bool fuse_relu) {
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
mxnet::TShape shape = inputs[batchnorm::kData].shape();
@@ -159,96 +163,92 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
if (param.axis != 1 || shape.ndim() != 4) {
// reshape to (N, C, 1, D)
mxnet::TShape new_shape{
- static_cast<index_t>(shape.ProdShape(0, real_axis)),
- shape[real_axis],
- 1,
- static_cast<index_t>(shape.ProdShape(real_axis + 1,
- static_cast<int>(shape.ndim())))
- };
+ static_cast<index_t>(shape.ProdShape(0, real_axis)),
+ shape[real_axis],
+ 1,
+ static_cast<index_t>(shape.ProdShape(real_axis + 1, static_cast<int>(shape.ndim())))};
in_data[batchnorm::kData] = in_data[batchnorm::kData].Reshape(new_shape);
- out = out.Reshape(new_shape);
+ out = out.Reshape(new_shape);
}
const std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
- mkldnn::normalization_flags flags = _GetFlags(in_data,
- aux_states,
- ctx.is_train && !param.use_global_stats,
- fuse_relu);
- NDArray &data = in_data[batchnorm::kData];
+ mkldnn::normalization_flags flags =
+ _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
+ NDArray& data = in_data[batchnorm::kData];
if (data.IsMKLDNNData() && data.IsView())
data = data.Reorder2Default();
auto data_mem = data.GetMKLDNNData();
- auto &fwd = GetBNForward<DType>(param, ctx, data_mem, flags);
+ auto& fwd = GetBNForward<DType>(param, ctx, data_mem, flags);
// for output memory
- auto out_mem = const_cast<NDArray &>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
+ auto out_mem = const_cast<NDArray&>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
// mxnet will always use scale shift.
// But if fix_gamma is true, then all scale elements will be set to 1.0f
if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
- const NDArray &gamma = in_data[batchnorm::kGamma];
- const NDArray &beta = in_data[batchnorm::kBeta];
+ const NDArray& gamma = in_data[batchnorm::kGamma];
+ const NDArray& beta = in_data[batchnorm::kBeta];
CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage);
CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage);
- const mkldnn::memory &weight_mem = fwd.GetWeight();
- float* weight_buf = reinterpret_cast<float *>(weight_mem.get_data_handle());
+ const mkldnn::memory& weight_mem = fwd.GetWeight();
+ float* weight_buf = reinterpret_cast<float*>(weight_mem.get_data_handle());
index_t channels_ = data.shape()[1];
CHECK(weight_mem.get_desc().get_size() == channels_ * sizeof(float) * 2);
- float* weight_ptr = gamma.data().dptr<float>();
- float* bias_ptr = beta.data().dptr<float>();
+ float* weight_ptr = gamma.data().dptr<float>();
+ float* bias_ptr = beta.data().dptr<float>();
const size_t copy_size = sizeof(weight_buf[0]) * channels_;
if (!param.fix_gamma) {
memcpy(weight_buf, weight_ptr, copy_size);
memcpy(&weight_buf[channels_], bias_ptr, copy_size);
} else if (IsBNWriting(req[batchnorm::kGamma])) {
for (index_t i = 0; i < channels_; i++) {
- weight_buf[i] = 1.0f;
- weight_ptr[i] = 1.0f;
+ weight_buf[i] = 1.0f;
+ weight_ptr[i] = 1.0f;
weight_buf[channels_ + i] = bias_ptr[i]; // bias
}
} else {
for (index_t i = 0; i < channels_; i++) {
- weight_buf[i] = 1.0f;
+ weight_buf[i] = 1.0f;
weight_buf[channels_ + i] = bias_ptr[i]; // bias
}
}
mkldnn_args_map_t net_args;
- net_args[MKLDNN_ARG_SRC] = *data_mem;
+ net_args[MKLDNN_ARG_SRC] = *data_mem;
net_args[MKLDNN_ARG_SCALE_SHIFT] = weight_mem;
- net_args[MKLDNN_ARG_DST] = *out_mem;
+ net_args[MKLDNN_ARG_DST] = *out_mem;
if (fuse_relu) {
- const NDArray *workspace = nullptr;
- workspace = &outputs[3];
- auto engine = CpuEngine::Get()->get_engine();
+ const NDArray* workspace = nullptr;
+ workspace = &outputs[3];
+ auto engine = CpuEngine::Get()->get_engine();
if (workspace == nullptr) {
- LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
+ LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
}
- auto ws = std::make_shared<mkldnn::memory>(fwd.GetPd().workspace_desc(),
- engine, workspace->GetMKLDNNData()->get_data_handle());
+ auto ws = std::make_shared<mkldnn::memory>(
+ fwd.GetPd().workspace_desc(), engine, workspace->GetMKLDNNData()->get_data_handle());
net_args[MKLDNN_ARG_WORKSPACE] = *ws;
}
if (!ctx.is_train || param.use_global_stats) {
- float* omean = outputs[batchnorm::kMean].data().dptr<float>();
- float* ovar = outputs[batchnorm::kVar].data().dptr<float>();
- float* inmean = aux_states[batchnorm::kMovingMean].data().dptr<float>();
- float* invar = aux_states[batchnorm::kMovingVar].data().dptr<float>();
+ float* omean = outputs[batchnorm::kMean].data().dptr<float>();
+ float* ovar = outputs[batchnorm::kVar].data().dptr<float>();
+ float* inmean = aux_states[batchnorm::kMovingMean].data().dptr<float>();
+ float* invar = aux_states[batchnorm::kMovingVar].data().dptr<float>();
// to align with origin implmentation: batch_norm.cc: L164
for (index_t i = 0; i < channels_; i++) {
omean[i] = inmean[i];
- ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps);
+ ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps);
}
- net_args[MKLDNN_ARG_MEAN] = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
+ net_args[MKLDNN_ARG_MEAN] = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = *(aux_states[batchnorm::kMovingVar].GetMKLDNNData());
MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
MKLDNNStream::Get()->Submit();
} else { // training
- const NDArray &outMean = outputs[batchnorm::kMean];
- const NDArray &outVar = outputs[batchnorm::kVar];
- net_args[MKLDNN_ARG_MEAN] = *(outMean.GetMKLDNNData());
+ const NDArray& outMean = outputs[batchnorm::kMean];
+ const NDArray& outVar = outputs[batchnorm::kVar];
+ net_args[MKLDNN_ARG_MEAN] = *(outMean.GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = *(outVar.GetMKLDNNData());
MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
MKLDNNStream::Get()->Submit();
@@ -271,25 +271,34 @@ class MKLDNNBNBackward {
public:
const t_bn_b_pdesc pd;
- explicit MKLDNNBNBackward(const t_bn_b_pdesc &_pd)
+ explicit MKLDNNBNBackward(const t_bn_b_pdesc& _pd)
: weight_m(new mkldnn::memory(_pd.weights_desc(), CpuEngine::Get()->get_engine())),
gradw_m(new mkldnn::memory(_pd.diff_weights_desc(), CpuEngine::Get()->get_engine())),
pd(_pd) {
bwd.reset(new mkldnn::batch_normalization_backward(pd));
}
- const mkldnn::memory &GetWeight() const { return *weight_m; }
+ const mkldnn::memory& GetWeight() const {
+ return *weight_m;
+ }
- const mkldnn::memory &GetGradw() const { return *gradw_m; }
+ const mkldnn::memory& GetGradw() const {
+ return *gradw_m;
+ }
- const mkldnn::batch_normalization_backward &GetBwd() const { return *bwd; }
+ const mkldnn::batch_normalization_backward& GetBwd() const {
+ return *bwd;
+ }
};
template <typename DType>
-static MKLDNNBNBackward &GetBNBackward(
- const BatchNormParam ¶m, const OpContext &ctx, const NDArray &in_data,
- const mkldnn::memory &in_mem, const NDArray &diff_data,
- const mkldnn::memory &diff_mem, mkldnn::normalization_flags flags) {
+static MKLDNNBNBackward& GetBNBackward(const BatchNormParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const mkldnn::memory& in_mem,
+ const NDArray& diff_data,
+ const mkldnn::memory& diff_mem,
+ mkldnn::normalization_flags flags) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNBackward, OpHash> bwds;
#else
@@ -310,41 +319,42 @@ static MKLDNNBNBackward &GetBNBackward(
}
template <typename DType>
-void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs, bool fuse_relu) {
+void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs,
+ bool fuse_relu) {
if (fuse_relu) {
CHECK_EQ(inputs.size(), 9U);
} else {
CHECK_EQ(inputs.size(), 8U);
}
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
std::vector<NDArray> out_grad(1);
std::vector<NDArray> out_data(3);
std::vector<NDArray> in_data(3);
std::vector<NDArray> aux_states(2);
- out_grad[0] = inputs[0];
- out_data[batchnorm::kMean] = inputs[1];
- out_data[batchnorm::kVar] = inputs[2];
- in_data[batchnorm::kData] = inputs[3];
- in_data[batchnorm::kGamma] = inputs[4];
- in_data[batchnorm::kBeta] = inputs[5];
- aux_states[batchnorm::kMovingMean] = inputs[6];
- aux_states[batchnorm::kMovingVar] = inputs[7];
- const std::vector<NDArray> &in_grad = outputs;
+ out_grad[0] = inputs[0];
+ out_data[batchnorm::kMean] = inputs[1];
+ out_data[batchnorm::kVar] = inputs[2];
+ in_data[batchnorm::kData] = inputs[3];
+ in_data[batchnorm::kGamma] = inputs[4];
+ in_data[batchnorm::kBeta] = inputs[5];
+ aux_states[batchnorm::kMovingMean] = inputs[6];
+ aux_states[batchnorm::kMovingVar] = inputs[7];
+ const std::vector<NDArray>& in_grad = outputs;
TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
- mkldnn::normalization_flags flags = _GetFlags(in_data,
- aux_states,
- ctx.is_train && !param.use_global_stats,
- fuse_relu);
-
- NDArray data = in_data[batchnorm::kData];
- NDArray diff = out_grad[batchnorm::kOut];
- NDArray gradIn = in_grad[batchnorm::kData];
- const NDArray &moving_mean = aux_states[batchnorm::kMovingMean];
- const NDArray &moving_var = aux_states[batchnorm::kMovingVar];
- const NDArray &out_mean = out_data[batchnorm::kMean];
- const NDArray &out_var = out_data[batchnorm::kVar];
+ mkldnn::normalization_flags flags =
+ _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
+
+ NDArray data = in_data[batchnorm::kData];
+ NDArray diff = out_grad[batchnorm::kOut];
+ NDArray gradIn = in_grad[batchnorm::kData];
+ const NDArray& moving_mean = aux_states[batchnorm::kMovingMean];
+ const NDArray& moving_var = aux_states[batchnorm::kMovingVar];
+ const NDArray& out_mean = out_data[batchnorm::kMean];
+ const NDArray& out_var = out_data[batchnorm::kVar];
CHECK(out_mean.IsDefaultData());
CHECK(out_var.IsDefaultData());
@@ -357,36 +367,34 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
if (param.axis != 1 || shape.ndim() != 4) {
// reshape to (N, C, 1, D)
mxnet::TShape new_shape{
- static_cast<index_t>(shape.ProdShape(0, real_axis)),
- shape[real_axis],
- 1,
- static_cast<index_t>(shape.ProdShape(real_axis + 1,
- static_cast<int>(shape.ndim())))
- };
- data = data.Reshape(new_shape);
- diff = diff.Reshape(new_shape);
+ static_cast<index_t>(shape.ProdShape(0, real_axis)),
+ shape[real_axis],
+ 1,
+ static_cast<index_t>(shape.ProdShape(real_axis + 1, static_cast<int>(shape.ndim())))};
+ data = data.Reshape(new_shape);
+ diff = diff.Reshape(new_shape);
gradIn = gradIn.Reshape(new_shape);
}
- auto data_mem = data.GetMKLDNNData();
- auto diff_mem = diff.GetMKLDNNData();
+ auto data_mem = data.GetMKLDNNData();
+ auto diff_mem = diff.GetMKLDNNData();
// MKLDNN batchnorm should run on special layouts. If one of them isn't, we
// should reorder them.
if (data.IsDefaultData())
data_mem = data.GetMKLDNNDataReorder(diff_mem->get_desc());
else if (diff.IsDefaultData())
diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_desc());
- auto &bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
- auto gradi_mem = CreateMKLDNNMem(const_cast<NDArray &>(gradIn),
- bwd.pd.diff_src_desc(), req[batchnorm::kData]);
+ auto& bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
+ auto gradi_mem =
+ CreateMKLDNNMem(const_cast<NDArray&>(gradIn), bwd.pd.diff_src_desc(), req[batchnorm::kData]);
if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
- const NDArray &gamma = in_data[batchnorm::kGamma];
- const NDArray &beta = in_data[batchnorm::kBeta];
- DType *weight_buf = reinterpret_cast<DType *>(bwd.GetWeight().get_data_handle());
- index_t channels_ = data.shape()[1];
- DType *weight_ptr = gamma.data().dptr<DType>();
- DType* bias_ptr = beta.data().dptr<DType>();
+ const NDArray& gamma = in_data[batchnorm::kGamma];
+ const NDArray& beta = in_data[batchnorm::kBeta];
+ DType* weight_buf = reinterpret_cast<DType*>(bwd.GetWeight().get_data_handle());
+ index_t channels_ = data.shape()[1];
+ DType* weight_ptr = gamma.data().dptr<DType>();
+ DType* bias_ptr = beta.data().dptr<DType>();
const size_t copy_size = sizeof(DType) * channels_;
if (!param.fix_gamma) {
memcpy(weight_buf, weight_ptr, copy_size);
@@ -398,15 +406,15 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
memcpy(&weight_buf[channels_], bias_ptr, copy_size);
}
mkldnn_args_map_t net_args;
- net_args[MKLDNN_ARG_SRC] = *data_mem;
- net_args[MKLDNN_ARG_DIFF_SRC] = *gradi_mem.second;
- net_args[MKLDNN_ARG_SCALE_SHIFT] = bwd.GetWeight();
+ net_args[MKLDNN_ARG_SRC] = *data_mem;
+ net_args[MKLDNN_ARG_DIFF_SRC] = *gradi_mem.second;
+ net_args[MKLDNN_ARG_SCALE_SHIFT] = bwd.GetWeight();
net_args[MKLDNN_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
- net_args[MKLDNN_ARG_DIFF_DST] = *diff_mem;
+ net_args[MKLDNN_ARG_DIFF_DST] = *diff_mem;
if (fuse_relu) {
- const NDArray *workspace = nullptr;
- workspace = &inputs[8];
+ const NDArray* workspace = nullptr;
+ workspace = &inputs[8];
if (workspace != nullptr) {
net_args[MKLDNN_ARG_WORKSPACE] = *(workspace->GetMKLDNNData());
}
@@ -414,26 +422,24 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
// training but no input mean and variance
if (ctx.is_train && !param.use_global_stats) {
- DType* moving_mean_ptr = moving_mean.data().dptr<DType>();
- DType* moving_var_ptr = moving_var.data().dptr<DType>();
- DType* out_mean_ptr = out_mean.data().dptr<DType>();
- DType* out_var_ptr = out_var.data().dptr<DType>();
+ DType* moving_mean_ptr = moving_mean.data().dptr<DType>();
+ DType* moving_var_ptr = moving_var.data().dptr<DType>();
+ DType* out_mean_ptr = out_mean.data().dptr<DType>();
+ DType* out_var_ptr = out_var.data().dptr<DType>();
mkldnn::memory var_mem(bwd.pd.variance_desc(), CpuEngine::Get()->get_engine());
- DType *tmp_var_ptr = reinterpret_cast<DType *>(var_mem.get_data_handle());
+ DType* tmp_var_ptr = reinterpret_cast<DType*>(var_mem.get_data_handle());
DType minus_mom = (1.0f - param.momentum);
for (index_t i = 0; i < channels_; i++) {
- moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum +
- out_mean_ptr[i] * minus_mom;
- float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
- tmp_var_ptr[i] = variance;
- moving_var_ptr[i] = moving_var_ptr[i] * param.momentum +
- variance * minus_mom;
+ moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + out_mean_ptr[i] * minus_mom;
+ float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
+ tmp_var_ptr[i] = variance;
+ moving_var_ptr[i] = moving_var_ptr[i] * param.momentum + variance * minus_mom;
}
- net_args[MKLDNN_ARG_MEAN] = *(out_mean.GetMKLDNNData());
+ net_args[MKLDNN_ARG_MEAN] = *(out_mean.GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = var_mem;
} else {
- net_args[MKLDNN_ARG_MEAN] = *(moving_mean.GetMKLDNNData());
+ net_args[MKLDNN_ARG_MEAN] = *(moving_mean.GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = *(moving_var.GetMKLDNNData());
}
MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
@@ -441,9 +447,9 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
MKLDNNStream::Get()->Submit();
// copy data from gradw_mem to in_grad[1] and in_grad[2]
- DType *gw_buf = reinterpret_cast<DType *>(bwd.GetGradw().get_data_handle());
- DType *w_grad_1 = in_grad[batchnorm::kGamma].data().dptr<DType>();
- DType *w_grad_2 = in_grad[batchnorm::kBeta].data().dptr<DType>();
+ DType* gw_buf = reinterpret_cast<DType*>(bwd.GetGradw().get_data_handle());
+ DType* w_grad_1 = in_grad[batchnorm::kGamma].data().dptr<DType>();
+ DType* w_grad_2 = in_grad[batchnorm::kBeta].data().dptr<DType>();
// the gradient of gamma
if (!param.fix_gamma) {
@@ -467,7 +473,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
if (req[batchnorm::kBeta] != kAddTo) {
memcpy(w_grad_2, &gw_buf[channels_], copy_size);
} else {
- DType *grad_beta = &gw_buf[channels_];
+ DType* grad_beta = &gw_buf[channels_];
for (index_t i = 0; i < channels_; i++) {
w_grad_2[i] += grad_beta[i];
}
diff --git a/src/operator/nn/mkldnn/mkldnn_concat-inl.h b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
index e23625a..a78b5a6 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
@@ -21,17 +21,18 @@
* \file mkldnn_concat-inl.h
* \brief
* \author
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
-
#if MXNET_USE_ONEDNN == 1
-#include <vector>
#include <utility>
-#include "../concat-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include <vector>
+
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
+
+#include "../concat-inl.h"
namespace mxnet {
namespace op {
@@ -40,17 +41,19 @@ class MKLDNNConcatFwd {
public:
mkldnn::concat::primitive_desc fwd_pd;
- MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md);
+ MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md);
- const mkldnn::concat &GetFwd() const { return *fwd_; }
+ const mkldnn::concat& GetFwd() const {
+ return *fwd_;
+ }
private:
std::shared_ptr<mkldnn::concat> fwd_;
};
-static MKLDNNConcatFwd &GetConcatForward(
- int concat_dim, const std::vector<NDArray> &in_data,
- const std::vector<mkldnn::memory::desc> &data_md) {
+static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
+ const std::vector<NDArray>& in_data,
+ const std::vector<mkldnn::memory::desc>& data_md) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
#else
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
index ff60c28..69dad1d 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -21,7 +21,7 @@
* \file mkldnn_concat.cc
* \brief
* \author
-*/
+ */
#if MXNET_USE_ONEDNN == 1
#include "mkldnn_concat-inl.h"
@@ -29,15 +29,16 @@
namespace mxnet {
namespace op {
-static inline bool IsUsingPadding(const mkldnn::memory::desc &dst_md) {
+static inline bool IsUsingPadding(const mkldnn::memory::desc& dst_md) {
// make sure a blocked format is used (at least one dimension is blocked)
- bool is_blocked_format = dst_md.data.format_kind == mkldnn_blocked &&
- dst_md.data.format_desc.blocking.inner_nblks > 0;
- return is_blocked_format && !std::equal(dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims,
- dst_md.data.padded_dims);
+ bool is_blocked_format =
+ dst_md.data.format_kind == mkldnn_blocked && dst_md.data.format_desc.blocking.inner_nblks > 0;
+ return is_blocked_format &&
+ !std::equal(
+ dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims, dst_md.data.padded_dims);
}
-MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md)
+MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md)
: fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
// MKL-DNN introduced padded formats since 0.15 which require more memory
// compared to the actual size of the tensor. Currently, MKL-DNN operators
@@ -45,39 +46,39 @@ MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memor
// format that has the expected memory size requirements (a plain format)
// When fwd_pd uses padding, impose a plain format
- const auto &dst_md = fwd_pd.dst_desc();
+ const auto& dst_md = fwd_pd.dst_desc();
if (IsUsingPadding(dst_md)) {
- auto plain_dst_tag = static_cast<mkldnn::memory::format_tag>(
- GetDefaultFormat(dst_md.data.ndims));
+ auto plain_dst_tag =
+ static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(dst_md.data.ndims));
auto plain_dst_md = mkldnn::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag);
- fwd_pd = mkldnn::concat::primitive_desc(plain_dst_md, concat_dim, data_md,
- CpuEngine::Get()->get_engine());
+ fwd_pd = mkldnn::concat::primitive_desc(
+ plain_dst_md, concat_dim, data_md, CpuEngine::Get()->get_engine());
}
fwd_ = std::make_shared<mkldnn::concat>(fwd_pd);
}
-void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data) {
+void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
- const int num_in_data = param.num_args;
- const int concat_dim = param.dim;
+ const int num_in_data = param.num_args;
+ const int concat_dim = param.dim;
std::vector<mkldnn::memory::desc> data_md;
- std::vector<const mkldnn::memory *> data_mem;
+ std::vector<const mkldnn::memory*> data_mem;
data_md.reserve(num_in_data);
data_mem.reserve(num_in_data);
for (int i = 0; i < num_in_data; i++) {
- const mkldnn::memory *tmp_mem = in_data[i].GetMKLDNNData();
- mkldnn::memory::desc tmp_md = tmp_mem->get_desc();
+ const mkldnn::memory* tmp_mem = in_data[i].GetMKLDNNData();
+ mkldnn::memory::desc tmp_md = tmp_mem->get_desc();
data_md.push_back(tmp_md);
data_mem.push_back(tmp_mem);
}
- MKLDNNConcatFwd &fwd = GetConcatForward(concat_dim, in_data, data_md);
- mxnet::mkldnn_output_t out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut],
- fwd.fwd_pd.dst_desc(),
- req[concat_enum::kOut]);
+ MKLDNNConcatFwd& fwd = GetConcatForward(concat_dim, in_data, data_md);
+ mxnet::mkldnn_output_t out_mem =
+ CreateMKLDNNMem(out_data[concat_enum::kOut], fwd.fwd_pd.dst_desc(), req[concat_enum::kOut]);
std::unordered_map<int, mkldnn::memory> net_args;
net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
for (int i = 0; i < num_in_data; i++) {
@@ -88,35 +89,34 @@ void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
MKLDNNStream::Get()->Submit();
}
-void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req,
const std::vector<NDArray>& outputs) {
TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
- const int num_in_data = param.num_args;
- const int axis = param.dim;
- const auto gradz_mem = inputs[0].GetMKLDNNData();
+ const int num_in_data = param.num_args;
+ const int axis = param.dim;
+ const auto gradz_mem = inputs[0].GetMKLDNNData();
/* init the offset */
mkldnn::memory::dims offsets(outputs[0].shape().ndim());
- for (auto &v : offsets) {
+ for (auto& v : offsets) {
v = 0;
}
for (int i = 0; i < num_in_data; i++) {
mkldnn::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
auto diff_src_md = outputs[i].GetMKLDNNData()->get_desc();
- auto gradi_mem = CreateMKLDNNMem(outputs[i], diff_src_md, req[i]);
+ auto gradi_mem = CreateMKLDNNMem(outputs[i], diff_src_md, req[i]);
auto from_md = gradz_mem->get_desc().submemory_desc(diff_src_tz, offsets);
- auto from_mem = new mkldnn::memory(from_md, gradz_mem->get_engine(),
- gradz_mem->get_data_handle());
+ auto from_mem =
+ new mkldnn::memory(from_md, gradz_mem->get_engine(), gradz_mem->get_data_handle());
offsets[axis] += diff_src_tz[axis];
- std::unordered_map<int, mkldnn::memory> net_args({
- {MKLDNN_ARG_FROM, *gradz_mem},
- {MKLDNN_ARG_TO, *gradi_mem.second}
- });
+ std::unordered_map<int, mkldnn::memory> net_args(
+ {{MKLDNN_ARG_FROM, *gradz_mem}, {MKLDNN_ARG_TO, *gradi_mem.second}});
MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*from_mem, *gradi_mem.second), net_args);
CommitOutput(outputs[i], gradi_mem);
}
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
index dfa365f..4197a01 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -20,18 +20,20 @@
/*!
* \file mkldnn_convolution-inl.h
* \brief
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
#if MXNET_USE_ONEDNN == 1
-#include <vector>
#include <utility>
-#include "../convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
+#include <vector>
+
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
+
+#include "../convolution-inl.h"
namespace mxnet {
namespace op {
@@ -48,28 +50,26 @@ struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
dmlc::optional<float> max_calib_range; // max float value calculated from calibration dataset
DMLC_DECLARE_PARAMETER(MKLDNNConvParam) {
- DMLC_DECLARE_FIELD(with_bn).set_default(false)
- .describe("Add post batchnorm.");
- DMLC_DECLARE_FIELD(with_act).set_default(false)
- .describe("Add post activation");
- DMLC_DECLARE_FIELD(with_sum).set_default(false)
- .describe("Add post sum");
- DMLC_DECLARE_FIELD(with_postsum_act).set_default(false)
- .describe("Add post activation after sum");
- DMLC_DECLARE_FIELD(quantized).set_default(false)
- .describe("enable quantization");
- DMLC_DECLARE_FIELD(dedup_sum).set_default(false).
- describe("deduplicated sum input");
+ DMLC_DECLARE_FIELD(with_bn).set_default(false).describe("Add post batchnorm.");
+ DMLC_DECLARE_FIELD(with_act).set_default(false).describe("Add post activation");
+ DMLC_DECLARE_FIELD(with_sum).set_default(false).describe("Add post sum");
+ DMLC_DECLARE_FIELD(with_postsum_act)
+ .set_default(false)
+ .describe("Add post activation after sum");
+ DMLC_DECLARE_FIELD(quantized).set_default(false).describe("enable quantization");
+ DMLC_DECLARE_FIELD(dedup_sum).set_default(false).describe("deduplicated sum input");
DMLC_DECLARE_FIELD(min_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The minimum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized convolution op to calculate primitive scale");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The minimum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized convolution op to calculate primitive scale");
DMLC_DECLARE_FIELD(max_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The maximum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized convolution op to calculate primitive scale");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The maximum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized convolution op to calculate primitive scale");
}
};
@@ -83,17 +83,29 @@ struct MKLDNNConvFullParam {
};
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
- const ConvolutionParam ¶m, const bool is_train, const NDArray &data, const NDArray &weight,
- const NDArray *bias, const NDArray &output);
+ const ConvolutionParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
class MKLDNNConvForward {
public:
- MKLDNNConvForward(const MKLDNNConvFullParam ¶m, const bool is_train, const NDArray &data,
- const NDArray &weight, const NDArray *bias, const NDArray &output);
-
- const mkldnn::convolution_forward &GetFwd() const { return *fwd_; }
+ MKLDNNConvForward(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
+
+ const mkldnn::convolution_forward& GetFwd() const {
+ return *fwd_;
+ }
- const mkldnn::convolution_forward::primitive_desc &GetPd() const { return *pd_; }
+ const mkldnn::convolution_forward::primitive_desc& GetPd() const {
+ return *pd_;
+ }
private:
std::shared_ptr<mkldnn::convolution_forward> fwd_;
@@ -102,37 +114,47 @@ class MKLDNNConvForward {
typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
-MKLDNNConvForward &GetConvFwd(const MKLDNNConvFullParam ¶m, const bool is_train,
- const NDArray &data, const NDArray &weight, const NDArray *bias,
- const NDArray &output);
-
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m,
- const OpContext &ctx,
- MKLDNNConvForward *fwd,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data);
-
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data);
+MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
+
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
+ const OpContext& ctx,
+ MKLDNNConvForward* fwd,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data);
+
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data);
class MKLDNNConvBackward {
public:
- MKLDNNConvBackward(const MKLDNNConvFullParam ¶m, const NDArray &data, const NDArray &weight,
- const NDArray *bias, const NDArray &output);
-
- const mkldnn::convolution_backward_data &GetBwdData() const { return *bwd_data_; }
+ MKLDNNConvBackward(const MKLDNNConvFullParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
+
+ const mkldnn::convolution_backward_data& GetBwdData() const {
+ return *bwd_data_;
+ }
- const mkldnn::convolution_backward_weights &GetBwdWeights() const { return *bwd_weight_; }
+ const mkldnn::convolution_backward_weights& GetBwdWeights() const {
+ return *bwd_weight_;
+ }
- const mkldnn::convolution_backward_data::primitive_desc &GetDataPd() const {
+ const mkldnn::convolution_backward_data::primitive_desc& GetDataPd() const {
return *bwd_data_pd_;
}
- const mkldnn::convolution_backward_weights::primitive_desc &GetWeightsPd() const {
+ const mkldnn::convolution_backward_weights::primitive_desc& GetWeightsPd() const {
return *bwd_weight_pd_;
}
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index 1aa4137..7180ebd 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -25,42 +25,40 @@
#if MXNET_USE_ONEDNN == 1
-#include "../convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
#include "./mkldnn_base-inl.h"
#include "./mkldnn_convolution-inl.h"
+#include "./mkldnn_ops-inl.h"
+
+#include "../convolution-inl.h"
namespace mxnet {
namespace op {
DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
- if ((params.kernel.ndim() != 1) &&
- (params.kernel.ndim() != 2) &&
- (params.kernel.ndim() != 3))
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input) {
+ if ((params.kernel.ndim() != 1) && (params.kernel.ndim() != 2) && (params.kernel.ndim() != 3))
return false;
return SupportMKLDNNQuantize(input.dtype()) &&
- ((input.shape().ndim() == 3) ||
- (input.shape().ndim() == 4) ||
+ ((input.shape().ndim() == 3) || (input.shape().ndim() == 4) ||
(input.shape().ndim() == 5));
}
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
- const MKLDNNConvFullParam ¶m,
- const bool is_train,
- const NDArray &data,
- const NDArray &weights,
- const NDArray *bias,
- const NDArray &output) {
+ const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray* bias,
+ const NDArray& output) {
auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
- auto data_md = GetMemDesc(data);
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.mkldnn_param.quantized);
- auto out_md = GetMemDesc(output);
+ auto out_md = GetMemDesc(output);
auto bias_md =
bias ? (param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
: mkldnn::memory::desc{
- {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
+ {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
auto bias_md_ptr = bias ? &bias_md : nullptr;
mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
@@ -90,20 +88,20 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
padding[1] = param.conv_param.pad[1];
padding[2] = param.conv_param.pad[2];
} else {
- LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size "
- << param.conv_param.kernel.ndim() << ", supporting only 1 or 2 or 3.";
+ LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.conv_param.kernel.ndim()
+ << ", supporting only 1 or 2 or 3.";
}
mkldnn::primitive_attr attr;
mkldnn::post_ops ops;
if (param.mkldnn_param.with_act) {
- const auto &act_param = param.act_param;
+ const auto& act_param = param.act_param;
ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
}
if (param.mkldnn_param.with_sum) {
ops.append_sum(param.sum_scale);
}
if (param.mkldnn_param.with_postsum_act) {
- const auto &act_param = param.postsum_act_param;
+ const auto& act_param = param.postsum_act_param;
ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
}
attr.set_post_ops(ops);
@@ -112,42 +110,54 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
int mask = (param.requantize_scales.size() > 1) ? 2 : 0;
attr.set_output_scales(mask, param.requantize_scales);
}
- auto GetConvFwdPd = [¶m, &data, &weights, &output,
- &attr](const mkldnn::convolution_forward::desc &desc) {
- auto engine = CpuEngine::Get()->get_engine();
- try {
- // MKL-DNN introduced padded formats since 0.15 which require more memory
- // compared to the actual size of the tensor. Currently, MKL-DNN operators
- // still reuse memory from memory planning, so here we need to select a
- // suboptimal kernel for computation that has the expected memory size requirements
- auto conv_pd =
- std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
- while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
- conv_pd->src_desc().get_size() != GetArraySize(data) ||
- (!param.mkldnn_param.quantized &&
- conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
- // next_impl() will visit desc and engine, please make sure they are still alive here.
- CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
- }
- return conv_pd;
- } catch (mkldnn::error &e) {
- if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
- LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
- "required for int8 convolution";
- } else {
- LOG(ERROR) << e.message;
- }
- throw;
- }
- };
+ auto GetConvFwdPd =
+ [¶m, &data, &weights, &output, &attr](const mkldnn::convolution_forward::desc& desc) {
+ auto engine = CpuEngine::Get()->get_engine();
+ try {
+ // MKL-DNN introduced padded formats since 0.15 which require more memory
+ // compared to the actual size of the tensor. Currently, MKL-DNN operators
+ // still reuse memory from memory planning, so here we need to select a
+ // suboptimal kernel for computation that has the expected memory size requirements
+ auto conv_pd =
+ std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
+ while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
+ conv_pd->src_desc().get_size() != GetArraySize(data) ||
+ (!param.mkldnn_param.quantized &&
+ conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
+ // next_impl() will visit desc and engine, please make sure they are still alive here.
+ CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
+ }
+ return conv_pd;
+ } catch (mkldnn::error& e) {
+ if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
+ LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
+ "required for int8 convolution";
+ } else {
+ LOG(ERROR) << e.message;
+ }
+ throw;
+ }
+ };
if (param.conv_param.dilate.ndim() == 0 && bias_md_ptr == nullptr) {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, padding, padding);
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ padding,
+ padding);
return GetConvFwdPd(desc);
} else if (param.conv_param.dilate.ndim() == 0) {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, *bias_md_ptr, out_md, strides, padding,
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ *bias_md_ptr,
+ out_md,
+ strides,
+ padding,
padding);
return GetConvFwdPd(desc);
} else {
@@ -166,25 +176,42 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
<< ", supporting only 1 or 2 or 3.";
}
if (bias_md_ptr == nullptr) {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, dilates, padding, padding);
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvFwdPd(desc);
} else {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, *bias_md_ptr, out_md, strides, dilates,
- padding, padding);
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ *bias_md_ptr,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvFwdPd(desc);
}
}
}
static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetConvBwdData(
- const ConvolutionParam ¶m, const NDArray &data, const NDArray &weight,
- const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
- auto data_md = GetMemDesc(data);
+ const ConvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weight, param.num_group);
- auto out_md = GetMemDesc(output);
- auto engine = CpuEngine::Get()->get_engine();
+ auto out_md = GetMemDesc(output);
+ auto engine = CpuEngine::Get()->get_engine();
mkldnn::memory::dims strides(param.kernel.ndim());
mkldnn::memory::dims padding(param.kernel.ndim());
if (param.kernel.ndim() == 1) {
@@ -216,8 +243,8 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
<< ", supporting only 1 or 2 or 3.";
}
- auto GetConvBwdDataPd = [&data, &weight, &output,
- &fwd_pd](const mkldnn::convolution_backward_data::desc &desc) {
+ auto GetConvBwdDataPd = [&data, &weight, &output, &fwd_pd](
+ const mkldnn::convolution_backward_data::desc& desc) {
auto engine = CpuEngine::Get()->get_engine();
try {
// MKL-DNN introduced padded formats since 0.15 which require more memory
@@ -233,15 +260,20 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
}
return conv_pd;
- } catch (mkldnn::error &e) {
+ } catch (mkldnn::error& e) {
LOG(ERROR) << e.message;
throw;
}
};
if (param.dilate.ndim() == 0) {
- mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, padding, padding);
+ mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ padding,
+ padding);
return GetConvBwdDataPd(desc);
} else {
mkldnn::memory::dims dilates(param.kernel.ndim());
@@ -255,23 +287,32 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
dilates[1] = param.dilate[1] - 1;
dilates[2] = param.dilate[2] - 1;
} else {
- LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
- << param.dilate.ndim() << ", supporting only 1 or 2 or 3.";
+ LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+ << ", supporting only 1 or 2 or 3.";
}
- mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, dilates, padding,
+ mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
padding);
return GetConvBwdDataPd(desc);
}
}
static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> GetConvBwdWeights(
- const ConvolutionParam ¶m, const NDArray &data, const NDArray &weight, const NDArray *bias,
- const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
- auto data_md = GetMemDesc(data);
+ const ConvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weight, param.num_group);
- auto out_md = GetMemDesc(output);
- auto engine = CpuEngine::Get()->get_engine();
+ auto out_md = GetMemDesc(output);
+ auto engine = CpuEngine::Get()->get_engine();
mkldnn::memory::dims strides(param.kernel.ndim());
mkldnn::memory::dims padding(param.kernel.ndim());
if (param.kernel.ndim() == 1) {
@@ -303,8 +344,8 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
<< ", supporting only 1 or 2 or 3.";
}
- auto GetConvBwdWeightsPd = [&data, &weight, &output,
- &fwd_pd](const mkldnn::convolution_backward_weights::desc &desc) {
+ auto GetConvBwdWeightsPd = [&data, &weight, &output, &fwd_pd](
+ const mkldnn::convolution_backward_weights::desc& desc) {
auto engine = CpuEngine::Get()->get_engine();
try {
// MKL-DNN introduced padded formats since 0.15 which require more memory
@@ -320,20 +361,30 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
}
return conv_pd;
- } catch (mkldnn::error &e) {
+ } catch (mkldnn::error& e) {
LOG(ERROR) << e.message;
throw;
}
};
if (param.dilate.ndim() == 0 && bias == nullptr) {
- mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, padding, padding);
+ mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ padding,
+ padding);
return GetConvBwdWeightsPd(desc);
} else if (param.dilate.ndim() == 0) {
auto bias_md = GetMemDesc(*bias);
- mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, bias_md, out_md, strides, padding,
+ mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ bias_md,
+ out_md,
+ strides,
+ padding,
padding);
return GetConvBwdWeightsPd(desc);
} else {
@@ -348,34 +399,51 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
dilates[1] = param.dilate[1] - 1;
dilates[2] = param.dilate[2] - 1;
} else {
- LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
- << param.dilate.ndim() << ", supporting only 1 or 2 or 3.";
+ LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+ << ", supporting only 1 or 2 or 3.";
}
if (bias == nullptr) {
mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
- data_md, weight_md, out_md, strides, dilates,
- padding, padding);
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvBwdWeightsPd(desc);
} else {
auto bias_md = GetMemDesc(*bias);
mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
- data_md, weight_md, bias_md, out_md, strides,
- dilates, padding, padding);
+ data_md,
+ weight_md,
+ bias_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvBwdWeightsPd(desc);
}
}
}
-MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam ¶m, const bool is_train,
- const NDArray &data, const NDArray &weight,
- const NDArray *bias, const NDArray &output)
+MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output)
: pd_(GetConvFwdImpl(param, is_train, data, weight, bias, output)) {
fwd_ = std::make_shared<mkldnn::convolution_forward>(GetPd());
}
-MKLDNNConvForward &GetConvFwd(const MKLDNNConvFullParam ¶m, const bool is_train,
- const NDArray &data, const NDArray &weight, const NDArray *bias,
- const NDArray &output) {
+MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output) {
using conv_fwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash>;
#if DMLC_CXX11_THREAD_LOCAL
static thread_local conv_fwd_map fwds;
@@ -391,29 +459,31 @@ MKLDNNConvForward &GetConvFwd(const MKLDNNConvFullParam ¶m, const bool is_tr
key.AddSign(data);
key.AddSign(weight);
key.AddSign(output);
- if (bias) key.AddSign(*bias);
+ if (bias)
+ key.AddSign(*bias);
auto it = fwds.find(key);
if (it == fwds.end()) {
auto fwd = MKLDNNConvForward(param, is_train, data, weight, bias, output);
- it = AddToCache(&fwds, key, fwd);
+ it = AddToCache(&fwds, key, fwd);
}
return it->second;
}
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m, const OpContext &ctx,
- MKLDNNConvForward *fwd,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data) {
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
+ const OpContext& ctx,
+ MKLDNNConvForward* fwd,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
- auto &data = in_data[conv::kData];
- auto &weight = in_data[conv::kWeight];
+ auto& data = in_data[conv::kData];
+ auto& weight = in_data[conv::kWeight];
bool no_bias = param.conv_param.no_bias && !param.mkldnn_param.with_bn;
auto data_mem = data.GetMKLDNNDataReorder(fwd->GetPd().src_desc());
- const mkldnn::memory *weight_mem;
+ const mkldnn::memory* weight_mem;
if (ctx.is_train) {
// TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it to the default format
// for now.
@@ -436,14 +506,14 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m, const
mkldnn_output_t out_mem;
if (param.mkldnn_param.with_sum) {
out_mem = mkldnn_output_t(OutDataOp::Noop,
- const_cast<mkldnn::memory *>(out_data[conv::kOut].GetMKLDNNData()));
+ const_cast<mkldnn::memory*>(out_data[conv::kOut].GetMKLDNNData()));
} else {
out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd->GetPd().dst_desc(), req[conv::kOut]);
}
mkldnn_args_map_t net_args;
if (!no_bias) {
- const mkldnn::memory *bias_mem = in_data[conv::kBias].GetMKLDNNData();
+ const mkldnn::memory* bias_mem = in_data[conv::kBias].GetMKLDNNData();
net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
}
@@ -455,32 +525,40 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m, const
MKLDNNStream::Get()->Submit();
}
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data) {
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
MKLDNNConvFullParam param;
param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
- auto &fwd =
- GetConvFwd(param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
- param.conv_param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
+ auto& fwd = GetConvFwd(param,
+ ctx.is_train,
+ in_data[conv::kData],
+ in_data[conv::kWeight],
+ param.conv_param.no_bias ? nullptr : &in_data[conv::kBias],
+ out_data[conv::kOut]);
MKLDNNConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
}
-MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam ¶m, const NDArray &data,
- const NDArray &weight, const NDArray *bias,
- const NDArray &output) {
+MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output) {
const auto fwd_pd = GetConvFwdImpl(param, true, data, weight, bias, output);
- bwd_data_pd_ = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
- bwd_weight_pd_ = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
- bwd_data_ = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
- bwd_weight_ = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
+ bwd_data_pd_ = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
+ bwd_weight_pd_ = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
+ bwd_data_ = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
+ bwd_weight_ = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
}
-static inline MKLDNNConvBackward &GetConvBwd(const MKLDNNConvFullParam ¶m, const NDArray &data,
- const NDArray &weight, const NDArray *bias,
- const NDArray &output) {
+static inline MKLDNNConvBackward& GetConvBwd(const MKLDNNConvFullParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output) {
using mkldnn_conv_bwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvBackward, OpHash>;
#if DMLC_CXX11_THREAD_LOCAL
static thread_local mkldnn_conv_bwd_map bwds;
@@ -495,40 +573,42 @@ static inline MKLDNNConvBackward &GetConvBwd(const MKLDNNConvFullParam ¶m, c
key.AddSign(data);
key.AddSign(weight);
key.AddSign(output);
- if (bias) key.AddSign(*bias);
+ if (bias)
+ key.AddSign(*bias);
auto it = bwds.find(key);
if (it == bwds.end()) {
auto bwd = MKLDNNConvBackward(param, data, weight, bias, output);
- it = AddToCache(&bwds, key, bwd);
+ it = AddToCache(&bwds, key, bwd);
}
return it->second;
}
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req,
const std::vector<NDArray>& outputs) {
TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
- const std::vector<NDArray> &in_grad = outputs;
+ const std::vector<NDArray>& in_grad = outputs;
MKLDNNConvFullParam full_param;
full_param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
- auto &data = inputs[conv::kData + 1];
- auto &weight = inputs[conv::kWeight + 1];
- const auto *bias = full_param.conv_param.no_bias ? nullptr : &inputs[conv::kBias + 1];
- auto &out_grad = inputs[conv::kOut];
+ auto& data = inputs[conv::kData + 1];
+ auto& weight = inputs[conv::kWeight + 1];
+ const auto* bias = full_param.conv_param.no_bias ? nullptr : &inputs[conv::kBias + 1];
+ auto& out_grad = inputs[conv::kOut];
- const ConvolutionParam ¶m = full_param.conv_param;
+ const ConvolutionParam& param = full_param.conv_param;
CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
- MKLDNNConvBackward &convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
- auto out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
+ MKLDNNConvBackward& convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
+ auto out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
if (req[conv::kData]) {
- auto weight_mem = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
- auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(),
- req[conv::kData]);
+ auto weight_mem = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
+ auto in_grad_mem = CreateMKLDNNMem(
+ in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(), req[conv::kData]);
MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdData(),
{{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
{MKLDNN_ARG_WEIGHTS, *weight_mem},
@@ -537,11 +617,11 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct
}
auto req_weight = req.size() > conv::kWeight ? req.at(conv::kWeight) : kNullOp;
- auto req_bias = req.size() > conv::kBias ? req.at(conv::kBias) : kNullOp;
+ auto req_bias = req.size() > conv::kBias ? req.at(conv::kBias) : kNullOp;
if (req_weight || req_bias) {
if (convBwd.GetDataPd().diff_dst_desc() != convBwd.GetWeightsPd().diff_dst_desc())
out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetWeightsPd().diff_dst_desc());
- auto data_mem = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
+ auto data_mem = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
auto in_grad_weight = CreateMKLDNNWeightGrad(
in_grad[conv::kWeight], convBwd.GetWeightsPd().diff_weights_desc(), req[conv::kWeight]);
@@ -550,9 +630,8 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct
{MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
mkldnn_output_t in_grad_bias;
if (!param.no_bias) {
- in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias],
- convBwd.GetWeightsPd().diff_bias_desc(),
- req[conv::kBias]);
+ in_grad_bias = CreateMKLDNNMem(
+ in_grad[conv::kBias], convBwd.GetWeightsPd().diff_bias_desc(), req[conv::kBias]);
net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second});
}
MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdWeights(), net_args);
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
index 8f8ee66..813016d 100644
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -21,19 +21,22 @@
* \file mkldnn_copy.cc
* \brief
* \author
-*/
+ */
-#include "./mkldnn_ops-inl.h"
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
#if MXNET_USE_ONEDNN == 1
namespace mxnet {
namespace op {
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const NDArray &in_data, const OpReqType &req,
- const NDArray &out_data) {
- if (req == kNullOp || req == kWriteInplace) return;
+void MKLDNNCopy(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const OpReqType& req,
+ const NDArray& out_data) {
+ if (req == kNullOp || req == kWriteInplace)
+ return;
TmpMemMgr::Get()->Init(ctx.requested[0]);
auto in_mem = in_data.GetMKLDNNData();
if (req == kAddTo) {
@@ -41,16 +44,16 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
// We should try and force the input memory has the same format
// as the input output. If not, we'll have to reorder memory.
auto out_mem = out_data.GetMKLDNNData();
- in_mem = in_data.GetMKLDNNData(out_mem ->get_desc());
+ in_mem = in_data.GetMKLDNNData(out_mem->get_desc());
if (in_mem == nullptr)
in_mem = in_data.GetMKLDNNDataReorder(out_mem->get_desc());
MKLDNNSum(*out_mem, *in_mem, *out_mem);
} else {
- const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
+ const_cast<NDArray&>(out_data).CopyFrom(*in_mem);
}
MKLDNNStream::Get()->Submit();
}
-} // namespace op
-} // namespace mxnet
+} // namespace op
+} // namespace mxnet
#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index ef508cf..f188f9f 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -24,46 +24,54 @@
#if MXNET_USE_ONEDNN == 1
-#include "../deconvolution-inl.h"
#include "./mkldnn_base-inl.h"
#include "./mkldnn_ops-inl.h"
+#include "../deconvolution-inl.h"
+
namespace mxnet {
namespace op {
-bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms,
- const NDArray &input) {
- if (params.kernel.ndim() != 2) return false;
- return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16)
- && input.shape().ndim() == 4;
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input) {
+ if (params.kernel.ndim() != 2)
+ return false;
+ return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16) &&
+ input.shape().ndim() == 4;
}
static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) {
mkldnn::memory::dims dims(1);
// This is deconvolution on 4D data. The second dimension is the channel.
dims[0] = md.data.dims[1];
- return mkldnn::memory::desc(
- dims, static_cast<mkldnn::memory::data_type>(md.data.data_type),
- mkldnn::memory::format_tag::any);
+ return mkldnn::memory::desc(dims,
+ static_cast<mkldnn::memory::data_type>(md.data.data_type),
+ mkldnn::memory::format_tag::any);
}
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetDeconvBwd_(
- const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md,
- bool has_bias, const mkldnn::memory::desc &out_md,
- const mkldnn::engine &engine, const mkldnn::memory::dims &strides,
- const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) {
+ const mkldnn::memory::desc& data_md,
+ const mkldnn::memory::desc& weights_md,
+ bool has_bias,
+ const mkldnn::memory::desc& out_md,
+ const mkldnn::engine& engine,
+ const mkldnn::memory::dims& strides,
+ const mkldnn::memory::dims& padding,
+ const mkldnn::memory::dims& dilates) {
// MKL-DNN introduced padded formats since 0.15 which require more memory
// compared to the actual size of the tensor. Currently, MKL-DNN operators
// still reuse memory from memory planning, so here we need to select a
// suboptimal kernel for computation that has the expected memory size requirements
if (!has_bias) {
- mkldnn::convolution_forward::desc desc(
- mkldnn::prop_kind::forward_training,
- mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md,
- strides, dilates, padding, padding);
- auto deconv_pd =
- std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc,
- engine);
+ mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+ mkldnn::algorithm::convolution_direct,
+ out_md,
+ weights_md,
+ data_md,
+ strides,
+ dilates,
+ padding,
+ padding);
+ auto deconv_pd = std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, engine);
while (deconv_pd->dst_desc().get_size() != GetMemDescSize(data_md) ||
deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
deconv_pd->weights_desc().get_size() != GetMemDescSize(weights_md)) {
@@ -72,13 +80,17 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetDeconvBwd_(
return deconv_pd;
} else {
auto bias_md = GetBiasDesc(data_md);
- mkldnn::convolution_forward::desc desc(
- mkldnn::prop_kind::forward_training,
- mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md,
- data_md, strides, dilates, padding, padding);
- auto deconv_pd =
- std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc,
- engine);
+ mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training,
+ mkldnn::algorithm::convolution_direct,
+ out_md,
+ weights_md,
+ bias_md,
+ data_md,
+ strides,
+ dilates,
+ padding,
+ padding);
+ auto deconv_pd = std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, engine);
while (deconv_pd->dst_desc().get_size() != GetMemDescSize(data_md) ||
deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
deconv_pd->weights_desc().get_size() != GetMemDescSize(weights_md)) {
@@ -88,13 +100,16 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetDeconvBwd_(
}
}
-std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
-GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, bool has_bias, const NDArray &output) {
- auto data_md = GetMemDesc(data);
+std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetDeconvFwdImpl(
+ const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ bool has_bias,
+ const NDArray& output) {
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weights, param.num_group);
- auto out_md = GetMemDesc(output);
- auto engine = CpuEngine::Get()->get_engine();
+ auto out_md = GetMemDesc(output);
+ auto engine = CpuEngine::Get()->get_engine();
CHECK_GE(param.stride.ndim(), 2);
CHECK_GE(param.pad.ndim(), 2);
CHECK_GE(param.dilate.ndim(), 2);
@@ -107,14 +122,18 @@ GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data,
mkldnn::memory::dims dilate{0, 0};
dilate[0] = param.dilate[0] - 1;
dilate[1] = param.dilate[1] - 1;
- auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine,
- strides, padding, dilate);
- mkldnn::convolution_backward_data::desc desc(
- mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md,
- strides, dilate, padding, padding);
+ auto bwd_pd =
+ GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, strides, padding, dilate);
+ mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+ out_md,
+ weight_md,
+ data_md,
+ strides,
+ dilate,
+ padding,
+ padding);
auto deconv_pd =
- std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
- desc, engine, *bwd_pd);
+ std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(desc, engine, *bwd_pd);
// MKL-DNN introduced padded formats since 0.15 which require more memory
// compared to the actual size of the tensor. Currently, MKL-DNN operators
// still reuse memory from memory planning, so here we need to select a
@@ -127,14 +146,16 @@ GetDeconvFwdImpl(const DeconvolutionParam ¶m, const NDArray &data,
return deconv_pd;
}
-std::shared_ptr<mkldnn::convolution_forward::primitive_desc>
-GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, bool has_bias,
- const NDArray &output) {
- auto data_md = GetMemDesc(data);
+std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetDeconvBwdDataImpl(
+ const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ bool has_bias,
+ const NDArray& output) {
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weights, param.num_group);
- auto out_md = GetMemDesc(output);
- auto engine = CpuEngine::Get()->get_engine();
+ auto out_md = GetMemDesc(output);
+ auto engine = CpuEngine::Get()->get_engine();
CHECK_GE(param.stride.ndim(), 2);
CHECK_GE(param.pad.ndim(), 2);
CHECK_GE(param.dilate.ndim(), 2);
@@ -147,19 +168,20 @@ GetDeconvBwdDataImpl(const DeconvolutionParam ¶m, const NDArray &data,
mkldnn::memory::dims dilate{0, 0};
dilate[0] = param.dilate[0] - 1;
dilate[1] = param.dilate[1] - 1;
- return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, strides,
- padding, dilate);
+ return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, strides, padding, dilate);
}
-std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
-GetDeconvBwdWeightsImpl(
- const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, bool has_bias, const NDArray &output,
- const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
- auto data_md = GetMemDesc(data);
+std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> GetDeconvBwdWeightsImpl(
+ const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ bool has_bias,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weights, param.num_group);
- auto out_md = GetMemDesc(output);
- auto engine = CpuEngine::Get()->get_engine();
+ auto out_md = GetMemDesc(output);
+ auto engine = CpuEngine::Get()->get_engine();
CHECK_GE(param.stride.ndim(), 2);
CHECK_GE(param.pad.ndim(), 2);
CHECK_GE(param.dilate.ndim(), 2);
@@ -178,31 +200,38 @@ GetDeconvBwdWeightsImpl(
// still reuse memory from memory planning, so here we need to select a
// suboptimal kernel for computation that has the expected memory size requirements
if (!has_bias) {
- mkldnn::convolution_backward_weights::desc desc(
- mkldnn::algorithm::convolution_direct, out_md, weight_md, data_md,
- strides, dilate, padding, padding);
- auto deconv_pd =
- std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
- desc, engine, fwd_pd);
+ mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+ out_md,
+ weight_md,
+ data_md,
+ strides,
+ dilate,
+ padding,
+ padding);
+ auto deconv_pd = std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
+ desc, engine, fwd_pd);
while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) ||
deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
- deconv_pd->diff_weights_desc().get_size() !=
- GetMemDescSize(weight_md)) {
+ deconv_pd->diff_weights_desc().get_size() != GetMemDescSize(weight_md)) {
CHECK(deconv_pd->next_impl()) << "No implementation";
}
return deconv_pd;
} else {
auto bias_md = GetBiasDesc(data_md);
- mkldnn::convolution_backward_weights::desc desc(
- mkldnn::algorithm::convolution_direct, out_md, weight_md, bias_md,
- data_md, strides, dilate, padding, padding);
- auto deconv_pd =
- std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
- desc, engine, fwd_pd);
+ mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+ out_md,
+ weight_md,
+ bias_md,
+ data_md,
+ strides,
+ dilate,
+ padding,
+ padding);
+ auto deconv_pd = std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
+ desc, engine, fwd_pd);
while (deconv_pd->diff_dst_desc().get_size() != GetMemDescSize(data_md) ||
deconv_pd->src_desc().get_size() != GetMemDescSize(out_md) ||
- deconv_pd->diff_weights_desc().get_size() !=
- GetMemDescSize(weight_md)) {
+ deconv_pd->diff_weights_desc().get_size() != GetMemDescSize(weight_md)) {
CHECK(deconv_pd->next_impl()) << "No implementation";
}
return deconv_pd;
@@ -211,12 +240,16 @@ GetDeconvBwdWeightsImpl(
class MKLDNNDeconvForward {
public:
- MKLDNNDeconvForward(const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, bool has_bias,
- const NDArray &output);
- const mkldnn::convolution_backward_data &GetFwd() const { return *fwd; }
+ MKLDNNDeconvForward(const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ bool has_bias,
+ const NDArray& output);
+ const mkldnn::convolution_backward_data& GetFwd() const {
+ return *fwd;
+ }
- const mkldnn::convolution_backward_data::primitive_desc &GetPd() const {
+ const mkldnn::convolution_backward_data::primitive_desc& GetPd() const {
return *fwd_pd;
}
@@ -225,45 +258,45 @@ class MKLDNNDeconvForward {
std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> fwd_pd;
}; // class MKLDNNDeconvForward
-MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam ¶m,
- const NDArray &data,
- const NDArray &weights, bool has_bias,
- const NDArray &output)
+MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ bool has_bias,
+ const NDArray& output)
: fwd_pd(GetDeconvFwdImpl(param, data, weights, has_bias, output)) {
fwd = std::make_shared<mkldnn::convolution_backward_data>(GetPd());
}
-static void MKLDNNDeconvFwdBiasPostProcess(
- const DeconvolutionParam ¶m, const OpContext &ctx, const NDArray &bias,
- const std::vector<NDArray> &out_data) {
+static void MKLDNNDeconvFwdBiasPostProcess(const DeconvolutionParam& param,
+ const OpContext& ctx,
+ const NDArray& bias,
+ const std::vector<NDArray>& out_data) {
// add bias, broadcast bias to dim 1: channel
if (!param.no_bias) {
// MKLDNN only supports float right now.
typedef float DType;
- Stream<cpu> *s = ctx.get_stream<cpu>();
+ Stream<cpu>* s = ctx.get_stream<cpu>();
Tensor<cpu, 1, DType> b = bias.data().get<cpu, 1, DType>(s);
// The output data is stored in a special MKLDNN format,
// converts its format to the default format.
// Unfortunately, MKLDNN doesn't support broadcast.
- auto out_data_def = out_data[deconv::kOut].Reorder2Default();
+ auto out_data_def = out_data[deconv::kOut].Reorder2Default();
Tensor<cpu, 4, DType> out_cpu = out_data_def.data().get<cpu, 4, DType>(s);
out_cpu += mshadow::expr::broadcast<1>(b, out_cpu.shape_);
}
}
-MKLDNNDeconvForward &GetDeconvFwd(const nnvm::NodeAttrs &attrs,
- const NDArray &data, const NDArray &weights,
- const NDArray *bias, const NDArray &output) {
+MKLDNNDeconvForward& GetDeconvFwd(const nnvm::NodeAttrs& attrs,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray* bias,
+ const NDArray& output) {
#if DMLC_CXX11_THREAD_LOCAL
- static thread_local std::unordered_map<DeconvSignature, MKLDNNDeconvForward,
- OpHash>
- fwds;
+ static thread_local std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash> fwds;
#else
- static MX_THREAD_LOCAL
- std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash>
- fwds;
+ static MX_THREAD_LOCAL std::unordered_map<DeconvSignature, MKLDNNDeconvForward, OpHash> fwds;
#endif
- const DeconvolutionParam ¶m = nnvm::get<DeconvolutionParam>(attrs.parsed);
+ const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
DeconvSignature key(param);
// Here we can sign the conv op with NDArray because conv primitive will
// decide the right layout for the, so we only need to get the shape and the
@@ -271,34 +304,34 @@ MKLDNNDeconvForward &GetDeconvFwd(const nnvm::NodeAttrs &attrs,
key.AddSign(data);
key.AddSign(weights);
key.AddSign(output);
- if (bias) key.AddSign(*bias);
+ if (bias)
+ key.AddSign(*bias);
auto it = fwds.find(key);
if (it == fwds.end()) {
bool has_bias = (bias != nullptr);
- auto fwd = MKLDNNDeconvForward(param, data, weights, has_bias, output);
- it = AddToCache(&fwds, key, fwd);
+ auto fwd = MKLDNNDeconvForward(param, data, weights, has_bias, output);
+ it = AddToCache(&fwds, key, fwd);
}
return it->second;
}
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data) {
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
- const DeconvolutionParam ¶m = nnvm::get<DeconvolutionParam>(attrs.parsed);
+ const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
- auto &data = in_data[deconv::kData];
- auto &weight = in_data[deconv::kWeight];
- const NDArray *bias = param.no_bias ? nullptr : &in_data[deconv::kBias];
+ auto& data = in_data[deconv::kData];
+ auto& weight = in_data[deconv::kWeight];
+ const NDArray* bias = param.no_bias ? nullptr : &in_data[deconv::kBias];
- MKLDNNDeconvForward &fwd =
- GetDeconvFwd(attrs, data, weight, bias, out_data[deconv::kOut]);
+ MKLDNNDeconvForward& fwd = GetDeconvFwd(attrs, data, weight, bias, out_data[deconv::kOut]);
auto data_mem = data.GetMKLDNNDataReorder(fwd.GetPd().diff_dst_desc());
- const mkldnn::memory *weight_mem;
+ const mkldnn::memory* weight_mem;
if (ctx.is_train) {
// TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
// to the default format for now.
@@ -306,8 +339,7 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs,
// This asks the engine to change the layout of the weight array after
// it's used.
weight.Reorder2DefaultAsync();
- weight_mem =
- GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
+ weight_mem = GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
} else {
// For inference, we want to reorder the weight array so we don't need to
// reorder data every time.
@@ -315,8 +347,7 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs,
// We also need to modify the layout on the original weight array. The
// data conversion happens after the weight array is used.
weight.MKLDNNDataReorderAsync(fwd.GetPd().weights_desc());
- weight_mem =
- GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
+ weight_mem = GetWeights(weight, fwd.GetPd().weights_desc(), param.num_group);
} else {
weight_mem = weight.GetMKLDNNData();
@@ -324,8 +355,7 @@ void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs,
}
}
mkldnn_output_t out_mem;
- out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().diff_src_desc(),
- req[deconv::kOut]);
+ out_mem = CreateMKLDNNMem(out_data[deconv::kOut], fwd.GetPd().diff_src_desc(), req[deconv::kOut]);
mkldnn_args_map_t net_args;
@@ -344,34 +374,38 @@ class MKLDNNDeconvBackwardData {
public:
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> bwd_pd;
- MKLDNNDeconvBackwardData(const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, const NDArray &output);
+ MKLDNNDeconvBackwardData(const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray& output);
- const mkldnn::convolution_forward &GetBwd() const { return *bwd; }
- const mkldnn::convolution_forward::primitive_desc &GetDataPd() const {
+ const mkldnn::convolution_forward& GetBwd() const {
+ return *bwd;
+ }
+ const mkldnn::convolution_forward::primitive_desc& GetDataPd() const {
return *bwd_pd;
}
};
-MKLDNNDeconvBackwardData::MKLDNNDeconvBackwardData(
- const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, const NDArray &output)
+MKLDNNDeconvBackwardData::MKLDNNDeconvBackwardData(const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray& output)
: bwd_pd(GetDeconvBwdDataImpl(param, data, weights, false, output)) {
bwd = std::make_shared<mkldnn::convolution_forward>(GetDataPd());
}
typedef ParamOpSign<DeconvolutionParam> MKLDNNDeconvSignature;
-static inline MKLDNNDeconvBackwardData &GetDeconvBwdData(
- const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, const NDArray &output) {
+static inline MKLDNNDeconvBackwardData& GetDeconvBwdData(const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray& output) {
#if DMLC_CXX11_THREAD_LOCAL
- static thread_local std::unordered_map<MKLDNNDeconvSignature,
- MKLDNNDeconvBackwardData, OpHash>
+ static thread_local std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvBackwardData, OpHash>
bwds;
#else
- static MX_THREAD_LOCAL std::unordered_map<MKLDNNDeconvSignature,
- MKLDNNDeconvBackwardData, OpHash>
+ static MX_THREAD_LOCAL std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvBackwardData, OpHash>
bwds;
#endif
MKLDNNDeconvSignature key(param);
@@ -385,7 +419,7 @@ static inline MKLDNNDeconvBackwardData &GetDeconvBwdData(
auto it = bwds.find(key);
if (it == bwds.end()) {
auto bwd = MKLDNNDeconvBackwardData(param, data, weights, output);
- it = AddToCache(&bwds, key, bwd);
+ it = AddToCache(&bwds, key, bwd);
}
return it->second;
}
@@ -394,40 +428,43 @@ class MKLDNNDeconvBackwardWeights {
std::shared_ptr<mkldnn::convolution_backward_weights> bwd;
public:
- std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
- bwd_data_pd;
- MKLDNNDeconvBackwardWeights(
- const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, const NDArray &output,
- const mkldnn::convolution_forward::primitive_desc &bwd_data_pd);
- const mkldnn::convolution_backward_weights &GetBwd() const { return *bwd; }
- const mkldnn::convolution_backward_weights::primitive_desc &GetWeightsPd()
- const {
+ std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> bwd_data_pd;
+ MKLDNNDeconvBackwardWeights(const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& bwd_data_pd);
+ const mkldnn::convolution_backward_weights& GetBwd() const {
+ return *bwd;
+ }
+ const mkldnn::convolution_backward_weights::primitive_desc& GetWeightsPd() const {
return *bwd_data_pd;
}
};
MKLDNNDeconvBackwardWeights::MKLDNNDeconvBackwardWeights(
- const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, const NDArray &output,
- const mkldnn::convolution_forward::primitive_desc &bwd_data_pd)
- : bwd_data_pd(GetDeconvBwdWeightsImpl(param, data, weights, false, output,
- bwd_data_pd)) {
+ const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& bwd_data_pd)
+ : bwd_data_pd(GetDeconvBwdWeightsImpl(param, data, weights, false, output, bwd_data_pd)) {
bwd = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
}
-static inline MKLDNNDeconvBackwardWeights &GetDeconvBwdWeights(
- const DeconvolutionParam ¶m, const NDArray &data,
- const NDArray &weights, const NDArray &output,
- const mkldnn::convolution_forward::primitive_desc &bwd_data_pd) {
+static inline MKLDNNDeconvBackwardWeights& GetDeconvBwdWeights(
+ const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& bwd_data_pd) {
#if DMLC_CXX11_THREAD_LOCAL
- static thread_local std::unordered_map<MKLDNNDeconvSignature,
- MKLDNNDeconvBackwardWeights, OpHash>
+ static thread_local std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvBackwardWeights, OpHash>
bwds;
#else
- static MX_THREAD_LOCAL std::unordered_map<MKLDNNDeconvSignature,
- MKLDNNDeconvBackwardWeights, OpHash>
- bwds;
+ static MX_THREAD_LOCAL
+ std::unordered_map<MKLDNNDeconvSignature, MKLDNNDeconvBackwardWeights, OpHash>
+ bwds;
#endif
MKLDNNDeconvSignature key(param);
// Here we can sign the conv op with NDArray because conv primitive will
@@ -439,42 +476,35 @@ static inline MKLDNNDeconvBackwardWeights &GetDeconvBwdWeights(
auto it = bwds.find(key);
if (it == bwds.end()) {
- auto bwd =
- MKLDNNDeconvBackwardWeights(param, data, weights, output, bwd_data_pd);
- auto ins_ret = bwds.insert(
- std::pair<MKLDNNDeconvSignature, MKLDNNDeconvBackwardWeights>(key,
- bwd));
+ auto bwd = MKLDNNDeconvBackwardWeights(param, data, weights, output, bwd_data_pd);
+ auto ins_ret =
+ bwds.insert(std::pair<MKLDNNDeconvSignature, MKLDNNDeconvBackwardWeights>(key, bwd));
CHECK(ins_ret.second);
it = ins_ret.first;
}
return it->second;
}
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
- const std::vector<NDArray> &in_grad = outputs;
- const DeconvolutionParam ¶m = nnvm::get<DeconvolutionParam>(attrs.parsed);
-
- auto &data = inputs[deconv::kData + 1];
- auto &weight = inputs[deconv::kWeight + 1];
- auto &out_grad = inputs[deconv::kOut];
-
- CHECK_NE(req[deconv::kWeight], kWriteInplace)
- << "cannot write weight inplace";
- MKLDNNDeconvBackwardData &bwd_data =
- GetDeconvBwdData(param, data, weight, inputs[deconv::kOut]);
- auto out_grad_mem =
- out_grad.GetMKLDNNDataReorder(bwd_data.GetDataPd().src_desc());
+ const std::vector<NDArray>& in_grad = outputs;
+ const DeconvolutionParam& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+
+ auto& data = inputs[deconv::kData + 1];
+ auto& weight = inputs[deconv::kWeight + 1];
+ auto& out_grad = inputs[deconv::kOut];
+
+ CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace";
+ MKLDNNDeconvBackwardData& bwd_data = GetDeconvBwdData(param, data, weight, inputs[deconv::kOut]);
+ auto out_grad_mem = out_grad.GetMKLDNNDataReorder(bwd_data.GetDataPd().src_desc());
if (req[deconv::kData]) {
- auto weight_mem = GetWeights(weight, bwd_data.GetDataPd().weights_desc(),
- param.num_group);
- auto in_grad_mem =
- CreateMKLDNNMem(in_grad[deconv::kData], bwd_data.GetDataPd().dst_desc(),
- req[deconv::kData]);
+ auto weight_mem = GetWeights(weight, bwd_data.GetDataPd().weights_desc(), param.num_group);
+ auto in_grad_mem = CreateMKLDNNMem(
+ in_grad[deconv::kData], bwd_data.GetDataPd().dst_desc(), req[deconv::kData]);
mkldnn_args_map_t net_args = {{MKLDNN_ARG_SRC, *out_grad_mem},
{MKLDNN_ARG_WEIGHTS, *weight_mem},
{MKLDNN_ARG_DST, *in_grad_mem.second}};
@@ -482,22 +512,18 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
CommitOutput(in_grad[deconv::kData], in_grad_mem);
}
if (req[deconv::kWeight]) {
- MKLDNNDeconvBackwardWeights &bwd_weights = GetDeconvBwdWeights(
- param, data, weight, inputs[deconv::kOut], bwd_data.GetDataPd());
- if (bwd_data.GetDataPd().src_desc() !=
- bwd_weights.GetWeightsPd().src_desc())
- out_grad_mem =
- out_grad.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().src_desc());
- auto data_mem =
- data.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().diff_dst_desc());
- auto in_grad_weight = CreateMKLDNNWeightGrad(
- in_grad[deconv::kWeight],
- bwd_weights.GetWeightsPd().diff_weights_desc(), req[deconv::kWeight]);
-
- mkldnn_args_map_t net_args = {
- {MKLDNN_ARG_SRC, *out_grad_mem},
- {MKLDNN_ARG_DIFF_DST, *data_mem},
- {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
+ MKLDNNDeconvBackwardWeights& bwd_weights =
+ GetDeconvBwdWeights(param, data, weight, inputs[deconv::kOut], bwd_data.GetDataPd());
+ if (bwd_data.GetDataPd().src_desc() != bwd_weights.GetWeightsPd().src_desc())
+ out_grad_mem = out_grad.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().src_desc());
+ auto data_mem = data.GetMKLDNNDataReorder(bwd_weights.GetWeightsPd().diff_dst_desc());
+ auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[deconv::kWeight],
+ bwd_weights.GetWeightsPd().diff_weights_desc(),
+ req[deconv::kWeight]);
+
+ mkldnn_args_map_t net_args = {{MKLDNN_ARG_SRC, *out_grad_mem},
+ {MKLDNN_ARG_DIFF_DST, *data_mem},
+ {MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
MKLDNNStream::Get()->RegisterPrimArgs(bwd_weights.GetBwd(), net_args);
CommitOutput(in_grad[deconv::kWeight], in_grad_weight);
}
@@ -505,9 +531,8 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
if (!param.no_bias) {
typedef float DType;
- Stream<cpu> *s = ctx.get_stream<cpu>();
- Tensor<cpu, 1, DType> gbias =
- in_grad[deconv::kBias].data().get<cpu, 1, DType>(s);
+ Stream<cpu>* s = ctx.get_stream<cpu>();
+ Tensor<cpu, 1, DType> gbias = in_grad[deconv::kBias].data().get<cpu, 1, DType>(s);
NDArray temp = inputs[deconv::kOut];
if (temp.IsMKLDNNData()) {
@@ -515,8 +540,7 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs,
}
Tensor<cpu, 4, DType> grad = temp.data().get<cpu, 4, DType>(s);
- Assign(gbias, req[deconv::kBias],
- mshadow::expr::sumall_except_dim<1>(grad));
+ Assign(gbias, req[deconv::kBias], mshadow::expr::sumall_except_dim<1>(grad));
}
}
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
index a91a4f6..caf9b1b 100644
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
@@ -22,22 +22,24 @@
* \file mkldnn_fully_connected-inl.h
* \brief Common functions used by MKLDNN (Quantized) FullyConnected operator
* \author Ciyong Chen
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_FULLY_CONNECTED_INL_H_
#if MXNET_USE_ONEDNN == 1
-#include <vector>
#include <string>
-#include "../fully_connected-inl.h"
+#include <vector>
+
#include "./mkldnn_base-inl.h"
+#include "../fully_connected-inl.h"
+
namespace mxnet {
namespace op {
-struct MKLDNNFCParam: public dmlc::Parameter<MKLDNNFCParam> {
+struct MKLDNNFCParam : public dmlc::Parameter<MKLDNNFCParam> {
bool quantized;
bool enable_float_output;
bool with_eltwise;
@@ -46,25 +48,29 @@ struct MKLDNNFCParam: public dmlc::Parameter<MKLDNNFCParam> {
dmlc::optional<bool> channel_wise_quantize;
DMLC_DECLARE_PARAMETER(MKLDNNFCParam) {
- DMLC_DECLARE_FIELD(quantized).set_default(false)
- .describe("Whether it's a quantized FullyConnected operator");
- DMLC_DECLARE_FIELD(enable_float_output).set_default(false)
- .describe("Whether to enable float32 output");
- DMLC_DECLARE_FIELD(with_eltwise).set_default(false)
- .describe("Whether there's a post with_eltwise after FullyConnected operator");
+ DMLC_DECLARE_FIELD(quantized).set_default(false).describe(
+ "Whether it's a quantized FullyConnected operator");
+ DMLC_DECLARE_FIELD(enable_float_output)
+ .set_default(false)
+ .describe("Whether to enable float32 output");
+ DMLC_DECLARE_FIELD(with_eltwise)
+ .set_default(false)
+ .describe("Whether there's a post with_eltwise after FullyConnected operator");
DMLC_DECLARE_FIELD(min_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The minimum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized fullyconnected op to calculate primitive scale");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The minimum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized fullyconnected op to calculate primitive scale");
DMLC_DECLARE_FIELD(max_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The maximum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized fullyconnected op to calculate primitive scale");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The maximum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized fullyconnected op to calculate primitive scale");
DMLC_DECLARE_FIELD(channel_wise_quantize)
- .set_default(dmlc::optional<bool>())
- .describe("Whether support channel-wise-quantize for weight.");
+ .set_default(dmlc::optional<bool>())
+ .describe("Whether support channel-wise-quantize for weight.");
}
};
@@ -75,24 +81,28 @@ struct MKLDNNFCFullParam {
std::vector<float> output_scales = {0.0f};
};
-mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(
- const MKLDNNFCFullParam &full_param, const bool is_train,
- const NDArray &data, const NDArray &weight, const NDArray *bias,
- const mkldnn::memory::desc &out_md);
+mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(const MKLDNNFCFullParam& full_param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const mkldnn::memory::desc& out_md);
class MKLDNNFullyConnectedForward {
public:
mkldnn::inner_product_forward::primitive_desc fwd_pd;
- MKLDNNFullyConnectedForward(const MKLDNNFCFullParam &full_param, const bool is_train,
- const NDArray &data, const NDArray &weight,
- const NDArray *bias,
- const mkldnn::memory::desc &out_md)
+ MKLDNNFullyConnectedForward(const MKLDNNFCFullParam& full_param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const mkldnn::memory::desc& out_md)
: fwd_pd(GetFCFwdImpl(full_param, is_train, data, weight, bias, out_md)) {
- fwd_ = std::make_shared<mkldnn::inner_product_forward>(fwd_pd);
- }
+ fwd_ = std::make_shared<mkldnn::inner_product_forward>(fwd_pd);
... 13426 lines suppressed ...