You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2021/07/16 14:29:12 UTC
[incubator-mxnet] branch v1.x updated: Auto-formatter to keep the
same coding style (#20356)
This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a commit to branch v1.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/v1.x by this push:
new 0ae1f0c Auto-formatter to keep the same coding style (#20356)
0ae1f0c is described below
commit 0ae1f0cc6788841dad85aad48699909353f90100
Author: mozga <ma...@intel.com>
AuthorDate: Fri Jul 16 16:27:12 2021 +0200
Auto-formatter to keep the same coding style (#20356)
* This pull-request contains coding-style chnages
* Sanity chnages
* Sanity chnages: NDArray file
* Remove the same #defines
* Cuda: batch_norm, a duplication was removed
* BinPackParameters was added
* Clang-formatter: constructor param in one line
* Conflict: fix
---
include/mxnet/ndarray.h | 668 ++++----
src/ndarray/ndarray.cc | 1715 +++++++++++---------
src/operator/nn/batch_norm-inl.h | 278 ++--
src/operator/nn/batch_norm.cc | 537 +++---
src/operator/nn/batch_norm.cu | 4 -
src/operator/nn/mkldnn/mkldnn_act-inl.h | 57 +-
src/operator/nn/mkldnn/mkldnn_act.cc | 192 ++-
src/operator/nn/mkldnn/mkldnn_base-inl.h | 425 ++---
src/operator/nn/mkldnn/mkldnn_base.cc | 406 ++---
src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h | 310 ++--
src/operator/nn/mkldnn/mkldnn_concat-inl.h | 20 +-
src/operator/nn/mkldnn/mkldnn_concat.cc | 74 +-
src/operator/nn/mkldnn/mkldnn_convolution-inl.h | 124 +-
src/operator/nn/mkldnn/mkldnn_convolution.cc | 415 +++--
src/operator/nn/mkldnn/mkldnn_copy.cc | 23 +-
src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h | 325 ++--
src/operator/nn/mkldnn/mkldnn_deconvolution.cc | 207 +--
.../nn/mkldnn/mkldnn_fully_connected-inl.h | 121 +-
src/operator/nn/mkldnn/mkldnn_fully_connected.cc | 257 +--
src/operator/nn/mkldnn/mkldnn_log_softmax.cc | 157 +-
src/operator/nn/mkldnn/mkldnn_lrn-inl.h | 206 ++-
src/operator/nn/mkldnn/mkldnn_ops-inl.h | 187 ++-
src/operator/nn/mkldnn/mkldnn_pooling-inl.h | 127 +-
src/operator/nn/mkldnn/mkldnn_pooling.cc | 220 +--
src/operator/nn/mkldnn/mkldnn_reshape-inl.h | 18 +-
src/operator/nn/mkldnn/mkldnn_reshape.cc | 78 +-
src/operator/nn/mkldnn/mkldnn_rnn-inl.h | 327 ++--
src/operator/nn/mkldnn/mkldnn_rnn.cc | 844 +++++-----
src/operator/nn/mkldnn/mkldnn_slice-inl.h | 23 +-
src/operator/nn/mkldnn/mkldnn_slice.cc | 50 +-
src/operator/nn/mkldnn/mkldnn_softmax.cc | 133 +-
src/operator/nn/mkldnn/mkldnn_softmax_output.cc | 64 +-
src/operator/nn/mkldnn/mkldnn_sum.cc | 63 +-
src/operator/nn/mkldnn/mkldnn_transpose.cc | 54 +-
src/operator/operator_common.h | 346 ++--
.../quantization/mkldnn/mkldnn_dequantize-inl.h | 57 +-
.../quantization/mkldnn/mkldnn_quantize-inl.h | 35 +-
.../quantization/mkldnn/mkldnn_quantize_v2-inl.h | 74 +-
.../quantization/mkldnn/mkldnn_quantized_act.cc | 9 +-
.../mkldnn/mkldnn_quantized_batch_norm.cc | 100 +-
.../quantization/mkldnn/mkldnn_quantized_concat.cc | 38 +-
.../quantization/mkldnn/mkldnn_quantized_conv.cc | 54 +-
.../mkldnn/mkldnn_quantized_elemwise_add.cc | 153 +-
.../mkldnn/mkldnn_quantized_flatten.cc | 24 +-
.../mkldnn/mkldnn_quantized_fully_connected.cc | 62 +-
.../quantization/mkldnn/mkldnn_quantized_ops-inl.h | 11 +-
.../mkldnn/mkldnn_quantized_pooling.cc | 21 +-
.../quantization/mkldnn/mkldnn_requantize-inl.h | 62 +-
src/operator/subgraph/mkldnn/mkldnn_common.h | 57 +-
src/operator/subgraph/mkldnn/mkldnn_conv-inl.h | 5 +-
src/operator/subgraph/mkldnn/mkldnn_conv.cc | 534 +++---
.../subgraph/mkldnn/mkldnn_conv_property.h | 101 +-
.../mkldnn_elemwisemul_post_quantize_property.h | 80 +-
src/operator/subgraph/mkldnn/mkldnn_fc-inl.h | 21 +-
src/operator/subgraph/mkldnn/mkldnn_fc.cc | 594 +++----
.../mkldnn/mkldnn_fc_post_quantize_property.h | 78 +-
src/operator/subgraph/mkldnn/mkldnn_fc_property.h | 80 +-
src/operator/subgraph/mkldnn/mkldnn_fc_sum_fuse.h | 140 +-
.../mkldnn_post_quantize_align_scale_property.h | 97 +-
.../mkldnn/mkldnn_post_quantize_property.h | 48 +-
.../subgraph/mkldnn/mkldnn_subgraph_property.cc | 23 +-
.../subgraph/mkldnn/mkldnn_transformer-inl.h | 33 +-
src/operator/subgraph/mkldnn/mkldnn_transformer.cc | 715 ++++----
.../mkldnn_transformer_post_quantize_property.h | 81 +-
.../subgraph/mkldnn/mkldnn_transformer_property.h | 59 +-
src/operator/tensor/amp_cast.cc | 278 ++--
src/operator/tensor/cast_storage-inl.h | 222 +--
tests/cpp/include/test_mkldnn.h | 262 +--
tests/cpp/operator/mkldnn_operator_test.cc | 829 +++++-----
69 files changed, 7476 insertions(+), 6616 deletions(-)
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index c55e49e..0febc65 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -26,23 +26,23 @@
#define MXNET_NDARRAY_H_
#include <dmlc/base.h>
-#include <dmlc/logging.h>
#include <dmlc/io.h>
-#include <dmlc/type_traits.h>
+#include <dmlc/logging.h>
#include <dmlc/registry.h>
+#include <dmlc/type_traits.h>
#include <nnvm/node.h>
-#include <vector>
-#include <map>
-#include <string>
+
#include <algorithm>
+#include <map>
#include <memory>
-#include <algorithm>
+#include <string>
+#include <vector>
#if MXNET_USE_MKLDNN == 1
#include <mkldnn.hpp>
#endif
#include "./base.h"
-#include "./storage.h"
#include "./engine.h"
+#include "./storage.h"
// check c++11
#if DMLC_USE_CXX11 == 0
#error "cxx11 was required for ndarray module"
@@ -51,11 +51,11 @@
namespace mxnet {
// enum for storage types
namespace csr {
-enum CSRAuxType {kIndPtr, kIdx};
+enum CSRAuxType { kIndPtr, kIdx };
}
namespace rowsparse {
-enum RowSparseAuxType {kIdx};
+enum RowSparseAuxType { kIdx };
}
enum NDArrayStorageType {
@@ -82,9 +82,7 @@ class MKLDNNMemory;
class NDArray {
public:
/*! \brief default constructor */
- NDArray()
- : entry_(nullptr) {
- }
+ NDArray() : entry_(nullptr) {}
/*!
* \brief constructs a new dynamic NDArray
* \param shape the shape of array
@@ -92,20 +90,25 @@ class NDArray {
* \param delay_alloc whether delay the allocation
* \param dtype data type of this ndarray
*/
- NDArray(const mxnet::TShape &shape, Context ctx,
- bool delay_alloc = false, int dtype = mshadow::default_type_flag)
+ NDArray(const mxnet::TShape& shape,
+ Context ctx,
+ bool delay_alloc = false,
+ int dtype = mshadow::default_type_flag)
: ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
shape_(shape),
dtype_(dtype),
storage_type_(kDefaultStorage),
- entry_(nullptr) {
- }
+ entry_(nullptr) {}
/*! \brief constructor for NDArray with storage type
*/
- NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Context ctx,
- bool delay_alloc = true, int dtype = mshadow::default_type_flag,
- std::vector<int> aux_types = {}, mxnet::ShapeVector aux_shapes = {},
- mxnet::TShape storage_shape = mxnet::TShape(mshadow::Shape1(0)));
+ NDArray(const NDArrayStorageType stype,
+ const mxnet::TShape& shape,
+ Context ctx,
+ bool delay_alloc = true,
+ int dtype = mshadow::default_type_flag,
+ std::vector<int> aux_types = {},
+ mxnet::ShapeVector aux_shapes = {},
+ mxnet::TShape storage_shape = mxnet::TShape(mshadow::Shape1(0)));
/*!
* \brief constructs a new dynamic NDArray whose shape is unknown,
* hence the NDArray is inherently lazily created
@@ -117,8 +120,7 @@ class NDArray {
shape_(),
dtype_(dtype),
storage_type_(kDefaultStorage),
- entry_(nullptr) {
- }
+ entry_(nullptr) {}
/*!
* \brief constructing a static NDArray that shares data with TBlob
* Use with caution: allocate ONLY ONE NDArray for each TBlob,
@@ -126,31 +128,31 @@ class NDArray {
* \param data the memory content of static data
* \param dev_id the device id this tensor sits at
*/
- NDArray(const TBlob &data, int dev_id)
+ NDArray(const TBlob& data, int dev_id)
: ptr_(std::make_shared<Chunk>(data, dev_id)),
shape_(data.shape_),
dtype_(data.type_flag_),
storage_type_(kDefaultStorage),
- entry_(nullptr) {
- }
+ entry_(nullptr) {}
/*!
- * \brief constructing a static NDArray that shares data with TBlob which is with deleter
- * Use with caution: allocate ONLY ONE NDArray for each TBlob,
+ * \brief constructing a static NDArray that shares data with TBlob which is
+ * with deleter Use with caution: allocate ONLY ONE NDArray for each TBlob,
* make sure the memory region is available through out the life of NDArray
* \param data the memory content of static data
* \param dev_id the device id this tensor sits at
* \param deleter the function pointer of custom deleter
*/
- NDArray(const TBlob &data, int dev_id, const std::function<void()>& deleter)
- : ptr_(new Chunk(data, dev_id), [deleter](Chunk *p) {
- deleter(); // call custom deleter
- delete p; // delete Chunk object
- }),
+ NDArray(const TBlob& data, int dev_id, const std::function<void()>& deleter)
+ : ptr_(new Chunk(data, dev_id),
+ [deleter](Chunk* p) {
+ deleter(); // call custom deleter
+ delete p; // delete Chunk object
+ }),
shape_(data.shape_),
- dtype_(data.type_flag_), storage_type_(kDefaultStorage),
- entry_(nullptr) {
- }
+ dtype_(data.type_flag_),
+ storage_type_(kDefaultStorage),
+ entry_(nullptr) {}
/*! \brief create ndarray from shared memory */
NDArray(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
@@ -158,12 +160,11 @@ class NDArray {
shape_(shape),
dtype_(dtype),
storage_type_(kDefaultStorage),
- entry_(nullptr) {
- }
+ entry_(nullptr) {}
/*!
- * \brief constructing a static NDArray of non-default storage that shares data with TBlob
- * Use with caution: allocate ONLY ONE NDArray for each TBlob,
+ * \brief constructing a static NDArray of non-default storage that shares
+ * data with TBlob Use with caution: allocate ONLY ONE NDArray for each TBlob,
* make sure the memory region is available through out the life of NDArray
* \param stype the storage type of NDArray
* \param shape the shape of NDArray
@@ -171,24 +172,27 @@ class NDArray {
* \param aux_data the memory content of static aux data
* \param dev_id the device id this tensor sits at
*/
- NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape,
- const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
+ NDArray(const NDArrayStorageType stype,
+ const mxnet::TShape& shape,
+ const TBlob& data,
+ const std::vector<TBlob>& aux_data,
+ int dev_id)
: ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)),
shape_(shape),
dtype_(data.type_flag_),
storage_type_(stype),
- entry_(nullptr) {
- }
+ entry_(nullptr) {}
/*!
- * \brief initialize the NDArray, assuming it is not assigned a meaningful shape before
- * \param shape the shape of the NDArray
+ * \brief initialize the NDArray, assuming it is not assigned a meaningful
+ * shape before \param shape the shape of the NDArray
*/
- void Init(const mxnet::TShape &shape) {
+ void Init(const mxnet::TShape& shape) {
ptr_->Init(shape, this->dtype_);
this->shape_ = shape;
}
/*!
- * \brief set the correct shape of NDArray directly from the storage_shape of its own chunk.
+ * \brief set the correct shape of NDArray directly from the storage_shape of
+ * its own chunk.
*/
void SetShapeFromChunk();
/*
@@ -210,10 +214,8 @@ class NDArray {
/* \brief Check whether the two arrays are the same array */
inline bool IsSame(const NDArray& other) const {
- return ptr_ == other.ptr_ &&
- shape_ == other.shape_ &&
- byte_offset_ == other.byte_offset_ &&
- dtype_ == other.dtype_;
+ return ptr_ == other.ptr_ && shape_ == other.shape_ && byte_offset_ == other.byte_offset_ &&
+ dtype_ == other.dtype_;
}
/*!
@@ -224,13 +226,13 @@ class NDArray {
}
/*!
* \return the shape of underlying chunk which stores the NDArray data/value.
- * It is only intended for non-default storage. For row-sparse storage, it is the shape of
- * the tensor which stores the non-zero values.
+ * It is only intended for non-default storage. For row-sparse storage, it is
+ * the shape of the tensor which stores the non-zero values.
*/
- inline const mxnet::TShape &storage_shape() const {
+ inline const mxnet::TShape& storage_shape() const {
CHECK(ptr_ != nullptr);
CHECK_NE(storage_type(), kDefaultStorage)
- << "storage_shape() is not intended for kDefaultStorage.";
+ << "storage_shape() is not intended for kDefaultStorage.";
return ptr_->storage_shape;
}
@@ -240,22 +242,20 @@ class NDArray {
* \return the shape of aux data at given index
*/
inline const mxnet::TShape& aux_shape(size_t index) const {
- CHECK_NE(storage_type(), kDefaultStorage)
- << "aux_shape() is not intended for kDefaultStorage.";
+ CHECK_NE(storage_type(), kDefaultStorage) << "aux_shape() is not intended for kDefaultStorage.";
return ptr_->aux_shapes[index];
}
/* \return the shapes of all aux data */
const mxnet::ShapeVector& aux_shapes() const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "aux_shapes() is not intended for kDefaultStorage.";
+ << "aux_shapes() is not intended for kDefaultStorage.";
return ptr_->aux_shapes;
}
/*! returns the dtypes of all aux data */
const std::vector<int>& aux_types() const {
- CHECK_NE(storage_type(), kDefaultStorage)
- << "aux_types() is not intended for kDefaultStorage.";
+ CHECK_NE(storage_type(), kDefaultStorage) << "aux_types() is not intended for kDefaultStorage.";
return ptr_->aux_types;
}
@@ -268,7 +268,7 @@ class NDArray {
*/
inline void set_aux_shape(size_t index, const mxnet::TShape& shape) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "set_aux_shape() is not intended for kDefaultStorage.";
+ << "set_aux_shape() is not intended for kDefaultStorage.";
ptr_->set_aux_shape(index, shape);
}
@@ -276,7 +276,8 @@ class NDArray {
* \return the data TBlob
*/
inline const TBlob& data() const {
- if (storage_type() == kDefaultStorage) CheckAndAlloc();
+ if (storage_type() == kDefaultStorage)
+ CheckAndAlloc();
SetTBlob();
return tblob_;
}
@@ -292,24 +293,26 @@ class NDArray {
auto stype = storage_type();
TBlob res;
auto shape = aux_shape(i);
- auto type = aux_type(i);
+ auto type = aux_type(i);
MSHADOW_TYPE_SWITCH(type, DType, {
auto dptr = static_cast<DType*>(ptr_->aux_handles[i].dptr);
CHECK(stype == kRowSparseStorage || stype == kCSRStorage)
- << "Unexpected storage type: " << stype;
+ << "Unexpected storage type: " << stype;
res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
});
return res;
}
/*!
- * \return the context of NDArray, this function is only valid when the NDArray is not empty
+ * \return the context of NDArray, this function is only valid when the
+ * NDArray is not empty
*/
inline Context ctx() const {
CHECK(!is_none());
return ptr_->shandle.ctx;
}
/*!
- * \return the data type of NDArray, this function is only valid when the NDArray is not empty
+ * \return the data type of NDArray, this function is only valid when the
+ * NDArray is not empty
*/
inline int dtype() const {
return dtype_;
@@ -330,24 +333,25 @@ class NDArray {
bool fresh_out_grad() const;
/*! \return updated grad state in entry_ */
void set_fresh_out_grad(bool state) const;
- /*! \brief Returns true if a sparse ndarray's aux_data and storage are initialized
- * Throws an exception if the indices array shape is inconsistent
+ /*! \brief Returns true if a sparse ndarray's aux_data and storage are
+ * initialized Throws an exception if the indices array shape is inconsistent
* Returns false if the indices array is empty(nnz = 0) for csr/row_sparse
*/
inline bool storage_initialized() const {
- if (is_none()) return false;
+ if (is_none())
+ return false;
auto stype = storage_type();
CHECK_NE(stype, kDefaultStorage)
- << "storage_initialized() is not intended for kDefaultStorage.";
+ << "storage_initialized() is not intended for kDefaultStorage.";
if (stype == kRowSparseStorage) {
CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0])
- << "inconsistent storage shape " << storage_shape()
- << " vs. aux shape " << aux_shape(rowsparse::kIdx);
+ << "inconsistent storage shape " << storage_shape() << " vs. aux shape "
+ << aux_shape(rowsparse::kIdx);
return aux_shape(rowsparse::kIdx).Size() != 0;
} else if (stype == kCSRStorage) {
CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0])
- << "inconsistent storage shape " << storage_shape()
- << " vs. aux shape " << aux_shape(csr::kIdx);
+ << "inconsistent storage shape " << storage_shape() << " vs. aux shape "
+ << aux_shape(csr::kIdx);
return aux_shape(csr::kIdx).Size() != 0;
} else {
LOG(FATAL) << "Unknown storage type";
@@ -366,7 +370,8 @@ class NDArray {
* to current NDArray are finished, and read can be performed.
*/
inline void WaitToRead() const {
- if (is_none()) return;
+ if (is_none())
+ return;
Engine::Get()->WaitForVar(ptr_->var);
}
/*!
@@ -374,15 +379,17 @@ class NDArray {
* to current NDArray are finished, and write can be performed.
*/
inline void WaitToWrite() const {
- if (is_none()) return;
+ if (is_none())
+ return;
/*!
* Push an empty mutable function to flush all preceding reads to the
* variable.
*/
Engine::Get()->PushAsync(
- [](RunContext, Engine::CallbackOnComplete on_complete) {
- on_complete();
- }, Context{}, {}, {ptr_->var});
+ [](RunContext, Engine::CallbackOnComplete on_complete) { on_complete(); },
+ Context{},
+ {},
+ {ptr_->var});
Engine::Get()->WaitForVar(ptr_->var);
}
/*! \return the associated variable of the ndarray.*/
@@ -401,81 +408,81 @@ class NDArray {
* \brief save the content into binary stream
* \param strm the output stream
*/
- void Save(dmlc::Stream *strm) const;
+ void Save(dmlc::Stream* strm) const;
/*!
* \brief load ndarrays before supporting sparse ndarrays
* \param strm the output stream
* \param magic the magic number used for version control
*/
- bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic);
+ bool LegacyLoad(dmlc::Stream* strm, const uint32_t magic);
/*!
* \brief load the content from binary stream
* \param strm the output stream
* \return whether the load is successful
*/
- bool Load(dmlc::Stream *strm);
+ bool Load(dmlc::Stream* strm);
/*!
* \brief set all the elements in ndarray to be scalar
* \param scalar the scalar to set
* \return reference of self
*/
- NDArray &operator=(real_t scalar);
+ NDArray& operator=(real_t scalar);
/*!
* \brief elementwise add to current space
* this mutate the current NDArray
* \param src the data to add
* \return reference of self
*/
- NDArray &operator+=(const NDArray &src);
+ NDArray& operator+=(const NDArray& src);
/*!
* \brief elementwise add to current space
* this mutate the current NDArray
* \param src the data to add
* \return reference of self
*/
- NDArray &operator+=(const real_t &src);
+ NDArray& operator+=(const real_t& src);
/*!
* \brief elementwise subtract from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator-=(const NDArray &src);
+ NDArray& operator-=(const NDArray& src);
/*!
* \brief elementwise subtract from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator-=(const real_t &src);
+ NDArray& operator-=(const real_t& src);
/*!
* \brief elementwise multiplication to current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator*=(const NDArray &src);
+ NDArray& operator*=(const NDArray& src);
/*!
* \brief elementwise multiplication to current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator*=(const real_t &src);
+ NDArray& operator*=(const real_t& src);
/*!
* \brief elementwise division from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator/=(const NDArray &src);
+ NDArray& operator/=(const NDArray& src);
/*!
* \brief elementwise division from current ndarray
* this mutate the current NDArray
* \param src the data to subtract
* \return reference of self
*/
- NDArray &operator/=(const real_t &src);
+ NDArray& operator/=(const real_t& src);
/*!
* \brief return a new copy this NDArray
* \param ctx the new context of this NDArray
@@ -492,12 +499,12 @@ class NDArray {
* \param data the data source to copy from.
* \param size the size of the source array, in sizeof(DType) not raw btyes.
*/
- void SyncCopyFromCPU(const void *data, size_t size) const;
+ void SyncCopyFromCPU(const void* data, size_t size) const;
/*!
* \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j)
*/
- void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1);
+ void SyncCopyFromNDArray(const NDArray& src, int i = -1, int j = -1);
/*!
* \brief Do a synchronize copy to a contiguous CPU memory region.
@@ -507,14 +514,15 @@ class NDArray {
* not wrapped by NDArray(thus dependency not being tracked).
*
* \param data the data source to copyinto.
- * \param size the memory size we want to copy into, in sizeof(DType) not raw btyes.
+ * \param size the memory size we want to copy into, in sizeof(DType) not raw
+ * btyes.
*/
- void SyncCopyToCPU(void *data, size_t size) const;
+ void SyncCopyToCPU(void* data, size_t size) const;
/*!
- * \brief check whether the NDArray format is valid
- * \param full_check if `True`, rigorous check, O(N) operations
- * Otherwise basic check, O(1) operations
- */
+ * \brief check whether the NDArray format is valid
+ * \param full_check if `True`, rigorous check, O(N) operations
+ * Otherwise basic check, O(1) operations
+ */
void SyncCheckFormat(const bool full_check) const;
/*!
* \brief Slice a NDArray
@@ -561,18 +569,16 @@ class NDArray {
* \param dtype The data type.
* \return NDArray in new shape and type.
*/
- inline NDArray AsArray(const mxnet::TShape &shape, int dtype) const {
- CHECK_EQ(storage_type(), kDefaultStorage)
- << "AsArray is intended only for kDefaultStorage.";
- CHECK_GE(ptr_->shandle.size,
- shape.Size() * mshadow::mshadow_sizeof(dtype))
+ inline NDArray AsArray(const mxnet::TShape& shape, int dtype) const {
+ CHECK_EQ(storage_type(), kDefaultStorage) << "AsArray is intended only for kDefaultStorage.";
+ CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype))
<< "NDArray.AsArray: target memory size is bigger";
// We can't reuse memory in a view.
CHECK(!IsView());
NDArray ret = *this;
- ret.shape_ = shape;
- ret.dtype_ = dtype;
- ret.reuse_ = true;
+ ret.shape_ = shape;
+ ret.dtype_ = dtype;
+ ret.reuse_ = true;
return ret;
}
@@ -597,13 +603,13 @@ class NDArray {
static NDArray FromDLPack(const DLManagedTensor* tensor, bool transient_handle);
/*!
- * \brief Update ndarray chunk storage handles using existing ndarray storage handles
- * Also update the aux_handle, aux_shapes and aux_types.
- * This is specifically used for custom op to update the inputs and outputs from
- * the temporary ndarray which stores intermediate custom op results.
- * Should be used with caution elsewhere. Supports only CSR and RSP formats.
+ * \brief Update ndarray chunk storage handles using existing ndarray storage
+ * handles Also update the aux_handle, aux_shapes and aux_types. This is
+ * specifically used for custom op to update the inputs and outputs from the
+ * temporary ndarray which stores intermediate custom op results. Should be
+ * used with caution elsewhere. Supports only CSR and RSP formats.
*/
- inline void SparseUpdateChunk(const NDArray &arr) const {
+ inline void SparseUpdateChunk(const NDArray& arr) const {
CHECK(shape_ == arr.shape_) << "ndarray shape is different from the target";
CHECK(dtype_ == arr.dtype_) << "ndarray dtype is different from the target";
auto stype = arr.storage_type();
@@ -611,24 +617,24 @@ class NDArray {
<< "Only to be used with CSR and RSP storage types";
// swap shandles between src and dst
Storage::Handle shandle_dst = arr.ptr_->shandle;
- arr.ptr_->shandle = ptr_->shandle;
- ptr_->shandle = shandle_dst;
+ arr.ptr_->shandle = ptr_->shandle;
+ ptr_->shandle = shandle_dst;
ptr_->storage_shape = arr.ptr_->storage_shape;
- ptr_->storage_type = arr.ptr_->storage_type;
- ptr_->ctx = arr.ptr_->ctx;
+ ptr_->storage_type = arr.ptr_->storage_type;
+ ptr_->ctx = arr.ptr_->ctx;
// swap aux_handles between src and dst
size_t aux_idx = 0;
CHECK(ptr_->aux_handles.size() == arr.ptr_->aux_handles.size())
<< "ndarray number of aux_handles is different from target";
- for (auto &aux_handle : arr.ptr_->aux_handles) {
- Storage::Handle aux_dst = ptr_->aux_handles[aux_idx];
+ for (auto& aux_handle : arr.ptr_->aux_handles) {
+ Storage::Handle aux_dst = ptr_->aux_handles[aux_idx];
ptr_->aux_handles[aux_idx] = aux_handle;
- aux_handle = aux_dst;
+ aux_handle = aux_dst;
aux_idx++;
}
- ptr_->aux_types = arr.ptr_->aux_types;
+ ptr_->aux_types = arr.ptr_->aux_types;
ptr_->aux_shapes = arr.ptr_->aux_shapes;
}
@@ -637,13 +643,13 @@ class NDArray {
* \param shape new shape
* \return NDArray in new shape
*/
- NDArray Reshape(const mxnet::TShape &shape) const;
+ NDArray Reshape(const mxnet::TShape& shape) const;
/*!
* \brief Get an reshaped NDArray. Supports autograd recording
* \param shape new shape
* \return NDArray in new shape
*/
- NDArray ReshapeWithRecord(const mxnet::TShape &shape);
+ NDArray ReshapeWithRecord(const mxnet::TShape& shape);
/*!
* \brief Return a copy of this NDArray without autograd history
*/
@@ -670,7 +676,8 @@ class NDArray {
* storage type and effectively changes the ndarray's shape_.
* Note: This function is named as this to avoid overload conflict
* with CheckAndAlloc(const mxnet::ShapeVector &aux_shapes), since
- * mxnet::TShape tmp = some_shape is equivalent to mxnet::TShape tmp = {some_shape}.
+ * mxnet::TShape tmp = some_shape is equivalent to mxnet::TShape tmp =
+ * {some_shape}.
*/
void ReshapeAndAlloc(const mxnet::TShape& shape) {
CHECK_EQ(storage_type(), kDefaultStorage);
@@ -683,19 +690,19 @@ class NDArray {
* \brief Alloc memory for non-default storage
* aux_shape is only known at run time
*/
- inline void CheckAndAlloc(const mxnet::ShapeVector &aux_shapes) const {
+ inline void CheckAndAlloc(const mxnet::ShapeVector& aux_shapes) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage";
+ << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage";
ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_);
}
- inline void CheckAndAllocData(const mxnet::TShape &storage_shape) const {
+ inline void CheckAndAllocData(const mxnet::TShape& storage_shape) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "CheckAndAllocData is not intended for kDefaultStorage";
+ << "CheckAndAllocData is not intended for kDefaultStorage";
ptr_->CheckAndAllocData(storage_shape, dtype_);
}
- inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape &aux_shape) const {
+ inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape& aux_shape) const {
CHECK_NE(storage_type(), kDefaultStorage)
- << "CheckAndAllocAuxData is not intended for kDefaultStorage";
+ << "CheckAndAllocAuxData is not intended for kDefaultStorage";
ptr_->CheckAndAllocAuxData(i, aux_shape);
}
@@ -704,12 +711,12 @@ class NDArray {
* Create NDArray from mkldnn memory.
* mkldnn_mem The mkldnn memory to be managed.
*/
- explicit NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem);
+ explicit NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem);
/*
* Create NDArray from mkldnn memory descriptor.
* mem_pd The mkldnn memory descriptor to be created.
*/
- explicit NDArray(const mkldnn::memory::desc &md);
+ explicit NDArray(const mkldnn::memory::desc& md);
/*
* Test if the data is stored in one of special MKLDNN format.
*/
@@ -732,29 +739,29 @@ class NDArray {
/*
* This function returns mkldnn::memory with the default primitive_desc.
*/
- const mkldnn::memory *GetMKLDNNData() const;
+ const mkldnn::memory* GetMKLDNNData() const;
/*
* This function returns mkldnn::memory with the given primitive_desc
- * as long as the array size meets the required size in the given primitive_desc.
+ * as long as the array size meets the required size in the given
+ * primitive_desc.
*/
- const mkldnn::memory *GetMKLDNNData(const mkldnn::memory::desc &md) const;
+ const mkldnn::memory* GetMKLDNNData(const mkldnn::memory::desc& md) const;
/*
* This function returns mkldnn::memory with the given primitive_desc.
* The returned mkldnn::memory will have the same physical layout as
* the given primitive_desc.
*/
- const mkldnn::memory *GetMKLDNNDataReorder(
- const mkldnn::memory::desc &md) const;
+ const mkldnn::memory* GetMKLDNNDataReorder(const mkldnn::memory::desc& md) const;
/*
* This function copies data from mkldnn memory.
*/
- void CopyFrom(const mkldnn::memory &mem);
+ void CopyFrom(const mkldnn::memory& mem);
/*
* This function allocates memory for array and creates mkldnn memory
* with the specified format.
*/
- mkldnn::memory *CreateMKLDNNData(const mkldnn::memory::desc &md);
+ mkldnn::memory* CreateMKLDNNData(const mkldnn::memory::desc& md);
/*
* These are the async version of the methods above.
@@ -762,7 +769,7 @@ class NDArray {
* the array are complete.
*/
void Reorder2DefaultAsync() const;
- void MKLDNNDataReorderAsync(const mkldnn::memory::desc &md) const;
+ void MKLDNNDataReorderAsync(const mkldnn::memory::desc& md) const;
/*
* This creates a new NDArray with the reordered data.
@@ -770,7 +777,7 @@ class NDArray {
*/
NDArray Reorder2Default() const;
- /*
+ /*
* This creates a new NDArray using f32 with the reordered data.
* It doesn't affect the data of the original NDArray.
*/
@@ -788,12 +795,12 @@ class NDArray {
* which can be expensive.
* It's used by FullyConnected right now.
*/
- NDArray MKLDNNDataReshape(const mxnet::TShape &shape) const;
+ NDArray MKLDNNDataReshape(const mxnet::TShape& shape) const;
- /*!
+ /*!
* \ Fix mkldnn memory descriptor mismatch from NDArray.
*/
- void UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc);
+ void UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc);
#endif
/*!
@@ -811,15 +818,14 @@ class NDArray {
* \param data the NDArrays to be loaded
* \param keys the name of the NDArray, if saved in the file.
*/
- static void Load(dmlc::Stream* fi,
- std::vector<NDArray>* data,
- std::vector<std::string>* keys);
+ static void Load(dmlc::Stream* fi, std::vector<NDArray>* data, std::vector<std::string>* keys);
private:
friend class Imperative;
/*! \brief the real data chunk that backs NDArray */
// shandle is used to store the actual values in the NDArray
- // aux_handles store the aux data(such as indices) if it's needed by non-default storage.
+ // aux_handles store the aux data(such as indices) if it's needed by
+ // non-default storage.
struct Chunk {
/*! \brief storage handle from storage engine.
for non-default storage, shandle stores the data(value) array.
@@ -844,52 +850,58 @@ class NDArray {
*/
/*! \brief construct from static data */
bool static_data;
- /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data
- allocation is delayed. */
+ /*! \brief whether data allocation is delayed. This doesn't indicate whether
+ aux data allocation is delayed. */
bool delay_alloc;
- // the type of the storage. The storage_type is never kUndefinedStorage once the chunk
- // is constructed.
+ // the type of the storage. The storage_type is never kUndefinedStorage once
+ // the chunk is constructed.
NDArrayStorageType storage_type = kDefaultStorage;
/*! \brief type of aux */
std::vector<int> aux_types;
// context of data
Context ctx;
// The shape of the chunk data.
- // This might not be the same shape as the NDArray, since the storage may be sparse.
- // The default value for storage_shape is {0} when an empty non-default NDArray is created.
+ // This might not be the same shape as the NDArray, since the storage may be
+ // sparse. The default value for storage_shape is {0} when an empty
+ // non-default NDArray is created.
mxnet::TShape storage_shape;
- // The shape of aux data. The default value for the shape depends on the type of storage.
- // If aux_shapes[i].Size() is zero, aux data i is empty.
+ // The shape of aux data. The default value for the shape depends on the
+ // type of storage. If aux_shapes[i].Size() is zero, aux data i is empty.
mxnet::ShapeVector aux_shapes;
/*! \brief Reference to the storage to ensure proper destruct order */
std::shared_ptr<Storage> storage_ref_;
- /*! \brief Reference to the engine to ensure we cleanup without calling a destructed engine */
+ /*! \brief Reference to the engine to ensure we cleanup without calling a
+ * destructed engine */
std::weak_ptr<Engine> engine_ref_;
-
/*! \brief default constructor */
- Chunk() : static_data(true), delay_alloc(false),
- storage_ref_(Storage::_GetSharedRef()),
- engine_ref_(Engine::_GetSharedRef()) {}
+ Chunk()
+ : static_data(true),
+ delay_alloc(false),
+ storage_ref_(Storage::_GetSharedRef()),
+ engine_ref_(Engine::_GetSharedRef()) {}
/*! \brief construct a new chunk */
Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
- : static_data(false), delay_alloc(true), ctx(ctx_),
+ : static_data(false),
+ delay_alloc(true),
+ ctx(ctx_),
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
storage_shape = shape;
if (shape_is_known(storage_shape)) {
shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
}
- var = Engine::Get()->NewVariable();
+ var = Engine::Get()->NewVariable();
shandle.ctx = ctx_;
if (!delay_alloc_) {
this->CheckAndAlloc();
}
}
- Chunk(const TBlob &data, int dev_id)
- : static_data(true), delay_alloc(false),
+ Chunk(const TBlob& data, int dev_id)
+ : static_data(true),
+ delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
CHECK(storage_type == kDefaultStorage);
@@ -901,35 +913,45 @@ class NDArray {
ctx = Context::GPU(dev_id);
}
// init shandle
- shandle.ctx = ctx;
- shandle.dptr = data.dptr_;
- shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
+ shandle.ctx = ctx;
+ shandle.dptr = data.dptr_;
+ shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
storage_shape = data.shape_;
}
Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
- : static_data(false), delay_alloc(false),
+ : static_data(false),
+ delay_alloc(false),
storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
- var = Engine::Get()->NewVariable();
- ctx = Context::CPUShared(0);
- shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
- shandle.ctx = ctx;
+ var = Engine::Get()->NewVariable();
+ ctx = Context::CPUShared(0);
+ shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
+ shandle.ctx = ctx;
shandle.shared_pid = shared_pid;
- shandle.shared_id = shared_id;
+ shandle.shared_id = shared_id;
Storage::Get()->Alloc(&shandle);
storage_shape = shape;
}
// Constructor for a non-default storage chunk
- Chunk(NDArrayStorageType storage_type_, const mxnet::TShape &storage_shape_, Context ctx_,
- bool delay_alloc_, int dtype, const std::vector<int> &aux_types_,
- const mxnet::ShapeVector &aux_shapes_)
- : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
- aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
- aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()),
+ Chunk(NDArrayStorageType storage_type_,
+ const mxnet::TShape& storage_shape_,
+ Context ctx_,
+ bool delay_alloc_,
+ int dtype,
+ const std::vector<int>& aux_types_,
+ const mxnet::ShapeVector& aux_shapes_)
+ : static_data(false),
+ delay_alloc(delay_alloc_),
+ storage_type(storage_type_),
+ aux_types(aux_types_),
+ ctx(ctx_),
+ storage_shape(storage_shape_),
+ aux_shapes(aux_shapes_),
+ storage_ref_(Storage::_GetSharedRef()),
engine_ref_(Engine::_GetSharedRef()) {
shandle.ctx = ctx;
- var = Engine::Get()->NewVariable();
+ var = Engine::Get()->NewVariable();
// aux_handles always reflect the correct number of aux data
for (size_t i = 0; i < aux_shapes.size(); i++) {
CheckAndAllocAuxData(i, aux_shapes[i]);
@@ -942,10 +964,15 @@ class NDArray {
}
}
- Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
- const std::vector<TBlob> &aux_data, int dev_id)
- : static_data(true), delay_alloc(false), storage_type(storage_type_),
- storage_ref_(Storage::_GetSharedRef()), engine_ref_(Engine::_GetSharedRef()) {
+ Chunk(const NDArrayStorageType storage_type_,
+ const TBlob& data,
+ const std::vector<TBlob>& aux_data,
+ int dev_id)
+ : static_data(true),
+ delay_alloc(false),
+ storage_type(storage_type_),
+ storage_ref_(Storage::_GetSharedRef()),
+ engine_ref_(Engine::_GetSharedRef()) {
using namespace mshadow;
CHECK_NE(storage_type, kDefaultStorage);
// init var
@@ -958,14 +985,14 @@ class NDArray {
ctx = Context::GPU(dev_id);
}
// init shandle
- shandle.ctx = ctx;
- shandle.dptr = data.dptr_;
- shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
+ shandle.ctx = ctx;
+ shandle.dptr = data.dptr_;
+ shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
storage_shape = data.shape_;
// init aux handles
- for (const auto &aux : aux_data) {
+ for (const auto& aux : aux_data) {
Storage::Handle aux_handle;
- aux_handle.ctx = ctx;
+ aux_handle.ctx = ctx;
aux_handle.dptr = aux.dptr_;
aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_);
aux_handles.push_back(aux_handle);
@@ -974,7 +1001,8 @@ class NDArray {
}
}
- /*! \brief set the shape for ith aux data, and update storage shape if necessary */
+ /*! \brief set the shape for ith aux data, and update storage shape if
+ * necessary */
inline void set_aux_shape(const size_t i, const mxnet::TShape& shape) {
aux_shapes[i] = shape;
if (storage_shape.ndim() >= 0) {
@@ -1019,14 +1047,16 @@ class NDArray {
#endif
}
}
- /*! \brief initialize the shape and dtype, assuming it is not initialized before. */
- void Init(const mxnet::TShape &shape, int dtype) {
- auto size = shape.Size();
+ /*! \brief initialize the shape and dtype, assuming it is not initialized
+ * before. */
+ void Init(const mxnet::TShape& shape, int dtype) {
+ auto size = shape.Size();
storage_shape = shape;
- shandle.size = size * mshadow::mshadow_sizeof(dtype);
+ shandle.size = size * mshadow::mshadow_sizeof(dtype);
this->CheckAndAlloc();
}
- inline void CheckAndAlloc(const mxnet::TShape &shape, const mxnet::ShapeVector &aux_shapes,
+ inline void CheckAndAlloc(const mxnet::TShape& shape,
+ const mxnet::ShapeVector& aux_shapes,
int dtype) {
// calculate size, perform allocation
if (kRowSparseStorage == storage_type) {
@@ -1044,21 +1074,20 @@ class NDArray {
LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc";
}
}
- // create storage handle for data based on shape and dtype, assuming ctx is set
- // storage shape is also updated
- // if data is already allocated, try reuse the storage. Otherwise, free the current one
- // and allocate new storage
- void CheckAndAllocData(const mxnet::TShape &shape, int dtype);
+ // create storage handle for data based on shape and dtype, assuming ctx is
+ // set storage shape is also updated if data is already allocated, try reuse
+ // the storage. Otherwise, free the current one and allocate new storage
+ void CheckAndAllocData(const mxnet::TShape& shape, int dtype);
#if MXNET_USE_MKLDNN == 1
// Have MKL memory reference to the data in the default storage
// or create memory for MKLDNN.
- void SetMKLMem(const mxnet::TShape &shape, int dtype);
+ void SetMKLMem(const mxnet::TShape& shape, int dtype);
// If the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and
// save the result in shandle.
void Reorder2Default();
// Reroder data to a specified layout.
- void MKLDNNDataReorder(const mkldnn::memory::desc &md);
+ void MKLDNNDataReorder(const mkldnn::memory::desc& md);
bool IsMKLDNN() const;
bool IsDefault() const;
#endif
@@ -1066,14 +1095,14 @@ class NDArray {
// create storage handle for aux data based on shape
// this function assumes ctx, aux shapes and aux types are set
// aux shape is also updated
- // if aux data is already allocated, try reuse the storage. Otherwise, free the current one
- // and allocate new storage
- inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape &shape) {
+ // if aux data is already allocated, try reuse the storage. Otherwise, free
+ // the current one and allocate new storage
+ inline void CheckAndAllocAuxData(size_t i, const mxnet::TShape& shape) {
CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData";
CHECK_NE(storage_type, kUndefinedStorage)
- << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
+ << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
CHECK_NE(storage_type, kDefaultStorage)
- << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
+ << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
if (aux_handles.size() <= i) {
aux_handles.resize(i + 1);
}
@@ -1133,7 +1162,7 @@ size_t num_aux_data(NDArrayStorageType stype);
* \note The function name explicitly marks the order of from and to
* due to different possible convention carried by copy function.
*/
-void CopyFromTo(const NDArray &from, const NDArray *to, int priority = 0);
+void CopyFromTo(const NDArray& from, const NDArray* to, int priority = 0);
/*!
* \brief issue an copy operation from one NDArray to another
@@ -1143,20 +1172,19 @@ void CopyFromTo(const NDArray &from, const NDArray *to, int priority = 0);
* \param from the ndarray we want to copy data from
* \param to the target ndarray
* \param priority Priority of the action.
- * \param is_opr whether it is invoked by an operator. For example, false if invoked from
- KVStore, true if invoked from `_copyto` operator.
+ * \param is_opr whether it is invoked by an operator. For example, false if
+ invoked from KVStore, true if invoked from `_copyto` operator.
* \note The function name explicitly marks the order of from and to
* due to different possible convention carried by copy function.
*/
-void CopyFromTo(const NDArray &from, const NDArray& to, int priority = 0, bool is_opr = false);
+void CopyFromTo(const NDArray& from, const NDArray& to, int priority = 0, bool is_opr = false);
/*!
- * \brief Perform elementwise sum over each data from source, store result into out.
- * \param source the ndarray we want to sum
- * \param out the target ndarray
+ * \brief Perform elementwise sum over each data from source, store result into
+ * out. \param source the ndarray we want to sum \param out the target ndarray
* \param priority Priority of the action.
*/
-void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority = 0);
+void ElementwiseSum(const std::vector<NDArray>& source, NDArray* out, int priority = 0);
/*!
* \brief elementwise add
@@ -1164,56 +1192,56 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator+(const NDArray &lhs, const NDArray &rhs);
+NDArray operator+(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise add
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator+(const NDArray &lhs, const real_t &rhs);
+NDArray operator+(const NDArray& lhs, const real_t& rhs);
/*!
* \brief elementwise subtraction
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator-(const NDArray &lhs, const NDArray &rhs);
+NDArray operator-(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise subtraction
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator-(const NDArray &lhs, const real_t &rhs);
+NDArray operator-(const NDArray& lhs, const real_t& rhs);
/*!
* \brief elementwise multiplication
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator*(const NDArray &lhs, const NDArray &rhs); \
+NDArray operator*(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise multiplication
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator*(const NDArray &lhs, const real_t &rhs);
+NDArray operator*(const NDArray& lhs, const real_t& rhs);
/*!
* \brief elementwise division
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator/(const NDArray &lhs, const NDArray &rhs);
+NDArray operator/(const NDArray& lhs, const NDArray& rhs);
/*!
* \brief elementwise division
* \param lhs left operand
* \param rhs right operand
* \return a new result ndarray
*/
-NDArray operator/(const NDArray &lhs, const real_t &rhs);
+NDArray operator/(const NDArray& lhs, const real_t& rhs);
/*!
* \brief Seed all random number generator in mxnet.
@@ -1231,60 +1259,59 @@ void RandomSeed(Context ctx, uint32_t seed);
* \param end upper bound of distribution.
* \param out output NDArray.
*/
-void SampleUniform(real_t begin, real_t end, NDArray *out);
+void SampleUniform(real_t begin, real_t end, NDArray* out);
/*!
* \brief Sample gaussian distribution for each elements of out.
* \param mu mean of gaussian distribution.
* \param sigma standard deviation of gaussian distribution.
* \param out output NDArray.
*/
-void SampleGaussian(real_t mu, real_t sigma, NDArray *out);
+void SampleGaussian(real_t mu, real_t sigma, NDArray* out);
/*!
* \brief Sample gamma distribution for each elements of out.
* \param alpha parameter (shape) of the gamma distribution
* \param beta parameter (scale) of the gamma distribution
* \param out output NDArray.
*/
-void SampleGamma(real_t alpha, real_t beta, NDArray *out);
+void SampleGamma(real_t alpha, real_t beta, NDArray* out);
/*!
* \brief Sample exponential distribution for each elements of out.
* \param lambda parameter (rate) of the exponential distribution
* \param out output NDArray.
*/
-void SampleExponential(real_t lambda, NDArray *out);
+void SampleExponential(real_t lambda, NDArray* out);
/*!
* \brief Sample Poisson distribution for each elements of out.
* \param lambda parameter (rate) of the Poisson distribution
* \param out output NDArray.
*/
-void SamplePoisson(real_t lambda, NDArray *out);
+void SamplePoisson(real_t lambda, NDArray* out);
/*!
* \brief Sample negative binomial distribution for each elements of out.
* \param k failure limit
* \param p success probability
* \param out output NDArray.
*/
-void SampleNegBinomial(int32_t k, real_t p, NDArray *out);
+void SampleNegBinomial(int32_t k, real_t p, NDArray* out);
/*!
- * \brief Sample generalized negative binomial distribution for each elements of out.
- * \param mu parameter (mean) of the distribution
- * \param alpha parameter (over dispersion) of the distribution
- * \param out output NDArray.
+ * \brief Sample generalized negative binomial distribution for each elements of
+ * out. \param mu parameter (mean) of the distribution \param alpha parameter
+ * (over dispersion) of the distribution \param out output NDArray.
*/
-void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray *out);
-
+void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray* out);
//--------------------------------------------------------------
// The following part are API Registration of NDArray functions.
//--------------------------------------------------------------
/*! \brief definition of NDArray function */
-typedef std::function<void (NDArray **used_vars,
- real_t *scalars,
- NDArray **mutate_vars,
- int num_params,
- char **param_keys,
- char **param_vals)> NDArrayAPIFunction;
+typedef std::function<void(NDArray** used_vars,
+ real_t* scalars,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals)>
+ NDArrayAPIFunction;
/*! \brief mask information on how functions can be exposed */
enum NDArrayFunctionTypeMask {
/*! \brief all the use_vars should go before scalar */
@@ -1303,8 +1330,7 @@ enum NDArrayFunctionTypeMask {
};
/*! \brief Registry entry for NDArrayFunction */
struct NDArrayFunctionReg
- : public dmlc::FunctionRegEntryBase<NDArrayFunctionReg,
- NDArrayAPIFunction> {
+ : public dmlc::FunctionRegEntryBase<NDArrayFunctionReg, NDArrayAPIFunction> {
/*! \brief number of variable used by this function */
unsigned num_use_vars;
/*! \brief number of variable mutated by this function */
@@ -1316,44 +1342,45 @@ struct NDArrayFunctionReg
/*!
* \brief constructor
*/
- NDArrayFunctionReg()
- : num_use_vars(0),
- num_mutate_vars(0),
- num_scalars(0),
- type_mask(0) {}
+ NDArrayFunctionReg() : num_use_vars(0), num_mutate_vars(0), num_scalars(0), type_mask(0) {}
/*!
* \brief set the function body to a NDArray setvalue function
* this will also auto set the parameters correctly
* \param fsetvalue function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*fsetvalue)(const real_t &rhs,
- NDArray *out)) {
- body = [fsetvalue] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
- (*fsetvalue)(s[0], mutate_vars[0]);
- };
- num_mutate_vars = 1; num_scalars = 1;
+ inline NDArrayFunctionReg& set_function(void (*fsetvalue)(const real_t& rhs, NDArray* out)) {
+ body = [fsetvalue](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) { (*fsetvalue)(s[0], mutate_vars[0]); };
+
+ num_mutate_vars = 1;
+ num_scalars = 1;
this->add_argument("src", "real_t", "Source input to the function.");
return *this;
}
/*!
- * \brief set the function body to a ternary NDArray function
- * this will also auto set the parameters correctly
- * \param fternary function body to set
- * \return ref to the registered entry, used to set properties
- */
- inline NDArrayFunctionReg &set_function(void(*fternary)(const NDArray &lhs,
- const NDArray &mhs,
- const NDArray &rhs,
- NDArray *out)) {
- body = [fternary](NDArray **used_vars,
- real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
+ * \brief set the function body to a ternary NDArray function
+ * this will also auto set the parameters correctly
+ * \param fternary function body to set
+ * \return ref to the registered entry, used to set properties
+ */
+ inline NDArrayFunctionReg& set_function(
+ void (*fternary)(const NDArray& lhs, const NDArray& mhs, const NDArray& rhs, NDArray* out)) {
+ body = [fternary](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
(*fternary)(*used_vars[0], *used_vars[1], *used_vars[2], mutate_vars[0]);
};
- num_use_vars = 3; num_mutate_vars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ num_use_vars = 3;
+ num_mutate_vars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("lhs", "NDArray", "Left operand to the function.");
this->add_argument("mhs", "NDArray", "Middle operand to the function.");
this->add_argument("rhs", "NDArray", "Right operand to the function.");
@@ -1365,15 +1392,20 @@ struct NDArrayFunctionReg
* \param fbinary function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*fbinary)(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out)) {
- body = [fbinary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
+ inline NDArrayFunctionReg& set_function(void (*fbinary)(const NDArray& lhs,
+ const NDArray& rhs,
+ NDArray* out)) {
+ body = [fbinary](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
(*fbinary)(*used_vars[0], *used_vars[1], mutate_vars[0]);
};
- num_use_vars = 2; num_mutate_vars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ num_use_vars = 2;
+ num_mutate_vars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("lhs", "NDArray", "Left operand to the function.");
this->add_argument("rhs", "NDArray", "Right operand to the function.");
return *this;
@@ -1384,15 +1416,20 @@ struct NDArrayFunctionReg
* \param fscalar function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*fscalar)(const NDArray &lhs,
- const real_t &rhs,
- NDArray *out)) {
- body = [fscalar] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
- (*fscalar)(*used_vars[0], s[0], mutate_vars[0]);
- };
- num_use_vars = 1; num_mutate_vars = 1; num_scalars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ inline NDArrayFunctionReg& set_function(void (*fscalar)(const NDArray& lhs,
+ const real_t& rhs,
+ NDArray* out)) {
+ body = [fscalar](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) { (*fscalar)(*used_vars[0], s[0], mutate_vars[0]); };
+
+ num_use_vars = 1;
+ num_mutate_vars = 1;
+ num_scalars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("lhs", "NDArray", "Left operand to the function.");
this->add_argument("rhs", "real_t", "Right operand to the function.");
return *this;
@@ -1403,14 +1440,17 @@ struct NDArrayFunctionReg
* \param funary function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(void (*funary)(const NDArray &src,
- NDArray *out)) {
- body = [funary] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
- (*funary)(*used_vars[0], mutate_vars[0]);
- };
- num_use_vars = 1; num_mutate_vars = 1;
- type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
+ inline NDArrayFunctionReg& set_function(void (*funary)(const NDArray& src, NDArray* out)) {
+ body = [funary](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) { (*funary)(*used_vars[0], mutate_vars[0]); };
+
+ num_use_vars = 1;
+ num_mutate_vars = 1;
+ type_mask = kNDArrayArgBeforeScalar | kAcceptEmptyMutateTarget;
this->add_argument("src", "NDArray", "Source input to the function.");
return *this;
}
@@ -1420,13 +1460,17 @@ struct NDArrayFunctionReg
* \param fgeneric function body to set
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_function(
- void (*fgeneric)(NDArray **used_vars,
- real_t *s,
- NDArray **mutate_vars,
- const std::map<std::string, std::string>& param)) {
- body = [fgeneric] (NDArray **used_vars, real_t *s, NDArray **mutate_vars,
- int num_params, char **param_keys, char **param_vals) {
+ inline NDArrayFunctionReg& set_function(
+ void (*fgeneric)(NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ const std::map<std::string, std::string>& param)) {
+ body = [fgeneric](NDArray** used_vars,
+ real_t* s,
+ NDArray** mutate_vars,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
std::map<std::string, std::string> param;
for (int i = 0; i < num_params; ++i) {
param[param_keys[i]] = param_vals[i];
@@ -1440,32 +1484,36 @@ struct NDArrayFunctionReg
* \param n number of mutate variablesx
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_num_use_vars(unsigned n) {
- num_use_vars = n; return *this;
+ inline NDArrayFunctionReg& set_num_use_vars(unsigned n) {
+ num_use_vars = n;
+ return *this;
}
/*!
* \brief set the number of mutate variables
* \param n number of mutate variablesx
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_num_mutate_vars(unsigned n) {
- num_mutate_vars = n; return *this;
+ inline NDArrayFunctionReg& set_num_mutate_vars(unsigned n) {
+ num_mutate_vars = n;
+ return *this;
}
/*!
* \brief set the number of scalar arguments
* \param n number of scalar arguments
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_num_scalars(unsigned n) {
- num_scalars = n; return *this;
+ inline NDArrayFunctionReg& set_num_scalars(unsigned n) {
+ num_scalars = n;
+ return *this;
}
/*!
* \brief set type mask
* \param tmask typemask
* \return ref to the registered entry, used to set properties
*/
- inline NDArrayFunctionReg &set_type_mask(int tmask) {
- type_mask = tmask; return *this;
+ inline NDArrayFunctionReg& set_type_mask(int tmask) {
+ type_mask = tmask;
+ return *this;
}
}; // NDArrayFunctionReg
@@ -1480,7 +1528,7 @@ struct NDArrayFunctionReg
*
* \endcode
*/
-#define MXNET_REGISTER_NDARRAY_FUN(name) \
+#define MXNET_REGISTER_NDARRAY_FUN(name) \
DMLC_REGISTRY_REGISTER(::mxnet::NDArrayFunctionReg, NDArrayFunctionReg, name)
} // namespace mxnet
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index d16b38e..c7188e3 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -23,19 +23,20 @@
* \brief ndarry module of mxnet
*/
#include <dmlc/io.h>
-#include <dmlc/memory_io.h>
#include <dmlc/logging.h>
+#include <dmlc/memory_io.h>
#include <dmlc/registry.h>
+#include <mshadow/tensor.h>
#include <mxnet/base.h>
+#include <mxnet/imperative.h>
#include <mxnet/ndarray.h>
#include <mxnet/resource.h>
-#include <mxnet/imperative.h>
-#include <mshadow/tensor.h>
-#include "./ndarray_function.h"
+
#include "../common/utils.h"
-#include "../operator/tensor/matrix_op-inl.h"
-#include "../operator/tensor/init_op.h"
#include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../operator/tensor/init_op.h"
+#include "../operator/tensor/matrix_op-inl.h"
+#include "./ndarray_function.h"
#if MXNET_USE_OPENCV
#include <opencv2/opencv.hpp>
@@ -47,13 +48,17 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
namespace mxnet {
-NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Context ctx,
- bool delay_alloc, int dtype, std::vector<int> aux_types,
- mxnet::ShapeVector aux_shapes, mxnet::TShape storage_shape) : shape_(shape),
- dtype_(dtype), storage_type_(stype), entry_(nullptr) {
+NDArray::NDArray(const NDArrayStorageType stype,
+ const mxnet::TShape& shape,
+ Context ctx,
+ bool delay_alloc,
+ int dtype,
+ std::vector<int> aux_types,
+ mxnet::ShapeVector aux_shapes,
+ mxnet::TShape storage_shape)
+ : shape_(shape), dtype_(dtype), storage_type_(stype), entry_(nullptr) {
// Assign default aux types if not given
- if (aux_types.size() == 0
- && stype != kDefaultStorage) {
+ if (aux_types.size() == 0 && stype != kDefaultStorage) {
if (stype == kRowSparseStorage) {
aux_types = {mshadow::kInt64};
} else if (stype == kCSRStorage) {
@@ -64,8 +69,7 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
}
// Assign default shapes if not given
// unknown shapes are intialized as {0} such that Size() would return 0
- if (aux_shapes.size() == 0
- && stype != kDefaultStorage) {
+ if (aux_shapes.size() == 0 && stype != kDefaultStorage) {
if (stype == kRowSparseStorage) {
aux_shapes = {mxnet::TShape(mshadow::Shape1(0))};
} else if (stype == kCSRStorage) {
@@ -75,10 +79,9 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
LOG(FATAL) << "Unknown storage type " << stype;
}
}
- if (storage_shape.Size() == 0
- && stype != kDefaultStorage) {
+ if (storage_shape.Size() == 0 && stype != kDefaultStorage) {
if (stype == kRowSparseStorage) {
- storage_shape = shape;
+ storage_shape = shape;
storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
} else if (stype == kCSRStorage) {
storage_shape = aux_shapes[csr::kIdx];
@@ -89,8 +92,8 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
if (stype == kDefaultStorage)
ptr_ = std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype);
else
- ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
- dtype, aux_types, aux_shapes);
+ ptr_ = std::make_shared<Chunk>(
+ stype, storage_shape, ctx, delay_alloc, dtype, aux_types, aux_shapes);
}
void NDArray::SetShapeFromChunk() {
@@ -111,38 +114,41 @@ struct ChunkMem {
NDArray::Chunk::~Chunk() {
bool skip_free = static_data || delay_alloc;
ChunkMem mem;
- mem.h = this->shandle;
+ mem.h = this->shandle;
mem.aux_h = this->aux_handles;
#if MXNET_USE_MKLDNN == 1
// We want to delete mkldnn memory after deleting the variable.
mem.mem = this->mkl_mem_;
#endif
if (auto engine = engine_ref_.lock()) {
- engine->DeleteVariable([mem, skip_free](RunContext s) {
- if (skip_free == false) {
+ engine->DeleteVariable(
+ [mem, skip_free](RunContext s) {
+ if (skip_free == false) {
#if MXNET_USE_MKLDNN == 1
- if (mem.mem) {
- CHECK_LE(mem.mem->GetSize(), mem.h.size);
- CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
- }
+ if (mem.mem) {
+ CHECK_LE(mem.mem->GetSize(), mem.h.size);
+ CHECK_EQ(mem.mem->GetDataHandle(), mem.h.dptr);
+ }
#endif
- Storage::Get()->Free(mem.h);
- for (const auto &aux : mem.aux_h) {
- Storage::Get()->Free(aux);
- }
- }
- }, shandle.ctx, var);
+ Storage::Get()->Free(mem.h);
+ for (const auto& aux : mem.aux_h) {
+ Storage::Get()->Free(aux);
+ }
+ }
+ },
+ shandle.ctx,
+ var);
}
}
-void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
- CHECK_NE(aux_shapes.size(), 0)
- << "data is expected to be allocated after aux_data";
+void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape& shape, int dtype) {
+ CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
- CHECK_LT(shape.Size(), (int64_t{1} << 31) - 1) <<
- "[CheckAndAllocData] Size of tensor you are trying to allocate is larger than "
- "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+ CHECK_LT(shape.Size(), (int64_t{1} << 31) - 1)
+ << "[CheckAndAllocData] Size of tensor you are trying to allocate is "
+ "larger than "
+ "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
}
if (shandle.size < dbytes) {
// free storage
@@ -160,7 +166,8 @@ void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
}
NDArray NDArray::grad() const {
- if (Imperative::AGInfo::IsNone(*this)) return NDArray();
+ if (Imperative::AGInfo::IsNone(*this))
+ return NDArray();
Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
if (info.out_grads.size()) {
CHECK_EQ(info.out_grads.size(), 1);
@@ -171,7 +178,8 @@ NDArray NDArray::grad() const {
nnvm::Symbol NDArray::get_autograd_symbol() const {
CHECK(!Imperative::AGInfo::IsNone(*this))
- << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
+ << "NDArray is not part of a computation graph. Did you forget to turn "
+ "on recording?";
nnvm::Symbol ret;
ret.outputs.emplace_back(entry_);
return ret;
@@ -179,36 +187,35 @@ nnvm::Symbol NDArray::get_autograd_symbol() const {
#if MXNET_USE_MKLDNN == 1
-NDArray::NDArray(const mkldnn::memory::desc &md)
- : storage_type_(kDefaultStorage), entry_(nullptr) {
+NDArray::NDArray(const mkldnn::memory::desc& md) : storage_type_(kDefaultStorage), entry_(nullptr) {
shape_ = mxnet::TShape(md.data.dims, md.data.dims + md.data.ndims);
dtype_ = get_mxnet_type(md.data.data_type);
- ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+ ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
ptr_->CheckAndAlloc(md.get_size());
ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(md, ptr_->shandle.dptr);
}
-NDArray::NDArray(const std::shared_ptr<mkldnn::memory> &mkldnn_mem)
+NDArray::NDArray(const std::shared_ptr<mkldnn::memory>& mkldnn_mem)
: storage_type_(kDefaultStorage), entry_(nullptr) {
- auto mem_desc = mkldnn_mem->get_desc();
- shape_ = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
- dtype_ = get_mxnet_type(mem_desc.data.data_type);
- ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
+ auto mem_desc = mkldnn_mem->get_desc();
+ shape_ = mxnet::TShape(mem_desc.data.dims, mem_desc.data.dims + mem_desc.data.ndims);
+ dtype_ = get_mxnet_type(mem_desc.data.data_type);
+ ptr_ = std::make_shared<Chunk>(shape_, Context::CPU(), true, dtype_);
ptr_->shandle.dptr = mkldnn_mem->get_data_handle();
ptr_->shandle.size = mem_desc.get_size();
- ptr_->delay_alloc = false;
- ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mkldnn_mem);
- ptr_->static_data = true;
+ ptr_->delay_alloc = false;
+ ptr_->mkl_mem_ = std::make_shared<MKLDNNMemory>(mkldnn_mem);
+ ptr_->static_data = true;
}
-NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape &shape) const {
+NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape& shape) const {
CHECK(!is_none()) << "NDArray is not initialized";
CHECK_GE(shape_.Size(), shape.Size())
- << "NDArray.Reshape: target shape size is larger current shape";
+ << "NDArray.Reshape: target shape size is larger current shape";
CHECK_EQ(storage_type(), kDefaultStorage);
if (!IsMKLDNNData()) {
NDArray ret = this->Detach();
- ret.shape_ = shape;
+ ret.shape_ = shape;
return ret;
} else {
NDArray ret(shape, ctx(), true, dtype());
@@ -216,32 +223,32 @@ NDArray NDArray::MKLDNNDataReshape(const mxnet::TShape &shape) const {
// be called in operators.
mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
CHECK(ptr_->IsMKLDNN());
- mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
- mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_desc);
- MKLDNNStream *stream = MKLDNNStream::Get();
+ mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
+ mkldnn::memory* def_mem = TmpMemMgr::Get()->Alloc(def_desc);
+ MKLDNNStream* stream = MKLDNNStream::Get();
std::shared_ptr<mkldnn::memory> curr_mem = ptr_->mkl_mem_->GetMem();
stream->RegisterMem(curr_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *curr_mem},
- {MKLDNN_ARG_TO, *def_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *curr_mem}, {MKLDNN_ARG_TO, *def_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(*curr_mem, *def_mem), args);
// def_mem points to a memory region in the temp space. It's only valid
// inside an operator. As such, the returned NDArray can only be valid
// inside an operator and the shared point doesn't need to do anything
// when it's destroyed.
- auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory *mem) {});
+ auto tmp = std::shared_ptr<mkldnn::memory>(def_mem, [](mkldnn::memory* mem) {});
ret.ptr_->mkl_mem_.reset(new MKLDNNMemory(tmp));
ret.ptr_->shandle.dptr = def_mem->get_data_handle();
ret.ptr_->shandle.size = def_mem->get_desc().get_size();
- ret.ptr_->delay_alloc = false;
- ret.ptr_->static_data = true;
- ret.byte_offset_ = byte_offset_;
- ret.reuse_ = false;
+ ret.ptr_->delay_alloc = false;
+ ret.ptr_->static_data = true;
+ ret.byte_offset_ = byte_offset_;
+ ret.reuse_ = false;
return ret;
}
}
#endif
-NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
+NDArray NDArray::Reshape(const mxnet::TShape& shape) const {
CHECK(!is_none()) << "NDArray is not initialized";
if (Imperative::Get()->is_np_shape()) {
CHECK_EQ(shape_.Size(), shape.Size())
@@ -249,7 +256,8 @@ NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
<< "current shape.";
} else {
CHECK_GE(shape_.Size(), shape.Size())
- << "NDArray.Reshape: target shape size is larger than the current shape";
+ << "NDArray.Reshape: target shape size is larger than the current "
+ "shape";
}
NDArray ret = this->Detach();
// If the shape doesn't change, we can just return it now.
@@ -262,15 +270,16 @@ NDArray NDArray::Reshape(const mxnet::TShape &shape) const {
return ret;
}
-NDArray NDArray::ReshapeWithRecord(const mxnet::TShape &shape) {
+NDArray NDArray::ReshapeWithRecord(const mxnet::TShape& shape) {
NDArray ret = this->Reshape(shape);
- if (!Imperative::Get()->is_recording()) return ret;
+ if (!Imperative::Get()->is_recording())
+ return ret;
CHECK_EQ(shape_.Size(), shape.Size())
- << "NDArray.Reshape: target shape must have the same size as "
- << "current shape when recording with autograd.";
+ << "NDArray.Reshape: target shape must have the same size as "
+ << "current shape when recording with autograd.";
nnvm::NodeAttrs attrs;
- attrs.op = nnvm::Op::Get("Reshape");;
+ attrs.op = nnvm::Op::Get("Reshape");
std::ostringstream os;
os << shape;
attrs.dict.insert({"shape", os.str()});
@@ -282,23 +291,22 @@ NDArray NDArray::ReshapeWithRecord(const mxnet::TShape &shape) {
NDArray NDArray::Slice(index_t begin, index_t end) const {
CHECK(!is_none()) << "NDArray is empty";
- CHECK_LE(begin, end)
- << "Invalid slicing range [" << begin << ", " << end << ")";
+ CHECK_LE(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")";
CHECK_GE(shape_[0], end) << "Slice end index out of range";
CHECK_EQ(storage_type(), kDefaultStorage);
- NDArray ret = this->Detach();
+ NDArray ret = this->Detach();
size_t length = shape_.ProdShape(1, shape_.ndim());
- MSHADOW_TYPE_SWITCH_WITH_BOOL(ret.dtype(), DType, {
- ret.byte_offset_ += begin * length * sizeof(DType);
- });
- ret.reuse_ = false;
+ MSHADOW_TYPE_SWITCH_WITH_BOOL(
+ ret.dtype(), DType, { ret.byte_offset_ += begin * length * sizeof(DType); });
+ ret.reuse_ = false;
ret.shape_[0] = end - begin;
return ret;
}
NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
NDArray ret = this->Slice(begin, end);
- if (!Imperative::Get()->is_recording()) return ret;
+ if (!Imperative::Get()->is_recording())
+ return ret;
// fake a slice op
nnvm::NodeAttrs attrs;
attrs.op = nnvm::Op::Get("slice");
@@ -313,9 +321,9 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
NDArray NDArray::At(index_t idx) const {
CHECK(storage_type() == kDefaultStorage)
<< "Storage type " << storage_type() << " doesn't support At()";
- NDArray ret = this->Slice(idx, idx+1);
+ NDArray ret = this->Slice(idx, idx + 1);
if (shape_.ndim() > 1) {
- return ret.Reshape(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+ return ret.Reshape(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));
} else {
return ret;
}
@@ -324,9 +332,9 @@ NDArray NDArray::At(index_t idx) const {
NDArray NDArray::AtWithRecord(index_t idx) {
CHECK(storage_type() == kDefaultStorage)
<< "Storage type " << storage_type() << " doesn't support At()";
- NDArray ret = this->SliceWithRecord(idx, idx+1);
+ NDArray ret = this->SliceWithRecord(idx, idx + 1);
if (shape_.ndim() > 1 || Imperative::Get()->is_np_shape()) {
- return ret.ReshapeWithRecord(mxnet::TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+ return ret.ReshapeWithRecord(mxnet::TShape(shape_.data() + 1, shape_.data() + shape_.ndim()));
} else {
return ret;
}
@@ -359,20 +367,19 @@ struct NDArrayDLManager {
DLManagedTensor* NDArray::ToDLPack() const {
CHECK(!is_none()) << "NDArray is not initialized";
NDArrayDLManager* dlmanager(new NDArrayDLManager);
- dlmanager->handle = *this;
- dlmanager->tensor.dl_tensor = dlmanager->handle.data().dltensor();
+ dlmanager->handle = *this;
+ dlmanager->tensor.dl_tensor = dlmanager->handle.data().dltensor();
dlmanager->tensor.manager_ctx = dlmanager;
- dlmanager->tensor.deleter = [](DLManagedTensor* dlmanager){
+ dlmanager->tensor.deleter = [](DLManagedTensor* dlmanager) {
delete static_cast<NDArrayDLManager*>(dlmanager->manager_ctx);
};
return &(dlmanager->tensor);
}
NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle) {
- DLManagedTensor *tensor_copy = transient_handle
- ? new DLManagedTensor(*tensor)
- : const_cast<DLManagedTensor*>(tensor);
- auto deleter = [tensor_copy, transient_handle](){
+ DLManagedTensor* tensor_copy =
+ transient_handle ? new DLManagedTensor(*tensor) : const_cast<DLManagedTensor*>(tensor);
+ auto deleter = [tensor_copy, transient_handle]() {
if (tensor_copy->deleter != nullptr) {
tensor_copy->deleter(tensor_copy);
}
@@ -384,17 +391,18 @@ NDArray NDArray::FromDLPack(const DLManagedTensor* tensor, bool transient_handle
}
bool NDArray::fresh_out_grad() const {
- if (Imperative::AGInfo::IsNone(*this)) return false;
+ if (Imperative::AGInfo::IsNone(*this))
+ return false;
Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
return info.fresh_out_grad;
}
-
void NDArray::set_fresh_out_grad(bool state) const {
CHECK(!Imperative::AGInfo::IsNone(*this))
- << "NDArray has not been marked as a variable and does not have gradient state";
+ << "NDArray has not been marked as a variable and does not have gradient "
+ "state";
Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
- info.fresh_out_grad = state;
+ info.fresh_out_grad = state;
}
#if MXNET_USE_MKLDNN == 1
@@ -424,7 +432,7 @@ void NDArray::Chunk::Reorder2Default() {
if (IsDefault())
return;
- mkldnn_format_tag_t format = mkl_mem_->GetDefaultFormat();
+ mkldnn_format_tag_t format = mkl_mem_->GetDefaultFormat();
mkldnn::memory::desc def_desc = mkl_mem_->GetDesc(format);
mkldnn_mem_ptr def_mem(new mkldnn::memory(def_desc, CpuEngine::Get()->get_engine()));
mkl_mem_->ReorderTo(def_mem.get());
@@ -436,7 +444,7 @@ void NDArray::Chunk::Reorder2Default() {
mkl_mem_ = nullptr;
}
-void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
+void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc& md) {
// If the memory already uses the specified layout, don't do anything.
if (mkl_mem_ != nullptr && mkl_mem_->SameFormat(md))
return;
@@ -456,7 +464,7 @@ void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
std::shared_ptr<mkldnn::memory> old_mem;
if (IsDefault()) {
mkldnn_format_tag_t def_format = GetDefaultFormat(md);
- mkldnn::memory::desc def_desc = GetDesc(md, def_format);
+ mkldnn::memory::desc def_desc = GetDesc(md, def_format);
old_mem.reset(new mkldnn::memory(def_desc, engine, shandle.dptr));
} else {
old_mem = this->mkl_mem_->GetMem();
@@ -473,12 +481,11 @@ void NDArray::Chunk::MKLDNNDataReorder(const mkldnn::memory::desc &md) {
mkl_mem_.reset(new MKLDNNMemory(md, shandle.dptr));
}
-void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
+void NDArray::Chunk::SetMKLMem(const mxnet::TShape& shape, int dtype) {
// The shape of the array and the one of the MKL memory may mismatch.
// For example, if the array stores parameters, the MKL memory may store data
// in 5 dimensions while the NDArray stores data in 4 dimensions.
- if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr
- && mkl_mem_->SameFormat(shape, dtype)) {
+ if (mkl_mem_ && mkl_mem_->GetDataHandle() == shandle.dptr && mkl_mem_->SameFormat(shape, dtype)) {
return;
}
@@ -493,12 +500,24 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
}
mkldnn::memory::format_tag layout = mkldnn::memory::format_tag::undef;
switch (dims.size()) {
- case 1: layout = mkldnn::memory::format_tag::a; break;
- case 2: layout = mkldnn::memory::format_tag::ab; break;
- case 3: layout = mkldnn::memory::format_tag::abc; break;
- case 4: layout = mkldnn::memory::format_tag::abcd; break;
- case 5: layout = mkldnn::memory::format_tag::abcde; break;
- case 6: layout = mkldnn::memory::format_tag::abcdef; break;
+ case 1:
+ layout = mkldnn::memory::format_tag::a;
+ break;
+ case 2:
+ layout = mkldnn::memory::format_tag::ab;
+ break;
+ case 3:
+ layout = mkldnn::memory::format_tag::abc;
+ break;
+ case 4:
+ layout = mkldnn::memory::format_tag::abcd;
+ break;
+ case 5:
+ layout = mkldnn::memory::format_tag::abcde;
+ break;
+ case 6:
+ layout = mkldnn::memory::format_tag::abcdef;
+ break;
default:
LOG(FATAL) << "Not implemented dimension (" << dims.size() << ") for MKLDNN";
}
@@ -511,12 +530,12 @@ void NDArray::Chunk::SetMKLMem(const mxnet::TShape &shape, int dtype) {
mkl_mem_.reset(new MKLDNNMemory(data_md, shandle.dptr));
}
-const mkldnn::memory *NDArray::GetMKLDNNData(const mkldnn::memory::desc &desc) const {
+const mkldnn::memory* NDArray::GetMKLDNNData(const mkldnn::memory::desc& desc) const {
if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc";
return nullptr;
}
- const mkldnn::memory *mem = GetMKLDNNData();
+ const mkldnn::memory* mem = GetMKLDNNData();
mkldnn::memory::desc desc1 = mem->get_desc();
// The MKL memory has the same format and shape as required,
// or both use the default format, we can return the MKL memory.
@@ -527,13 +546,12 @@ const mkldnn::memory *NDArray::GetMKLDNNData(const mkldnn::memory::desc &desc) c
}
}
-const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
- const mkldnn::memory::desc &new_desc) const {
+const mkldnn::memory* NDArray::GetMKLDNNDataReorder(const mkldnn::memory::desc& new_desc) const {
CHECK(storage_type() == kDefaultStorage);
- const mkldnn::memory *mem = GetMKLDNNData();
+ const mkldnn::memory* mem = GetMKLDNNData();
// If the memory descriptor matches, it's easy.
- MKLDNNStream *stream = MKLDNNStream::Get();
+ MKLDNNStream* stream = MKLDNNStream::Get();
if (mem->get_desc() == new_desc) {
return GetMKLDNNExact(mem, new_desc);
}
@@ -542,13 +560,13 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
// Now we need to determine if we should reorder the memory.
// If both use the default formats, we think we don't need to reorder.
if ((!mxnet::IsMKLDNN(old_desc)) && (!mxnet::IsMKLDNN(new_desc))) {
- mkldnn_mem_ptr ret(new mkldnn::memory(new_desc,
- CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+ mkldnn_mem_ptr ret(
+ new mkldnn::memory(new_desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
stream->RegisterMem(ret);
return ret.get();
} else if (same_shape(old_desc, new_desc)) {
// If they have the same shape, we can reorder data directly.
- mkldnn::memory *ret = TmpMemMgr::Get()->Alloc(new_desc);
+ mkldnn::memory* ret = TmpMemMgr::Get()->Alloc(new_desc);
std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
stream->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
return ret;
@@ -559,14 +577,14 @@ const mkldnn::memory *NDArray::GetMKLDNNDataReorder(
mxnet::TShape required_shape(new_desc.data.ndims, -1);
for (int i = 0; i < new_desc.data.ndims; i++)
required_shape[i] = new_desc.data.dims[i];
- NDArray reshaped = MKLDNNDataReshape(required_shape);
- const mkldnn::memory *ret = reshaped.GetMKLDNNData();
+ NDArray reshaped = MKLDNNDataReshape(required_shape);
+ const mkldnn::memory* ret = reshaped.GetMKLDNNData();
if (ret->get_desc() == new_desc) {
return GetMKLDNNExact(ret, new_desc);
} else {
- mkldnn::memory *ret2 = TmpMemMgr::Get()->Alloc(new_desc);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *ret},
- {MKLDNN_ARG_TO, *ret2}});
+ mkldnn::memory* ret2 = TmpMemMgr::Get()->Alloc(new_desc);
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *ret}, {MKLDNN_ARG_TO, *ret2}});
stream->RegisterPrimArgs(mkldnn::reorder(*ret, *ret2), args);
return ret2;
}
@@ -584,17 +602,18 @@ NDArray NDArray::Reorder2Default() const {
// create new ndarray from mkldnn layout
mkldnn::memory::desc from_desc = ptr_->mkl_mem_->GetDesc();
mxnet::TShape tshape(from_desc.data.ndims, -1);
- for (int i = 0; i < from_desc.data.ndims; i++) tshape[i] = from_desc.data.dims[i];
+ for (int i = 0; i < from_desc.data.ndims; i++)
+ tshape[i] = from_desc.data.dims[i];
NDArray ret(tshape, ctx(), false, dtype());
- mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
+ mkldnn_format_tag_t format = ptr_->mkl_mem_->GetDefaultFormat();
mkldnn::memory::desc def_desc = ptr_->mkl_mem_->GetDesc(format);
CHECK(ret.ptr_->shandle.size >= def_desc.get_size());
mkldnn::memory def_mem(def_desc, CpuEngine::Get()->get_engine(), ret.ptr_->shandle.dptr);
ptr_->mkl_mem_->ReorderTo(&def_mem);
// reshape as needed
- ret.shape_ = shape_;
+ ret.shape_ = shape_;
ret.byte_offset_ = byte_offset_;
- ret.reuse_ = false;
+ ret.reuse_ = false;
return ret;
}
@@ -603,17 +622,22 @@ void NDArray::Reorder2DefaultAsync() const {
std::vector<Engine::VarHandle> mutable_vars(1, this->var());
NDArray tmp = *this;
Engine::Get()->PushAsync(
- [tmp](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- tmp.ptr_->Reorder2Default();
- on_complete();
- }, ctx(), const_vars, mutable_vars,
- FnProperty::kNormal, 0, "Reorder2Default");
+ [tmp](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ tmp.ptr_->Reorder2Default();
+ on_complete();
+ },
+ ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kNormal,
+ 0,
+ "Reorder2Default");
}
// now just support bf16->fp32
NDArray NDArray::Reorder2DefaultFloatFormat() const {
CHECK(storage_type() == kDefaultStorage && IsView() == false);
- if (dtype() != mshadow::kBfloat16) {
+ if (dtype() != mshadow::kBfloat16) {
return Reorder2Default();
}
NDArray ret(shape(), ctx(), false, mshadow::DataType<float>::kFlag);
@@ -624,24 +648,29 @@ NDArray NDArray::Reorder2DefaultFloatFormat() const {
return ret;
}
-void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc &desc) const {
+void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc& desc) const {
std::vector<Engine::VarHandle> const_vars;
std::vector<Engine::VarHandle> mutable_vars(1, this->var());
- NDArray tmp = *this;
+ NDArray tmp = *this;
const auto version = this->version();
Engine::Get()->PushAsync(
- [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- // MXNet will try to reuse NDArray from memory planning, so we need to ensure
- // the NDArray is still holding the original trunk data.
- if (tmp.version() == version) {
- tmp.ptr_->MKLDNNDataReorder(desc);
- }
- on_complete();
- }, ctx(), const_vars, mutable_vars,
- FnProperty::kNormal, 0, "Reorder");
+ [tmp, version, desc](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ // MXNet will try to reuse NDArray from memory planning, so we need to
+ // ensure the NDArray is still holding the original trunk data.
+ if (tmp.version() == version) {
+ tmp.ptr_->MKLDNNDataReorder(desc);
+ }
+ on_complete();
+ },
+ ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kNormal,
+ 0,
+ "Reorder");
}
-const mkldnn::memory *NDArray::GetMKLDNNData() const {
+const mkldnn::memory* NDArray::GetMKLDNNData() const {
CHECK(storage_type() == kDefaultStorage);
bool is_view = IsView();
if (IsMKLDNNData()) {
@@ -657,13 +686,13 @@ const mkldnn::memory *NDArray::GetMKLDNNData() const {
// because we don't have the complete data type and shape information for
// the chunk.
CheckAndAlloc();
- void *off_addr = static_cast<char *>(ptr_->shandle.dptr) + byte_offset_;
+ void* off_addr = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
// Create the primitive desc for the new mkldnn memory.
mkldnn::memory::dims dims(shape().ndim());
for (size_t i = 0; i < dims.size(); i++)
dims[i] = shape()[i];
- mkldnn::memory::format_tag cpp_format = static_cast<mkldnn::memory::format_tag>(
- GetDefaultFormat(shape().ndim()));
+ mkldnn::memory::format_tag cpp_format =
+ static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(shape().ndim()));
mkldnn::memory::data_type cpp_type = get_mkldnn_type(dtype_);
mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
std::shared_ptr<mkldnn::memory> ret(
@@ -686,7 +715,7 @@ void NDArray::InvalidateMKLDNNData() {
ptr_->mkl_mem_ = nullptr;
}
-void NDArray::CopyFrom(const mkldnn::memory &mem) {
+void NDArray::CopyFrom(const mkldnn::memory& mem) {
CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized";
if (ptr_->mkl_mem_ && ptr_->mkl_mem_->GetRaw() == &mem)
return;
@@ -699,15 +728,16 @@ void NDArray::CopyFrom(const mkldnn::memory &mem) {
if (IsMKLDNNData() && IsView())
ptr_->Reorder2Default();
- const mkldnn::memory *this_mem = GetMKLDNNData();
+ const mkldnn::memory* this_mem = GetMKLDNNData();
MKLDNNMemoryCopy(mem, this_mem);
}
-mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
+mkldnn::memory* NDArray::CreateMKLDNNData(const mkldnn::memory::desc& desc) {
if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) {
- LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc. "
- << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
- << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
+ LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN "
+ "memory desc. "
+ << "MKLDNN memory requests for " << desc.get_size() << " bytes, but got "
+ << shape().Size() * GetTypeSize(dtype_) << " bytes from NDArray";
return nullptr;
}
bool isDefaultFormat = IsDefaultFormat(desc);
@@ -720,8 +750,10 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
CHECK(ptr_->shandle.dptr);
// When this is a view and a user wants the default layout, we can simply
// create a new mkldnn memory that points to the right memory.
- std::shared_ptr<mkldnn::memory> mem(new mkldnn::memory(desc,
- CpuEngine::Get()->get_engine(), static_cast<char *>(ptr_->shandle.dptr) + byte_offset_));
+ std::shared_ptr<mkldnn::memory> mem(
+ new mkldnn::memory(desc,
+ CpuEngine::Get()->get_engine(),
+ static_cast<char*>(ptr_->shandle.dptr) + byte_offset_));
MKLDNNStream::Get()->RegisterMem(mem);
return mem.get();
} else if (IsView()) {
@@ -747,9 +779,9 @@ mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::desc &desc) {
return ptr_->mkl_mem_->GetRaw();
}
-void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc) {
- auto new_desc = desc;
- auto this_dtype = get_mkldnn_type(dtype());
+void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc& desc) {
+ auto new_desc = desc;
+ auto this_dtype = get_mkldnn_type(dtype());
new_desc.data.data_type = static_cast<mkldnn_data_type_t>(this_dtype);
ptr_->mkl_mem_.reset(new MKLDNNMemory(new_desc, ptr_->shandle.dptr));
MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_->GetMem());
@@ -760,12 +792,12 @@ void NDArray::UpdateMKLDNNMemDesc(const mkldnn::memory::desc &desc) {
void NDArray::SetTBlob() const {
CHECK(ptr_ != nullptr);
mxnet::TShape shape = shape_;
- char *dptr = static_cast<char*>(ptr_->shandle.dptr);
- auto stype = storage_type();
+ char* dptr = static_cast<char*>(ptr_->shandle.dptr);
+ auto stype = storage_type();
if (stype == kDefaultStorage) {
#if MXNET_USE_MKLDNN == 1
CHECK(!IsMKLDNNData()) << "We can't generate TBlob for MKLDNN data. "
- << "Please use Reorder2Default() to generate a new NDArray first";
+ << "Please use Reorder2Default() to generate a new NDArray first";
#endif
dptr += byte_offset_;
} else if (stype == kCSRStorage || stype == kRowSparseStorage) {
@@ -774,27 +806,24 @@ void NDArray::SetTBlob() const {
} else {
LOG(FATAL) << "unknown storage type " << stype;
}
- tblob_.dptr_ = dptr;
- tblob_.shape_ = shape;
+ tblob_.dptr_ = dptr;
+ tblob_.shape_ = shape;
tblob_.type_flag_ = dtype_;
tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
}
/*!
-* \brief run a ternary operation
-* \param lhs left operand
-* \param mhs middle operand
-* \param rhs right operand
-* \param out the output ndarray
-*/
-template<typename OP>
-void TernaryOp(const NDArray &lhs,
- const NDArray &mhs,
- const NDArray &rhs,
- NDArray *out) {
+ * \brief run a ternary operation
+ * \param lhs left operand
+ * \param mhs middle operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ */
+template <typename OP>
+void TernaryOp(const NDArray& lhs, const NDArray& mhs, const NDArray& rhs, NDArray* out) {
// no check if all of them are on cpu
- if (lhs.ctx().dev_mask() != cpu::kDevMask || mhs.ctx().dev_mask() != cpu::kDevMask
- || rhs.ctx().dev_mask() != cpu::kDevMask) {
+ if (lhs.ctx().dev_mask() != cpu::kDevMask || mhs.ctx().dev_mask() != cpu::kDevMask ||
+ rhs.ctx().dev_mask() != cpu::kDevMask) {
CHECK((lhs.ctx() == mhs.ctx()) && (mhs.ctx() == rhs.ctx())) << "operands context mismatch";
}
// if out is none, allocate space
@@ -802,60 +831,75 @@ void TernaryOp(const NDArray &lhs,
*out = NDArray(OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()), lhs.ctx(), true);
} else {
// no check if both of them are on cpu
- if (lhs.ctx().dev_mask() != cpu::kDevMask ||
- out->ctx().dev_mask() != cpu::kDevMask) {
+ if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
}
CHECK(out->shape() == OP::GetShape(lhs.shape(), mhs.shape(), rhs.shape()))
- << "target shape mismatch";
+ << "target shape mismatch";
}
// important: callback must always capture by value
NDArray ret = *out;
// get the const variables
std::vector<Engine::VarHandle> const_vars;
- if (lhs.var() != ret.var()) const_vars.push_back(lhs.var());
- if (mhs.var() != ret.var()) const_vars.push_back(mhs.var());
- if (rhs.var() != ret.var()) const_vars.push_back(rhs.var());
+ if (lhs.var() != ret.var())
+ const_vars.push_back(lhs.var());
+ if (mhs.var() != ret.var())
+ const_vars.push_back(mhs.var());
+ if (rhs.var() != ret.var())
+ const_vars.push_back(rhs.var());
// redirect everything to mshadow operations
switch (lhs.ctx().dev_mask()) {
- case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, mhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<cpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
- }, lhs.ctx(), const_vars, { ret.var() },
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
- break;
- }
+ case cpu::kDevMask: {
+ Engine::Get()->PushSync(
+ [lhs, mhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<cpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
+ break;
+ }
#if MXNET_USE_CUDA
- case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, mhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<gpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, { ret.var() },
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
- break;
- }
+ case gpu::kDevMask: {
+ Engine::Get()->PushSync(
+ [lhs, mhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<gpu, OP>(lhs.data(), mhs.data(), rhs.data(), &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
+ break;
+ }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
/*!
-* \brief Performs some preparation required to apply binary operators.
-* Checks context and shape of ndarrays, allocates space for output
-* and prepares const variables for engine
-* \param lhs left operand
-* \param rhs right operand
-* \param out the output ndarray
-* \param binary_op the real operation
-*/
-template<typename OP>
-std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out) {
+ * \brief Performs some preparation required to apply binary operators.
+ * Checks context and shape of ndarrays, allocates space for output
+ * and prepares const variables for engine
+ * \param lhs left operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ * \param binary_op the real operation
+ */
+template <typename OP>
+std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray& lhs,
+ const NDArray& rhs,
+ NDArray* out) {
// no check if both of them are on cpu
if (lhs.ctx().dev_mask() != cpu::kDevMask || rhs.ctx().dev_mask() != cpu::kDevMask) {
CHECK(lhs.ctx() == rhs.ctx()) << "operands context mismatch";
@@ -865,59 +909,69 @@ std::vector<Engine::VarHandle> BinaryOpPrepare(const NDArray &lhs,
*out = NDArray(OP::GetShape(lhs.shape(), rhs.shape()), lhs.ctx(), true, lhs.dtype());
} else {
// no check if both of them are on cpu
- if (lhs.ctx().dev_mask() != cpu::kDevMask ||
- out->ctx().dev_mask() != cpu::kDevMask) {
+ if (lhs.ctx().dev_mask() != cpu::kDevMask || out->ctx().dev_mask() != cpu::kDevMask) {
CHECK(out->ctx() == lhs.ctx()) << "target context mismatch";
}
- CHECK(out->shape() == OP::GetShape(lhs.shape(), rhs.shape()))
- << "target shape mismatch";
+ CHECK(out->shape() == OP::GetShape(lhs.shape(), rhs.shape())) << "target shape mismatch";
}
std::vector<Engine::VarHandle> const_vars;
// prepare const variables for engine
- if (lhs.var() != out->var()) const_vars.push_back(lhs.var());
- if (rhs.var() != out->var()) const_vars.push_back(rhs.var());
+ if (lhs.var() != out->var())
+ const_vars.push_back(lhs.var());
+ if (rhs.var() != out->var())
+ const_vars.push_back(rhs.var());
return const_vars;
}
/*!
-* \brief run a binary operation using the kernel launch method
-* \param lhs left operand
-* \param rhs right operand
-* \param out the output ndarray
-* \param binary_op the real operation
-*/
-template<typename OP>
-void BinaryOpKernel(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out) {
+ * \brief run a binary operation using the kernel launch method
+ * \param lhs left operand
+ * \param rhs right operand
+ * \param out the output ndarray
+ * \param binary_op the real operation
+ */
+template <typename OP>
+void BinaryOpKernel(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
// important: callback must always capture by value
NDArray ret = *out;
switch (lhs.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
- ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
- },
- lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+ ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
- ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+ ndarray::BinaryOpKernelImpl<OP>(s, lhs.data(), rhs.data(), &tmp);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
-}
+ }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
@@ -928,71 +982,89 @@ void BinaryOpKernel(const NDArray &lhs,
* \param out the output ndarray
* \param binary_op the real operation
*/
-template<typename OP>
-void BinaryOp(const NDArray &lhs,
- const NDArray &rhs,
- NDArray *out) {
+template <typename OP>
+void BinaryOp(const NDArray& lhs, const NDArray& rhs, NDArray* out) {
std::vector<Engine::VarHandle> const_vars = BinaryOpPrepare<OP>(lhs, rhs, out);
// important: callback must always capture by value
NDArray ret = *out;
// redirect everything to mshadow operations
switch (lhs.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
TBlob tmp = ret.data();
ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<gpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
-void SetValueOp(const real_t &rhs, NDArray *out) {
+void SetValueOp(const real_t& rhs, NDArray* out) {
CHECK_NE(out->is_none(), true) << "Set value target must not be empty";
// important: callback must always capture by value
- NDArray ret = *out;
+ NDArray ret = *out;
const NDArrayStorageType stype = ret.storage_type();
- Engine::Get()->PushSync([rhs, ret, stype](RunContext ctx) {
- TBlob tmp = ret.data();
- switch (ret.ctx().dev_mask()) {
- case cpu::kDevMask: {
- if (stype == kDefaultStorage) {
- ndarray::Eval<cpu>(rhs, &tmp, ctx);
- } else {
- ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
+ Engine::Get()->PushSync(
+ [rhs, ret, stype](RunContext ctx) {
+ TBlob tmp = ret.data();
+ switch (ret.ctx().dev_mask()) {
+ case cpu::kDevMask: {
+ if (stype == kDefaultStorage) {
+ ndarray::Eval<cpu>(rhs, &tmp, ctx);
+ } else {
+ ndarray::Eval(ctx.get_stream<cpu>(), rhs, ret);
+ }
+ break;
}
- break;
- }
#if MXNET_USE_CUDA
- case gpu::kDevMask: {
- if (stype == kDefaultStorage) {
- ndarray::Eval<gpu>(rhs, &tmp, ctx);
- } else {
- ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
+ case gpu::kDevMask: {
+ if (stype == kDefaultStorage) {
+ ndarray::Eval<gpu>(rhs, &tmp, ctx);
+ } else {
+ ndarray::Eval(ctx.get_stream<gpu>(), rhs, ret);
+ }
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ break;
}
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- break;
- }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
- }
- }, ret.ctx(), {}, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ }
+ },
+ ret.ctx(),
+ {},
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
}
/*!
@@ -1002,10 +1074,8 @@ void SetValueOp(const real_t &rhs, NDArray *out) {
* \param out the output ndarray
* \param binary_op the real
*/
-template<typename OP, bool reverse>
-void ScalarOp(const NDArray &lhs,
- const real_t &rhs,
- NDArray *out) {
+template <typename OP, bool reverse>
+void ScalarOp(const NDArray& lhs, const real_t& rhs, NDArray* out) {
if (out->is_none()) {
*out = NDArray(lhs.shape(), lhs.ctx(), true, lhs.dtype());
} else {
@@ -1016,47 +1086,69 @@ void ScalarOp(const NDArray &lhs,
NDArray ret = *out;
// get the const variables
std::vector<Engine::VarHandle> const_vars;
- if (lhs.var() != ret.var()) const_vars.push_back(lhs.var());
+ if (lhs.var() != ret.var())
+ const_vars.push_back(lhs.var());
// redirect everything to mshadow operations
switch (lhs.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<cpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<cpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::Eval<gpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, lhs.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [lhs, rhs, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::Eval<gpu, OP, reverse>(lhs.data(), rhs, &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ lhs.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
size_t num_aux_data(NDArrayStorageType stype) {
size_t num = 0;
switch (stype) {
- case kDefaultStorage: num = 0; break;
- case kCSRStorage: num = 2; break;
- case kRowSparseStorage: num = 1; break;
- default: LOG(FATAL) << "Unknown storage type" << stype; break;
+ case kDefaultStorage:
+ num = 0;
+ break;
+ case kCSRStorage:
+ num = 2;
+ break;
+ case kRowSparseStorage:
+ num = 1;
+ break;
+ default:
+ LOG(FATAL) << "Unknown storage type" << stype;
+ break;
}
return num;
}
// Make a copy of a CSR NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
using namespace mshadow;
CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
@@ -1070,19 +1162,16 @@ inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext
to.CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr));
to.CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx));
to.CheckAndAllocData(from.aux_shape(csr::kIdx));
- TBlob val = to.data();
+ TBlob val = to.data();
TBlob indptr = to.aux_data(csr::kIndPtr);
- TBlob idx = to.aux_data(csr::kIdx);
- ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
- from.ctx(), to.ctx(), ctx);
- ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr,
- from.ctx(), to.ctx(), ctx);
- ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx,
- from.ctx(), to.ctx(), ctx);
+ TBlob idx = to.aux_data(csr::kIdx);
+ ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr, from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx, from.ctx(), to.ctx(), ctx);
}
// Make a copy of a row-sparse NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
using namespace mshadow;
CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
@@ -1096,14 +1185,12 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext
to.CheckAndAlloc({aux_shape});
TBlob val = to.data();
TBlob idx = to.aux_data(rowsparse::kIdx);
- ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
- from.ctx(), to.ctx(), ctx);
- ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx,
- from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.data(), &val, from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx, from.ctx(), to.ctx(), ctx);
}
// Make a copy of a dense NDArray
-template<typename from_xpu, typename to_xpu>
+template <typename from_xpu, typename to_xpu>
inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
#if MXNET_USE_MKLDNN == 1
// If neither is MKLDNN, we can copy data normally.
@@ -1112,23 +1199,19 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
using namespace mshadow;
CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
TBlob tmp = to.data();
- ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
- from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp, from.ctx(), to.ctx(), ctx);
#if MXNET_USE_MKLDNN == 1
- } else if (SupportMKLDNN(from.dtype(), from.shape())
- && SupportMKLDNN(to.dtype(), to.shape())
- && from.ctx().dev_mask() == cpu::kDevMask
- && to.ctx().dev_mask() == cpu::kDevMask) {
- // If we copy data directly, we need to make sure both NDArrays are supported
- // by MKLDNN.
+ } else if (SupportMKLDNN(from.dtype(), from.shape()) && SupportMKLDNN(to.dtype(), to.shape()) &&
+ from.ctx().dev_mask() == cpu::kDevMask && to.ctx().dev_mask() == cpu::kDevMask) {
+ // If we copy data directly, we need to make sure both NDArrays are
+ // supported by MKLDNN.
auto from_mem = from.GetMKLDNNData();
- auto to_mem = to.GetMKLDNNData();
+ auto to_mem = to.GetMKLDNNData();
if (from_mem->get_desc() == to_mem->get_desc()) {
- size_t size = std::min(from_mem->get_desc().get_size(),
- to_mem->get_desc().get_size());
+ size_t size = std::min(from_mem->get_desc().get_size(), to_mem->get_desc().get_size());
memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size);
} else {
- const_cast<NDArray &>(to).CopyFrom(*from_mem);
+ const_cast<NDArray&>(to).CopyFrom(*from_mem);
MKLDNNStream::Get()->Submit();
}
} else {
@@ -1138,7 +1221,7 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
NDArray tmp_from = from;
if (tmp_from.IsMKLDNNData()) {
// TODO(zhengda) tmp_from should be cached.
- tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
+ tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype());
auto tmp_mem = from.GetMKLDNNData();
tmp_from.CopyFrom(*tmp_mem);
MKLDNNStream::Get()->Submit();
@@ -1146,35 +1229,31 @@ inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext
CHECK(tmp_from.IsDefaultData());
CHECK(to.IsDefaultData());
TBlob tmp = to.data();
- ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp,
- from.ctx(), to.ctx(), ctx);
+ ndarray::Copy<from_xpu, to_xpu>(tmp_from.data(), &tmp, from.ctx(), to.ctx(), ctx);
}
#endif
}
// Make a copy of an NDArray based on storage type
-template<typename from_xpu, typename to_xpu>
-void CopyFromToImpl(const NDArray& from, const NDArray& to,
- RunContext rctx, const std::vector<Resource>& requested) {
+template <typename from_xpu, typename to_xpu>
+void CopyFromToImpl(const NDArray& from,
+ const NDArray& to,
+ RunContext rctx,
+ const std::vector<Resource>& requested) {
using namespace std;
using namespace mshadow;
// if storage type doesn't match, cast the storage first
const NDArrayStorageType from_stype = from.storage_type();
- const NDArrayStorageType to_stype = to.storage_type();
- CHECK(from_stype == kDefaultStorage
- || to_stype == kDefaultStorage
- || from_stype == to_stype)
- << "Copying ndarray of stype = " << from_stype
- << " to stype = " << to_stype << " is not supported";
+ const NDArrayStorageType to_stype = to.storage_type();
+ CHECK(from_stype == kDefaultStorage || to_stype == kDefaultStorage || from_stype == to_stype)
+ << "Copying ndarray of stype = " << from_stype << " to stype = " << to_stype
+ << " is not supported";
const Context from_ctx = from.ctx();
- const Context to_ctx = to.ctx();
- bool is_train = Imperative::Get()->is_training();
-
- OpContext opctx{Imperative::Get()->is_recording(),
- is_train,
- rctx,
- engine::CallbackOnComplete(),
- requested};
+ const Context to_ctx = to.ctx();
+ bool is_train = Imperative::Get()->is_training();
+
+ OpContext opctx{
+ Imperative::Get()->is_recording(), is_train, rctx, engine::CallbackOnComplete(), requested};
if (from_ctx == to_ctx && from_stype != to_stype) {
// same ctx, different stypes, use cast op directly without copying
common::CastStorageDispatch<from_xpu>(opctx, from, to);
@@ -1182,7 +1261,7 @@ void CopyFromToImpl(const NDArray& from, const NDArray& to,
NDArray casted_nd; // an intermediate result before copying from to to
if (from_stype == to_stype) {
casted_nd = from; // same stype, no need to cast from
- } else { // different stypes on different ctx needs an temporary casted_nd
+ } else { // different stypes on different ctx needs an temporary casted_nd
const mxnet::TShape& shape = from.shape();
if (to_stype == kDefaultStorage) {
casted_nd = NDArray(shape, from_ctx);
@@ -1213,21 +1292,21 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
CHECK(from.shape() == to.shape())
<< "operands shape mismatch "
<< "from.shape = " << from.shape() << " to.shape=" << to.shape();
- CHECK(!mxnet::op::shape_is_none(from.shape()))
- << "source operands have undefined shape";
+ CHECK(!mxnet::op::shape_is_none(from.shape())) << "source operands have undefined shape";
// zero-size array, no need to copy
if (from.shape().Size() == 0U) {
return;
}
// important: callback must always capture by value
const Context from_ctx = from.ctx();
- const int a = from_ctx.dev_mask();
- const int b = to.ctx().dev_mask();
+ const int a = from_ctx.dev_mask();
+ const int b = to.ctx().dev_mask();
std::vector<Engine::VarHandle> const_vars;
- if (from.var() != to.var()) const_vars.push_back(from.var());
+ if (from.var() != to.var())
+ const_vars.push_back(from.var());
const NDArrayStorageType from_stype = from.storage_type();
- const NDArrayStorageType to_stype = to.storage_type();
+ const NDArrayStorageType to_stype = to.storage_type();
std::vector<Engine::VarHandle> mutable_vars(1, to.var());
@@ -1250,8 +1329,8 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
// request temp resource if cast_storage performs on GPU
if (a == gpu::kDevMask) {
- Resource rsc = ResourceManager::Get()->Request(from_ctx,
- ResourceRequest(ResourceRequest::kTempSpace));
+ Resource rsc =
+ ResourceManager::Get()->Request(from_ctx, ResourceRequest(ResourceRequest::kTempSpace));
requested.push_back(rsc);
mutable_vars.push_back(rsc.var);
}
@@ -1259,38 +1338,57 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
if (a == cpu::kDevMask && b == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
- on_complete();
- }, from.ctx(), const_vars, mutable_vars,
- FnProperty::kNormal, priority, "CopyCPU2CPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<cpu, cpu>(from, to, ctx, requested);
+ on_complete();
+ },
+ from.ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kNormal,
+ priority,
+ "CopyCPU2CPU");
} else {
#if MXNET_USE_CUDA
if (a == cpu::kDevMask && b == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
- ctx.get_stream<gpu>()->Wait();
- on_complete();
- }, to.ctx(), const_vars, mutable_vars,
- FnProperty::kCopyToGPU, priority, "CopyCPU2GPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<cpu, gpu>(from, to, ctx, requested);
+ ctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ to.ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kCopyToGPU,
+ priority,
+ "CopyCPU2GPU");
} else if (a == gpu::kDevMask && b == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
- ctx.get_stream<gpu>()->Wait();
- on_complete();
- }, from.ctx(), const_vars, mutable_vars,
- FnProperty::kCopyFromGPU, priority, "CopyGPU2CPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<gpu, cpu>(from, to, ctx, requested);
+ ctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ from.ctx(),
+ const_vars,
+ mutable_vars,
+ FnProperty::kCopyFromGPU,
+ priority,
+ "CopyGPU2CPU");
} else if (a == gpu::kDevMask && b == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
- CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
- ctx.get_stream<gpu>()->Wait();
- on_complete();
- }, from.ctx(), const_vars, mutable_vars,
- from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
- priority, is_opr ? "_copyto_GPU2GPU" : "CopyGPU2GPU");
+ [from, to, requested](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+ CopyFromToImpl<gpu, gpu>(from, to, ctx, requested);
+ ctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ from.ctx(),
+ const_vars,
+ mutable_vars,
+ from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+ priority,
+ is_opr ? "_copyto_GPU2GPU" : "CopyGPU2GPU");
} else {
LOG(FATAL) << "unknown device mask";
}
@@ -1300,26 +1398,22 @@ void CopyFromTo(const NDArray& from, const NDArray& to, int priority, bool is_op
}
}
-
-void CopyFromTo(const NDArray& from, const NDArray *to, int priority) {
+void CopyFromTo(const NDArray& from, const NDArray* to, int priority) {
CopyFromTo(from, *to, priority);
}
-void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority) {
+void ElementwiseSum(const std::vector<NDArray>& source, NDArray* out, int priority) {
std::vector<Engine::VarHandle> const_vars;
const_vars.reserve(source.size());
for (const auto& source_array : source) {
if (source_array.var() != out->var()) {
const_vars.push_back(source_array.var());
}
- CHECK_EQ(source_array.shape() , out->shape())
- << "operands shape mismatch";
+ CHECK_EQ(source_array.shape(), out->shape()) << "operands shape mismatch";
if (out->ctx().dev_mask() == Context::kCPU) {
- CHECK_EQ(source_array.ctx().dev_mask(), Context::kCPU)
- << "operands context mismatch";
+ CHECK_EQ(source_array.ctx().dev_mask(), Context::kCPU) << "operands context mismatch";
} else {
- CHECK_EQ(source_array.ctx(), out->ctx())
- << "operands context mismatch";
+ CHECK_EQ(source_array.ctx(), out->ctx()) << "operands context mismatch";
}
}
// important: callback must always capture by value
@@ -1330,67 +1424,84 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
if (stype == kDefaultStorage) {
switch (out->ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([source, ret](RunContext ctx) {
- std::vector<TBlob> source_tblob(source.size());
- for (size_t i = 0; i < source.size(); ++i) {
- source_tblob[i] = source[i].data();
- }
- TBlob tmp = ret.data();
- ndarray::ElementwiseSum<cpu>(source_tblob, &tmp, ctx);
- }, out->ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, priority, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [source, ret](RunContext ctx) {
+ std::vector<TBlob> source_tblob(source.size());
+ for (size_t i = 0; i < source.size(); ++i) {
+ source_tblob[i] = source[i].data();
+ }
+ TBlob tmp = ret.data();
+ ndarray::ElementwiseSum<cpu>(source_tblob, &tmp, ctx);
+ },
+ out->ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ priority,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([source, ret](RunContext ctx) {
- std::vector<TBlob> source_tblob(source.size());
- for (size_t i = 0; i < source.size(); ++i) {
- source_tblob[i] = source[i].data();
- }
- TBlob tmp = ret.data();
- ndarray::ElementwiseSum<gpu>(source_tblob, &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, out->ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, priority, "DenseElementwiseSum");
+ Engine::Get()->PushSync(
+ [source, ret](RunContext ctx) {
+ std::vector<TBlob> source_tblob(source.size());
+ for (size_t i = 0; i < source.size(); ++i) {
+ source_tblob[i] = source[i].data();
+ }
+ TBlob tmp = ret.data();
+ ndarray::ElementwiseSum<gpu>(source_tblob, &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ out->ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ priority,
+ "DenseElementwiseSum");
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
} else if (stype == kRowSparseStorage) {
- Resource rsc = ResourceManager::Get()->Request(ret.ctx(),
- ResourceRequest(ResourceRequest::kTempSpace));
+ Resource rsc =
+ ResourceManager::Get()->Request(ret.ctx(), ResourceRequest(ResourceRequest::kTempSpace));
Engine::Get()->PushSync(
- [source, ret, rsc](RunContext rctx) {
- NDArray result = ret;
- switch (ret.ctx().dev_mask()) {
- case cpu::kDevMask: {
- mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, source, &result);
- break;
- }
+ [source, ret, rsc](RunContext rctx) {
+ NDArray result = ret;
+ switch (ret.ctx().dev_mask()) {
+ case cpu::kDevMask: {
+ mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, source, &result);
+ break;
+ }
#if MXNET_USE_CUDA
- case gpu::kDevMask: {
- mxnet::ndarray::ElementwiseSum(rctx.get_stream<gpu>(), rsc, source, &result);
- // wait for GPU operations to complete
- rctx.get_stream<gpu>()->Wait();
- break;
- }
+ case gpu::kDevMask: {
+ mxnet::ndarray::ElementwiseSum(rctx.get_stream<gpu>(), rsc, source, &result);
+ // wait for GPU operations to complete
+ rctx.get_stream<gpu>()->Wait();
+ break;
+ }
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
- }
- }, ret.ctx(), const_vars, {ret.var(), rsc.var},
- FnProperty::kNormal, priority, "RowSparseElementwiseSum");
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ }
+ },
+ ret.ctx(),
+ const_vars,
+ {ret.var(), rsc.var},
+ FnProperty::kNormal,
+ priority,
+ "RowSparseElementwiseSum");
} else {
LOG(FATAL) << "Not implemented for storage_type " << common::stype_string(stype);
}
}
-void ClipOp(const NDArray &src,
- const real_t &a_min, const real_t &a_max,
- NDArray *out) {
+void ClipOp(const NDArray& src, const real_t& a_min, const real_t& a_max, NDArray* out) {
if (out->is_none()) {
*out = NDArray(src.shape(), src.ctx(), true, src.dtype());
} else {
@@ -1399,99 +1510,123 @@ void ClipOp(const NDArray &src,
}
NDArray ret = *out;
std::vector<Engine::VarHandle> const_vars;
- if (src.var() != ret.var()) const_vars.push_back(src.var());
+ if (src.var() != ret.var())
+ const_vars.push_back(src.var());
switch (src.ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([src, a_min, a_max, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalClip<cpu>(src.data(), a_min, a_max, &tmp, ctx);
- }, src.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [src, a_min, a_max, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalClip<cpu>(src.data(), a_min, a_max, &tmp, ctx);
+ },
+ src.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
- #if MXNET_USE_CUDA
+#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([src, a_min, a_max, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalClip<gpu>(src.data(), a_min, a_max, &tmp, ctx);
- }, src.ctx(), const_vars, {ret.var()},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [src, a_min, a_max, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalClip<gpu>(src.data(), a_min, a_max, &tmp, ctx);
+ },
+ src.ctx(),
+ const_vars,
+ {ret.var()},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
- #endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
-template<typename Distribution>
-void SampleOP(const real_t &a,
- const real_t &b,
- NDArray *out) {
+template <typename Distribution>
+void SampleOP(const real_t& a, const real_t& b, NDArray* out) {
CHECK(!out->is_none());
- Resource resource = ResourceManager::Get()->Request(
- out->ctx(), ResourceRequest::kRandom);
+ Resource resource = ResourceManager::Get()->Request(out->ctx(), ResourceRequest::kRandom);
// important: callback must always capture by value
NDArray ret = *out;
// redirect everything to mshadow operations
switch (out->ctx().dev_mask()) {
case cpu::kDevMask: {
- Engine::Get()->PushSync([a, b, resource, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalRandom<cpu, Distribution>(a, b, resource, &tmp, ctx);
- }, out->ctx(), {}, {ret.var(), resource.var},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [a, b, resource, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalRandom<cpu, Distribution>(a, b, resource, &tmp, ctx);
+ },
+ out->ctx(),
+ {},
+ {ret.var(), resource.var},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#if MXNET_USE_CUDA
case gpu::kDevMask: {
- Engine::Get()->PushSync([a, b, resource, ret](RunContext ctx) {
- TBlob tmp = ret.data();
- ndarray::EvalRandom<gpu, Distribution>(a, b, resource, &tmp, ctx);
- // Wait GPU kernel to complete
- ctx.get_stream<gpu>()->Wait();
- }, out->ctx(), {}, {ret.var(), resource.var},
- FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+ Engine::Get()->PushSync(
+ [a, b, resource, ret](RunContext ctx) {
+ TBlob tmp = ret.data();
+ ndarray::EvalRandom<gpu, Distribution>(a, b, resource, &tmp, ctx);
+ // Wait GPU kernel to complete
+ ctx.get_stream<gpu>()->Wait();
+ },
+ out->ctx(),
+ {},
+ {ret.var(), resource.var},
+ FnProperty::kNormal,
+ 0,
+ PROFILER_MESSAGE_FUNCNAME);
break;
}
#endif
- default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+ default:
+ LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
}
}
-void SampleUniform(real_t begin, real_t end, NDArray *out) {
+void SampleUniform(real_t begin, real_t end, NDArray* out) {
SampleOP<ndarray::UniformDistribution>(begin, end, out);
}
-void SampleGaussian(real_t mu, real_t sigma, NDArray *out) {
+void SampleGaussian(real_t mu, real_t sigma, NDArray* out) {
SampleOP<ndarray::GaussianDistribution>(mu, sigma, out);
}
-void SampleExponential(real_t lambda, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"exponential sampling only valid on cpu";
+void SampleExponential(real_t lambda, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "exponential sampling only valid on cpu";
}
real_t dummy;
SampleOP<ndarray::ExponentialDistribution>(lambda, dummy, out);
}
-void SamplePoisson(real_t lambda, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"poisson sampling only valid on cpu";
+void SamplePoisson(real_t lambda, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "poisson sampling only valid on cpu";
}
real_t dummy;
SampleOP<ndarray::PoissonDistribution>(lambda, dummy, out);
}
-void SampleNegBinomial(int32_t k, real_t p, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"negative binomial sampling only valid on cpu";
+void SampleNegBinomial(int32_t k, real_t p, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "negative binomial sampling only valid on cpu";
}
SampleOP<ndarray::NegBinomialDistribution>(k, p, out);
}
-void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray *out) {
- if ( out->ctx().dev_mask() != cpu::kDevMask ) {
- LOG(FATAL) <<"negative binomial sampling only valid on cpu";
+void SampleGenNegBinomial(real_t mu, real_t alpha, NDArray* out) {
+ if (out->ctx().dev_mask() != cpu::kDevMask) {
+ LOG(FATAL) << "negative binomial sampling only valid on cpu";
}
SampleOP<ndarray::GenNegBinomialDistribution>(mu, alpha, out);
}
@@ -1504,92 +1639,88 @@ void RandomSeed(Context ctx, uint32_t seed) {
ResourceManager::Get()->SeedRandom(ctx, seed);
}
-template<typename OP>
-inline NDArray BinaryOpRet(const NDArray &lhs,
- const NDArray &rhs) {
+template <typename OP>
+inline NDArray BinaryOpRet(const NDArray& lhs, const NDArray& rhs) {
NDArray ret;
BinaryOpKernel<OP>(lhs, rhs, &ret);
return ret;
}
-template<typename OP, bool reverse>
-inline NDArray ScalarOpRet(const NDArray &lhs,
- const real_t &rhs) {
+template <typename OP, bool reverse>
+inline NDArray ScalarOpRet(const NDArray& lhs, const real_t& rhs) {
NDArray ret;
ScalarOp<OP, reverse>(lhs, rhs, &ret);
return ret;
}
-template<typename OP>
-inline NDArray &BinaryOpApply(NDArray *dst,
- const NDArray &src) {
+template <typename OP>
+inline NDArray& BinaryOpApply(NDArray* dst, const NDArray& src) {
BinaryOpKernel<OP>(*dst, src, dst);
return *dst;
}
-template<typename OP>
-inline NDArray &ScalarOpApply(NDArray *dst,
- const real_t &src) {
+template <typename OP>
+inline NDArray& ScalarOpApply(NDArray* dst, const real_t& src) {
ScalarOp<OP, false>(*dst, src, dst);
return *dst;
}
// Binary
-NDArray operator+(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator+(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Plus>(lhs, rhs);
}
-NDArray operator-(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator-(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Minus>(lhs, rhs);
}
-NDArray operator*(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator*(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Mul>(lhs, rhs);
}
-NDArray operator/(const NDArray &lhs, const NDArray &rhs) {
+NDArray operator/(const NDArray& lhs, const NDArray& rhs) {
return BinaryOpRet<ndarray::Div>(lhs, rhs);
}
// Scalar
-NDArray operator+(const NDArray &lhs, const real_t &rhs) {
+NDArray operator+(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Plus, false>(lhs, rhs);
}
-NDArray operator-(const NDArray &lhs, const real_t &rhs) {
+NDArray operator-(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Minus, false>(lhs, rhs);
}
-NDArray operator*(const NDArray &lhs, const real_t &rhs) {
+NDArray operator*(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Mul, false>(lhs, rhs);
}
-NDArray operator/(const NDArray &lhs, const real_t &rhs) {
+NDArray operator/(const NDArray& lhs, const real_t& rhs) {
return ScalarOpRet<ndarray::Div, false>(lhs, rhs);
}
// Binary
-NDArray &NDArray::operator=(real_t scalar) {
+NDArray& NDArray::operator=(real_t scalar) {
SetValueOp(scalar, this);
return *this;
}
-NDArray &NDArray::operator+=(const NDArray &src) {
+NDArray& NDArray::operator+=(const NDArray& src) {
return BinaryOpApply<ndarray::Plus>(this, src);
}
-NDArray &NDArray::operator-=(const NDArray &src) {
+NDArray& NDArray::operator-=(const NDArray& src) {
return BinaryOpApply<ndarray::Minus>(this, src);
}
-NDArray &NDArray::operator*=(const NDArray &src) {
+NDArray& NDArray::operator*=(const NDArray& src) {
return BinaryOpApply<ndarray::Mul>(this, src);
}
-NDArray &NDArray::operator/=(const NDArray &src) {
+NDArray& NDArray::operator/=(const NDArray& src) {
return BinaryOpApply<ndarray::Div>(this, src);
}
// Scalar
-NDArray &NDArray::operator+=(const real_t &src) {
+NDArray& NDArray::operator+=(const real_t& src) {
return ScalarOpApply<ndarray::Plus>(this, src);
}
-NDArray &NDArray::operator-=(const real_t &src) {
+NDArray& NDArray::operator-=(const real_t& src) {
return ScalarOpApply<ndarray::Minus>(this, src);
}
-NDArray &NDArray::operator*=(const real_t &src) {
+NDArray& NDArray::operator*=(const real_t& src) {
return ScalarOpApply<ndarray::Mul>(this, src);
}
-NDArray &NDArray::operator/=(const real_t &src) {
+NDArray& NDArray::operator/=(const real_t& src) {
return ScalarOpApply<ndarray::Div>(this, src);
}
@@ -1603,10 +1734,11 @@ static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9;
// The ndarray must be saved and loaded within np shape semantics.
static const uint32_t NDARRAY_V3_MAGIC = 0xF993faca;
-void NDArray::Save(dmlc::Stream *strm) const {
+void NDArray::Save(dmlc::Stream* strm) const {
if (Imperative::Get()->is_np_shape()) {
CHECK_EQ(storage_type(), kDefaultStorage)
- << "only allow serializing ndarray of default storage type in np shape semantics";
+ << "only allow serializing ndarray of default storage type in np shape "
+ "semantics";
strm->Write(NDARRAY_V3_MAGIC);
} else {
// write magic number to mark this version
@@ -1626,7 +1758,8 @@ void NDArray::Save(dmlc::Stream *strm) const {
// save shape
shape_.Save(strm);
- if (is_none()) return;
+ if (is_none())
+ return;
// save context
Context ctx = this->ctx();
@@ -1679,66 +1812,82 @@ void NDArray::Save(dmlc::Stream *strm) const {
}
}
-bool LegacyTShapeLoad(dmlc::Stream *strm, mxnet::TShape *shape, const uint32_t magic) {
+bool LegacyTShapeLoad(dmlc::Stream* strm, mxnet::TShape* shape, const uint32_t magic) {
switch (magic) {
case NDARRAY_V1_MAGIC:
return shape->Load(strm);
default:
// meet legacy mxnet::TShape, magic is ndim here
uint32_t ndim = magic;
- *shape = mxnet::TShape(ndim, -1);
+ *shape = mxnet::TShape(ndim, -1);
std::vector<uint32_t> buffer(ndim);
size_t nread = ndim * sizeof(uint32_t);
- if (strm->Read(buffer.data(), nread) != nread) return false;
+ if (strm->Read(buffer.data(), nread) != nread)
+ return false;
nnvm::ShapeTypeCast(buffer.begin(), buffer.end(), shape->begin());
return true;
}
}
-bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
+bool NDArray::LegacyLoad(dmlc::Stream* strm, const uint32_t magic) {
// load shape
mxnet::TShape shape;
- if (!LegacyTShapeLoad(strm, &shape, magic)) return false;
+ if (!LegacyTShapeLoad(strm, &shape, magic))
+ return false;
if (mxnet::op::shape_is_none(shape)) {
- *this = NDArray(); return true;
+ *this = NDArray();
+ return true;
}
// load context
Context ctx;
- if (!ctx.Load(strm)) return false;
+ if (!ctx.Load(strm))
+ return false;
// load type flag
int32_t type_flag;
- if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+ if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
+ return false;
// load data into CPU
NDArray temp(shape, Context::CPU(), false, type_flag);
- TBlob load_data = temp.data();
+ TBlob load_data = temp.data();
size_t type_size = mshadow::mshadow_sizeof(type_flag);
- size_t nread = type_size * shape.Size();
+ size_t nread = type_size * shape.Size();
- if (strm->Read(load_data.dptr_, nread) != nread) return false;
+ if (strm->Read(load_data.dptr_, nread) != nread)
+ return false;
if (ctx.dev_mask() == cpu::kDevMask) {
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
} else {
#if MXNET_USE_CUDA
- *this = temp.Copy(ctx); return true;
+ *this = temp.Copy(ctx);
+ return true;
#else
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
#endif
}
}
-bool NDArray::Load(dmlc::Stream *strm) {
+bool NDArray::Load(dmlc::Stream* strm) {
uint32_t magic;
- if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
+ if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t))
+ return false;
if (magic == NDARRAY_V3_MAGIC) {
CHECK(Imperative::Get()->is_np_shape())
- << "ndarray was saved in np shape semantics, must be loaded in the same semantics."
- " Please turn on np shape semantics in Python using `with np_shape(True)`"
- " or decorator `use_np_shape` to scope the code of loading the ndarray.";
+ << "ndarray was saved in np shape semantics, must be loaded in the "
+ "same semantics."
+ " Please turn on np shape semantics in Python using `with "
+ "np_shape(True)`"
+ " or decorator `use_np_shape` to scope the code of loading the "
+ "ndarray.";
} else {
- // when the flag is global on, skip the check since it would be always global on.
+ // when the flag is global on, skip the check since it would be always
+ // global on.
CHECK(Imperative::Get()->is_np_shape() == GlobalOn || !Imperative::Get()->is_np_shape())
- << "ndarray was not saved in np shape semantics, but being loaded in np shape semantics."
- " Please turn off np shape semantics in Python using `with np_shape(False)`"
+ << "ndarray was not saved in np shape semantics, but being loaded in "
+ "np shape semantics."
+ " Please turn off np shape semantics in Python using `with "
+ "np_shape(False)`"
" to scope the code of loading the ndarray.";
}
if (magic != NDARRAY_V2_MAGIC && magic != NDARRAY_V3_MAGIC) {
@@ -1747,38 +1896,45 @@ bool NDArray::Load(dmlc::Stream *strm) {
// load storage type
int32_t stype;
- if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false;
+ if (strm->Read(&stype, sizeof(stype)) != sizeof(stype))
+ return false;
if (Imperative::Get()->is_np_shape()) {
CHECK_EQ(stype, kDefaultStorage)
- << "only allow deserializing ndarray of default storage type in np shape semantics";
+ << "only allow deserializing ndarray of default storage type in np "
+ "shape semantics";
}
const int32_t nad = num_aux_data(static_cast<NDArrayStorageType>(stype));
// load storage shape
mxnet::TShape sshape;
if (nad > 0) {
- if (!sshape.Load(strm)) return false;
+ if (!sshape.Load(strm))
+ return false;
}
// load shape
mxnet::TShape shape;
- if (!shape.Load(strm)) return false;
+ if (!shape.Load(strm))
+ return false;
if (Imperative::Get()->is_np_shape()) {
if (!shape_is_known(shape)) {
*this = NDArray();
return true;
}
} else if (shape.ndim() == 0) {
- *this = NDArray(); return true;
+ *this = NDArray();
+ return true;
}
// load context
Context ctx;
- if (!ctx.Load(strm)) return false;
+ if (!ctx.Load(strm))
+ return false;
// load type flag
int32_t type_flag;
- if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+ if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag))
+ return false;
// load aux_types and aux_shapes
std::vector<int32_t> aux_types;
@@ -1788,9 +1944,11 @@ bool NDArray::Load(dmlc::Stream *strm) {
aux_shapes.resize(nad);
for (int i = 0; i < nad; ++i) {
// load aux_type(i)
- if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i])) return false;
+ if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i]))
+ return false;
// load aux_shapes(i)
- if (!aux_shapes[i].Load(strm)) return false;
+ if (!aux_shapes[i].Load(strm))
+ return false;
}
}
@@ -1799,39 +1957,50 @@ bool NDArray::Load(dmlc::Stream *strm) {
if (0 == nad) {
temp = NDArray(shape, Context::CPU(), false, type_flag);
} else {
- temp = NDArray(static_cast<NDArrayStorageType>(stype), shape,
- Context::CPU(), false, type_flag,
- aux_types, aux_shapes, sshape);
+ temp = NDArray(static_cast<NDArrayStorageType>(stype),
+ shape,
+ Context::CPU(),
+ false,
+ type_flag,
+ aux_types,
+ aux_shapes,
+ sshape);
}
// load data
- TBlob load_data = temp.data();
+ TBlob load_data = temp.data();
size_t type_size = mshadow::mshadow_sizeof(type_flag);
- size_t nread = type_size * load_data.Size();
- if (strm->Read(load_data.dptr_, nread) != nread) return false;
+ size_t nread = type_size * load_data.Size();
+ if (strm->Read(load_data.dptr_, nread) != nread)
+ return false;
// load aux_data
if (nad > 0) {
for (int i = 0; i < nad; ++i) {
load_data = temp.aux_data(i);
type_size = mshadow::mshadow_sizeof(load_data.type_flag_);
- nread = type_size * load_data.Size();
- if (strm->Read(load_data.dptr_, nread) != nread) return false;
+ nread = type_size * load_data.Size();
+ if (strm->Read(load_data.dptr_, nread) != nread)
+ return false;
}
}
if (ctx.dev_mask() == cpu::kDevMask) {
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
} else {
#if MXNET_USE_CUDA
int device_count = -1;
cudaGetDeviceCount(&device_count);
if (device_count > 0) {
- *this = temp.Copy(ctx); return true;
+ *this = temp.Copy(ctx);
+ return true;
} else {
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
}
#else
- *this = std::move(temp); return true;
+ *this = std::move(temp);
+ return true;
#endif
}
}
@@ -1848,22 +2017,14 @@ void NDArray::Save(dmlc::Stream* fo,
fo->Write(names);
}
-void NDArray::Load(dmlc::Stream* fi,
- std::vector<NDArray>* data,
- std::vector<std::string>* keys) {
+void NDArray::Load(dmlc::Stream* fi, std::vector<NDArray>* data, std::vector<std::string>* keys) {
uint64_t header, reserved;
- CHECK(fi->Read(&header))
- << "Invalid NDArray file format";
- CHECK(fi->Read(&reserved))
- << "Invalid NDArray file format";
- CHECK(header == kMXAPINDArrayListMagic)
- << "Invalid NDArray file format";
- CHECK(fi->Read(data))
- << "Invalid NDArray file format";
- CHECK(fi->Read(keys))
- << "Invalid NDArray file format";
- CHECK(keys->size() == 0 || keys->size() == data->size())
- << "Invalid NDArray file format";
+ CHECK(fi->Read(&header)) << "Invalid NDArray file format";
+ CHECK(fi->Read(&reserved)) << "Invalid NDArray file format";
+ CHECK(header == kMXAPINDArrayListMagic) << "Invalid NDArray file format";
+ CHECK(fi->Read(data)) << "Invalid NDArray file format";
+ CHECK(fi->Read(keys)) << "Invalid NDArray file format";
+ CHECK(keys->size() == 0 || keys->size() == data->size()) << "Invalid NDArray file format";
}
NDArray NDArray::Copy(Context ctx) const {
@@ -1871,30 +2032,37 @@ NDArray NDArray::Copy(Context ctx) const {
if (kDefaultStorage == storage_type()) {
ret = NDArray(shape(), ctx, true, dtype_);
} else if (kUndefinedStorage != storage_type()) {
- ret = NDArray(storage_type(), shape(), ctx, true, dtype_,
- ptr_->aux_types, ptr_->aux_shapes, storage_shape());
+ ret = NDArray(storage_type(),
+ shape(),
+ ctx,
+ true,
+ dtype_,
+ ptr_->aux_types,
+ ptr_->aux_shapes,
+ storage_shape());
} else {
- LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type="
+ LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to "
+ "ctx.dev_type="
<< ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id;
}
CopyFromTo(*this, ret);
return ret;
}
-void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
+void NDArray::SyncCopyFromCPU(const void* data, size_t size) const {
mxnet::TShape dshape = this->shape();
if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
- CHECK_LT(size, (int64_t{1} << 31) - 1) <<
- "[SyncCopyFromCPU] Size of tensor you are trying to allocate is larger than "
- "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+ CHECK_LT(size, (int64_t{1} << 31) - 1)
+ << "[SyncCopyFromCPU] Size of tensor you are trying to allocate is "
+ "larger than "
+ "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
}
- CHECK_EQ(dshape.Size(), size)
- << "Memory size do not match";
+ CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
// zero-size array, no need to copy
if (size == 0U) {
return;
}
- TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
+ TBlob src((void*)data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
if (this->ctx().dev_mask() == cpu::kDevMask) {
this->WaitToWrite();
@@ -1904,15 +2072,19 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
} else {
#if MXNET_USE_CUDA
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- TBlob dst = this->data();
- ndarray::Copy<cpu, gpu>(src, &dst,
- Context::CPU(), this->ctx(), rctx);
- // Wait GPU kernel to complete
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), {}, {this->var()},
- FnProperty::kCopyToGPU, 0, "SyncCopyCPU2GPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ TBlob dst = this->data();
+ ndarray::Copy<cpu, gpu>(src, &dst, Context::CPU(), this->ctx(), rctx);
+ // Wait GPU kernel to complete
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ {},
+ {this->var()},
+ FnProperty::kCopyToGPU,
+ 0,
+ "SyncCopyCPU2GPU");
this->WaitToRead();
#else
LOG(FATAL) << "GPU is not enabled";
@@ -1958,51 +2130,71 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
this->CheckAndAllocAuxData(j, src_shape);
}
}
- TBlob dst_data = (j >= 0? this->aux_data(j) : this->data());
+ TBlob dst_data = (j >= 0 ? this->aux_data(j) : this->data());
CHECK_LE(src_shape.Size(), dst_data.shape_.Size());
return dst_data;
};
if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
- Engine::Get()->PushSync([&](RunContext rctx) {
- const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- }, this->ctx(), const_vars, {this->var()},
- FnProperty::kNormal, 0, "SyncCopyFromNDArrayCPU2CPU");
+ Engine::Get()->PushSync(
+ [&](RunContext rctx) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ },
+ this->ctx(),
+ const_vars,
+ {this->var()},
+ FnProperty::kNormal,
+ 0,
+ "SyncCopyFromNDArrayCPU2CPU");
} else {
#if MXNET_USE_CUDA
if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), const_vars, {this->var()},
- FnProperty::kCopyToGPU, 0, "SyncCopyFromNDArrayCPU2GPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ const_vars,
+ {this->var()},
+ FnProperty::kCopyToGPU,
+ 0,
+ "SyncCopyFromNDArrayCPU2GPU");
} else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, src.ctx(), const_vars, {this->var()},
- FnProperty::kCopyFromGPU, 0, "SyncCopyFromNDArrayGPU2CPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ src.ctx(),
+ const_vars,
+ {this->var()},
+ FnProperty::kCopyFromGPU,
+ 0,
+ "SyncCopyFromNDArrayGPU2CPU");
} else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
- TBlob dst_data = get_dst_data(src_data.shape_);
- ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), const_vars, {this->var()},
- src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
- 0, "SyncCopyFromNDArrayGPU2GPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ const TBlob src_data = (i >= 0 ? src.aux_data(i) : src.data());
+ TBlob dst_data = get_dst_data(src_data.shape_);
+ ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ const_vars,
+ {this->var()},
+ src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+ 0,
+ "SyncCopyFromNDArrayGPU2GPU");
} else {
LOG(FATAL) << "unknown device mask";
}
@@ -2021,20 +2213,20 @@ void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
WaitToRead();
}
-void NDArray::SyncCopyToCPU(void *data, size_t size) const {
+void NDArray::SyncCopyToCPU(void* data, size_t size) const {
mxnet::TShape dshape = this->shape();
if (!features::is_enabled(features::INT64_TENSOR_SIZE)) {
- CHECK_LT(size, (int64_t{1} << 31) - 1) <<
- "[SyncCopyToCPU] Size of tensor you are trying to allocate is larger than "
- "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
+ CHECK_LT(size, (int64_t{1} << 31) - 1)
+ << "[SyncCopyToCPU] Size of tensor you are trying to allocate is "
+ "larger than "
+ "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
}
- CHECK_EQ(dshape.Size(), size)
- << "Memory size do not match";
+ CHECK_EQ(dshape.Size(), size) << "Memory size do not match";
// zero-size array, no need to copy
if (size == 0U) {
return;
}
- TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
+ TBlob dst(data, dshape, cpu::kDevMask, this->dtype_, 0); // NOLINT(*)
if (this->ctx().dev_mask() == cpu::kDevMask) {
this->WaitToRead();
@@ -2044,19 +2236,22 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
if (src.IsMKLDNNData())
src = this->Reorder2Default();
#endif
- ndarray::Copy<cpu, cpu>(src.data(), &dst,
- Context::CPU(), Context::CPU(), rctx);
+ ndarray::Copy<cpu, cpu>(src.data(), &dst, Context::CPU(), Context::CPU(), rctx);
} else {
#if MXNET_USE_CUDA
Engine::Get()->PushAsync(
- [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
- ndarray::Copy<gpu, cpu>(this->data(), &dst,
- this->ctx(), Context::CPU(), rctx);
- // Wait GPU kernel to complete
- rctx.get_stream<gpu>()->Wait();
- on_complete();
- }, this->ctx(), {this->var()}, {},
- FnProperty::kCopyFromGPU, 0, "SyncCopyGPU2CPU");
+ [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+ ndarray::Copy<gpu, cpu>(this->data(), &dst, this->ctx(), Context::CPU(), rctx);
+ // Wait GPU kernel to complete
+ rctx.get_stream<gpu>()->Wait();
+ on_complete();
+ },
+ this->ctx(),
+ {this->var()},
+ {},
+ FnProperty::kCopyFromGPU,
+ 0,
+ "SyncCopyGPU2CPU");
this->WaitToWrite();
#else
LOG(FATAL) << "GPU is not enabled";
@@ -2068,101 +2263,114 @@ void NDArray::SyncCheckFormat(const bool full_check) const {
int32_t err = kNormalErr;
TBlob err_cpu(&err, mshadow::Shape1(1), cpu::kDevMask, 0);
if (this->ctx().dev_mask() == cpu::kDevMask) {
- Engine::Get()->PushSync([&](RunContext rctx) {
- common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check);
- }, this->ctx(), {this->var()}, {},
- FnProperty::kNormal, 0, "CheckFormat");
+ Engine::Get()->PushSync(
+ [&](RunContext rctx) { common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check); },
+ this->ctx(),
+ {this->var()},
+ {},
+ FnProperty::kNormal,
+ 0,
+ "CheckFormat");
} else {
#if MXNET_USE_CUDA
- Engine::Get()->PushSync([&](RunContext rctx) {
- common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
- rctx.get_stream<gpu>()->Wait();
- }, this->ctx(), {this->var()}, {},
- FnProperty::kNormal, 0, "CheckFormat");
+ Engine::Get()->PushSync(
+ [&](RunContext rctx) {
+ common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
+ rctx.get_stream<gpu>()->Wait();
+ },
+ this->ctx(),
+ {this->var()},
+ {},
+ FnProperty::kNormal,
+ 0,
+ "CheckFormat");
#else
LOG(FATAL) << "GPU is not enabled";
#endif
}
this->WaitToWrite();
CHECK_NE(err, kCSRShapeErr) << "Shape mismatch of this csr NDArray";
- CHECK_NE(err, kCSRIndPtrErr)
- << "IndPtr of csr NDArray should be non-negative, in non-decreasing order, "
- << "start with 0, and end with value equal with size of indices.";
- CHECK_NE(err, kCSRIdxErr)
- << "Indices of csr NDArray should be non-negative, in ascending order per row "
- << " and less than the number of columns.";
+ CHECK_NE(err, kCSRIndPtrErr) << "IndPtr of csr NDArray should be non-negative, in non-decreasing "
+ "order, "
+ << "start with 0, and end with value equal with size of indices.";
+ CHECK_NE(err, kCSRIdxErr) << "Indices of csr NDArray should be non-negative, "
+ "in ascending order per row "
+ << " and less than the number of columns.";
CHECK_NE(err, kRSPShapeErr) << "Shape mismatch of this row_sparse NDArray";
- CHECK_NE(err, kRSPIdxErr)
- << "Indices of row_sparse NDArray should be non-negative, "
- << "less than the size of first dimension and in ascending order";
+ CHECK_NE(err, kRSPIdxErr) << "Indices of row_sparse NDArray should be non-negative, "
+ << "less than the size of first dimension and in ascending order";
CHECK_EQ(err, kNormalErr) << "Check the validity of this sparse NDArray";
}
#if MXNET_PREDICT_ONLY == 0
// register API function
// those with underscore will be registered at NDArray
-MXNET_REGISTER_NDARRAY_FUN(_set_value)
-.set_function(SetValueOp);
-
-
-MXNET_REGISTER_NDARRAY_FUN(_onehot_encode)
-.set_function(BinaryOp<ndarray::OneHotEncode>);
+MXNET_REGISTER_NDARRAY_FUN(_set_value).set_function(SetValueOp);
+MXNET_REGISTER_NDARRAY_FUN(_onehot_encode).set_function(BinaryOp<ndarray::OneHotEncode>);
MXNET_REGISTER_NDARRAY_FUN(fill_element_0index)
-.set_function(TernaryOp<ndarray::MatFillRowElem>)
-.describe("Fill one element of each line(row for python, column for R/Julia)"
-" in lhs according to index indicated by rhs and values indicated by mhs."
-" This function assume rhs uses 0-based index.");
+ .set_function(TernaryOp<ndarray::MatFillRowElem>)
+ .describe(
+ "Fill one element of each line(row for python, column for R/Julia)"
+ " in lhs according to index indicated by rhs and values indicated by "
+ "mhs."
+ " This function assume rhs uses 0-based index.");
// register API function
// those with underscore will be registered at NDArray
-void CopyFromToSimple(
- const nnvm::NodeAttrs& attrs,
- const OpContext& ctx,
- const std::vector<NDArray>& inputs,
- const std::vector<OpReqType>& req,
- const std::vector<NDArray>& outputs) {
+void CopyFromToSimple(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
CopyFromTo(inputs[0], outputs[0], 0, true);
}
// copy function is special
// that we need to remove kAcceptEmptyMutateTarget from it
NNVM_REGISTER_OP(_copyto)
-.add_alias("_npi_copyto")
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr<mxnet::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInferType>("FInferType",
- [](const NodeAttrs& attrs, std::vector<int> *in_type, std::vector<int> *out_type) {
- return !op::type_is_none((*in_type)[0]) && !op::type_is_none((*out_type)[0]);
- })
-.set_attr<FInferStorageType>("FInferStorageType",
- [](const NodeAttrs& attrs,
- const int dev_mask,
- DispatchMode* dispatch_mode,
- std::vector<int>* in_attrs,
- std::vector<int>* out_attrs) {
- op::dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx);
- if (op::storage_type_is_none((*out_attrs)[0])) {
- (*out_attrs)[0] = (*in_attrs)[0];
- }
- return true;
- })
-.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
- return ExecType::kCrossDeviceCopy;
- })
-.set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
-.set_attr<bool>("TIsBackward", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
-.set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
-.add_argument("data", "NDArray", "input data");
-
-
-void Imdecode(NDArray *ret, NDArray mean, size_t index,
- size_t x0, size_t y0, size_t x1, size_t y1, size_t n_channels,
- size_t size, char *str_img) {
+ .add_alias("_npi_copyto")
+ .set_num_inputs(1)
+ .set_num_outputs(1)
+ .set_attr<mxnet::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
+ .set_attr<nnvm::FInferType>(
+ "FInferType",
+ [](const NodeAttrs& attrs, std::vector<int>* in_type, std::vector<int>* out_type) {
+ return !op::type_is_none((*in_type)[0]) && !op::type_is_none((*out_type)[0]);
+ })
+ .set_attr<FInferStorageType>("FInferStorageType",
+ [](const NodeAttrs& attrs,
+ const int dev_mask,
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ op::dispatch_mode_assign(dispatch_mode,
+ DispatchMode::kFComputeEx);
+ if (op::storage_type_is_none((*out_attrs)[0])) {
+ (*out_attrs)[0] = (*in_attrs)[0];
+ }
+ return true;
+ })
+ .set_attr<FExecType>("FExecType",
+ [](const NodeAttrs& attrs) { return ExecType::kCrossDeviceCopy; })
+ .set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
+ .set_attr<bool>("TIsBackward", true)
+ .set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
+ .set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
+ .add_argument("data", "NDArray", "input data");
+
+void Imdecode(NDArray* ret,
+ NDArray mean,
+ size_t index,
+ size_t x0,
+ size_t y0,
+ size_t x1,
+ size_t y1,
+ size_t n_channels,
+ size_t size,
+ char* str_img) {
#if MXNET_USE_OPENCV
cv::Mat buf(1, size, CV_8U, str_img);
cv::Mat res = cv::imdecode(buf, n_channels == 1 ? 0 : -1);
@@ -2174,12 +2382,12 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
y0 = 0;
y1 = res.rows;
}
- CHECK(x1 <= static_cast<size_t>(res.cols) &&
- y1 <= static_cast<size_t>(res.rows));
+ CHECK(x1 <= static_cast<size_t>(res.cols) && y1 <= static_cast<size_t>(res.rows));
if (ret->is_none()) {
- *ret = NDArray(mshadow::Shape3(n_channels, y1-y0, x1-x0),
- Context::CPU(), false,
+ *ret = NDArray(mshadow::Shape3(n_channels, y1 - y0, x1 - x0),
+ Context::CPU(),
+ false,
mean.is_none() ? mshadow::default_type_flag : mean.dtype());
}
NDArray buff;
@@ -2187,19 +2395,19 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
buff = ret->Reshape(mshadow::Shape4(1, ret->shape()[0], ret->shape()[1], ret->shape()[2]));
} else {
CHECK_EQ(ret->shape().ndim(), 4U);
- buff = ret->Slice(index, index+1);
+ buff = ret->Slice(index, index + 1);
}
CHECK_EQ(buff.ctx().dev_mask(), Context::kCPU);
CHECK_EQ(n_channels, buff.shape()[1]);
- CHECK_EQ(y1-y0, buff.shape()[2]);
- CHECK_EQ(x1-x0, buff.shape()[3]);
+ CHECK_EQ(y1 - y0, buff.shape()[2]);
+ CHECK_EQ(x1 - x0, buff.shape()[3]);
buff.WaitToWrite();
if (mean.is_none()) {
MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
- for (size_t i = 0; i < y1-y0; i++) {
- uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
- for (size_t j = 0; j < x1-x0; j++) {
+ for (size_t i = 0; i < y1 - y0; i++) {
+ uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
+ for (size_t j = 0; j < x1 - x0; j++) {
for (size_t k = 0; k < n_channels; k++) {
tensor[0][k][i][j] = DType(im_data[k]); // NOLINT(*)
}
@@ -2216,10 +2424,10 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
mean.WaitToRead();
MSHADOW_TYPE_SWITCH(buff.dtype(), DType, {
mshadow::Tensor<cpu, 4, DType> tensor = buff.data().get<cpu, 4, DType>();
- mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
- for (size_t i = 0; i < y1-y0; i++) {
- uchar* im_data = res.ptr<uchar>(y0+i) + res.channels()*x0;
- for (size_t j = 0; j < x1-x0; j++) {
+ mshadow::Tensor<cpu, 3, DType> tmean = mean.data().get<cpu, 3, DType>();
+ for (size_t i = 0; i < y1 - y0; i++) {
+ uchar* im_data = res.ptr<uchar>(y0 + i) + res.channels() * x0;
+ for (size_t j = 0; j < x1 - x0; j++) {
for (size_t k = 0; k < n_channels; k++) {
tensor[0][k][i][j] = DType(im_data[k]) - tmean[k][i][j]; // NOLINT(*)
}
@@ -2234,31 +2442,38 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
}
MXNET_REGISTER_NDARRAY_FUN(_imdecode)
-.set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
-.set_body([](NDArray **u, real_t *s, NDArray **out,
- int num_params, char **param_keys, char **param_vals) {
- CHECK_EQ(num_params, 1);
- Imdecode(out[0], *u[0],
- static_cast<size_t>(s[0]),
- static_cast<size_t>(s[1]),
- static_cast<size_t>(s[2]),
- static_cast<size_t>(s[3]),
- static_cast<size_t>(s[4]),
- static_cast<size_t>(s[5]),
- static_cast<size_t>(s[6]),
- param_vals[0]);
- })
-.set_num_use_vars(1)
-.set_num_scalars(7)
-.set_num_mutate_vars(1)
-.describe("Decode an image, clip to (x0, y0, x1, y1), subtract mean, and write to buffer")
-.add_argument("mean", "NDArray-or-Symbol", "image mean")
-.add_argument("index", "int", "buffer position for output")
-.add_argument("x0", "int", "x0")
-.add_argument("y0", "int", "y0")
-.add_argument("x1", "int", "x1")
-.add_argument("y1", "int", "y1")
-.add_argument("c", "int", "channel")
-.add_argument("size", "int", "length of str_img");
+ .set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
+ .set_body([](NDArray** u,
+ real_t* s,
+ NDArray** out,
+ int num_params,
+ char** param_keys,
+ char** param_vals) {
+ CHECK_EQ(num_params, 1);
+ Imdecode(out[0],
+ *u[0],
+ static_cast<size_t>(s[0]),
+ static_cast<size_t>(s[1]),
+ static_cast<size_t>(s[2]),
+ static_cast<size_t>(s[3]),
+ static_cast<size_t>(s[4]),
+ static_cast<size_t>(s[5]),
+ static_cast<size_t>(s[6]),
+ param_vals[0]);
+ })
+ .set_num_use_vars(1)
+ .set_num_scalars(7)
+ .set_num_mutate_vars(1)
+ .describe(
+ "Decode an image, clip to (x0, y0, x1, y1), subtract mean, and write "
+ "to buffer")
+ .add_argument("mean", "NDArray-or-Symbol", "image mean")
+ .add_argument("index", "int", "buffer position for output")
+ .add_argument("x0", "int", "x0")
+ .add_argument("y0", "int", "y0")
+ .add_argument("x1", "int", "x1")
+ .add_argument("y1", "int", "y1")
+ .add_argument("c", "int", "channel")
+ .add_argument("size", "int", "length of str_img");
#endif
} // namespace mxnet
diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h
index 485b3b3..dae66bc 100644
--- a/src/operator/nn/batch_norm-inl.h
+++ b/src/operator/nn/batch_norm-inl.h
@@ -43,15 +43,24 @@
#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
#endif
+/*! \brief inverse standard deviation <-> variance */
+#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0 / std::sqrt((__var$) + DType(__eps$)))
+#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
+
namespace mxnet {
namespace op {
namespace batchnorm {
-enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
- kInMovingVar}; // kGamma: weights, kBeta: biases
-enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data
-enum BatchNormOpResource {kTempSpace};
-enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states
+enum BatchNormOpInputs {
+ kData,
+ kGamma,
+ kBeta,
+ kInMovingMean,
+ kInMovingVar
+}; // kGamma: weights, kBeta: biases
+enum BatchNormOpOutputs { kOut, kMean, kVar }; // req, out_data
+enum BatchNormOpResource { kTempSpace };
+enum BatchNormOpAuxiliary { kMovingMean, kMovingVar }; // aux_states
/*! \brief Default channel axis if none specified in the params */
constexpr int DEFAULT_AXIS = 1;
@@ -59,11 +68,18 @@ constexpr int DEFAULT_AXIS = 1;
/*! \brief Parameters for BatchNorm operator */
namespace quantized_batchnorm {
-enum QuantizedBatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean,
- kInMovingVar, kDataMin, kDataMax};
-enum QuantizedBatchNormOutputs {kOut, kOutMin, kOutMax};
-enum QuantizedBatchNormOpAuxiliary {kMovingMean, kMovingVar};
-} // quantized_batchnorm
+enum QuantizedBatchNormOpInputs {
+ kData,
+ kGamma,
+ kBeta,
+ kInMovingMean,
+ kInMovingVar,
+ kDataMin,
+ kDataMax
+};
+enum QuantizedBatchNormOutputs { kOut, kOutMin, kOutMax };
+enum QuantizedBatchNormOpAuxiliary { kMovingMean, kMovingVar };
+} // namespace quantized_batchnorm
/*! \brief Parameters for BatchNoram operator */
struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
@@ -79,38 +95,42 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
dmlc::optional<float> max_calib_range; // max float value calculated from calibration dataset
DMLC_DECLARE_PARAMETER(BatchNormParam) {
- DMLC_DECLARE_FIELD(eps).set_default(1e-3f)
- .describe("Epsilon to prevent div 0. "
- "Must be no less than CUDNN_BN_MIN_EPSILON "
- "defined in cudnn.h when using cudnn (usually 1e-5)");
- DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
- .describe("Momentum for moving average");
- DMLC_DECLARE_FIELD(fix_gamma).set_default(true)
- .describe("Fix gamma while training");
- DMLC_DECLARE_FIELD(use_global_stats).set_default(false)
- .describe("Whether use global moving statistics instead of local batch-norm. "
- "This will force change batch-norm into a scale shift operator.");
- DMLC_DECLARE_FIELD(output_mean_var).set_default(false)
- .describe("Output the mean and inverse std ");
- DMLC_DECLARE_FIELD(axis).set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
- .describe("Specify which shape axis the channel is specified");
- DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
- .describe("Do not select CUDNN operator, if available");
+ DMLC_DECLARE_FIELD(eps).set_default(1e-3f).describe(
+ "Epsilon to prevent div 0. "
+ "Must be no less than CUDNN_BN_MIN_EPSILON "
+ "defined in cudnn.h when using cudnn (usually 1e-5)");
+ DMLC_DECLARE_FIELD(momentum).set_default(0.9f).describe("Momentum for moving average");
+ DMLC_DECLARE_FIELD(fix_gamma).set_default(true).describe("Fix gamma while training");
+ DMLC_DECLARE_FIELD(use_global_stats)
+ .set_default(false)
+ .describe(
+ "Whether use global moving statistics instead of local batch-norm. "
+ "This will force change batch-norm into a scale shift operator.");
+ DMLC_DECLARE_FIELD(output_mean_var)
+ .set_default(false)
+ .describe("Output the mean and inverse std ");
+ DMLC_DECLARE_FIELD(axis)
+ .set_default(mxnet::op::batchnorm::DEFAULT_AXIS)
+ .describe("Specify which shape axis the channel is specified");
+ DMLC_DECLARE_FIELD(cudnn_off).set_default(false).describe(
+ "Do not select CUDNN operator, if available");
DMLC_DECLARE_FIELD(min_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The minimum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized batch norm op to calculate primitive scale."
- "Note: this calib_range is to calib bn output.");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The minimum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized batch norm op to calculate primitive scale."
+ "Note: this calib_range is to calib bn output.");
DMLC_DECLARE_FIELD(max_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The maximum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized batch norm op to calculate primitive scale."
- "Note: this calib_range is to calib bn output.");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The maximum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized batch norm op to calculate primitive scale."
+ "Note: this calib_range is to calib bn output.");
}
- bool operator==(const BatchNormParam &other) const {
+ bool operator==(const BatchNormParam& other) const {
bool flag = this->eps == other.eps && this->momentum == other.momentum &&
this->fix_gamma == other.fix_gamma &&
this->use_global_stats == other.use_global_stats &&
@@ -131,15 +151,15 @@ struct BatchNormParam : public dmlc::Parameter<BatchNormParam> {
} // namespace mxnet
namespace std {
-template<>
+template <>
struct hash<mxnet::op::BatchNormParam> {
size_t operator()(const mxnet::op::BatchNormParam& val) {
size_t ret = 0;
- ret = dmlc::HashCombine(ret, val.momentum);
- ret = dmlc::HashCombine(ret, val.fix_gamma);
- ret = dmlc::HashCombine(ret, val.use_global_stats);
- ret = dmlc::HashCombine(ret, val.output_mean_var);
- ret = dmlc::HashCombine(ret, val.axis);
+ ret = dmlc::HashCombine(ret, val.momentum);
+ ret = dmlc::HashCombine(ret, val.fix_gamma);
+ ret = dmlc::HashCombine(ret, val.use_global_stats);
+ ret = dmlc::HashCombine(ret, val.output_mean_var);
+ ret = dmlc::HashCombine(ret, val.axis);
return ret;
}
};
@@ -153,40 +173,30 @@ static inline bool IsBNWriting(const OpReqType ort) {
}
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states);
+void BatchNormForwardImpl(mshadow::Stream<cpu>* stream, const OpContext& ctx,
+ const BatchNormParam& param, const std::vector<TBlob>& in_data,
+ const std::vector<OpReqType>& req, const std::vector<TBlob>& out_data,
+ const std::vector<TBlob>& aux_states);
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &out_grad,
- const std::vector<TBlob> &in_data,
- const std::vector<TBlob> &out_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &in_grad,
- const std::vector<TBlob> &aux_states);
+void BatchNormBackwardImpl(mshadow::Stream<cpu>* stream, const OpContext& ctx,
+ const BatchNormParam& param, const std::vector<TBlob>& out_grad,
+ const std::vector<TBlob>& in_data, const std::vector<TBlob>& out_data,
+ const std::vector<OpReqType>& req, const std::vector<TBlob>& in_grad,
+ const std::vector<TBlob>& aux_states);
#if MXNET_USE_CUDA
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<gpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states);
+void BatchNormForwardImpl(mshadow::Stream<gpu>* stream, const OpContext& ctx,
+ const BatchNormParam& param, const std::vector<TBlob>& in_data,
+ const std::vector<OpReqType>& req, const std::vector<TBlob>& out_data,
+ const std::vector<TBlob>& aux_states);
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
- const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &out_grad,
- const std::vector<TBlob> &in_data,
- const std::vector<TBlob> &out_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &in_grad,
- const std::vector<TBlob> &aux_states);
+void BatchNormBackwardImpl(mshadow::Stream<gpu>* stream, const OpContext& ctx,
+ const BatchNormParam& param, const std::vector<TBlob>& out_grad,
+ const std::vector<TBlob>& in_data, const std::vector<TBlob>& out_data,
+ const std::vector<OpReqType>& req, const std::vector<TBlob>& in_grad,
+ const std::vector<TBlob>& aux_states);
#endif // MXNET_USE_CUDA
/*!
@@ -201,11 +211,9 @@ void BatchNormBackwardImpl(mshadow::Stream<gpu> *stream,
* \sa OpReqType, OpContext
*/
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states) {
+void BatchNormForward(const OpContext& ctx, const BatchNormParam& param,
+ const std::vector<TBlob>& in_data, const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& out_data, const std::vector<TBlob>& aux_states) {
using namespace mshadow;
using namespace mshadow::expr;
@@ -219,9 +227,8 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
CHECK_GE(req.size(), 1U);
CHECK_EQ(req[batchnorm::kOut], kWriteTo);
}
- Stream<xpu> *s = ctx.get_stream<xpu>();
- BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req,
- out_data, aux_states);
+ Stream<xpu>* s = ctx.get_stream<xpu>();
+ BatchNormForwardImpl<xpu, DType, AccReal>(s, ctx, param, in_data, req, out_data, aux_states);
}
/*!
@@ -253,10 +260,9 @@ void BatchNormForward(const OpContext &ctx, const BatchNormParam& param,
* \sa OperatorProperty, OpReqType, OpContext
*/
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
- const std::vector<TBlob> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &outputs) {
+void BatchNormBackward(const OpContext& ctx, const BatchNormParam& param,
+ const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 8U);
CHECK_EQ(outputs.size(), 3U);
@@ -265,41 +271,36 @@ void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param,
std::vector<TBlob> in_data(3);
std::vector<TBlob> aux_states(2);
- out_grad[0] = inputs[0];
- out_data[batchnorm::kMean] = inputs[1];
- out_data[batchnorm::kVar] = inputs[2];
- in_data[batchnorm::kData] = inputs[3];
- in_data[batchnorm::kGamma] = inputs[4];
- in_data[batchnorm::kBeta] = inputs[5];
+ out_grad[0] = inputs[0];
+ out_data[batchnorm::kMean] = inputs[1];
+ out_data[batchnorm::kVar] = inputs[2];
+ in_data[batchnorm::kData] = inputs[3];
+ in_data[batchnorm::kGamma] = inputs[4];
+ in_data[batchnorm::kBeta] = inputs[5];
aux_states[batchnorm::kMovingMean] = inputs[6];
- aux_states[batchnorm::kMovingVar] = inputs[7];
- const std::vector<TBlob> &in_grad = outputs;
- mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
- BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data,
- out_data, req, in_grad, aux_states);
+ aux_states[batchnorm::kMovingVar] = inputs[7];
+ const std::vector<TBlob>& in_grad = outputs;
+ mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+ BatchNormBackwardImpl<xpu, DType, AccReal>(s, ctx, param, out_grad, in_data, out_data, req,
+ in_grad, aux_states);
}
-template<typename xpu>
-void BatchNormCompute(const nnvm::NodeAttrs& attrs,
- const OpContext& ctx, const std::vector<TBlob>& inputs,
- const std::vector<OpReqType>& req,
+template <typename xpu>
+void BatchNormCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+ const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
CHECK_EQ(inputs.size(), 5U);
- std::vector<TBlob> in_data(inputs.begin(),
- inputs.begin() + batchnorm::kInMovingMean);
- std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean,
- inputs.end());
+ std::vector<TBlob> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
+ std::vector<TBlob> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, {
- BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs,
- aux_states);
+ BatchNormForward<xpu, DType, AccReal>(ctx, param, in_data, req, outputs, aux_states);
});
}
-template<typename xpu>
-void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
- const OpContext& ctx, const std::vector<TBlob>& inputs,
- const std::vector<OpReqType>& req,
+template <typename xpu>
+void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+ const std::vector<TBlob>& inputs, const std::vector<OpReqType>& req,
const std::vector<TBlob>& outputs) {
CHECK_EQ(inputs.size(), 8U);
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
@@ -313,15 +314,15 @@ void BatchNormGradCompute(const nnvm::NodeAttrs& attrs,
namespace batchnorm {
-template<typename DType>
+template <typename DType>
class BNTensor3 {
enum { OUTER, CHANNEL, INNER, COUNT };
public:
inline BNTensor3(const TBlob& blob, const int indexOfChannel)
- : dptr_(blob.dptr<DType>())
- , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
- ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
+ : dptr_(blob.dptr<DType>()),
+ indexOfChannel_(static_cast<size_t>(
+ indexOfChannel < 0 ? (static_cast<int>(blob.shape_.ndim()) + indexOfChannel)
: indexOfChannel)) {
CHECK_EQ(blob.type_flag_, mshadow::DataType<DType>::kFlag);
shape_[OUTER] = 1;
@@ -329,31 +330,29 @@ class BNTensor3 {
shape_[OUTER] *= blob.shape_[i];
}
shape_[CHANNEL] = blob.shape_[indexOfChannel_];
- shape_[INNER] = 1;
+ shape_[INNER] = 1;
for (size_t i = indexOfChannel_ + 1, n = blob.shape_.ndim(); i < n; ++i) {
shape_[INNER] *= blob.shape_[i];
}
}
- inline BNTensor3(DType *p, const mxnet::TShape& shape, const int indexOfChannel)
- : dptr_(p)
- , indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
- ? (static_cast<int>(shape.ndim()) + indexOfChannel)
- : indexOfChannel)) {
+ inline BNTensor3(DType* p, const mxnet::TShape& shape, const int indexOfChannel)
+ : dptr_(p),
+ indexOfChannel_(static_cast<size_t>(indexOfChannel < 0
+ ? (static_cast<int>(shape.ndim()) + indexOfChannel)
+ : indexOfChannel)) {
shape_[OUTER] = 1;
for (size_t i = 0; i < indexOfChannel_; ++i) {
shape_[OUTER] *= shape[i];
}
shape_[CHANNEL] = shape[indexOfChannel_];
- shape_[INNER] = 1;
+ shape_[INNER] = 1;
for (size_t i = indexOfChannel_ + 1, n = shape.ndim(); i < n; ++i) {
shape_[INNER] *= shape[i];
}
}
- MSHADOW_FORCE_INLINE bool IsEmpty() const {
- return dptr_ == nullptr;
- }
+ MSHADOW_FORCE_INLINE bool IsEmpty() const { return dptr_ == nullptr; }
MSHADOW_XINLINE size_t Size() const {
size_t n = 1;
@@ -363,22 +362,14 @@ class BNTensor3 {
return n;
}
- MSHADOW_XINLINE size_t ChannelCount() const {
- return shape_[CHANNEL];
- }
+ MSHADOW_XINLINE size_t ChannelCount() const { return shape_[CHANNEL]; }
- MSHADOW_XINLINE size_t OuterSize() const {
- return shape_[OUTER];
- }
+ MSHADOW_XINLINE size_t OuterSize() const { return shape_[OUTER]; }
- MSHADOW_XINLINE size_t InnerSize() const {
- return shape_[INNER];
- }
+ MSHADOW_XINLINE size_t InnerSize() const { return shape_[INNER]; }
/*! \brief start of a given channel's spatial data */
- MSHADOW_XINLINE size_t StartOffset(const size_t channel) const {
- return channel * InnerSize();
- }
+ MSHADOW_XINLINE size_t StartOffset(const size_t channel) const { return channel * InnerSize(); }
/*! \brief This is the amount to skip to next same-channel data
* This is the number of bytes to skip from one past the end of the current spatial data
@@ -392,12 +383,10 @@ class BNTensor3 {
return (ChannelCount() - 1) * InnerSize();
}
- MSHADOW_XINLINE size_t offset(const size_t outer,
- const size_t channel,
- const size_t i) const {
+ MSHADOW_XINLINE size_t offset(const size_t outer, const size_t channel, const size_t i) const {
const size_t spatial_size = InnerSize();
- const size_t skip_length = SkipLengthToNextSameChannelData();
- size_t off = StartOffset(channel);
+ const size_t skip_length = SkipLengthToNextSameChannelData();
+ size_t off = StartOffset(channel);
off += outer * shape_[CHANNEL] * shape_[INNER];
const size_t skips = i / spatial_size;
off += (1 + skip_length) * skips;
@@ -405,21 +394,18 @@ class BNTensor3 {
return off;
}
- MSHADOW_XINLINE DType& get_ref(const size_t batch,
- const size_t channel,
- const size_t i) {
+ MSHADOW_XINLINE DType& get_ref(const size_t batch, const size_t channel, const size_t i) {
const size_t off = offset(batch, channel, i);
return dptr_[off];
}
- MSHADOW_XINLINE const DType& get_ref(const size_t batch,
- const size_t channel,
+ MSHADOW_XINLINE const DType& get_ref(const size_t batch, const size_t channel,
const size_t i) const {
const size_t off = offset(batch, channel, i);
return dptr_[off];
}
- DType *dptr_;
+ DType* dptr_;
size_t indexOfChannel_;
size_t shape_[COUNT];
};
diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc
index 1bbdfa6..6ffbc66 100644
--- a/src/operator/nn/batch_norm.cc
+++ b/src/operator/nn/batch_norm.cc
@@ -22,20 +22,17 @@
* \file batch_norm.cc
* \brief
* \author Bing Xu, Chris Olivier, Da Zheng
-*/
+ */
-#include "batch_norm-inl.h"
#include <nnvm/op_attr_types.h>
+
#include "../elemwise_op_common.h"
#include "../operator_common.h"
+#include "batch_norm-inl.h"
#if MXNET_USE_MKLDNN == 1
#include "./mkldnn/mkldnn_batch_norm-inl.h"
#endif
-/*! \brief inverse standard deviation <-> variance */
-#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/std::sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
-
namespace mxnet {
namespace op {
namespace batchnorm {
@@ -43,16 +40,17 @@ namespace batchnorm {
/*! \brief Global disable of batchnorm mkl operator for unit testing */
volatile bool disable_mkl = false;
-/*! \brief Fast-foreach when you don't care about the position other than channel */
-template<typename DType, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType> &tensor,
+/*! \brief Fast-foreach when you don't care about the position other than
+ * channel */
+template <typename DType, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType>& tensor,
const size_t channel,
OnData onData) {
- const size_t num = tensor.OuterSize();
- const size_t matrixSize = tensor.InnerSize();
- const size_t skipLength = tensor.SkipLengthToNextSameChannelData();
+ const size_t num = tensor.OuterSize();
+ const size_t matrixSize = tensor.InnerSize();
+ const size_t skipLength = tensor.SkipLengthToNextSameChannelData();
const size_t startOffset = tensor.StartOffset(channel);
- DType *data = tensor.dptr_ + startOffset;
+ DType* data = tensor.dptr_ + startOffset;
for (size_t outer = 0; outer < num; ++outer) {
for (size_t i = 0; i < matrixSize; ++i) {
@@ -62,10 +60,11 @@ static inline void ForEachFast(const BNTensor3<DType> &tensor,
}
}
-/*! \brief Fast-foreach when you don't care about the position other than channel */
-template<typename DType1, typename DType2, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType1> &in_data,
- const BNTensor3<DType2> &out_data,
+/*! \brief Fast-foreach when you don't care about the position other than
+ * channel */
+template <typename DType1, typename DType2, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType1>& in_data,
+ const BNTensor3<DType2>& out_data,
const size_t channel,
OnData onData) {
const size_t num = in_data.OuterSize();
@@ -73,22 +72,22 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
const size_t skipLength = in_data.SkipLengthToNextSameChannelData();
const size_t startOffset = in_data.StartOffset(channel);
- DType1 *data = in_data.dptr_ + startOffset;
- DType2 *odata = out_data.dptr_ + startOffset;
+ DType1* data = in_data.dptr_ + startOffset;
+ DType2* odata = out_data.dptr_ + startOffset;
for (size_t outer = 0; outer < num; ++outer) {
for (size_t i = 0; i < matrixSize; ++i) {
onData(data++, odata++);
}
- data += skipLength;
+ data += skipLength;
odata += skipLength;
}
}
-template<typename DType1, typename DType2, typename DType3, typename OnData>
-static inline void ForEachFast(const BNTensor3<DType1> &in_data,
- const BNTensor3<DType2> &in_data2,
- const BNTensor3<DType3> &out_data,
+template <typename DType1, typename DType2, typename DType3, typename OnData>
+static inline void ForEachFast(const BNTensor3<DType1>& in_data,
+ const BNTensor3<DType2>& in_data2,
+ const BNTensor3<DType3>& out_data,
const size_t channel,
OnData onData) {
const size_t num = in_data.OuterSize();
@@ -96,15 +95,15 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
const size_t skipLength = in_data.SkipLengthToNextSameChannelData();
const size_t startOffset = in_data.StartOffset(channel);
- DType1 *data = in_data.dptr_ + startOffset;
- DType2 *data2 = in_data2.dptr_ + startOffset;
- DType3 *odata = out_data.dptr_ + startOffset;
+ DType1* data = in_data.dptr_ + startOffset;
+ DType2* data2 = in_data2.dptr_ + startOffset;
+ DType3* odata = out_data.dptr_ + startOffset;
for (size_t outer = 0; outer < num; ++outer) {
for (size_t i = 0; i < matrixSize; ++i) {
onData(data++, data2++, odata++);
}
- data += skipLength;
+ data += skipLength;
data2 += skipLength;
odata += skipLength;
}
@@ -114,50 +113,50 @@ static inline void ForEachFast(const BNTensor3<DType1> &in_data,
/*! \brief Forward CPU */
template <typename xpu, typename DType, typename AccReal>
-void BatchNormForwardImpl(mshadow::Stream<cpu> *,
- const OpContext &ctx, const BatchNormParam& param_,
- const std::vector<TBlob> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &out_data,
- const std::vector<TBlob> &aux_states) {
+void BatchNormForwardImpl(mshadow::Stream<cpu>*,
+ const OpContext& ctx,
+ const BatchNormParam& param_,
+ const std::vector<TBlob>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& out_data,
+ const std::vector<TBlob>& aux_states) {
// Input
batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
- const TBlob &weights = in_data[batchnorm::kGamma];
- const TBlob &bias = in_data[batchnorm::kBeta];
+ const TBlob& weights = in_data[batchnorm::kGamma];
+ const TBlob& bias = in_data[batchnorm::kBeta];
// Aux (Moving)
- const TBlob &runningMean = aux_states[batchnorm::kMovingMean];
- const TBlob &runningVariance = aux_states[batchnorm::kMovingVar];
+ const TBlob& runningMean = aux_states[batchnorm::kMovingMean];
+ const TBlob& runningVariance = aux_states[batchnorm::kMovingVar];
// Output
batchnorm::BNTensor3<DType> outputData(out_data[batchnorm::kOut], param_.axis);
- const TBlob &meanVector = out_data[batchnorm::kMean];
- const TBlob &varianceVector = out_data[batchnorm::kVar];
+ const TBlob& meanVector = out_data[batchnorm::kMean];
+ const TBlob& varianceVector = out_data[batchnorm::kVar];
- AccReal *mean = meanVector.dptr<AccReal>();
- AccReal *var = varianceVector.dptr<AccReal>();
+ AccReal* mean = meanVector.dptr<AccReal>();
+ AccReal* var = varianceVector.dptr<AccReal>();
const bool is_train_and_not_global_stats = ctx.is_train && !param_.use_global_stats;
- const size_t channelCount = inputData.ChannelCount();
- const size_t itemCountPerChannel = inputData.Size() / channelCount;
+ const size_t channelCount = inputData.ChannelCount();
+ const size_t itemCountPerChannel = inputData.Size() / channelCount;
- #pragma omp parallel for
+#pragma omp parallel for
for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
if (is_train_and_not_global_stats) {
// compute mean per input
mean[channel] = 0;
- ForEachFast(inputData, channel, [mean, channel](const DType *in_data) {
- mean[channel] += *in_data; });
+ ForEachFast(
+ inputData, channel, [mean, channel](const DType* in_data) { mean[channel] += *in_data; });
mean[channel] /= itemCountPerChannel;
// compute variance per input
const AccReal thisMean = mean[channel];
- var[channel] = 0;
- ForEachFast(inputData, channel,
- [var, thisMean, channel](const DType *current_in_data) {
- const AccReal current = *current_in_data;
- var[channel] += (current - thisMean) * (current - thisMean);
- });
+ var[channel] = 0;
+ ForEachFast(inputData, channel, [var, thisMean, channel](const DType* current_in_data) {
+ const AccReal current = *current_in_data;
+ var[channel] += (current - thisMean) * (current - thisMean);
+ });
const AccReal sum = var[channel];
@@ -167,125 +166,130 @@ void BatchNormForwardImpl(mshadow::Stream<cpu> *,
invstd = 0;
} else {
const AccReal variance = sum / itemCountPerChannel;
- invstd = VARIANCE_TO_INVSTD(variance, param_.eps);
+ invstd = VARIANCE_TO_INVSTD(variance, param_.eps);
}
var[channel] = invstd;
} else {
- const AccReal *rm = runningMean.dptr<AccReal>();
- const AccReal *rv = runningVariance.dptr<AccReal>();
+ const AccReal* rm = runningMean.dptr<AccReal>();
+ const AccReal* rv = runningVariance.dptr<AccReal>();
mean[channel] = rm[channel];
- var[channel] = VARIANCE_TO_INVSTD(rv[channel], param_.eps);
+ var[channel] = VARIANCE_TO_INVSTD(rv[channel], param_.eps);
}
// compute output
- AccReal *w = weights.dptr<AccReal>();
- const AccReal *b = bias.dptr<AccReal>();
+ AccReal* w = weights.dptr<AccReal>();
+ const AccReal* b = bias.dptr<AccReal>();
- const AccReal thisMean = mean[channel];
+ const AccReal thisMean = mean[channel];
const AccReal thisInvstd = var[channel];
const AccReal thisWeight = w[channel];
- const AccReal thisBias = b[channel];
+ const AccReal thisBias = b[channel];
// note that var is still invstd
if (!param_.fix_gamma) {
if (IsBNWriting(req[batchnorm::kData])) {
- ForEachFast(inputData, outputData, channel,
- [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
- DType *out_data) {
- *out_data = static_cast<DType>(
- ((*in_data - thisMean) * thisInvstd) * thisWeight + thisBias);
- });
+ ForEachFast(
+ inputData,
+ outputData,
+ channel,
+ [thisWeight, thisBias, thisMean, thisInvstd](const DType* in_data, DType* out_data) {
+ *out_data =
+ static_cast<DType>(((*in_data - thisMean) * thisInvstd) * thisWeight + thisBias);
+ });
}
} else {
if (IsBNWriting(req[batchnorm::kGamma])) {
w[channel] = AccReal(1);
}
if (IsBNWriting(req[batchnorm::kData])) {
- ForEachFast(inputData, outputData, channel,
- [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data,
- DType *out_data) {
- *out_data = static_cast<DType>(
- ((*in_data - thisMean) * thisInvstd) + thisBias);
- });
+ ForEachFast(
+ inputData,
+ outputData,
+ channel,
+ [thisWeight, thisBias, thisMean, thisInvstd](const DType* in_data, DType* out_data) {
+ *out_data = static_cast<DType>(((*in_data - thisMean) * thisInvstd) + thisBias);
+ });
}
}
}
}
template <typename xpu, typename DType, typename AccReal>
-void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
- const OpContext &ctx, const BatchNormParam& param_,
- const std::vector<TBlob> &out_grad,
- const std::vector<TBlob> &in_data,
- const std::vector<TBlob> &out_data,
- const std::vector<OpReqType> &req,
- const std::vector<TBlob> &in_grad,
- const std::vector<TBlob> &aux_states) {
+void BatchNormBackwardImpl(mshadow::Stream<cpu>*,
+ const OpContext& ctx,
+ const BatchNormParam& param_,
+ const std::vector<TBlob>& out_grad,
+ const std::vector<TBlob>& in_data,
+ const std::vector<TBlob>& out_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<TBlob>& in_grad,
+ const std::vector<TBlob>& aux_states) {
// Input Data
batchnorm::BNTensor3<DType> inputData(in_data[batchnorm::kData], param_.axis);
- const TBlob &weights = in_data[batchnorm::kGamma];
+ const TBlob& weights = in_data[batchnorm::kGamma];
// Input Grad
batchnorm::BNTensor3<DType> gradIn(in_grad[batchnorm::kData], param_.axis);
- const TBlob &gradWeight = in_grad[batchnorm::kGamma];
- const TBlob &gradBias = in_grad[batchnorm::kBeta];
+ const TBlob& gradWeight = in_grad[batchnorm::kGamma];
+ const TBlob& gradBias = in_grad[batchnorm::kBeta];
// Aux (Moving)
- const TBlob &runningMean = aux_states[batchnorm::kMovingMean];
- const TBlob &runningVariance = aux_states[batchnorm::kMovingVar];
+ const TBlob& runningMean = aux_states[batchnorm::kMovingMean];
+ const TBlob& runningVariance = aux_states[batchnorm::kMovingVar];
// Output
batchnorm::BNTensor3<DType> gradOut(out_grad[batchnorm::kOut], param_.axis);
- const TBlob &saveMean = out_data[batchnorm::kMean];
- const TBlob &saveStd = out_data[batchnorm::kVar];
+ const TBlob& saveMean = out_data[batchnorm::kMean];
+ const TBlob& saveStd = out_data[batchnorm::kVar];
const size_t channelCount = inputData.ChannelCount();
const size_t itemCount = inputData.Size() / channelCount;
// Avoid multiple dptr() call within the channel loop
- AccReal *runningMeanDataPtr = runningMean.dptr<AccReal>();
- AccReal *runningVarDataPtr = runningVariance.dptr<AccReal>();
- const AccReal *saveMeanDataPtr = saveMean.dptr<AccReal>();
- const AccReal *saveInvStdDataPtr = saveStd.dptr<AccReal>();
- AccReal *gradWeightData = gradWeight.dptr<AccReal>();
- AccReal *gradBiasData = gradBias.dptr<AccReal>();
+ AccReal* runningMeanDataPtr = runningMean.dptr<AccReal>();
+ AccReal* runningVarDataPtr = runningVariance.dptr<AccReal>();
+ const AccReal* saveMeanDataPtr = saveMean.dptr<AccReal>();
+ const AccReal* saveInvStdDataPtr = saveStd.dptr<AccReal>();
+ AccReal* gradWeightData = gradWeight.dptr<AccReal>();
+ AccReal* gradBiasData = gradBias.dptr<AccReal>();
const bool is_train_and_not_global_stats = ctx.is_train && !param_.use_global_stats;
- #pragma omp parallel for
+#pragma omp parallel for
for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
- const AccReal *weight = weights.dptr<AccReal>();
- const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1);
+ const AccReal* weight = weights.dptr<AccReal>();
+ const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1);
AccReal mean, invstd;
if (is_train_and_not_global_stats) {
- mean = saveMeanDataPtr[channel];
- invstd = saveInvStdDataPtr[channel];
+ mean = saveMeanDataPtr[channel];
+ invstd = saveInvStdDataPtr[channel];
const AccReal variance = INVSTD_TO_VARIANCE(invstd, param_.eps);
// update running averages
- runningMeanDataPtr[channel] = runningMeanDataPtr[channel] * param_.momentum
- + mean * (AccReal(1) - param_.momentum);
+ runningMeanDataPtr[channel] =
+ runningMeanDataPtr[channel] * param_.momentum + mean * (AccReal(1) - param_.momentum);
- runningVarDataPtr[channel] = runningVarDataPtr[channel] * param_.momentum
- + variance * (AccReal(1) - param_.momentum);
+ runningVarDataPtr[channel] =
+ runningVarDataPtr[channel] * param_.momentum + variance * (AccReal(1) - param_.momentum);
} else {
- mean = runningMeanDataPtr[channel];
+ mean = runningMeanDataPtr[channel];
invstd = VARIANCE_TO_INVSTD(runningVarDataPtr[channel], param_.eps);
}
// sumGradOut over all gradOutput in feature plane
AccReal sumGradOut = 0;
- ForEachFast(gradOut, static_cast<size_t>(channel),
- [&sumGradOut](const DType *gradOut_data) {
- sumGradOut += *gradOut_data;
- });
+ ForEachFast(gradOut, static_cast<size_t>(channel), [&sumGradOut](const DType* gradOut_data) {
+ sumGradOut += *gradOut_data;
+ });
// dot product of the Q(X) and gradOuput
AccReal dotp = 0;
- ForEachFast(inputData, gradOut, static_cast<size_t>(channel),
- [&dotp, mean](const DType *thisInputData, const DType *gradOut_data) {
+ ForEachFast(inputData,
+ gradOut,
+ static_cast<size_t>(channel),
+ [&dotp, mean](const DType* thisInputData, const DType* gradOut_data) {
dotp += (*thisInputData - mean) * (*gradOut_data);
});
@@ -297,28 +301,34 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
// dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
// projection of gradOutput on to output scaled by std
- const AccReal k = dotp * invstd * invstd / itemCount;
- const AccReal iw = invstd * w;
+ const AccReal k = dotp * invstd * invstd / itemCount;
+ const AccReal iw = invstd * w;
const AccReal gradMean = sumGradOut / itemCount;
if (req[batchnorm::kData] != kAddTo) {
- ForEachFast(inputData, gradIn, static_cast<size_t>(channel),
- [&mean, &k](const DType *inputDataPtr, DType *gradIn_data) {
+ ForEachFast(inputData,
+ gradIn,
+ static_cast<size_t>(channel),
+ [&mean, &k](const DType* inputDataPtr, DType* gradIn_data) {
*gradIn_data = (*inputDataPtr - mean) * k;
});
- ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
- [iw, gradMean](const DType *gradOut_data, DType *gradIn_data) {
+ ForEachFast(gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [iw, gradMean](const DType* gradOut_data, DType* gradIn_data) {
*gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * iw;
});
} else {
- ForEachFast(inputData, gradOut, gradIn, static_cast<size_t>(channel),
- [&mean, &k, iw, gradMean](const DType *inputDataPtr,
- const DType *gradOut_data,
- DType *gradIn_data) {
- DType normal_val = (*inputDataPtr - mean) * k;
- *gradIn_data += (*gradOut_data - gradMean -
- normal_val) * iw;
- });
+ ForEachFast(
+ inputData,
+ gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [&mean, &k, iw, gradMean](
+ const DType* inputDataPtr, const DType* gradOut_data, DType* gradIn_data) {
+ DType normal_val = (*inputDataPtr - mean) * k;
+ *gradIn_data += (*gradOut_data - gradMean - normal_val) * iw;
+ });
}
} else {
// when in evaluation mode
@@ -327,13 +337,17 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
// dL/dX = w / running_std
const AccReal iw = invstd * w;
if (req[batchnorm::kData] != kAddTo) {
- ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
- [iw](const DType *gradOut_data, DType *gradIn_data) {
+ ForEachFast(gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [iw](const DType* gradOut_data, DType* gradIn_data) {
*gradIn_data = *gradOut_data * iw;
});
} else {
- ForEachFast(gradOut, gradIn, static_cast<size_t>(channel),
- [iw](const DType *gradOut_data, DType *gradIn_data) {
+ ForEachFast(gradOut,
+ gradIn,
+ static_cast<size_t>(channel),
+ [iw](const DType* gradOut_data, DType* gradIn_data) {
*gradIn_data += *gradOut_data * iw;
});
}
@@ -358,28 +372,27 @@ void BatchNormBackwardImpl(mshadow::Stream<cpu> *,
DMLC_REGISTER_PARAMETER(BatchNormParam);
static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
- mxnet::ShapeVector *in_shape,
- mxnet::ShapeVector *out_shape) {
+ mxnet::ShapeVector* in_shape,
+ mxnet::ShapeVector* out_shape) {
const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
using namespace mshadow;
CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]";
CHECK_EQ(out_shape->size(), 3U);
- const mxnet::TShape &dshape = in_shape->at(batchnorm::kData);
+ const mxnet::TShape& dshape = in_shape->at(batchnorm::kData);
if (!mxnet::ndim_is_known(dshape)) {
return false;
}
- const size_t channelAxis = static_cast<size_t>(param.axis < 0
- ? static_cast<int>(dshape.ndim()) + param.axis
- : param.axis);
+ const size_t channelAxis = static_cast<size_t>(
+ param.axis < 0 ? static_cast<int>(dshape.ndim()) + param.axis : param.axis);
CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis;
const index_t channelCount = dshape[channelAxis];
- in_shape->at(batchnorm::kGamma) = mxnet::TShape(Shape1(channelCount));
- in_shape->at(batchnorm::kBeta) = mxnet::TShape(Shape1(channelCount));
+ in_shape->at(batchnorm::kGamma) = mxnet::TShape(Shape1(channelCount));
+ in_shape->at(batchnorm::kBeta) = mxnet::TShape(Shape1(channelCount));
in_shape->at(batchnorm::kInMovingMean) = mxnet::TShape(Shape1(channelCount)); // kMovingMean
- in_shape->at(batchnorm::kInMovingVar) = mxnet::TShape(Shape1(channelCount)); // kMovingVar
+ in_shape->at(batchnorm::kInMovingVar) = mxnet::TShape(Shape1(channelCount)); // kMovingVar
out_shape->clear();
out_shape->push_back(dshape); // kOut
@@ -390,32 +403,33 @@ static bool BatchNormShape(const nnvm::NodeAttrs& attrs,
}
static bool BatchNormType(const nnvm::NodeAttrs& attrs,
- std::vector<int> *in_type, std::vector<int> *out_type) {
+ std::vector<int>* in_type,
+ std::vector<int>* out_type) {
using namespace mshadow;
CHECK_GE(in_type->size(), 1U);
const size_t n_out = 3;
- // For float16 input type beta, gamma, mean, and average are stored in float32.
- // For other input types, these parameters have the same type as input
- // NOTE: This requirement is from cuDNN (v. 4 and 5)
+ // For float16 input type beta, gamma, mean, and average are stored in
+ // float32. For other input types, these parameters have the same type as
+ // input NOTE: This requirement is from cuDNN (v. 4 and 5)
int dtype_param;
int dtype = (*in_type)[0];
if (type_is_none(dtype)) {
// Input type is undefined, we try backward inference
- if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
- // Neither the input nor the output are defined,
- // types cannot be infered for this op
- return false;
- } else {
- // Input type is undefined but output type is: backward inference
- dtype = (*out_type)[0];
- (*in_type)[0] = dtype;
- MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
- dtype_param = mshadow::DataType<AccRealX>::kFlag; });
- }
+ if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
+ // Neither the input nor the output are defined,
+ // types cannot be infered for this op
+ return false;
+ } else {
+ // Input type is undefined but output type is: backward inference
+ dtype = (*out_type)[0];
+ (*in_type)[0] = dtype;
+ MSHADOW_REAL_TYPE_SWITCH_EX(
+ dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+ }
} else {
// Input type is defined but output type is not: forward inference
- MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, {
- dtype_param = mshadow::DataType<AccRealX>::kFlag; });
+ MSHADOW_REAL_TYPE_SWITCH_EX(
+ dtype, DTypeX, AccRealX, { dtype_param = mshadow::DataType<AccRealX>::kFlag; });
out_type->clear();
out_type->push_back(dtype);
for (size_t i = 1; i < n_out; ++i) {
@@ -435,29 +449,30 @@ static bool BatchNormType(const nnvm::NodeAttrs& attrs,
}
#if MXNET_USE_MKLDNN == 1
-static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) {
- if (mxnet::op::batchnorm::disable_mkl) return false;
+static inline bool SupportMKLDNNBN(const NDArray& input, const BatchNormParam& param) {
+ if (mxnet::op::batchnorm::disable_mkl)
+ return false;
const mxnet::TShape shape = input.shape();
- const int ndim = shape.ndim();
- if (ndim == 0 || shape.Size() == 0) return false;
+ const int ndim = shape.ndim();
+ if (ndim == 0 || shape.Size() == 0)
+ return false;
const int dtype = input.dtype();
- return (dtype == mshadow::kFloat32 ||
- dtype == mshadow::kBfloat16) &&
- SupportStorageMKLDNN(input.storage_type());
+ return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
+ SupportStorageMKLDNN(input.storage_type());
}
-void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void BatchNormComputeExCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
CHECK_EQ(inputs.size(), 5U);
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
- bool fuse_relu = false;
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+ bool fuse_relu = false;
if (SupportMKLDNNBN(inputs[0], param)) {
MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
MKLDNN_REAL_TYPE_SWITCH(inputs[0].dtype(), DTYPE, {
- MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
+ MKLDNNBatchNormForward<DTYPE>(attrs, ctx, inputs, req, outputs, fuse_relu);
});
MKLDNN_OPCHECK_RUN(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
return;
@@ -465,52 +480,53 @@ void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs,
FallBackCompute(BatchNormCompute<cpu>, attrs, ctx, inputs, req, outputs);
}
-void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
- bool fuse_relu = false;
+void BatchNormGradComputeExCPU(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
+ bool fuse_relu = false;
if (SupportMKLDNNBN(inputs[0], param)) {
- MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
- MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
- MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
- return;
+ MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
+ MKLDNNBatchNormBackward<float>(attrs, ctx, inputs, req, outputs, fuse_relu);
+ MKLDNN_OPCHECK_RUN(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
+ return;
}
FallBackCompute(BatchNormGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
}
#endif
-static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs,
+static inline bool BatchNormStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
- DispatchMode *dispatch_mode,
- std::vector<int> *in_attrs,
- std::vector<int> *out_attrs) {
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
bool dispatched = false;
#if MXNET_USE_MKLDNN == 1
if (!dispatched) {
- dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode,
- in_attrs, out_attrs);
+ dispatched = MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs);
}
if (!MKLDNNEnvSet()) {
*dispatch_mode = DispatchMode::kFComputeFallback;
}
#else
for (int& v : *in_attrs)
- if (v == - 1) v = kDefaultStorage;
+ if (v == -1)
+ v = kDefaultStorage;
if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
- dispatched = storage_type_assign(out_attrs, kDefaultStorage,
- dispatch_mode, DispatchMode::kFCompute);
+ dispatched =
+ storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute);
}
if (!dispatched) {
dispatched = dispatch_fallback(out_attrs, dispatch_mode);
}
#endif
if (!common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) && param.fix_gamma) {
- LOG(FATAL) << "fix_gamma=True is not supported for sparse ndarrays. Tracked at #11647";
+ LOG(FATAL) << "fix_gamma=True is not supported for sparse ndarrays. "
+ "Tracked at #11647";
}
return dispatched;
}
@@ -533,10 +549,10 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
heads.emplace_back(n->inputs.at(batchnorm::kInMovingVar));
nnvm::ObjectPtr gnode = nnvm::Node::Create();
- gnode->inputs = std::move(heads);
+ gnode->inputs = std::move(heads);
gnode->control_deps.emplace_back(n);
- gnode->attrs = n->attrs;
- gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
+ gnode->attrs = n->attrs;
+ gnode->attrs.op = nnvm::Op::Get("_backward_BatchNorm");
gnode->attrs.name = n->attrs.name + "_backward";
// The input of batchnorm
std::vector<nnvm::NodeEntry> in_grad;
@@ -545,8 +561,8 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
in_grad.emplace_back(gnode, i, 0);
// attach no gradient node to forbid gradient on aux_state
nnvm::ObjectPtr ng = nnvm::Node::Create();
- ng->attrs.op = Op::Get("_NoGradient");
- ng->attrs.name = "NoGradient";
+ ng->attrs.op = Op::Get("_NoGradient");
+ ng->attrs.name = "NoGradient";
// the aux state of batchnorm
for (size_t i = 3; i < 5; ++i)
in_grad.emplace_back(ng);
@@ -554,8 +570,8 @@ std::vector<nnvm::NodeEntry> BatchNormGrad(const nnvm::ObjectPtr& n,
}
NNVM_REGISTER_OP(BatchNorm)
-.add_alias("_npx_batch_norm")
-.describe(R"code(Batch normalization.
+ .add_alias("_npx_batch_norm")
+ .describe(R"code(Batch normalization.
Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
well as offset ``beta``.
@@ -605,75 +621,82 @@ then set ``gamma`` to 1 and its gradient to 0.
the sparse tensors will fallback.
)code" ADD_FILELINE)
-.set_num_inputs(5)
-.set_num_outputs(3)
-.set_attr_parser(ParamParser<BatchNormParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
- [](const NodeAttrs& attrs) {
- return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
- [](const NodeAttrs& attrs) {
- return std::vector<std::string>{"output", "mean", "var"};
-})
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
- [](const NodeAttrs& attrs) {
- const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
- return param.output_mean_var ? 3 : 1;
-})
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
- return std::vector<uint32_t>{3, 4};
-})
-.set_attr<mxnet::FInferShape>("FInferShape", BatchNormShape)
-.set_attr<nnvm::FInferType>("FInferType", BatchNormType)
-.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
-.set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
+ .set_num_inputs(5)
+ .set_num_outputs(3)
+ .set_attr_parser(ParamParser<BatchNormParam>)
+ .set_attr<nnvm::FListInputNames>(
+ "FListInputNames",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::string>{"data", "gamma", "beta", "moving_mean", "moving_var"};
+ })
+ .set_attr<nnvm::FListOutputNames>("FListOutputNames",
+ [](const NodeAttrs& attrs) {
+ return std::vector<std::string>{"output", "mean", "var"};
+ })
+ .set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs",
+ [](const NodeAttrs& attrs) {
+ const BatchNormParam& param =
+ nnvm::get<BatchNormParam>(attrs.parsed);
+ return param.output_mean_var ? 3 : 1;
+ })
+ .set_attr<nnvm::FMutateInputs>("FMutateInputs",
+ [](const nnvm::NodeAttrs& attrs) {
+ return std::vector<uint32_t>{3, 4};
+ })
+ .set_attr<mxnet::FInferShape>("FInferShape", BatchNormShape)
+ .set_attr<nnvm::FInferType>("FInferType", BatchNormType)
+ .set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+ .set_attr<FCompute>("FCompute<cpu>", BatchNormCompute<cpu>)
#if MXNET_USE_MKLDNN == 1
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
+ .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormComputeExCPU)
#endif
-.set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
+ .set_attr<nnvm::FGradient>("FGradient", BatchNormGrad)
#if MXNET_USE_MKLDNN == 1
-.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
- return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
+ .set_attr<bool>("TIsMKLDNN", true)
+ .set_attr<FResourceRequest>("FResourceRequest",
+ [](const NodeAttrs& n) {
+ return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+ })
#endif
-.add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
-.add_argument("gamma", "NDArray-or-Symbol", "gamma array")
-.add_argument("beta", "NDArray-or-Symbol", "beta array")
-.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
-.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
-.add_arguments(BatchNormParam::__FIELDS__())
-.set_attr<nnvm::FSetInputVarAttrOnCompose>(
- "FSetInputVarAttrOnCompose",
- [](const nnvm::NodeAttrs& attrs, nnvm::ObjectPtr var, const int index) {
- if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return;
- if (index == 3) {
- var->attrs.dict["__init__"] = "[\"zero\", {}]";
- } else if (index == 4) {
- var->attrs.dict["__init__"] = "[\"one\", {}]";
- }
- });
+ .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
+ .add_argument("gamma", "NDArray-or-Symbol", "gamma array")
+ .add_argument("beta", "NDArray-or-Symbol", "beta array")
+ .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
+ .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
+ .add_arguments(BatchNormParam::__FIELDS__())
+ .set_attr<nnvm::FSetInputVarAttrOnCompose>(
+ "FSetInputVarAttrOnCompose",
+ [](const nnvm::NodeAttrs& attrs, nnvm::ObjectPtr var, const int index) {
+ if (var->attrs.dict.find("__init__") != var->attrs.dict.end())
+ return;
+ if (index == 3) {
+ var->attrs.dict["__init__"] = "[\"zero\", {}]";
+ } else if (index == 4) {
+ var->attrs.dict["__init__"] = "[\"one\", {}]";
+ }
+ });
NNVM_REGISTER_OP(_backward_BatchNorm)
-.set_num_inputs(8)
-.set_num_outputs(3)
-.set_attr<nnvm::FMutateInputs>("FMutateInputs", [](const nnvm::NodeAttrs& attrs) {
- return std::vector<uint32_t>{6, 7}; // moving_mean, moving_var
-})
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
+ .set_num_inputs(8)
+ .set_num_outputs(3)
+ .set_attr<nnvm::FMutateInputs>("FMutateInputs",
+ [](const nnvm::NodeAttrs& attrs) {
+ return std::vector<uint32_t>{6, 7}; // moving_mean, moving_var
+ })
+ .set_attr<nnvm::TIsBackward>("TIsBackward", true)
+ .set_attr<FInferStorageType>("FInferStorageType", BatchNormStorageType)
#if MXNET_USE_MKLDNN == 1
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
- return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
+ .set_attr<FResourceRequest>("FResourceRequest",
+ [](const NodeAttrs& n) {
+ return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+ })
#endif
-.set_attr_parser(ParamParser<BatchNormParam>)
+ .set_attr_parser(ParamParser<BatchNormParam>)
#if MXNET_USE_MKLDNN == 1
-.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
+ .set_attr<bool>("TIsMKLDNN", true)
+ .set_attr<FComputeEx>("FComputeEx<cpu>", BatchNormGradComputeExCPU)
#endif
-.set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
+ .set_attr<FCompute>("FCompute<cpu>", BatchNormGradCompute<cpu>);
} // namespace op
} // namespace mxnet
diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu
index 40d677a..0c70f85 100644
--- a/src/operator/nn/batch_norm.cu
+++ b/src/operator/nn/batch_norm.cu
@@ -47,10 +47,6 @@
using namespace mxnet;
-/*! \brief inverse standard deviation <-> variance */
-#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
-
namespace mxnet {
namespace op {
namespace batchnorm {
diff --git a/src/operator/nn/mkldnn/mkldnn_act-inl.h b/src/operator/nn/mkldnn/mkldnn_act-inl.h
index 70bf16a..ad5d70e 100644
--- a/src/operator/nn/mkldnn/mkldnn_act-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_act-inl.h
@@ -22,17 +22,17 @@
* \file mkldnn_act-inl.h
* \brief MKLDNN Activation operator
* /author Zhiyuan Huang
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_ACT_INL_H_
-
#if MXNET_USE_MKLDNN == 1
-#include <vector>
#include <utility>
-#include "../activation-inl.h"
+#include <vector>
+
#include "../../leaky_relu-inl.h"
+#include "../activation-inl.h"
namespace mxnet {
namespace op {
@@ -42,53 +42,56 @@ struct MKLDNNActParam {
float slope = 0.f;
bool operator==(const MKLDNNActParam& other) const {
- return this->alg == other.alg &&
- this->slope == other.slope;
+ return this->alg == other.alg && this->slope == other.slope;
}
};
mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param);
mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param);
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
- const MKLDNNActParam& param, bool is_train,
- const mkldnn::memory &input_mem);
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
+ bool is_train,
+ const mkldnn::memory& input_mem);
class MKLDNNActForward {
public:
const mkldnn::eltwise_forward::primitive_desc fwd_pd;
- MKLDNNActForward(const MKLDNNActParam& param, bool is_train,
- const NDArray &data, const mkldnn::memory &mem): fwd_pd(
- GetActFwdDescImpl(param, is_train, mem)) {
+ MKLDNNActForward(const MKLDNNActParam& param,
+ bool is_train,
+ const NDArray& data,
+ const mkldnn::memory& mem)
+ : fwd_pd(GetActFwdDescImpl(param, is_train, mem)) {
fwd_ = std::make_shared<mkldnn::eltwise_forward>(fwd_pd);
}
- const inline mkldnn::eltwise_forward &GetFwd() const;
+ const inline mkldnn::eltwise_forward& GetFwd() const;
private:
std::shared_ptr<mkldnn::eltwise_forward> fwd_;
};
typedef ParamOpSign<MKLDNNActParam> MKLDNNActSignature;
-MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
- const OpContext &ctx, const NDArray &in_data,
- const mkldnn::memory &in_mem);
+MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const mkldnn::memory& in_mem);
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(
- const MKLDNNActParam ¶m, const mkldnn::memory &input_mem,
- const mkldnn::memory &diff_dst_memory);
+mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
+ const mkldnn::memory& input_mem,
+ const mkldnn::memory& diff_dst_memory);
class MKLDNNActBackward {
public:
const mkldnn::eltwise_backward::primitive_desc bwd_pd;
- explicit MKLDNNActBackward(const MKLDNNActParam ¶m, const NDArray &data,
- const mkldnn::memory &mem,
- const mkldnn::memory &diff_dst_memory): bwd_pd(
- GetActBwdDescImpl(param, mem, diff_dst_memory)) {
+ explicit MKLDNNActBackward(const MKLDNNActParam& param,
+ const NDArray& data,
+ const mkldnn::memory& mem,
+ const mkldnn::memory& diff_dst_memory)
+ : bwd_pd(GetActBwdDescImpl(param, mem, diff_dst_memory)) {
bwd_prim_ = std::make_shared<mkldnn::eltwise_backward>(bwd_pd);
}
- const inline mkldnn::eltwise_backward &GetBwd() const;
+ const inline mkldnn::eltwise_backward& GetBwd() const;
private:
std::shared_ptr<mkldnn::eltwise_backward> bwd_prim_;
@@ -97,12 +100,12 @@ class MKLDNNActBackward {
} // namespace mxnet
namespace std {
-template<>
+template <>
struct hash<mxnet::op::MKLDNNActParam> {
size_t operator()(const mxnet::op::MKLDNNActParam& val) {
size_t ret = 0;
- ret = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
- ret = dmlc::HashCombine(ret, val.slope);
+ ret = dmlc::HashCombine(ret, static_cast<size_t>(val.alg));
+ ret = dmlc::HashCombine(ret, val.slope);
return ret;
}
};
diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc
index 29ff8d9..6f4ac3d 100644
--- a/src/operator/nn/mkldnn/mkldnn_act.cc
+++ b/src/operator/nn/mkldnn/mkldnn_act.cc
@@ -21,57 +21,54 @@
* \file mkldnn_act.cc
* \brief
* \author Da Zheng
-*/
+ */
#if MXNET_USE_MKLDNN == 1
#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <mxnet/operator.h>
+
#include <algorithm>
#include <map>
-#include <vector>
#include <string>
#include <utility>
+#include <vector>
+
#include "../../operator_common.h"
-#include "mkldnn_act-inl.h"
#include "./mkldnn_base-inl.h"
+#include "mkldnn_act-inl.h"
namespace mxnet {
namespace op {
bool SupportMKLDNNAct(const ActivationParam& param) {
- return param.act_type == activation::kReLU
- || param.act_type == activation::kSigmoid
- || param.act_type == activation::kSoftReLU
- || param.act_type == activation::kTanh;
+ return param.act_type == activation::kReLU || param.act_type == activation::kSigmoid ||
+ param.act_type == activation::kSoftReLU || param.act_type == activation::kTanh;
}
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input) {
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input) {
// MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
- if ((input.shape().ndim() < 1) ||
- (input.shape().ndim() > 5) ||
+ if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
!(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
return false;
return SupportMKLDNNAct(param);
}
bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param) {
- return param.act_type == leakyrelu::kLeakyReLU
- || param.act_type == leakyrelu::kELU
- || param.act_type == leakyrelu::kGELU;
+ return param.act_type == leakyrelu::kLeakyReLU || param.act_type == leakyrelu::kELU ||
+ param.act_type == leakyrelu::kGELU;
}
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray &input) {
+bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input) {
// MKL-DNN Activation supports 1d, 2d, 3d, 4d and 5d data layout
- if ((input.shape().ndim() < 1) ||
- (input.shape().ndim() > 5) ||
+ if ((input.shape().ndim() < 1) || (input.shape().ndim() > 5) ||
!(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16))
return false;
return SupportMKLDNNLeakyRelu(param);
}
-bool SupportQuantizedMKLDNNAct(const ActivationParam ¶m) {
+bool SupportQuantizedMKLDNNAct(const ActivationParam& param) {
// TODO(zhennan): Add more activation type when mkldnn supports.
// Remove this when it's identity to SupportMKLDNNAct.
return param.act_type == activation::kReLU;
@@ -107,26 +104,26 @@ mkldnn::algorithm GetMKLDNNActAlgo(const LeakyReLUParam& param) {
}
}
-mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(
- const MKLDNNActParam& param, bool is_train,
- const mkldnn::memory &input_mem) {
+mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl(const MKLDNNActParam& param,
+ bool is_train,
+ const mkldnn::memory& input_mem) {
mkldnn::memory::desc data_md = input_mem.get_desc();
- auto cpu_engine = CpuEngine::Get()->get_engine();
- auto alg = param.alg;
+ auto cpu_engine = CpuEngine::Get()->get_engine();
+ auto alg = param.alg;
- auto prop = is_train ? mkldnn::prop_kind::forward_training :
- mkldnn::prop_kind::forward_scoring;
+ auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
auto desc = mkldnn::eltwise_forward::desc(prop, alg, data_md, param.slope);
return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine);
}
-const inline mkldnn::eltwise_forward &MKLDNNActForward::GetFwd() const {
+const inline mkldnn::eltwise_forward& MKLDNNActForward::GetFwd() const {
return *fwd_;
}
-MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
- const OpContext &ctx, const NDArray &in_data,
- const mkldnn::memory &in_mem) {
+MKLDNNActForward& GetActForward(const MKLDNNActParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const mkldnn::memory& in_mem) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActForward, OpHash> fwds;
#else
@@ -145,72 +142,75 @@ MKLDNNActForward &GetActForward(const MKLDNNActParam& param,
return it->second;
}
-void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const NDArray &in_data, const OpReqType &req,
- const NDArray &out_data) {
+void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const OpReqType& req,
+ const NDArray& out_data) {
const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
MKLDNNActParam param_;
- param_.alg = GetMKLDNNActAlgo(param);
+ param_.alg = GetMKLDNNActAlgo(param);
const NDArray& in_buffer = in_data;
- MKLDNNStream *stream = MKLDNNStream::Get();
- auto input_mem = in_buffer.GetMKLDNNData();
- MKLDNNActForward &fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
- auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ auto input_mem = in_buffer.GetMKLDNNData();
+ MKLDNNActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
+ auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
stream->RegisterPrimArgs(fwd.GetFwd(),
- {{ MKLDNN_ARG_SRC, *input_mem}, { MKLDNN_ARG_DST, *out_mem_t.second}});
+ {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
CommitOutput(out_data, out_mem_t);
stream->Submit();
}
-void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const NDArray &in_data, const OpReqType &req,
- const NDArray &out_data) {
+void MKLDNNLeakyReluForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const OpReqType& req,
+ const NDArray& out_data) {
const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
MKLDNNActParam param_;
- param_.alg = GetMKLDNNActAlgo(param);
+ param_.alg = GetMKLDNNActAlgo(param);
param_.slope = param.slope;
- NDArray in_buffer = in_data;
- MKLDNNStream *stream = MKLDNNStream::Get();
+ NDArray in_buffer = in_data;
+ MKLDNNStream* stream = MKLDNNStream::Get();
if (in_data.IsView() && in_data.IsMKLDNNData())
in_buffer = in_data.Reorder2Default();
- auto input_mem = in_buffer.GetMKLDNNData();
- MKLDNNActForward &fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
- auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
+ auto input_mem = in_buffer.GetMKLDNNData();
+ MKLDNNActForward& fwd = GetActForward(param_, ctx, in_buffer, *input_mem);
+ auto out_mem_t = CreateMKLDNNMem(out_data, fwd.fwd_pd.dst_desc(), req, &in_buffer);
stream->RegisterPrimArgs(fwd.GetFwd(),
- {{ MKLDNN_ARG_SRC, *input_mem}, { MKLDNN_ARG_DST, *out_mem_t.second}});
+ {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem_t.second}});
CommitOutput(out_data, out_mem_t);
stream->Submit();
}
-mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(
- const MKLDNNActParam ¶m, const mkldnn::memory &input_mem,
- const mkldnn::memory &diff_dst_memory) {
+mkldnn::eltwise_backward::primitive_desc GetActBwdDescImpl(const MKLDNNActParam& param,
+ const mkldnn::memory& input_mem,
+ const mkldnn::memory& diff_dst_memory) {
mkldnn::memory::desc data_md = input_mem.get_desc();
mkldnn::memory::desc diff_md = diff_dst_memory.get_desc();
- auto cpu_engine = CpuEngine::Get()->get_engine();
- auto alg = param.alg;
+ auto cpu_engine = CpuEngine::Get()->get_engine();
+ auto alg = param.alg;
- mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training,
- alg, data_md, param.slope);
+ mkldnn::eltwise_forward::desc fw_desc(
+ mkldnn::prop_kind::forward_training, alg, data_md, param.slope);
mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine);
mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, param.slope);
- mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine,
- fw_pdesc);
+ mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, fw_pdesc);
return bw_pdesc;
}
-const inline mkldnn::eltwise_backward &MKLDNNActBackward::GetBwd() const {
+const inline mkldnn::eltwise_backward& MKLDNNActBackward::GetBwd() const {
return *bwd_prim_;
}
-static inline MKLDNNActBackward &GetActBackward(const MKLDNNActParam ¶m,
- const OpContext &ctx,
- const NDArray &in_data,
- const NDArray &out_grad,
- const mkldnn::memory &in_mem) {
+static inline MKLDNNActBackward& GetActBackward(const MKLDNNActParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const NDArray& out_grad,
+ const mkldnn::memory& in_mem) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNActSignature, MKLDNNActBackward, OpHash> bwds;
#else
@@ -228,38 +228,38 @@ static inline MKLDNNActBackward &GetActBackward(const MKLDNNActParam ¶m,
return it->second;
}
-// For backward relu activation, it's okay to pass "out_data" as "in_data" to this
-// function, since the computation only involes non-zeros.
-void MKLDNNActivationBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+// For backward relu activation, it's okay to pass "out_data" as "in_data" to
+// this function, since the computation only involes non-zeros.
+void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
if (req[0] == kNullOp) {
return;
}
const ActivationParam& param = nnvm::get<ActivationParam>(attrs.parsed);
// XXX: for y = relu(x), y is passed as "in_data" to Backward()
- const bool relu = param.act_type == activation::kReLU;
- const NDArray &out_buffer = inputs[0];
- const NDArray &in_buffer = relu ? inputs[1] : inputs[2];
- const NDArray &in_grad = outputs[0];
+ const bool relu = param.act_type == activation::kReLU;
+ const NDArray& out_buffer = inputs[0];
+ const NDArray& in_buffer = relu ? inputs[1] : inputs[2];
+ const NDArray& in_grad = outputs[0];
MKLDNNActParam param_;
param_.alg = GetMKLDNNActAlgo(param);
TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]);
auto diff_dst_memory = out_buffer.GetMKLDNNData();
- auto input_mem = in_buffer.GetMKLDNNData();
+ auto input_mem = in_buffer.GetMKLDNNData();
// We need to make sure the two inputs to eltwise_backward has the same memory
// descriptor. Otherwise, the perf will suffer.
if (input_mem->get_desc() != diff_dst_memory->get_desc())
input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
- MKLDNNActBackward &bwd =
- GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
- MKLDNNStream *stream = MKLDNNStream::Get();
- mkldnn_output_t diff_src_memory =
- CreateMKLDNNMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
- mkldnn_args_map_t args = {
- { MKLDNN_ARG_SRC, *input_mem },
- { MKLDNN_ARG_DIFF_DST, *diff_dst_memory },
- { MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second },
+ MKLDNNActBackward& bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ mkldnn_output_t diff_src_memory = CreateMKLDNNMem(in_grad, bwd.bwd_pd.diff_src_desc(), req[0]);
+ mkldnn_args_map_t args = {
+ {MKLDNN_ARG_SRC, *input_mem},
+ {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
+ {MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second},
};
stream->RegisterPrimArgs(bwd.GetBwd(), args);
CommitOutput(in_grad, diff_src_memory);
@@ -267,40 +267,38 @@ void MKLDNNActivationBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx
}
void MKLDNNLeakyReluBackward(const nnvm::NodeAttrs& attrs,
- const OpContext &ctx,
+ const OpContext& ctx,
const std::vector<NDArray>& inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
if (req[0] == kNullOp) {
return;
}
CHECK_EQ(inputs.size(), 2U);
CHECK_EQ(outputs.size(), 1U);
const NDArray& out_buffer = inputs[0];
- const NDArray& in_buffer = inputs[1];
- const NDArray &output = outputs[0];
+ const NDArray& in_buffer = inputs[1];
+ const NDArray& output = outputs[0];
const LeakyReLUParam& param = nnvm::get<LeakyReLUParam>(attrs.parsed);
MKLDNNActParam param_;
- param_.alg = GetMKLDNNActAlgo(param);
+ param_.alg = GetMKLDNNActAlgo(param);
param_.slope = param.slope;
TmpMemMgr::Get()->Init(ctx.requested[leakyrelu::kRandom]);
auto diff_dst_memory = out_buffer.GetMKLDNNData();
- auto input_mem = in_buffer.GetMKLDNNData();
+ auto input_mem = in_buffer.GetMKLDNNData();
// We need to make sure the two inputs to eltwise_backward has the same memory
// descriptor. Otherwise, the perf will suffer.
if (input_mem->get_desc() != diff_dst_memory->get_desc())
input_mem = in_buffer.GetMKLDNNDataReorder(diff_dst_memory->get_desc());
- MKLDNNActBackward &bwd =
- GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
- MKLDNNStream *stream = MKLDNNStream::Get();
- mkldnn_output_t diff_src_memory =
- CreateMKLDNNMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
- mkldnn_args_map_t args = {
- { MKLDNN_ARG_SRC, *input_mem },
- { MKLDNN_ARG_DIFF_DST, *diff_dst_memory },
- { MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second },
+ MKLDNNActBackward& bwd = GetActBackward(param_, ctx, in_buffer, out_buffer, *input_mem);
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ mkldnn_output_t diff_src_memory = CreateMKLDNNMem(output, bwd.bwd_pd.diff_src_desc(), req[0]);
+ mkldnn_args_map_t args = {
+ {MKLDNN_ARG_SRC, *input_mem},
+ {MKLDNN_ARG_DIFF_DST, *diff_dst_memory},
+ {MKLDNN_ARG_DIFF_SRC, *diff_src_memory.second},
};
stream->RegisterPrimArgs(bwd.GetBwd(), args);
CommitOutput(output, diff_src_memory);
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index cb30b0b..48c7445 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -18,30 +18,30 @@
*/
/*******************************************************************************
-* Copyright 2016-2017 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-* http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*
-* \file mkldnn_base-inl.h
-* \brief
-* \author young.jin.kim@intel.com
-* ashok.emani@intel.com
-* deepthi.karkada@intel.com
-* louis.feng@intel.com
-* adam.d.straw@intel.com
-* zhengda1936@gmail.com
-*
-*******************************************************************************/
+ * Copyright 2016-2017 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * \file mkldnn_base-inl.h
+ * \brief
+ * \author young.jin.kim@intel.com
+ * ashok.emani@intel.com
+ * deepthi.karkada@intel.com
+ * louis.feng@intel.com
+ * adam.d.straw@intel.com
+ * zhengda1936@gmail.com
+ *
+ *******************************************************************************/
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_
@@ -54,28 +54,25 @@
#include <unordered_map>
#include <utility>
#include <vector>
+
#include "mkldnn.hpp"
#include "mxnet/graph_attr_types.h"
#include "mxnet/ndarray.h"
#include "mxnet/op_attr_types.h"
#include "mxnet/resource.h"
-#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...) \
- switch (type) { \
- case mshadow::kFloat32: \
- { \
- typedef float DType; \
- {__VA_ARGS__} \
- } \
- break; \
- case mshadow::kBfloat16: \
- { \
- typedef mshadow::bfloat::bf16_t DType; \
- {__VA_ARGS__} \
- } \
- break; \
- default: \
- LOG(FATAL) << "Unknown type enum " << type; \
+#define MKLDNN_REAL_TYPE_SWITCH(type, DType, ...) \
+ switch (type) { \
+ case mshadow::kFloat32: { \
+ typedef float DType; \
+ { __VA_ARGS__ } \
+ } break; \
+ case mshadow::kBfloat16: { \
+ typedef mshadow::bfloat::bf16_t DType; \
+ { __VA_ARGS__ } \
+ } break; \
+ default: \
+ LOG(FATAL) << "Unknown type enum " << type; \
}
namespace mxnet {
@@ -84,18 +81,20 @@ namespace mxnet {
// cpu_engine singleton
class CpuEngine {
public:
- static CpuEngine *Get() {
+ static CpuEngine* Get() {
// I's thread-safe in C++11.
// ensure same mkldnn engine is used across threads
static CpuEngine myInstance;
return &myInstance;
}
- CpuEngine(CpuEngine const &) = delete; // Copy construct
- CpuEngine(CpuEngine &&) = delete; // Move construct
- CpuEngine &operator=(CpuEngine const &) = delete; // Copy assign
- CpuEngine &operator=(CpuEngine &&) = delete; // Move assign
+ CpuEngine(CpuEngine const&) = delete; // Copy construct
+ CpuEngine(CpuEngine&&) = delete; // Move construct
+ CpuEngine& operator=(CpuEngine const&) = delete; // Copy assign
+ CpuEngine& operator=(CpuEngine&&) = delete; // Move assign
- mkldnn::engine &get_engine() { return _cpu_engine; }
+ mkldnn::engine& get_engine() {
+ return _cpu_engine;
+ }
protected:
CpuEngine() : _cpu_engine(mkldnn::engine::kind::cpu, 0) {}
@@ -134,10 +133,10 @@ struct data_type_enum<uint8_t> {
enum { type = static_cast<unsigned int>(mkldnn::memory::data_type::u8) };
};
-static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape &shape) {
- int ndim = shape.ndim();
+static inline bool SupportMKLDNNArray(int dtype, const mxnet::TShape& shape) {
+ int ndim = shape.ndim();
bool support = ndim == 1 || ndim == 2 || ndim == 4;
- support = support &&
+ support = support &&
(dtype == mshadow::kFloat32 || dtype == mshadow::kInt32 || dtype == mshadow::kInt8 ||
dtype == mshadow::kUint8 || dtype == mshadow::kBfloat16);
return support;
@@ -147,24 +146,23 @@ static inline bool SupportStorageMKLDNN(int stype) {
return stype == kDefaultStorage;
}
-static inline bool SupportMKLDNN(int dtype, const mxnet::TShape &shape) {
+static inline bool SupportMKLDNN(int dtype, const mxnet::TShape& shape) {
int ndim = shape.ndim();
if (ndim == 0 || shape.Size() == 0) {
// MKLDNN currently does not support 0-dim Tensor and 0-size Tensor
return false;
}
return (dtype == mshadow::kFloat32 || dtype == mshadow::kBfloat16) &&
- (ndim == 1 || ndim == 2 || ndim == 4);
+ (ndim == 1 || ndim == 2 || ndim == 4);
}
static inline bool SupportMKLDNNQuantize(int dtype) {
- return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 ||
- dtype == mshadow::kUint8 || dtype == mshadow::kBfloat16;
+ return dtype == mshadow::kFloat32 || dtype == mshadow::kInt8 || dtype == mshadow::kUint8 ||
+ dtype == mshadow::kBfloat16;
}
-static inline bool SupportMKLDNN(const NDArray &input) {
- return SupportMKLDNN(input.dtype(), input.shape())
- && SupportStorageMKLDNN(input.storage_type());
+static inline bool SupportMKLDNN(const NDArray& input) {
+ return SupportMKLDNN(input.dtype(), input.shape()) && SupportStorageMKLDNN(input.storage_type());
}
static inline bool MKLDNNEnvSet() {
@@ -177,10 +175,12 @@ static inline int GetMKLDNNCacheSize() {
return mkldnn_cache_size;
}
-// TODO(alex): (MXNET-1075) Will remove env variable and calculate cache size during runtime
-template<typename S, typename I, typename H>
-static typename std::unordered_map<S, I, H>::iterator AddToCache(
- std::unordered_map<S, I, H>* cache, const S &key, const I &item) {
+// TODO(alex): (MXNET-1075) Will remove env variable and calculate cache size
+// during runtime
+template <typename S, typename I, typename H>
+static typename std::unordered_map<S, I, H>::iterator AddToCache(std::unordered_map<S, I, H>* cache,
+ const S& key,
+ const I& item) {
int mkldnn_cache_size = GetMKLDNNCacheSize();
if (mkldnn_cache_size != -1 && static_cast<int>(cache->size()) > mkldnn_cache_size)
cache->erase(cache->begin());
@@ -192,7 +192,7 @@ static typename std::unordered_map<S, I, H>::iterator AddToCache(
/*
* This is to align address to a certain alignment.
*/
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space);
+void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space);
namespace op {
struct ActivationParam;
@@ -204,29 +204,28 @@ struct SoftmaxOutputParam;
struct TransposeParam;
struct ReshapeParam;
bool SupportMKLDNNAct(const ActivationParam& param);
-bool SupportMKLDNNAct(const ActivationParam& param, const NDArray &input);
+bool SupportMKLDNNAct(const ActivationParam& param, const NDArray& input);
bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param);
-bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray &input);
-bool SupportQuantizedMKLDNNAct(const ActivationParam ¶m);
-bool SupportMKLDNNConv(const ConvolutionParam ¶ms, const NDArray &input);
-bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
-bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray &input, const NDArray &output);
-bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param, const NDArray &input,
- const NDArray &output);
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam ¶m);
-bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray &data);
-bool SupportMKLDNNBatchDot(const std::vector<NDArray> &inputs, const NDArray &output);
+bool SupportMKLDNNLeakyRelu(const LeakyReLUParam& param, const NDArray& input);
+bool SupportQuantizedMKLDNNAct(const ActivationParam& param);
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input);
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input);
+bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray& input, const NDArray& output);
+bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param,
+ const NDArray& input,
+ const NDArray& output);
+bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam& param);
+bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray& data);
+bool SupportMKLDNNBatchDot(const std::vector<NDArray>& inputs, const NDArray& output);
} // namespace op
static int GetTypeSize(int dtype) {
int size = -1;
- MSHADOW_TYPE_SWITCH(dtype, DType, {
- size = sizeof(DType);
- });
+ MSHADOW_TYPE_SWITCH(dtype, DType, { size = sizeof(DType); });
return size;
}
-static inline size_t GetArraySize(const NDArray &arr) {
+static inline size_t GetArraySize(const NDArray& arr) {
if (arr.IsMKLDNNData()) {
return arr.GetMKLDNNData()->get_desc().get_size();
}
@@ -251,7 +250,7 @@ static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) {
}
}
-template<typename T>
+template <typename T>
static inline mkldnn::memory::data_type get_mkldnn_type() {
return static_cast<mkldnn::memory::data_type>(data_type_enum<T>::type);
}
@@ -260,12 +259,11 @@ static inline mkldnn_data_type_t get_mkldnn_type_t(int dtype) {
return static_cast<mkldnn_data_type_t>(get_mkldnn_type(dtype));
}
-template<typename T>
+template <typename T>
static inline mkldnn_data_type_t get_mkldnn_type_t() {
return static_cast<mkldnn_data_type_t>(data_type_enum<T>::type);
}
-
static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
auto mkldnn_dtype = static_cast<mkldnn::memory::data_type>(dtype);
switch (mkldnn_dtype) {
@@ -285,8 +283,9 @@ static inline int get_mxnet_type(mkldnn_data_type_t dtype) {
}
}
-static inline size_t GetMemDescSize(const mkldnn::memory::desc &md) {
- if (md.data.ndims == 0) return 0;
+static inline size_t GetMemDescSize(const mkldnn::memory::desc& md) {
+ if (md.data.ndims == 0)
+ return 0;
size_t ret = 1;
for (int i = 0; i < md.data.ndims; i++) {
@@ -297,19 +296,21 @@ static inline size_t GetMemDescSize(const mkldnn::memory::desc &md) {
return ret;
}
-inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int dtype = -1) {
+inline static mkldnn::memory::desc GetMemDesc(const NDArray& arr, int dtype = -1) {
int ndim = arr.shape().ndim();
mkldnn::memory::dims dims(ndim);
dtype = (dtype == -1) ? arr.dtype() : dtype;
- for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+ for (size_t i = 0; i < dims.size(); i++)
+ dims[i] = arr.shape()[i];
return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
}
-inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray &arr, int dtype = -1) {
+inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray& arr, int dtype = -1) {
int ndim = arr.shape().ndim();
mkldnn::memory::dims dims(ndim);
dtype = (dtype == -1) ? arr.dtype() : dtype;
- for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i];
+ for (size_t i = 0; i < dims.size(); i++)
+ dims[i] = arr.shape()[i];
auto format = mkldnn::memory::format_tag::any;
// for batch 256 alexnet benchmark test
if (dims.size() == 2) {
@@ -319,7 +320,7 @@ inline static mkldnn::memory::desc GetFCWeightDesc(const NDArray &arr, int dtype
return mkldnn::memory::desc{dims, get_mkldnn_type(dtype), format};
}
-inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
+inline static mkldnn::memory::desc GetWeightDesc(const NDArray& arr,
int num_groups,
bool quantized = false) {
int dtype = quantized ? mshadow::kInt8 : arr.dtype();
@@ -340,25 +341,29 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
switch (ndim) {
case 3:
tz = mkldnn::memory::dims{
- num_groups, arr.shape()[N] / num_groups,
- arr.shape()[C], arr.shape()[H]};
+ num_groups, arr.shape()[N] / num_groups, arr.shape()[C], arr.shape()[H]};
break;
case 4:
- tz = mkldnn::memory::dims{
- num_groups, arr.shape()[N] / num_groups,
- arr.shape()[C], arr.shape()[H], arr.shape()[W]};
+ tz = mkldnn::memory::dims{num_groups,
+ arr.shape()[N] / num_groups,
+ arr.shape()[C],
+ arr.shape()[H],
+ arr.shape()[W]};
break;
case 5:
- tz = mkldnn::memory::dims{
- num_groups, arr.shape()[N] / num_groups,
- arr.shape()[C], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
+ tz = mkldnn::memory::dims{num_groups,
+ arr.shape()[N] / num_groups,
+ arr.shape()[C],
+ arr.shape()[D],
+ arr.shape()[H],
+ arr.shape()[W]};
}
return mkldnn::memory::desc{tz, get_mkldnn_type(dtype), mkldnn::memory::format_tag::any};
}
}
-inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray> &inputs) {
- for (const auto &in : inputs) {
+inline static bool CheckMKLDNNInputArrayIsView(const std::vector<NDArray>& inputs) {
+ for (const auto& in : inputs) {
if (in.IsView() && in.IsMKLDNNData()) {
return true;
}
@@ -381,7 +386,7 @@ typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;
*/
class TmpMemMgr {
// This points to the memory buffer where we can allocate temp memory.
- char *curr_mem;
+ char* curr_mem;
// The total size of the temp memory.
size_t mem_size;
// This contains the current available memory size.
@@ -391,7 +396,7 @@ class TmpMemMgr {
const size_t alignment = kMKLDNNAlign;
public:
- static TmpMemMgr *Get() {
+ static TmpMemMgr* Get() {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local TmpMemMgr mgr;
#else
@@ -407,44 +412,43 @@ class TmpMemMgr {
}
void Reset() {
- curr_mem = nullptr;
+ curr_mem = nullptr;
curr_size = 0;
// We don't reset est_size and mem_size because est_size contains the
// estimated temp memory size from the last run and mem_size contains the
// memroy size allocated in the last run.
}
- void Init(const Resource &r) {
+ void Init(const Resource& r) {
// If the last time, if we estimate that we need more memory, we should the
// larger memory size.
mem_size = std::max(mem_size, est_size);
if (mem_size > 0) {
- // Let's allocate some extra memory. If we don't use some of them all the time,
- // the OS won't physically allocate pages for them any way.
+ // Let's allocate some extra memory. If we don't use some of them all the
+ // time, the OS won't physically allocate pages for them any way.
this->curr_size = mem_size * 2;
- this->curr_mem = static_cast<char *>(r.get_host_space_internal(this->curr_size));
+ this->curr_mem = static_cast<char*>(r.get_host_space_internal(this->curr_size));
}
// reset est_size, so we can start to estimate the temp memory size.
this->est_size = 0;
}
- mkldnn::memory *Alloc(const mkldnn::memory::desc &md);
+ mkldnn::memory* Alloc(const mkldnn::memory::desc& md);
};
typedef std::unordered_map<int, mkldnn::memory> mkldnn_args_map_t;
class MKLDNNStream {
- std::vector<std::pair<mkldnn::primitive, mkldnn_args_map_t> > net_prim_args;
+ std::vector<std::pair<mkldnn::primitive, mkldnn_args_map_t>> net_prim_args;
// Here we hold all memory related to the operators in the stream.
- std::vector<std::shared_ptr<const mkldnn::memory> > mem_holder;
+ std::vector<std::shared_ptr<const mkldnn::memory>> mem_holder;
mkldnn::stream s;
public:
- static MKLDNNStream *Get();
+ static MKLDNNStream* Get();
- MKLDNNStream(): s(CpuEngine::Get()->get_engine()) {}
+ MKLDNNStream() : s(CpuEngine::Get()->get_engine()) {}
- void RegisterPrimArgs(const mkldnn::primitive &prim,
- const mkldnn_args_map_t &args) {
+ void RegisterPrimArgs(const mkldnn::primitive& prim, const mkldnn_args_map_t& args) {
net_prim_args.emplace_back(prim, args);
}
@@ -463,7 +467,7 @@ class MKLDNNStream {
*/
void Submit(bool cleanup = true) {
if (!net_prim_args.empty()) {
- for (auto &v : net_prim_args) {
+ for (auto& v : net_prim_args) {
v.first.execute(s, v.second);
}
net_prim_args.clear();
@@ -484,22 +488,22 @@ enum OutDataOp {
AddBack,
};
-typedef std::pair<OutDataOp, mkldnn::memory *> mkldnn_output_t;
-void MKLDNNMemoryCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem);
+typedef std::pair<OutDataOp, mkldnn::memory*> mkldnn_output_t;
+void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem);
/*
* Here we want to get MKLDNN memory whose desc is exactly the same as
* the given one. operator== can't guarantee that. == can return true even if
* the formats are different. I need to double check its format.
*/
-static inline mkldnn::memory *GetMKLDNNExact(
- const mkldnn::memory *mem, const mkldnn::memory::desc &desc) {
+static inline mkldnn::memory* GetMKLDNNExact(const mkldnn::memory* mem,
+ const mkldnn::memory::desc& desc) {
mkldnn::memory::desc src_desc = mem->get_desc();
if (desc == src_desc) {
- return const_cast<mkldnn::memory *>(mem);
+ return const_cast<mkldnn::memory*>(mem);
} else {
- std::shared_ptr<mkldnn::memory> ret(new mkldnn::memory(
- desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
+ std::shared_ptr<mkldnn::memory> ret(
+ new mkldnn::memory(desc, CpuEngine::Get()->get_engine(), mem->get_data_handle()));
MKLDNNStream::Get()->RegisterMem(ret);
return ret.get();
}
@@ -516,27 +520,29 @@ static inline mkldnn::memory *GetMKLDNNExact(
* If these two functions are used, we have to call CommitOutput to write
* the output back to the output NDArray.
*/
-mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
- OpReqType req, const NDArray* in_arr = nullptr);
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
+ OpReqType req,
+ const NDArray* in_arr = nullptr);
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
OpReqType req);
/* This function has to be used with one of the functions above. */
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res);
+void CommitOutput(const NDArray& arr, const mkldnn_output_t& res);
-static inline void InvalidateOutputs(const std::vector<NDArray> &arrs,
- const std::vector<OpReqType> &reqs) {
+static inline void InvalidateOutputs(const std::vector<NDArray>& arrs,
+ const std::vector<OpReqType>& reqs) {
for (size_t i = 0; i < arrs.size(); i++) {
if (reqs[i] == kWriteTo || reqs[i] == kNullOp) {
- const_cast<NDArray &>(arrs[i]).InvalidateMKLDNNData();
+ const_cast<NDArray&>(arrs[i]).InvalidateMKLDNNData();
}
}
}
-// TODO(alexzai): (MXNET-856) Remove helper function after subgraph feature added
-static inline void CreateDefaultInputs(const std::vector<NDArray> &arrs,
- std::vector<NDArray> *out_arrs) {
+// TODO(alexzai): (MXNET-856) Remove helper function after subgraph feature
+// added
+static inline void CreateDefaultInputs(const std::vector<NDArray>& arrs,
+ std::vector<NDArray>* out_arrs) {
out_arrs->clear();
for (size_t i = 0; i < arrs.size(); ++i) {
if (arrs[i].IsMKLDNNData())
@@ -546,20 +552,20 @@ static inline void CreateDefaultInputs(const std::vector<NDArray> &arrs,
}
}
-const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups);
+const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups);
-const mkldnn::memory *GetWeights(const NDArray &arr,
- const mkldnn::memory::desc &target_md,
+const mkldnn::memory* GetWeights(const NDArray& arr,
+ const mkldnn::memory::desc& target_md,
int num_groups);
-bool IsDefaultFormat(const mkldnn::memory::desc &desc);
-bool IsMKLDNN(const mkldnn::memory::desc &desc);
+bool IsDefaultFormat(const mkldnn::memory::desc& desc);
+bool IsMKLDNN(const mkldnn::memory::desc& desc);
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc &md);
+mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& md);
mkldnn_format_tag_t GetDefaultFormat(int num_dims);
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc &md, const mkldnn_format_tag_t &format);
+mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& md, const mkldnn_format_tag_t& format);
-inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int ndims) {
+inline bool same_shape(const mxnet::TShape& shape, const mkldnn_dims_t dims, int ndims) {
if (shape.ndim() != ndims)
return false;
for (int i = 0; i < ndims; i++)
@@ -568,8 +574,7 @@ inline bool same_shape(const mxnet::TShape &shape, const mkldnn_dims_t dims, int
return true;
}
-inline bool same_shape(const mkldnn::memory::desc &desc1,
- const mkldnn::memory::desc &desc2) {
+inline bool same_shape(const mkldnn::memory::desc& desc1, const mkldnn::memory::desc& desc2) {
if (desc1.data.ndims != desc2.data.ndims)
return false;
for (int i = 0; i < desc1.data.ndims; i++)
@@ -578,10 +583,9 @@ inline bool same_shape(const mkldnn::memory::desc &desc1,
return true;
}
-inline bool same_shape(const mxnet::TShape &shape, int dtype,
- const mkldnn::memory::desc &desc) {
- return same_shape(shape, desc.data.dims, desc.data.ndims)
- && get_mkldnn_type(dtype) == desc.data.data_type;
+inline bool same_shape(const mxnet::TShape& shape, int dtype, const mkldnn::memory::desc& desc) {
+ return same_shape(shape, desc.data.dims, desc.data.ndims) &&
+ get_mkldnn_type(dtype) == desc.data.data_type;
}
/*
@@ -592,25 +596,24 @@ inline bool same_shape(const mxnet::TShape &shape, int dtype,
class MKLDNNMemory {
std::shared_ptr<mkldnn::memory> mem;
mkldnn::memory::desc desc;
- size_t size; // The number of bytes.
+ size_t size; // The number of bytes.
public:
- MKLDNNMemory(mkldnn::memory::desc md, void *addr): desc(md) {
+ MKLDNNMemory(mkldnn::memory::desc md, void* addr) : desc(md) {
mem.reset(new mkldnn::memory(md, CpuEngine::Get()->get_engine(), addr));
size = desc.get_size();
}
- explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem): desc(
- mem->get_desc()) {
+ explicit MKLDNNMemory(std::shared_ptr<mkldnn::memory> mem) : desc(mem->get_desc()) {
this->mem = mem;
- size = desc.get_size();
+ size = desc.get_size();
}
- void SetDataHandle(void *handle) {
+ void SetDataHandle(void* handle) {
mem->set_data_handle(handle);
}
- void *GetDataHandle() const {
+ void* GetDataHandle() const {
return mem->get_data_handle();
}
@@ -618,7 +621,7 @@ class MKLDNNMemory {
return mem;
}
- mkldnn::memory *GetRaw() const {
+ mkldnn::memory* GetRaw() const {
return mem.get();
}
@@ -630,13 +633,15 @@ class MKLDNNMemory {
return mem->get_desc();
}
- mkldnn::memory::desc GetDesc(mkldnn_format_tag_t format,
- mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
+ mkldnn::memory::desc GetDesc(
+ mkldnn_format_tag_t format,
+ mkldnn::memory::data_type data_type = mkldnn::memory::data_type::undef) const {
mkldnn::memory::dims dims(desc.data.dims, desc.data.dims + desc.data.ndims);
- mkldnn::memory::data_type cpp_type = (data_type == mkldnn::memory::data_type::undef)
- ? static_cast<mkldnn::memory::data_type>(desc.data.data_type) : data_type;
- mkldnn::memory::desc data_md(dims, cpp_type,
- static_cast<mkldnn::memory::format_tag>(format));
+ mkldnn::memory::data_type cpp_type =
+ (data_type == mkldnn::memory::data_type::undef)
+ ? static_cast<mkldnn::memory::data_type>(desc.data.data_type)
+ : data_type;
+ mkldnn::memory::desc data_md(dims, cpp_type, static_cast<mkldnn::memory::format_tag>(format));
return data_md;
}
@@ -652,25 +657,26 @@ class MKLDNNMemory {
return mem->get_desc() == md;
}
- bool SameFormat(const mxnet::TShape &shape, int dtype) const {
+ bool SameFormat(const mxnet::TShape& shape, int dtype) const {
return same_shape(shape, dtype, desc);
}
- void ReorderTo(mkldnn::memory *other) const {
+ void ReorderTo(mkldnn::memory* other) const {
mkldnn::stream s(CpuEngine::Get()->get_engine());
mkldnn::reorder(*mem, *other).execute(s, *mem, *other);
}
};
// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst);
+void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst);
template <typename Compute, typename AttrState>
-void FallBackCompute(Compute fn, const AttrState &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs);
+void FallBackCompute(Compute fn,
+ const AttrState& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs);
/*
* This class is used to check the correctness of MKLDNN operators.
@@ -683,66 +689,69 @@ class OpCheck {
public:
OpCheck(bool backward, size_t num_checks) {
- this->backward = backward;
+ this->backward = backward;
this->num_checks = num_checks;
}
- void Init(const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::NDArray> &outputs_);
+ void Init(const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::NDArray>& outputs_);
- void Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs_);
+ void Run(mxnet::FCompute fn,
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs_);
- void CopyResult(const std::vector<mxnet::NDArray> &outputs_,
- const std::vector<size_t>& indice);
+ void CopyResult(const std::vector<mxnet::NDArray>& outputs_, const std::vector<size_t>& indice);
};
-bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
+bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
bool support_mkldnn,
- DispatchMode *dispatch_mode,
- std::vector<int> *in_attrs,
- std::vector<int> *out_attrs);
-
-#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
- static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false); \
- OpCheck check(backward, num_checks); \
- if (debug) check.Init(inputs, outputs);
-
-#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
- if (debug) check.Run(fn, attrs, ctx, inputs, req, outputs);
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs);
+
+#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \
+ static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false); \
+ OpCheck check(backward, num_checks); \
+ if (debug) \
+ check.Init(inputs, outputs);
+
+#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \
+ if (debug) \
+ check.Run(fn, attrs, ctx, inputs, req, outputs);
#define MKLDNN_OPCHECK_COPY_RESULT(outputs, indice) \
- if (debug) check.CopyResult(outputs, indice);
+ if (debug) \
+ check.CopyResult(outputs, indice);
struct MKLDNNPostEltwiseParam {
mkldnn::algorithm alg = mkldnn::algorithm::undef;
- float scale = 1.f;
- float alpha = 0.f;
- float beta = 1.f;
+ float scale = 1.f;
+ float alpha = 0.f;
+ float beta = 1.f;
};
void MKLDNNRun(mxnet::FComputeEx fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs_);
-
-using FComputeExUnary = std::function<void (const nnvm::NodeAttrs& attrs,
- const OpContext& ctx,
- const NDArray& input,
- const OpReqType& req,
- const NDArray& output)>;
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs_);
+
+using FComputeExUnary = std::function<void(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& input,
+ const OpReqType& req,
+ const NDArray& output)>;
void MKLDNNRun(FComputeExUnary fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const mxnet::NDArray &inputs_,
- const mxnet::OpReqType &req,
- const mxnet::NDArray &outputs_);
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const mxnet::NDArray& inputs_,
+ const mxnet::OpReqType& req,
+ const mxnet::NDArray& outputs_);
} // namespace mxnet
#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc
index 0cea4ef..5a65c94 100644
--- a/src/operator/nn/mkldnn/mkldnn_base.cc
+++ b/src/operator/nn/mkldnn/mkldnn_base.cc
@@ -20,14 +20,15 @@
#if MXNET_USE_MKLDNN == 1
#include <atomic>
-#include "./mkldnn_base-inl.h"
-#include "./mkldnn_ops-inl.h"
+
#include "../../../common/exec_utils.h"
#include "../../operator_common.h"
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
namespace mxnet {
-MKLDNNStream *MKLDNNStream::Get() {
+MKLDNNStream* MKLDNNStream::Get() {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local MKLDNNStream stream;
#else
@@ -36,7 +37,7 @@ MKLDNNStream *MKLDNNStream::Get() {
return &stream;
}
-void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
+void* AlignMem(void* mem, size_t size, size_t alignment, size_t* space) {
if (size > *space)
return nullptr;
intptr_t addr = reinterpret_cast<intptr_t>(mem);
@@ -51,13 +52,13 @@ void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) {
addr += padding;
*space -= padding;
CHECK_EQ(addr % alignment, 0);
- return reinterpret_cast<void *>(addr);
+ return reinterpret_cast<void*>(addr);
}
-mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
+mkldnn::memory* TmpMemMgr::Alloc(const mkldnn::memory::desc& md) {
// We need to include the size of the memory used for alignment.
this->est_size += md.get_size() + alignment;
- void *mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
+ void* mem = AlignMem(this->curr_mem, md.get_size(), alignment, &this->curr_size);
if (mem) {
// The memory is allocated from the temporary memory space in the
// operator. It'll only become invalid after we exit from the operator.
@@ -65,19 +66,20 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
MKLDNNStream::Get()->RegisterMem(ret);
CHECK_EQ(mem, mem);
this->curr_size -= md.get_size();
- this->curr_mem = static_cast<char *>(mem) + md.get_size();
+ this->curr_mem = static_cast<char*>(mem) + md.get_size();
return ret.get();
} else {
- // If curr_mem has been initialized and we still reach here, it means the current
- // allocated memory isn't enough. But it doesn't matter for multiple invokes of a
- // operator, as the TmpMemMgr could estimate the space at the first iteration and
- // then re-requests abundant space from MXNet resource. MKL-DNN could allocate
- // the space by itself. Thus, we just let it continue for estimating the maximum
- // required space size. It will be allocated at next call.
+ // If curr_mem has been initialized and we still reach here, it means the
+ // current allocated memory isn't enough. But it doesn't matter for multiple
+ // invokes of a operator, as the TmpMemMgr could estimate the space at the
+ // first iteration and then re-requests abundant space from MXNet resource.
+ // MKL-DNN could allocate the space by itself. Thus, we just let it continue
+ // for estimating the maximum required space size. It will be allocated at
+ // next call.
if (this->curr_mem && dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false)) {
LOG(WARNING) << "mkl-dnn debug message: The rest of the temporary space is not "
- << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
- << "allocate the space by itself.";
+ << "adequate for allocating " << md.get_size() << " bytes. Thus, mkl-dnn "
+ << "allocate the space by itself.";
}
mkldnn_mem_ptr ret(new mkldnn::memory(md, CpuEngine::Get()->get_engine()));
MKLDNNStream::Get()->RegisterMem(ret);
@@ -85,97 +87,93 @@ mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::desc &md) {
}
}
-void MKLDNNMemoryCopy(const mkldnn::memory &mem, const mkldnn::memory* this_mem) {
- MKLDNNStream *stream = MKLDNNStream::Get();
- mkldnn::memory::desc from_desc = mem.get_desc();
- mkldnn::memory::desc this_desc = this_mem->get_desc();
+void MKLDNNMemoryCopy(const mkldnn::memory& mem, const mkldnn::memory* this_mem) {
+ MKLDNNStream* stream = MKLDNNStream::Get();
+ mkldnn::memory::desc from_desc = mem.get_desc();
+ mkldnn::memory::desc this_desc = this_mem->get_desc();
mkldnn_format_tag_t from_def_format = GetDefaultFormat(from_desc);
mkldnn_format_tag_t this_def_format = GetDefaultFormat(this_desc);
if (!same_shape(this_desc, from_desc) && IsDefaultFormat(from_desc)) {
// In this case, we can simply create a new MKLDNN memory for the required
// shape.
- mkldnn::memory::dims dims(this_desc.data.dims,
- this_desc.data.dims + this_desc.data.ndims);
+ mkldnn::memory::dims dims(this_desc.data.dims, this_desc.data.dims + this_desc.data.ndims);
auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
- mkldnn::memory::desc data_md(dims, this_dtype,
- static_cast<mkldnn::memory::format_tag>(this_def_format));
+ mkldnn::memory::desc data_md(
+ dims, this_dtype, static_cast<mkldnn::memory::format_tag>(this_def_format));
mkldnn_mem_ptr tmp_mem(new mkldnn::memory(data_md, mem.get_engine(), mem.get_data_handle()));
stream->RegisterMem(tmp_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *tmp_mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
} else if (!same_shape(this_desc, from_desc)) {
// In this case, the source memory stores data in a customized layout. We
// need to reorganize the data in memory before we can reshape.
mkldnn::memory::desc def_desc = GetDesc(from_desc, from_def_format);
- mkldnn::memory *def_mem = TmpMemMgr::Get()->Alloc(def_desc);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *def_mem}});
+ mkldnn::memory* def_mem = TmpMemMgr::Get()->Alloc(def_desc);
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *def_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(mem, *def_mem), args);
// Now we can reshape it
- mkldnn_mem_ptr tmp_mem(new mkldnn::memory(this_desc,
- mem.get_engine(), def_mem->get_data_handle()));
+ mkldnn_mem_ptr tmp_mem(
+ new mkldnn::memory(this_desc, mem.get_engine(), def_mem->get_data_handle()));
stream->RegisterMem(tmp_mem);
args = {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}};
stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
-} else if (this_desc == from_desc) {
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ } else if (this_desc == from_desc) {
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
// If the layout is the same, we can just copy data.
stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
-} else {
+ } else {
// If both are not using the default layouts. There isn't much we can do,
// other than reorder data layout directly.
if (!IsDefaultFormat(this_desc) && !IsDefaultFormat(from_desc)) {
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *this_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(mem, *this_mem), args);
} else if (IsDefaultFormat(this_desc)) {
// If the dest mem uses the default memory layout, we can simply use
// the default format of the source memory to improve perf of reorder.
mkldnn::memory::desc desc = GetDesc(from_desc, from_def_format);
- mkldnn_mem_ptr tmp_mem(new mkldnn::memory(desc,
- mem.get_engine(), this_mem->get_data_handle()));
+ mkldnn_mem_ptr tmp_mem(
+ new mkldnn::memory(desc, mem.get_engine(), this_mem->get_data_handle()));
stream->RegisterMem(tmp_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, mem},
- {MKLDNN_ARG_TO, *tmp_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, mem}, {MKLDNN_ARG_TO, *tmp_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(mem, *tmp_mem), args);
} else {
// If the src mem uses the default memory layout, we can use
// the default format of the source memory to improve perf.
mkldnn::memory::desc desc = GetDesc(this_desc, this_def_format);
- mkldnn_mem_ptr tmp_mem(new mkldnn::memory(desc,
- this_mem->get_engine(), mem.get_data_handle()));
+ mkldnn_mem_ptr tmp_mem(
+ new mkldnn::memory(desc, this_mem->get_engine(), mem.get_data_handle()));
stream->RegisterMem(tmp_mem);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *tmp_mem},
- {MKLDNN_ARG_TO, *this_mem}});
+ std::unordered_map<int, mkldnn::memory> args(
+ {{MKLDNN_ARG_FROM, *tmp_mem}, {MKLDNN_ARG_TO, *this_mem}});
stream->RegisterPrimArgs(mkldnn::reorder(*tmp_mem, *this_mem), args);
}
}
}
-bool CanWriteTo(const NDArray &out_arr,
- const NDArray &in_arr,
- const mkldnn::memory::desc &desc) {
- auto in_mem = in_arr.GetMKLDNNData();
- bool add_same = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
- bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc &&
- in_mem->get_desc() == desc;
+bool CanWriteTo(const NDArray& out_arr, const NDArray& in_arr, const mkldnn::memory::desc& desc) {
+ auto in_mem = in_arr.GetMKLDNNData();
+ bool add_same = in_mem->get_data_handle() == out_arr.GetMKLDNNData()->get_data_handle();
+ bool pdesc_same = out_arr.GetMKLDNNData()->get_desc() == desc && in_mem->get_desc() == desc;
return add_same && pdesc_same;
}
-mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNMem(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
OpReqType req,
const NDArray* in_arr) {
if (kAddTo == req) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::AddBack, tmp);
} else if (kWriteInplace == req && in_arr != nullptr && CanWriteTo(out_arr, *in_arr, desc)) {
- mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+ mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
// mem is nullptr if out_arr is view and desc is MKLDNN format.
// need to Reorder2Default before calling CreateMKLDNNMem
CHECK(mem != nullptr);
@@ -184,7 +182,7 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::CopyBack, tmp);
} else if (kWriteTo == req) {
- mkldnn::memory *mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+ mkldnn::memory* mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
if (nullptr == mem) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::CopyBack, tmp);
@@ -195,8 +193,8 @@ mkldnn_output_t CreateMKLDNNMem(const NDArray &out_arr,
return mkldnn_output_t(OutDataOp::Noop, tmp);
}
-mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
- const mkldnn::memory::desc &desc,
+mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray& out_arr,
+ const mkldnn::memory::desc& desc,
OpReqType req) {
if (kAddTo == req) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
@@ -205,9 +203,9 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
auto tmp = TmpMemMgr::Get()->Alloc(desc);
return mkldnn_output_t(OutDataOp::CopyBack, tmp);
} else {
- mkldnn::memory *mem = nullptr;
+ mkldnn::memory* mem = nullptr;
if (IsDefaultFormat(desc)) {
- mem = const_cast<NDArray &>(out_arr).CreateMKLDNNData(desc);
+ mem = const_cast<NDArray&>(out_arr).CreateMKLDNNData(desc);
}
if (mem == nullptr) {
auto tmp = TmpMemMgr::Get()->Alloc(desc);
@@ -218,29 +216,29 @@ mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &out_arr,
}
}
-void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) {
+void CommitOutput(const NDArray& arr, const mkldnn_output_t& res) {
if (res.first == CopyBack) {
- const_cast<NDArray &>(arr).CopyFrom(*res.second);
+ const_cast<NDArray&>(arr).CopyFrom(*res.second);
} else if (res.first == AddBack) {
auto res_memory = res.second;
- auto target_pd = arr.GetMKLDNNData()->get_desc();
- auto mem = arr.GetMKLDNNData(res.second->get_desc());
+ auto target_pd = arr.GetMKLDNNData()->get_desc();
+ auto mem = arr.GetMKLDNNData(res.second->get_desc());
if (mem == nullptr) {
auto tmp_memory = TmpMemMgr::Get()->Alloc(target_pd);
MKLDNNMemoryCopy(*res_memory, tmp_memory);
res_memory = tmp_memory;
- mem = arr.GetMKLDNNData();
+ mem = arr.GetMKLDNNData();
}
op::MKLDNNSum(*mem, *res_memory, *mem);
}
}
-const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
+const mkldnn::memory* GetWeights(const NDArray& arr, int num_groups) {
const auto type = get_mkldnn_type(arr.dtype());
- auto tz = mkldnn::memory::dims{0};
+ auto tz = mkldnn::memory::dims{0};
auto format_tag = mkldnn::memory::format_tag::undef;
- auto engine = CpuEngine::Get()->get_engine();
- const int ndim = arr.shape().ndim();
+ auto engine = CpuEngine::Get()->get_engine();
+ const int ndim = arr.shape().ndim();
int O = 0, I = 1, H = 2, W = 3;
int D = -1;
if (ndim == 5) {
@@ -249,35 +247,38 @@ const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
W = 4;
}
if (ndim == 2) {
- tz = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
+ tz = mkldnn::memory::dims{arr.shape()[O], arr.shape()[I]};
format_tag = mkldnn::memory::format_tag::oi;
} else if (ndim == 3) {
- tz = num_groups > 1
- ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
- arr.shape()[I], arr.shape()[H]}
- : mkldnn::memory::dims{arr.shape()[O],
- arr.shape()[I], arr.shape()[H]};
- format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goiw
- : mkldnn::memory::format_tag::oiw;
+ tz = num_groups > 1 ? mkldnn::memory::dims{num_groups,
+ arr.shape()[O] / num_groups,
+ arr.shape()[I],
+ arr.shape()[H]}
+ : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H]};
+ format_tag =
+ num_groups > 1 ? mkldnn::memory::format_tag::goiw : mkldnn::memory::format_tag::oiw;
} else if (ndim == 4) {
tz = num_groups > 1
- ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
- arr.shape()[I], arr.shape()[H],
+ ? mkldnn::memory::dims{num_groups,
+ arr.shape()[O] / num_groups,
+ arr.shape()[I],
+ arr.shape()[H],
arr.shape()[W]}
- : mkldnn::memory::dims{
- arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
- format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goihw
- : mkldnn::memory::format_tag::oihw;
+ : mkldnn::memory::dims{arr.shape()[O], arr.shape()[I], arr.shape()[H], arr.shape()[W]};
+ format_tag =
+ num_groups > 1 ? mkldnn::memory::format_tag::goihw : mkldnn::memory::format_tag::oihw;
} else if (ndim == 5) {
tz = num_groups > 1
- ? mkldnn::memory::dims{num_groups, arr.shape()[O] / num_groups,
- arr.shape()[I], arr.shape()[D],
- arr.shape()[H], arr.shape()[W]}
+ ? mkldnn::memory::dims{num_groups,
+ arr.shape()[O] / num_groups,
+ arr.shape()[I],
+ arr.shape()[D],
+ arr.shape()[H],
+ arr.shape()[W]}
: mkldnn::memory::dims{
- arr.shape()[O], arr.shape()[I], arr.shape()[D],
- arr.shape()[H], arr.shape()[W]};
- format_tag = num_groups > 1 ? mkldnn::memory::format_tag::goidhw
- : mkldnn::memory::format_tag::oidhw;
+ arr.shape()[O], arr.shape()[I], arr.shape()[D], arr.shape()[H], arr.shape()[W]};
+ format_tag =
+ num_groups > 1 ? mkldnn::memory::format_tag::goidhw : mkldnn::memory::format_tag::oidhw;
} else {
LOG(FATAL) << "The weight array has an unsupported number of dimensions";
}
@@ -285,37 +286,41 @@ const mkldnn::memory *GetWeights(const NDArray &arr, int num_groups) {
return arr.GetMKLDNNData(md);
}
-const mkldnn::memory *GetWeights(const NDArray &arr,
- const mkldnn::memory::desc &target_desc, int num_groups) {
- const mkldnn::memory *mem = arr.GetMKLDNNData(target_desc);
- // If the weight array already uses the target layout, simply return it directly.
- if (mem) return mem;
+const mkldnn::memory* GetWeights(const NDArray& arr,
+ const mkldnn::memory::desc& target_desc,
+ int num_groups) {
+ const mkldnn::memory* mem = arr.GetMKLDNNData(target_desc);
+ // If the weight array already uses the target layout, simply return it
+ // directly.
+ if (mem)
+ return mem;
mem = GetWeights(arr, num_groups);
- if (mem == nullptr) mem = arr.GetMKLDNNDataReorder(target_desc);
- if (mem->get_desc() == target_desc) return mem;
+ if (mem == nullptr)
+ mem = arr.GetMKLDNNDataReorder(target_desc);
+ if (mem->get_desc() == target_desc)
+ return mem;
auto ret = TmpMemMgr::Get()->Alloc(target_desc);
- std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem},
- {MKLDNN_ARG_TO, *ret}});
+ std::unordered_map<int, mkldnn::memory> args({{MKLDNN_ARG_FROM, *mem}, {MKLDNN_ARG_TO, *ret}});
MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*mem, *ret), args);
return ret;
}
-
// default: block and dims' stride increase monotonically
-// mkldnn: 1.winograd 2.rnn packed 3. block and dims'stride is not increase monotonically
-bool IsMKLDNN(const mkldnn::memory::desc &desc) {
+// mkldnn: 1.winograd 2.rnn packed 3. block and dims'stride is not increase
+// monotonically
+bool IsMKLDNN(const mkldnn::memory::desc& desc) {
bool rslt = true;
if (desc.data.format_kind == mkldnn_blocked) {
if (desc.data.format_desc.blocking.inner_nblks == 0) {
int i = 0;
- for (i = 0; i < desc.data.ndims-1; i++) {
- if (desc.data.format_desc.blocking.strides[i]
- < desc.data.format_desc.blocking.strides[i + 1]) {
+ for (i = 0; i < desc.data.ndims - 1; i++) {
+ if (desc.data.format_desc.blocking.strides[i] <
+ desc.data.format_desc.blocking.strides[i + 1]) {
break;
}
}
- if (i == desc.data.ndims-1) {
+ if (i == desc.data.ndims - 1) {
rslt = false;
}
}
@@ -325,34 +330,40 @@ bool IsMKLDNN(const mkldnn::memory::desc &desc) {
mkldnn_format_tag_t GetDefaultFormat(int num_dims) {
switch (num_dims) {
- case 1: return mkldnn_a;
- case 2: return mkldnn_ab;
- case 3: return mkldnn_abc;
- case 4: return mkldnn_abcd;
- case 5: return mkldnn_abcde;
- case 6: return mkldnn_abcdef;
+ case 1:
+ return mkldnn_a;
+ case 2:
+ return mkldnn_ab;
+ case 3:
+ return mkldnn_abc;
+ case 4:
+ return mkldnn_abcd;
+ case 5:
+ return mkldnn_abcde;
+ case 6:
+ return mkldnn_abcdef;
default:
LOG(FATAL) << "Not implemented dimension (" << num_dims << ") for MKLDNN";
return mkldnn_format_tag_undef;
}
}
-mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc &desc) {
+mkldnn_format_tag_t GetDefaultFormat(const mkldnn::memory::desc& desc) {
return GetDefaultFormat(desc.data.ndims);
}
-bool IsDefaultFormat(const mkldnn::memory::desc &desc) {
+bool IsDefaultFormat(const mkldnn::memory::desc& desc) {
bool rslt = false;
if (desc.data.format_kind == mkldnn_blocked) {
if (desc.data.format_desc.blocking.inner_nblks == 0) {
int i = 0;
- for (i = 0; i < desc.data.ndims-1; i++) {
- if (desc.data.format_desc.blocking.strides[i]
- < desc.data.format_desc.blocking.strides[i + 1]) {
+ for (i = 0; i < desc.data.ndims - 1; i++) {
+ if (desc.data.format_desc.blocking.strides[i] <
+ desc.data.format_desc.blocking.strides[i + 1]) {
break;
}
}
- if (i == desc.data.ndims-1) {
+ if (i == desc.data.ndims - 1) {
rslt = true;
}
}
@@ -360,20 +371,18 @@ bool IsDefaultFormat(const mkldnn::memory::desc &desc) {
return rslt;
}
-mkldnn::memory::desc GetDesc(const mkldnn::memory::desc &desc,
- const mkldnn_format_tag_t &format) {
+mkldnn::memory::desc GetDesc(const mkldnn::memory::desc& desc, const mkldnn_format_tag_t& format) {
mkldnn::memory::dims dims(desc.data.ndims);
for (size_t i = 0; i < dims.size(); i++)
dims[i] = desc.data.dims[i];
mkldnn::memory::format_tag cpp_format = static_cast<mkldnn::memory::format_tag>(format);
- mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(
- desc.data.data_type);
+ mkldnn::memory::data_type cpp_type = static_cast<mkldnn::memory::data_type>(desc.data.data_type);
mkldnn::memory::desc data_md(dims, cpp_type, cpp_format);
return mkldnn::memory::desc(dims, cpp_type, cpp_format);
}
// reorder mkldnn src to dst format dtype
-void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst) {
+void ReorderTo(const mkldnn::memory* src, const mkldnn::memory* dst) {
mkldnn::stream s(CpuEngine::Get()->get_engine());
auto new_src = *src;
auto new_dst = *dst;
@@ -381,11 +390,12 @@ void ReorderTo(const mkldnn::memory *src, const mkldnn::memory *dst) {
}
template <typename Compute, typename AttrState>
-void FallBackCompute(Compute fn, const AttrState &attrs_states,
- const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void FallBackCompute(Compute fn,
+ const AttrState& attrs_states,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
std::vector<TBlob> in_blobs(inputs.size());
std::vector<NDArray> in_bufs;
std::vector<OpReqType> new_req = req;
@@ -427,7 +437,7 @@ void FallBackCompute(Compute fn, const AttrState &attrs_states,
// ensure output does not use mkldnn mem.
// for inplace, we already converted & copied input above.
if ((req[i] == kWriteTo) || (req[i] == kWriteInplace)) {
- const_cast<NDArray &>(output).InvalidateMKLDNNData();
+ const_cast<NDArray&>(output).InvalidateMKLDNNData();
if (req[i] == kWriteInplace) {
new_req[i] = kWriteTo;
}
@@ -454,18 +464,20 @@ void FallBackCompute(Compute fn, const AttrState &attrs_states,
}
}
-template<typename DType>
-void print_diff(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2) {
- DType *data1 = reinterpret_cast<DType *>(arr1.data().dptr_);
- DType *data2 = reinterpret_cast<DType *>(arr2.data().dptr_);
+template <typename DType>
+void print_diff(const mxnet::NDArray& arr1, const mxnet::NDArray& arr2) {
+ DType* data1 = reinterpret_cast<DType*>(arr1.data().dptr_);
+ DType* data2 = reinterpret_cast<DType*>(arr2.data().dptr_);
for (size_t i = 0; i < arr1.shape().Size(); i++)
std::cout << data1[i] - data2[i] << ", ";
std::cout << std::endl;
}
-template<typename DType>
-static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
- DType rtol, DType atol) {
+template <typename DType>
+static bool SimilarArray(const mxnet::NDArray& arr1,
+ const mxnet::NDArray& arr2,
+ DType rtol,
+ DType atol) {
if (arr1.shape().Size() != arr2.shape().Size())
return false;
@@ -476,21 +488,21 @@ static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
// But we shouldn't reorder data in the original array.
NDArray buf1, buf2;
if (arr1.IsMKLDNNData()) {
- buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
+ buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype());
auto mem = arr1.GetMKLDNNData();
buf1.CopyFrom(*mem);
}
if (arr2.IsMKLDNNData()) {
- buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
+ buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype());
auto mem = arr2.GetMKLDNNData();
buf2.CopyFrom(*mem);
}
MKLDNNStream::Get()->Submit();
- DType *data1 = reinterpret_cast<DType *>(
- arr1.IsMKLDNNData() ? buf1.data().dptr_: arr1.data().dptr_);
- DType *data2 = reinterpret_cast<DType *>(
- arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_);
+ DType* data1 =
+ reinterpret_cast<DType*>(arr1.IsMKLDNNData() ? buf1.data().dptr_ : arr1.data().dptr_);
+ DType* data2 =
+ reinterpret_cast<DType*>(arr2.IsMKLDNNData() ? buf2.data().dptr_ : arr2.data().dptr_);
std::atomic<bool> success(true);
#pragma omp parallel for
#ifdef _MSC_VER
@@ -505,39 +517,42 @@ static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2,
return success.load();
}
-template void FallBackCompute(void (*)(nnvm::NodeAttrs const &, OpContext const &,
- std::vector<TBlob, std::allocator<TBlob> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<TBlob, std::allocator<TBlob> > const &),
- nnvm::NodeAttrs const &, OpContext const &,
- std::vector<NDArray, std::allocator<NDArray> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<NDArray, std::allocator<NDArray> > const &);
-
-template void FallBackCompute(void (*)(OpStatePtr const &, OpContext const &,
- std::vector<TBlob, std::allocator<TBlob> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<TBlob, std::allocator<TBlob> > const &),
- OpStatePtr const &, OpContext const &,
- std::vector<NDArray, std::allocator<NDArray> > const &,
- std::vector<OpReqType, std::allocator<OpReqType> > const &,
- std::vector<NDArray, std::allocator<NDArray> > const &);
-
-void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::NDArray> &outputs_) {
+template void FallBackCompute(void (*)(nnvm::NodeAttrs const&,
+ OpContext const&,
+ std::vector<TBlob, std::allocator<TBlob>> const&,
+ std::vector<OpReqType, std::allocator<OpReqType>> const&,
+ std::vector<TBlob, std::allocator<TBlob>> const&),
+ nnvm::NodeAttrs const&,
+ OpContext const&,
+ std::vector<NDArray, std::allocator<NDArray>> const&,
+ std::vector<OpReqType, std::allocator<OpReqType>> const&,
+ std::vector<NDArray, std::allocator<NDArray>> const&);
+
+template void FallBackCompute(void (*)(OpStatePtr const&,
+ OpContext const&,
+ std::vector<TBlob, std::allocator<TBlob>> const&,
+ std::vector<OpReqType, std::allocator<OpReqType>> const&,
+ std::vector<TBlob, std::allocator<TBlob>> const&),
+ OpStatePtr const&,
+ OpContext const&,
+ std::vector<NDArray, std::allocator<NDArray>> const&,
+ std::vector<OpReqType, std::allocator<OpReqType>> const&,
+ std::vector<NDArray, std::allocator<NDArray>> const&);
+
+void OpCheck::Init(const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::NDArray>& outputs_) {
auto ctx = inputs_[0].ctx();
CHECK(!MKLDNNStream::Get()->HasOps());
for (size_t i = 0; i < inputs_.size(); i++) {
NDArray data = inputs_[i];
inputs.emplace_back(data.shape(), ctx, false, data.dtype());
if (data.IsMKLDNNData() && data.IsView())
- data = data.Reorder2Default();
+ data = data.Reorder2Default();
auto mem = data.GetMKLDNNData();
inputs[i].CopyFrom(*mem);
}
for (size_t i = 0; i < outputs_.size(); i++) {
- outputs.emplace_back(outputs_[i].shape(), ctx,
- false, outputs_[i].dtype());
+ outputs.emplace_back(outputs_[i].shape(), ctx, false, outputs_[i].dtype());
if (backward) {
auto mem = outputs_[i].GetMKLDNNData();
outputs[i].CopyFrom(*mem);
@@ -546,18 +561,20 @@ void OpCheck::Init(const std::vector<mxnet::NDArray> &inputs_,
MKLDNNStream::Get()->Submit();
}
-void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs_,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs_) {
+void OpCheck::Run(mxnet::FCompute fn,
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs_,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs_) {
static auto& is_excluded = Op::GetAttr<bool>("TExcludeMKLDNNDebug");
if (is_excluded.get(attrs.op, false)) {
LOG(WARNING) << attrs.op->name << " not checked. TExcludeMKLDNNDebug flag present";
return;
}
std::vector<mxnet::TBlob> in_blobs(inputs.size());
- for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data();
+ for (size_t i = 0; i < in_blobs.size(); i++)
+ in_blobs[i] = inputs[i].data();
std::vector<mxnet::TBlob> out_blobs(outputs.size());
for (size_t i = 0; i < out_blobs.size(); i++)
out_blobs[i] = outputs[i].data();
@@ -565,7 +582,7 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
if (dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false))
LOG(INFO) << "test " << attrs.op->name;
size_t num = std::min(outputs.size(), outputs_.size());
- num = std::min(num_checks, num);
+ num = std::min(num_checks, num);
for (size_t i = 0; i < num; i++) {
// We don't need to compare if it doesn't need to output data.
if (req[i] == kNullOp)
@@ -580,10 +597,10 @@ void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs,
}
}
-void OpCheck::CopyResult(const std::vector<mxnet::NDArray> &outputs_,
- const std::vector<size_t> &indice) {
+void OpCheck::CopyResult(const std::vector<mxnet::NDArray>& outputs_,
+ const std::vector<size_t>& indice) {
CHECK(!MKLDNNStream::Get()->HasOps());
- auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray> &>(outputs_);
+ auto non_const_outputs_ = const_cast<std::vector<mxnet::NDArray>&>(outputs_);
for (auto i = indice.begin(); i != indice.end(); ++i) {
auto mem = outputs[*i].GetMKLDNNData();
non_const_outputs_[*i].CopyFrom(*mem);
@@ -591,14 +608,15 @@ void OpCheck::CopyResult(const std::vector<mxnet::NDArray> &outputs_,
MKLDNNStream::Get()->Submit();
}
-bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
+bool MKLDNNStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
bool support_mkldnn,
- DispatchMode *dispatch_mode,
- std::vector<int> *in_attrs,
- std::vector<int> *out_attrs) {
+ DispatchMode* dispatch_mode,
+ std::vector<int>* in_attrs,
+ std::vector<int>* out_attrs) {
for (int& v : *in_attrs)
- if (v == - 1) v = kDefaultStorage;
+ if (v == -1)
+ v = kDefaultStorage;
DispatchMode wanted_mode;
#if MXNET_USE_MKLDNN == 1
@@ -612,8 +630,8 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
bool dispatched = false;
if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
- dispatched = op::storage_type_assign(out_attrs, mxnet::kDefaultStorage,
- dispatch_mode, wanted_mode);
+ dispatched =
+ op::storage_type_assign(out_attrs, mxnet::kDefaultStorage, dispatch_mode, wanted_mode);
}
if (!dispatched) {
dispatched = op::dispatch_fallback(out_attrs, dispatch_mode);
@@ -621,10 +639,10 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
return dispatched;
}
-inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray> &inputs) {
+inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray>& inputs) {
std::vector<NDArray> ret;
ret.reserve(inputs.size());
- for (const auto &in : inputs) {
+ for (const auto& in : inputs) {
if (in.IsView() && in.IsMKLDNNData()) {
ret.push_back(in.Reorder2Default());
} else {
@@ -635,11 +653,11 @@ inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<N
}
void MKLDNNRun(mxnet::FComputeEx fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const std::vector<mxnet::NDArray> &inputs,
- const std::vector<mxnet::OpReqType> &req,
- const std::vector<mxnet::NDArray> &outputs) {
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const std::vector<mxnet::NDArray>& inputs,
+ const std::vector<mxnet::OpReqType>& req,
+ const std::vector<mxnet::NDArray>& outputs) {
if (CheckMKLDNNInputArrayIsView(inputs)) {
const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
fn(attrs, ctx, mkldnn_inputs, req, outputs);
@@ -649,11 +667,11 @@ void MKLDNNRun(mxnet::FComputeEx fn,
}
void MKLDNNRun(FComputeExUnary fn,
- const nnvm::NodeAttrs &attrs,
- const mxnet::OpContext &ctx,
- const mxnet::NDArray &input,
- const mxnet::OpReqType &req,
- const mxnet::NDArray &output) {
+ const nnvm::NodeAttrs& attrs,
+ const mxnet::OpContext& ctx,
+ const mxnet::NDArray& input,
+ const mxnet::OpReqType& req,
+ const mxnet::NDArray& output) {
auto mkldnn_input = input;
if (input.IsView() && input.IsMKLDNNData()) {
mkldnn_input = input.Reorder2Default();
diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
index 75c7c4d..67d7841 100644
--- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h
@@ -11,7 +11,7 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY92
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
@@ -21,56 +21,55 @@
* \file mkldnn_batch_norm.cc
* \brief
* \author Tao Lv
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_
#if MXNET_USE_MKLDNN == 1
-#include <vector>
-#include <utility>
#include <mkldnn.hpp>
+#include <utility>
+#include <vector>
+
#include "../batch_norm-inl.h"
-#include "./mkldnn_ops-inl.h"
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
-#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/std::sqrt((__var$) + DType(__eps$)))
-#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$))
namespace mxnet {
namespace op {
-typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
-typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc;
-typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
-typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc;
+typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc;
+typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc;
+typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc;
+typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc;
-inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray> &in_data,
- const std::vector<NDArray> &aux_states,
+inline static mkldnn::normalization_flags _GetFlags(const std::vector<NDArray>& in_data,
+ const std::vector<NDArray>& aux_states,
bool is_train_and_not_global_stats,
bool fuse_relu) {
mkldnn::normalization_flags flags = static_cast<mkldnn::normalization_flags>(0U);
if (in_data.size() == 3U) {
- flags |= mkldnn::normalization_flags::use_scale_shift;
+ flags |= mkldnn::normalization_flags::use_scale_shift;
}
// aux_states[0]: inMean
// aux_states[1]: inVariance
if (aux_states.size() == 2U && !is_train_and_not_global_stats) {
- flags |= mkldnn::normalization_flags::use_global_stats;
+ flags |= mkldnn::normalization_flags::use_global_stats;
}
if (fuse_relu) {
- flags |= mkldnn::normalization_flags::fuse_norm_relu;
+ flags |= mkldnn::normalization_flags::fuse_norm_relu;
}
return flags;
}
-inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
+inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory& data_mem,
bool is_train,
float eps,
mkldnn::normalization_flags flags) {
- auto data_md = data_mem.get_desc();
- auto engine = CpuEngine::Get()->get_engine();
+ auto data_md = data_mem.get_desc();
+ auto engine = CpuEngine::Get()->get_engine();
if (is_train) {
t_bn_f_desc bnFwd_desc(mkldnn::prop_kind::forward_training, data_md, eps, flags);
@@ -81,15 +80,15 @@ inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem,
}
}
-inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem,
- const mkldnn::memory &diff_mem,
+inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory& data_mem,
+ const mkldnn::memory& diff_mem,
float eps,
mkldnn::normalization_flags flags) {
- auto data_md = data_mem.get_desc();
- auto diff_md = diff_mem.get_desc();
- auto engine = CpuEngine::Get()->get_engine();
+ auto data_md = data_mem.get_desc();
+ auto diff_md = diff_mem.get_desc();
+ auto engine = CpuEngine::Get()->get_engine();
- t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
+ t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags);
return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags));
}
@@ -102,28 +101,29 @@ class MKLDNNBNForward {
t_bn_f_pdesc pd;
public:
- MKLDNNBNForward(const t_bn_f_pdesc &_pd, bool is_train_and_not_global_stats): pd(_pd) {
+ MKLDNNBNForward(const t_bn_f_pdesc& _pd, bool is_train_and_not_global_stats) : pd(_pd) {
weight_m.reset(new mkldnn::memory(pd.weights_desc(), CpuEngine::Get()->get_engine()));
fwd.reset(new mkldnn::batch_normalization_forward(pd));
this->is_train_and_not_global_stats = is_train_and_not_global_stats;
}
- const mkldnn::memory &GetWeight() const {
+ const mkldnn::memory& GetWeight() const {
return *weight_m;
}
- const t_bn_f_pdesc &GetPd() const {
+ const t_bn_f_pdesc& GetPd() const {
return pd;
}
- const mkldnn::batch_normalization_forward &GetFwd() const {
+ const mkldnn::batch_normalization_forward& GetFwd() const {
return *fwd;
}
};
-template<typename DType>
-static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
- const OpContext &ctx, const mkldnn::memory *data_mem,
+template <typename DType>
+static MKLDNNBNForward& GetBNForward(const BatchNormParam& param,
+ const OpContext& ctx,
+ const mkldnn::memory* data_mem,
mkldnn::normalization_flags flags) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNForward, OpHash> fwds;
@@ -137,8 +137,7 @@ static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
auto it = fwds.find(key);
if (it == fwds.end()) {
- auto fwd_pd = _GetFwd(*data_mem, ctx.is_train,
- param.eps, flags);
+ auto fwd_pd = _GetFwd(*data_mem, ctx.is_train, param.eps, flags);
MKLDNNBNForward fwd(fwd_pd, ctx.is_train && !param.use_global_stats);
it = AddToCache(&fwds, key, fwd);
}
@@ -146,10 +145,13 @@ static MKLDNNBNForward &GetBNForward(const BatchNormParam& param,
}
template <typename DType>
-void MKLDNNBatchNormForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs, bool fuse_relu) {
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
+void MKLDNNBatchNormForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs,
+ bool fuse_relu) {
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
std::vector<NDArray> in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean);
mxnet::TShape shape = inputs[batchnorm::kData].shape();
@@ -159,96 +161,92 @@ void MKLDNNBatchNormForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
if (param.axis != 1 || shape.ndim() != 4) {
// reshape to (N, C, 1, D)
mxnet::TShape new_shape{
- static_cast<dim_t>(shape.ProdShape(0, real_axis)),
- shape[real_axis],
- 1,
- static_cast<dim_t>(shape.ProdShape(real_axis + 1,
- static_cast<int>(shape.ndim())))
- };
+ static_cast<dim_t>(shape.ProdShape(0, real_axis)),
+ shape[real_axis],
+ 1,
+ static_cast<dim_t>(shape.ProdShape(real_axis + 1, static_cast<int>(shape.ndim())))};
in_data[batchnorm::kData] = in_data[batchnorm::kData].Reshape(new_shape);
- out = out.Reshape(new_shape);
+ out = out.Reshape(new_shape);
}
const std::vector<NDArray> aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end());
TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
- mkldnn::normalization_flags flags = _GetFlags(in_data,
- aux_states,
- ctx.is_train && !param.use_global_stats,
- fuse_relu);
- NDArray &data = in_data[batchnorm::kData];
+ mkldnn::normalization_flags flags =
+ _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
+ NDArray& data = in_data[batchnorm::kData];
if (data.IsMKLDNNData() && data.IsView())
data = data.Reorder2Default();
auto data_mem = data.GetMKLDNNData();
- auto &fwd = GetBNForward<DType>(param, ctx, data_mem, flags);
+ auto& fwd = GetBNForward<DType>(param, ctx, data_mem, flags);
// for output memory
- auto out_mem = const_cast<NDArray &>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
+ auto out_mem = const_cast<NDArray&>(out).CreateMKLDNNData(fwd.GetPd().dst_desc());
// mxnet will always use scale shift.
// But if fix_gamma is true, then all scale elements will be set to 1.0f
if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
- const NDArray &gamma = in_data[batchnorm::kGamma];
- const NDArray &beta = in_data[batchnorm::kBeta];
+ const NDArray& gamma = in_data[batchnorm::kGamma];
+ const NDArray& beta = in_data[batchnorm::kBeta];
CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage);
CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage);
- const mkldnn::memory &weight_mem = fwd.GetWeight();
- float* weight_buf = reinterpret_cast<float *>(weight_mem.get_data_handle());
+ const mkldnn::memory& weight_mem = fwd.GetWeight();
+ float* weight_buf = reinterpret_cast<float*>(weight_mem.get_data_handle());
nnvm::dim_t channels_ = data.shape()[1];
CHECK(weight_mem.get_desc().get_size() == channels_ * sizeof(float) * 2);
- float* weight_ptr = gamma.data().dptr<float>();
- float* bias_ptr = beta.data().dptr<float>();
+ float* weight_ptr = gamma.data().dptr<float>();
+ float* bias_ptr = beta.data().dptr<float>();
const size_t copy_size = sizeof(weight_buf[0]) * channels_;
if (!param.fix_gamma) {
memcpy(weight_buf, weight_ptr, copy_size);
memcpy(&weight_buf[channels_], bias_ptr, copy_size);
} else if (IsBNWriting(req[batchnorm::kGamma])) {
for (int i = 0; i < channels_; i++) {
- weight_buf[i] = 1.0f;
- weight_ptr[i] = 1.0f;
+ weight_buf[i] = 1.0f;
+ weight_ptr[i] = 1.0f;
weight_buf[channels_ + i] = bias_ptr[i]; // bias
}
} else {
for (int i = 0; i < channels_; i++) {
- weight_buf[i] = 1.0f;
+ weight_buf[i] = 1.0f;
weight_buf[channels_ + i] = bias_ptr[i]; // bias
}
}
mkldnn_args_map_t net_args;
- net_args[MKLDNN_ARG_SRC] = *data_mem;
+ net_args[MKLDNN_ARG_SRC] = *data_mem;
net_args[MKLDNN_ARG_SCALE_SHIFT] = weight_mem;
- net_args[MKLDNN_ARG_DST] = *out_mem;
+ net_args[MKLDNN_ARG_DST] = *out_mem;
if (fuse_relu) {
- const NDArray *workspace = nullptr;
- workspace = &outputs[3];
- auto engine = CpuEngine::Get()->get_engine();
+ const NDArray* workspace = nullptr;
+ workspace = &outputs[3];
+ auto engine = CpuEngine::Get()->get_engine();
if (workspace == nullptr) {
- LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
+ LOG(FATAL) << "MKLDNN BatchNorm: incorrect workspace input";
}
- auto ws = std::make_shared<mkldnn::memory>(fwd.GetPd().workspace_desc(),
- engine, workspace->GetMKLDNNData()->get_data_handle());
+ auto ws = std::make_shared<mkldnn::memory>(
+ fwd.GetPd().workspace_desc(), engine, workspace->GetMKLDNNData()->get_data_handle());
net_args[MKLDNN_ARG_WORKSPACE] = *ws;
}
if (!ctx.is_train || param.use_global_stats) {
- float* omean = outputs[batchnorm::kMean].data().dptr<float>();
- float* ovar = outputs[batchnorm::kVar].data().dptr<float>();
- float* inmean = aux_states[batchnorm::kMovingMean].data().dptr<float>();
- float* invar = aux_states[batchnorm::kMovingVar].data().dptr<float>();
+ float* omean = outputs[batchnorm::kMean].data().dptr<float>();
+ float* ovar = outputs[batchnorm::kVar].data().dptr<float>();
+ float* inmean = aux_states[batchnorm::kMovingMean].data().dptr<float>();
+ float* invar = aux_states[batchnorm::kMovingVar].data().dptr<float>();
// to align with origin implmentation: batch_norm.cc: L164
for (int i = 0; i < channels_; i++) {
omean[i] = inmean[i];
- ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps);
+ ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps);
}
- net_args[MKLDNN_ARG_MEAN] = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
+ net_args[MKLDNN_ARG_MEAN] = *(aux_states[batchnorm::kMovingMean].GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = *(aux_states[batchnorm::kMovingVar].GetMKLDNNData());
MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
MKLDNNStream::Get()->Submit();
} else { // training
- const NDArray &outMean = outputs[batchnorm::kMean];
- const NDArray &outVar = outputs[batchnorm::kVar];
- net_args[MKLDNN_ARG_MEAN] = *(outMean.GetMKLDNNData());
+ const NDArray& outMean = outputs[batchnorm::kMean];
+ const NDArray& outVar = outputs[batchnorm::kVar];
+ net_args[MKLDNN_ARG_MEAN] = *(outMean.GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = *(outVar.GetMKLDNNData());
MKLDNNStream::Get()->RegisterPrimArgs(fwd.GetFwd(), net_args);
MKLDNNStream::Get()->Submit();
@@ -271,25 +269,34 @@ class MKLDNNBNBackward {
public:
const t_bn_b_pdesc pd;
- explicit MKLDNNBNBackward(const t_bn_b_pdesc &_pd)
+ explicit MKLDNNBNBackward(const t_bn_b_pdesc& _pd)
: weight_m(new mkldnn::memory(_pd.weights_desc(), CpuEngine::Get()->get_engine())),
gradw_m(new mkldnn::memory(_pd.diff_weights_desc(), CpuEngine::Get()->get_engine())),
pd(_pd) {
bwd.reset(new mkldnn::batch_normalization_backward(pd));
}
- const mkldnn::memory &GetWeight() const { return *weight_m; }
+ const mkldnn::memory& GetWeight() const {
+ return *weight_m;
+ }
- const mkldnn::memory &GetGradw() const { return *gradw_m; }
+ const mkldnn::memory& GetGradw() const {
+ return *gradw_m;
+ }
- const mkldnn::batch_normalization_backward &GetBwd() const { return *bwd; }
+ const mkldnn::batch_normalization_backward& GetBwd() const {
+ return *bwd;
+ }
};
template <typename DType>
-static MKLDNNBNBackward &GetBNBackward(
- const BatchNormParam ¶m, const OpContext &ctx, const NDArray &in_data,
- const mkldnn::memory &in_mem, const NDArray &diff_data,
- const mkldnn::memory &diff_mem, mkldnn::normalization_flags flags) {
+static MKLDNNBNBackward& GetBNBackward(const BatchNormParam& param,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const mkldnn::memory& in_mem,
+ const NDArray& diff_data,
+ const mkldnn::memory& diff_mem,
+ mkldnn::normalization_flags flags) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<MKLDNNBNSignature, MKLDNNBNBackward, OpHash> bwds;
#else
@@ -310,41 +317,42 @@ static MKLDNNBNBackward &GetBNBackward(
}
template <typename DType>
-void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs, const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs, bool fuse_relu) {
+void MKLDNNBatchNormBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs,
+ bool fuse_relu) {
if (fuse_relu) {
CHECK_EQ(inputs.size(), 9U);
} else {
CHECK_EQ(inputs.size(), 8U);
}
- const BatchNormParam ¶m = nnvm::get<BatchNormParam>(attrs.parsed);
+ const BatchNormParam& param = nnvm::get<BatchNormParam>(attrs.parsed);
std::vector<NDArray> out_grad(1);
std::vector<NDArray> out_data(3);
std::vector<NDArray> in_data(3);
std::vector<NDArray> aux_states(2);
- out_grad[0] = inputs[0];
- out_data[batchnorm::kMean] = inputs[1];
- out_data[batchnorm::kVar] = inputs[2];
- in_data[batchnorm::kData] = inputs[3];
- in_data[batchnorm::kGamma] = inputs[4];
- in_data[batchnorm::kBeta] = inputs[5];
- aux_states[batchnorm::kMovingMean] = inputs[6];
- aux_states[batchnorm::kMovingVar] = inputs[7];
- const std::vector<NDArray> &in_grad = outputs;
+ out_grad[0] = inputs[0];
+ out_data[batchnorm::kMean] = inputs[1];
+ out_data[batchnorm::kVar] = inputs[2];
+ in_data[batchnorm::kData] = inputs[3];
+ in_data[batchnorm::kGamma] = inputs[4];
+ in_data[batchnorm::kBeta] = inputs[5];
+ aux_states[batchnorm::kMovingMean] = inputs[6];
+ aux_states[batchnorm::kMovingVar] = inputs[7];
+ const std::vector<NDArray>& in_grad = outputs;
TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]);
- mkldnn::normalization_flags flags = _GetFlags(in_data,
- aux_states,
- ctx.is_train && !param.use_global_stats,
- fuse_relu);
-
- NDArray data = in_data[batchnorm::kData];
- NDArray diff = out_grad[batchnorm::kOut];
- NDArray gradIn = in_grad[batchnorm::kData];
- const NDArray &moving_mean = aux_states[batchnorm::kMovingMean];
- const NDArray &moving_var = aux_states[batchnorm::kMovingVar];
- const NDArray &out_mean = out_data[batchnorm::kMean];
- const NDArray &out_var = out_data[batchnorm::kVar];
+ mkldnn::normalization_flags flags =
+ _GetFlags(in_data, aux_states, ctx.is_train && !param.use_global_stats, fuse_relu);
+
+ NDArray data = in_data[batchnorm::kData];
+ NDArray diff = out_grad[batchnorm::kOut];
+ NDArray gradIn = in_grad[batchnorm::kData];
+ const NDArray& moving_mean = aux_states[batchnorm::kMovingMean];
+ const NDArray& moving_var = aux_states[batchnorm::kMovingVar];
+ const NDArray& out_mean = out_data[batchnorm::kMean];
+ const NDArray& out_var = out_data[batchnorm::kVar];
CHECK(out_mean.IsDefaultData());
CHECK(out_var.IsDefaultData());
@@ -357,36 +365,34 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
if (param.axis != 1 || shape.ndim() != 4) {
// reshape to (N, C, 1, D)
mxnet::TShape new_shape{
- static_cast<dim_t>(shape.ProdShape(0, real_axis)),
- shape[real_axis],
- 1,
- static_cast<dim_t>(shape.ProdShape(real_axis + 1,
- static_cast<int>(shape.ndim())))
- };
- data = data.Reshape(new_shape);
- diff = diff.Reshape(new_shape);
+ static_cast<dim_t>(shape.ProdShape(0, real_axis)),
+ shape[real_axis],
+ 1,
+ static_cast<dim_t>(shape.ProdShape(real_axis + 1, static_cast<int>(shape.ndim())))};
+ data = data.Reshape(new_shape);
+ diff = diff.Reshape(new_shape);
gradIn = gradIn.Reshape(new_shape);
}
- auto data_mem = data.GetMKLDNNData();
- auto diff_mem = diff.GetMKLDNNData();
+ auto data_mem = data.GetMKLDNNData();
+ auto diff_mem = diff.GetMKLDNNData();
// MKLDNN batchnorm should run on special layouts. If one of them isn't, we
// should reorder them.
if (data.IsDefaultData())
data_mem = data.GetMKLDNNDataReorder(diff_mem->get_desc());
else if (diff.IsDefaultData())
diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_desc());
- auto &bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
- auto gradi_mem = CreateMKLDNNMem(const_cast<NDArray &>(gradIn),
- bwd.pd.diff_src_desc(), req[batchnorm::kData]);
+ auto& bwd = GetBNBackward<DType>(param, ctx, data, *data_mem, diff, *diff_mem, flags);
+ auto gradi_mem =
+ CreateMKLDNNMem(const_cast<NDArray&>(gradIn), bwd.pd.diff_src_desc(), req[batchnorm::kData]);
if (static_cast<int>(flags) & static_cast<int>(mkldnn::normalization_flags::use_scale_shift)) {
- const NDArray &gamma = in_data[batchnorm::kGamma];
- const NDArray &beta = in_data[batchnorm::kBeta];
- DType *weight_buf = reinterpret_cast<DType *>(bwd.GetWeight().get_data_handle());
- nnvm::dim_t channels_ = data.shape()[1];
- DType *weight_ptr = gamma.data().dptr<DType>();
- DType* bias_ptr = beta.data().dptr<DType>();
+ const NDArray& gamma = in_data[batchnorm::kGamma];
+ const NDArray& beta = in_data[batchnorm::kBeta];
+ DType* weight_buf = reinterpret_cast<DType*>(bwd.GetWeight().get_data_handle());
+ nnvm::dim_t channels_ = data.shape()[1];
+ DType* weight_ptr = gamma.data().dptr<DType>();
+ DType* bias_ptr = beta.data().dptr<DType>();
const size_t copy_size = sizeof(DType) * channels_;
if (!param.fix_gamma) {
memcpy(weight_buf, weight_ptr, copy_size);
@@ -398,15 +404,15 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
memcpy(&weight_buf[channels_], bias_ptr, copy_size);
}
mkldnn_args_map_t net_args;
- net_args[MKLDNN_ARG_SRC] = *data_mem;
- net_args[MKLDNN_ARG_DIFF_SRC] = *gradi_mem.second;
- net_args[MKLDNN_ARG_SCALE_SHIFT] = bwd.GetWeight();
+ net_args[MKLDNN_ARG_SRC] = *data_mem;
+ net_args[MKLDNN_ARG_DIFF_SRC] = *gradi_mem.second;
+ net_args[MKLDNN_ARG_SCALE_SHIFT] = bwd.GetWeight();
net_args[MKLDNN_ARG_DIFF_SCALE_SHIFT] = bwd.GetGradw();
- net_args[MKLDNN_ARG_DIFF_DST] = *diff_mem;
+ net_args[MKLDNN_ARG_DIFF_DST] = *diff_mem;
if (fuse_relu) {
- const NDArray *workspace = nullptr;
- workspace = &inputs[8];
+ const NDArray* workspace = nullptr;
+ workspace = &inputs[8];
if (workspace != nullptr) {
net_args[MKLDNN_ARG_WORKSPACE] = *(workspace->GetMKLDNNData());
}
@@ -414,26 +420,24 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
// training but no input mean and variance
if (ctx.is_train && !param.use_global_stats) {
- DType* moving_mean_ptr = moving_mean.data().dptr<DType>();
- DType* moving_var_ptr = moving_var.data().dptr<DType>();
- DType* out_mean_ptr = out_mean.data().dptr<DType>();
- DType* out_var_ptr = out_var.data().dptr<DType>();
+ DType* moving_mean_ptr = moving_mean.data().dptr<DType>();
+ DType* moving_var_ptr = moving_var.data().dptr<DType>();
+ DType* out_mean_ptr = out_mean.data().dptr<DType>();
+ DType* out_var_ptr = out_var.data().dptr<DType>();
mkldnn::memory var_mem(bwd.pd.variance_desc(), CpuEngine::Get()->get_engine());
- DType *tmp_var_ptr = reinterpret_cast<DType *>(var_mem.get_data_handle());
+ DType* tmp_var_ptr = reinterpret_cast<DType*>(var_mem.get_data_handle());
DType minus_mom = (1.0f - param.momentum);
for (int i = 0; i < channels_; i++) {
- moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum +
- out_mean_ptr[i] * minus_mom;
- float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
- tmp_var_ptr[i] = variance;
- moving_var_ptr[i] = moving_var_ptr[i] * param.momentum +
- variance * minus_mom;
+ moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + out_mean_ptr[i] * minus_mom;
+ float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps);
+ tmp_var_ptr[i] = variance;
+ moving_var_ptr[i] = moving_var_ptr[i] * param.momentum + variance * minus_mom;
}
- net_args[MKLDNN_ARG_MEAN] = *(out_mean.GetMKLDNNData());
+ net_args[MKLDNN_ARG_MEAN] = *(out_mean.GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = var_mem;
} else {
- net_args[MKLDNN_ARG_MEAN] = *(moving_mean.GetMKLDNNData());
+ net_args[MKLDNN_ARG_MEAN] = *(moving_mean.GetMKLDNNData());
net_args[MKLDNN_ARG_VARIANCE] = *(moving_var.GetMKLDNNData());
}
MKLDNNStream::Get()->RegisterPrimArgs(bwd.GetBwd(), net_args);
@@ -441,9 +445,9 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
MKLDNNStream::Get()->Submit();
// copy data from gradw_mem to in_grad[1] and in_grad[2]
- DType *gw_buf = reinterpret_cast<DType *>(bwd.GetGradw().get_data_handle());
- DType *w_grad_1 = in_grad[batchnorm::kGamma].data().dptr<DType>();
- DType *w_grad_2 = in_grad[batchnorm::kBeta].data().dptr<DType>();
+ DType* gw_buf = reinterpret_cast<DType*>(bwd.GetGradw().get_data_handle());
+ DType* w_grad_1 = in_grad[batchnorm::kGamma].data().dptr<DType>();
+ DType* w_grad_2 = in_grad[batchnorm::kBeta].data().dptr<DType>();
// the gradient of gamma
if (!param.fix_gamma) {
@@ -467,7 +471,7 @@ void MKLDNNBatchNormBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
if (req[batchnorm::kBeta] != kAddTo) {
memcpy(w_grad_2, &gw_buf[channels_], copy_size);
} else {
- DType *grad_beta = &gw_buf[channels_];
+ DType* grad_beta = &gw_buf[channels_];
for (int i = 0; i < channels_; i++) {
w_grad_2[i] += grad_beta[i];
}
diff --git a/src/operator/nn/mkldnn/mkldnn_concat-inl.h b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
index 66cb851..14f980a 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_concat-inl.h
@@ -21,17 +21,17 @@
* \file mkldnn_concat-inl.h
* \brief
* \author
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONCAT_INL_H_
-
#if MXNET_USE_MKLDNN == 1
-#include <vector>
#include <utility>
+#include <vector>
+
#include "../concat-inl.h"
-#include "./mkldnn_ops-inl.h"
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
namespace mxnet {
namespace op {
@@ -40,17 +40,19 @@ class MKLDNNConcatFwd {
public:
mkldnn::concat::primitive_desc fwd_pd;
- MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md);
+ MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md);
- const mkldnn::concat &GetFwd() const { return *fwd_; }
+ const mkldnn::concat& GetFwd() const {
+ return *fwd_;
+ }
private:
std::shared_ptr<mkldnn::concat> fwd_;
};
-static MKLDNNConcatFwd &GetConcatForward(
- int concat_dim, const std::vector<NDArray> &in_data,
- const std::vector<mkldnn::memory::desc> &data_md) {
+static MKLDNNConcatFwd& GetConcatForward(int concat_dim,
+ const std::vector<NDArray>& in_data,
+ const std::vector<mkldnn::memory::desc>& data_md) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<OpSignature, MKLDNNConcatFwd, OpHash> fwds;
#else
diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc
index 1dd2dc3..689888a 100644
--- a/src/operator/nn/mkldnn/mkldnn_concat.cc
+++ b/src/operator/nn/mkldnn/mkldnn_concat.cc
@@ -21,7 +21,7 @@
* \file mkldnn_concat.cc
* \brief
* \author
-*/
+ */
#if MXNET_USE_MKLDNN == 1
#include "mkldnn_concat-inl.h"
@@ -29,15 +29,16 @@
namespace mxnet {
namespace op {
-static inline bool IsUsingPadding(const mkldnn::memory::desc &dst_md) {
+static inline bool IsUsingPadding(const mkldnn::memory::desc& dst_md) {
// make sure a blocked format is used (at least one dimension is blocked)
- bool is_blocked_format = dst_md.data.format_kind == mkldnn_blocked &&
- dst_md.data.format_desc.blocking.inner_nblks > 0;
- return is_blocked_format && !std::equal(dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims,
- dst_md.data.padded_dims);
+ bool is_blocked_format =
+ dst_md.data.format_kind == mkldnn_blocked && dst_md.data.format_desc.blocking.inner_nblks > 0;
+ return is_blocked_format &&
+ !std::equal(
+ dst_md.data.dims, dst_md.data.dims + dst_md.data.ndims, dst_md.data.padded_dims);
}
-MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc> &data_md)
+MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memory::desc>& data_md)
: fwd_pd(concat_dim, data_md, CpuEngine::Get()->get_engine()) {
// MKL-DNN introduced padded formats since 0.15 which require more memory
// compared to the actual size of the tensor. Currently, MKL-DNN operators
@@ -45,39 +46,39 @@ MKLDNNConcatFwd::MKLDNNConcatFwd(int concat_dim, const std::vector<mkldnn::memor
// format that has the expected memory size requirements (a plain format)
// When fwd_pd uses padding, impose a plain format
- const auto &dst_md = fwd_pd.dst_desc();
+ const auto& dst_md = fwd_pd.dst_desc();
if (IsUsingPadding(dst_md)) {
- auto plain_dst_tag = static_cast<mkldnn::memory::format_tag>(
- GetDefaultFormat(dst_md.data.ndims));
+ auto plain_dst_tag =
+ static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(dst_md.data.ndims));
auto plain_dst_md = mkldnn::memory::desc(dst_md.dims(), dst_md.data_type(), plain_dst_tag);
- fwd_pd = mkldnn::concat::primitive_desc(plain_dst_md, concat_dim, data_md,
- CpuEngine::Get()->get_engine());
+ fwd_pd = mkldnn::concat::primitive_desc(
+ plain_dst_md, concat_dim, data_md, CpuEngine::Get()->get_engine());
}
fwd_ = std::make_shared<mkldnn::concat>(fwd_pd);
}
-void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data) {
+void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
- const int num_in_data = param.num_args;
- const int concat_dim = param.dim;
+ const int num_in_data = param.num_args;
+ const int concat_dim = param.dim;
std::vector<mkldnn::memory::desc> data_md;
- std::vector<const mkldnn::memory *> data_mem;
+ std::vector<const mkldnn::memory*> data_mem;
data_md.reserve(num_in_data);
data_mem.reserve(num_in_data);
for (int i = 0; i < num_in_data; i++) {
- const mkldnn::memory *tmp_mem = in_data[i].GetMKLDNNData();
- mkldnn::memory::desc tmp_md = tmp_mem->get_desc();
+ const mkldnn::memory* tmp_mem = in_data[i].GetMKLDNNData();
+ mkldnn::memory::desc tmp_md = tmp_mem->get_desc();
data_md.push_back(tmp_md);
data_mem.push_back(tmp_mem);
}
- MKLDNNConcatFwd &fwd = GetConcatForward(concat_dim, in_data, data_md);
- mxnet::mkldnn_output_t out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut],
- fwd.fwd_pd.dst_desc(),
- req[concat_enum::kOut]);
+ MKLDNNConcatFwd& fwd = GetConcatForward(concat_dim, in_data, data_md);
+ mxnet::mkldnn_output_t out_mem =
+ CreateMKLDNNMem(out_data[concat_enum::kOut], fwd.fwd_pd.dst_desc(), req[concat_enum::kOut]);
std::unordered_map<int, mkldnn::memory> net_args;
net_args.insert({MKLDNN_ARG_DST, *out_mem.second});
for (int i = 0; i < num_in_data; i++) {
@@ -88,35 +89,34 @@ void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
MKLDNNStream::Get()->Submit();
}
-void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req,
const std::vector<NDArray>& outputs) {
TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]);
const ConcatParam& param = nnvm::get<ConcatParam>(attrs.parsed);
- const int num_in_data = param.num_args;
- const int axis = param.dim;
- const auto gradz_mem = inputs[0].GetMKLDNNData();
+ const int num_in_data = param.num_args;
+ const int axis = param.dim;
+ const auto gradz_mem = inputs[0].GetMKLDNNData();
/* init the offset */
mkldnn::memory::dims offsets(outputs[0].shape().ndim());
- for (auto &v : offsets) {
+ for (auto& v : offsets) {
v = 0;
}
for (int i = 0; i < num_in_data; i++) {
mkldnn::memory::dims diff_src_tz(outputs[i].shape().begin(), outputs[i].shape().end());
auto diff_src_md = outputs[i].GetMKLDNNData()->get_desc();
- auto gradi_mem = CreateMKLDNNMem(outputs[i], diff_src_md, req[i]);
+ auto gradi_mem = CreateMKLDNNMem(outputs[i], diff_src_md, req[i]);
auto from_md = gradz_mem->get_desc().submemory_desc(diff_src_tz, offsets);
- auto from_mem = new mkldnn::memory(from_md, gradz_mem->get_engine(),
- gradz_mem->get_data_handle());
+ auto from_mem =
+ new mkldnn::memory(from_md, gradz_mem->get_engine(), gradz_mem->get_data_handle());
offsets[axis] += diff_src_tz[axis];
- std::unordered_map<int, mkldnn::memory> net_args({
- {MKLDNN_ARG_FROM, *gradz_mem},
- {MKLDNN_ARG_TO, *gradi_mem.second}
- });
+ std::unordered_map<int, mkldnn::memory> net_args(
+ {{MKLDNN_ARG_FROM, *gradz_mem}, {MKLDNN_ARG_TO, *gradi_mem.second}});
MKLDNNStream::Get()->RegisterPrimArgs(mkldnn::reorder(*from_mem, *gradi_mem.second), net_args);
CommitOutput(outputs[i], gradi_mem);
}
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
index ac2d316..4292677 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_convolution-inl.h
@@ -20,18 +20,19 @@
/*!
* \file mkldnn_convolution-inl.h
* \brief
-*/
+ */
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_CONVOLUTION_INL_H_
#if MXNET_USE_MKLDNN == 1
-#include <vector>
#include <utility>
+#include <vector>
+
#include "../convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
namespace mxnet {
namespace op {
@@ -47,26 +48,25 @@ struct MKLDNNConvParam : public dmlc::Parameter<MKLDNNConvParam> {
dmlc::optional<float> max_calib_range; // max float value calculated from calibration dataset
DMLC_DECLARE_PARAMETER(MKLDNNConvParam) {
- DMLC_DECLARE_FIELD(with_bn).set_default(false)
- .describe("Add post batchnorm.");
- DMLC_DECLARE_FIELD(with_act).set_default(false)
- .describe("Add post activation");
- DMLC_DECLARE_FIELD(with_sum).set_default(false)
- .describe("Add post sum");
- DMLC_DECLARE_FIELD(with_postsum_act).set_default(false)
- .describe("Add post activation after sum");
- DMLC_DECLARE_FIELD(quantized).set_default(false)
- .describe("enable quantization");
+ DMLC_DECLARE_FIELD(with_bn).set_default(false).describe("Add post batchnorm.");
+ DMLC_DECLARE_FIELD(with_act).set_default(false).describe("Add post activation");
+ DMLC_DECLARE_FIELD(with_sum).set_default(false).describe("Add post sum");
+ DMLC_DECLARE_FIELD(with_postsum_act)
+ .set_default(false)
+ .describe("Add post activation after sum");
+ DMLC_DECLARE_FIELD(quantized).set_default(false).describe("enable quantization");
DMLC_DECLARE_FIELD(min_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The minimum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized convolution op to calculate primitive scale");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The minimum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized convolution op to calculate primitive scale");
DMLC_DECLARE_FIELD(max_calib_range)
- .set_default(dmlc::optional<float>())
- .describe("The maximum scalar value in the form of float32 obtained "
- "through calibration. If present, it will be used to by "
- "quantized convolution op to calculate primitive scale");
+ .set_default(dmlc::optional<float>())
+ .describe(
+ "The maximum scalar value in the form of float32 obtained "
+ "through calibration. If present, it will be used to by "
+ "quantized convolution op to calculate primitive scale");
}
};
@@ -80,17 +80,29 @@ struct MKLDNNConvFullParam {
};
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
- const ConvolutionParam ¶m, const bool is_train, const NDArray &data, const NDArray &weight,
- const NDArray *bias, const NDArray &output);
+ const ConvolutionParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
class MKLDNNConvForward {
public:
- MKLDNNConvForward(const MKLDNNConvFullParam ¶m, const bool is_train, const NDArray &data,
- const NDArray &weight, const NDArray *bias, const NDArray &output);
-
- const mkldnn::convolution_forward &GetFwd() const { return *fwd_; }
+ MKLDNNConvForward(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
+
+ const mkldnn::convolution_forward& GetFwd() const {
+ return *fwd_;
+ }
- const mkldnn::convolution_forward::primitive_desc &GetPd() const { return *pd_; }
+ const mkldnn::convolution_forward::primitive_desc& GetPd() const {
+ return *pd_;
+ }
private:
std::shared_ptr<mkldnn::convolution_forward> fwd_;
@@ -99,37 +111,47 @@ class MKLDNNConvForward {
typedef ParamOpSign<ConvolutionParam> MKLDNNConvSignature;
-MKLDNNConvForward &GetConvFwd(const MKLDNNConvFullParam ¶m, const bool is_train,
- const NDArray &data, const NDArray &weight, const NDArray *bias,
- const NDArray &output);
-
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m,
- const OpContext &ctx,
- MKLDNNConvForward *fwd,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data);
-
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs &attrs,
- const OpContext &ctx,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data);
+MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
+
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
+ const OpContext& ctx,
+ MKLDNNConvForward* fwd,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data);
+
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data);
class MKLDNNConvBackward {
public:
- MKLDNNConvBackward(const MKLDNNConvFullParam ¶m, const NDArray &data, const NDArray &weight,
- const NDArray *bias, const NDArray &output);
-
- const mkldnn::convolution_backward_data &GetBwdData() const { return *bwd_data_; }
+ MKLDNNConvBackward(const MKLDNNConvFullParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output);
+
+ const mkldnn::convolution_backward_data& GetBwdData() const {
+ return *bwd_data_;
+ }
- const mkldnn::convolution_backward_weights &GetBwdWeights() const { return *bwd_weight_; }
+ const mkldnn::convolution_backward_weights& GetBwdWeights() const {
+ return *bwd_weight_;
+ }
- const mkldnn::convolution_backward_data::primitive_desc &GetDataPd() const {
+ const mkldnn::convolution_backward_data::primitive_desc& GetDataPd() const {
return *bwd_data_pd_;
}
- const mkldnn::convolution_backward_weights::primitive_desc &GetWeightsPd() const {
+ const mkldnn::convolution_backward_weights::primitive_desc& GetWeightsPd() const {
return *bwd_weight_pd_;
}
diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc
index b042bd2..966ba21 100644
--- a/src/operator/nn/mkldnn/mkldnn_convolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc
@@ -26,41 +26,38 @@
#if MXNET_USE_MKLDNN == 1
#include "../convolution-inl.h"
-#include "./mkldnn_ops-inl.h"
#include "./mkldnn_base-inl.h"
#include "./mkldnn_convolution-inl.h"
+#include "./mkldnn_ops-inl.h"
namespace mxnet {
namespace op {
DMLC_REGISTER_PARAMETER(MKLDNNConvParam);
-bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input) {
- if ((params.kernel.ndim() != 1) &&
- (params.kernel.ndim() != 2) &&
- (params.kernel.ndim() != 3))
+bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray& input) {
+ if ((params.kernel.ndim() != 1) && (params.kernel.ndim() != 2) && (params.kernel.ndim() != 3))
return false;
return SupportMKLDNNQuantize(input.dtype()) &&
- ((input.shape().ndim() == 3) ||
- (input.shape().ndim() == 4) ||
+ ((input.shape().ndim() == 3) || (input.shape().ndim() == 4) ||
(input.shape().ndim() == 5));
}
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
- const MKLDNNConvFullParam ¶m,
- const bool is_train,
- const NDArray &data,
- const NDArray &weights,
- const NDArray *bias,
- const NDArray &output) {
+ const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray* bias,
+ const NDArray& output) {
auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
- auto data_md = GetMemDesc(data);
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weights, param.conv_param.num_group, param.mkldnn_param.quantized);
- auto out_md = GetMemDesc(output);
+ auto out_md = GetMemDesc(output);
auto bias_md =
bias ? (param.mkldnn_param.quantized ? GetMemDesc(*bias, mshadow::kInt32) : GetMemDesc(*bias))
: mkldnn::memory::desc{
- {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
+ {}, mkldnn::memory::data_type::undef, mkldnn::memory::format_tag::any};
auto bias_md_ptr = bias ? &bias_md : nullptr;
mkldnn::memory::dims strides(param.conv_param.kernel.ndim());
@@ -90,20 +87,20 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
padding[1] = param.conv_param.pad[1];
padding[2] = param.conv_param.pad[2];
} else {
- LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size "
- << param.conv_param.kernel.ndim() << ", supporting only 1 or 2 or 3.";
+ LOG(FATAL) << "Unexpected MKL-DNN Conv kernel size " << param.conv_param.kernel.ndim()
+ << ", supporting only 1 or 2 or 3.";
}
mkldnn::primitive_attr attr;
mkldnn::post_ops ops;
if (param.mkldnn_param.with_act) {
- const auto &act_param = param.act_param;
+ const auto& act_param = param.act_param;
ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
}
if (param.mkldnn_param.with_sum) {
ops.append_sum(param.sum_scale);
}
if (param.mkldnn_param.with_postsum_act) {
- const auto &act_param = param.postsum_act_param;
+ const auto& act_param = param.postsum_act_param;
ops.append_eltwise(act_param.scale, act_param.alg, act_param.alpha, act_param.beta);
}
attr.set_post_ops(ops);
@@ -112,42 +109,56 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
int mask = (param.requantize_scales.size() > 1) ? 2 : 0;
attr.set_output_scales(mask, param.requantize_scales);
}
- auto GetConvFwdPd = [¶m, &data, &weights, &output,
- &attr](const mkldnn::convolution_forward::desc &desc) {
- auto engine = CpuEngine::Get()->get_engine();
- try {
- // MKL-DNN introduced padded formats since 0.15 which require more memory
- // compared to the actual size of the tensor. Currently, MKL-DNN operators
- // still reuse memory from memory planning, so here we need to select a
- // suboptimal kernel for computation that has the expected memory size requirements
- auto conv_pd =
- std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
- while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
- conv_pd->src_desc().get_size() != GetArraySize(data) ||
- (!param.mkldnn_param.quantized &&
- conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
- // next_impl() will visit desc and engine, please make sure they are still alive here.
- CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
- }
- return conv_pd;
- } catch (mkldnn::error &e) {
- if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
- LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
- "required for int8 convolution";
- } else {
- LOG(ERROR) << e.message;
- }
- throw;
- }
- };
+ auto GetConvFwdPd =
+ [¶m, &data, &weights, &output, &attr](const mkldnn::convolution_forward::desc& desc) {
+ auto engine = CpuEngine::Get()->get_engine();
+ try {
+ // MKL-DNN introduced padded formats since 0.15 which require more memory
+ // compared to the actual size of the tensor. Currently, MKL-DNN operators
+ // still reuse memory from memory planning, so here we need to select a
+ // suboptimal kernel for computation that has the expected memory size
+ // requirements
+ auto conv_pd =
+ std::make_shared<mkldnn::convolution_forward::primitive_desc>(desc, attr, engine);
+ while (conv_pd->dst_desc().get_size() != GetArraySize(output) ||
+ conv_pd->src_desc().get_size() != GetArraySize(data) ||
+ (!param.mkldnn_param.quantized &&
+ conv_pd->weights_desc().get_size() != GetArraySize(weights))) {
+ // next_impl() will visit desc and engine, please make sure they are
+ // still alive here.
+ CHECK(conv_pd->next_impl()) << "No convolution implementation for this request.";
+ }
+ return conv_pd;
+ } catch (mkldnn::error& e) {
+ if (e.status == mkldnn_unimplemented && param.mkldnn_param.quantized) {
+ LOG(ERROR) << "AVX512-BW support or Intel(R) MKL dependency is "
+ "required for int8 convolution";
+ } else {
+ LOG(ERROR) << e.message;
+ }
+ throw;
+ }
+ };
if (param.conv_param.dilate.ndim() == 0 && bias_md_ptr == nullptr) {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, padding, padding);
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ padding,
+ padding);
return GetConvFwdPd(desc);
} else if (param.conv_param.dilate.ndim() == 0) {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, *bias_md_ptr, out_md, strides, padding,
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ *bias_md_ptr,
+ out_md,
+ strides,
+ padding,
padding);
return GetConvFwdPd(desc);
} else {
@@ -166,25 +177,42 @@ std::shared_ptr<mkldnn::convolution_forward::primitive_desc> GetConvFwdImpl(
<< ", supporting only 1 or 2 or 3.";
}
if (bias_md_ptr == nullptr) {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, dilates, padding, padding);
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvFwdPd(desc);
} else {
- mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, data_md,
- weight_md, *bias_md_ptr, out_md, strides, dilates,
- padding, padding);
+ mkldnn::convolution_forward::desc desc(prop,
+ mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ *bias_md_ptr,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvFwdPd(desc);
}
}
}
static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetConvBwdData(
- const ConvolutionParam ¶m, const NDArray &data, const NDArray &weight,
- const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
- auto data_md = GetMemDesc(data);
+ const ConvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weight, param.num_group);
- auto out_md = GetMemDesc(output);
- auto engine = CpuEngine::Get()->get_engine();
+ auto out_md = GetMemDesc(output);
+ auto engine = CpuEngine::Get()->get_engine();
mkldnn::memory::dims strides(param.kernel.ndim());
mkldnn::memory::dims padding(param.kernel.ndim());
if (param.kernel.ndim() == 1) {
@@ -216,32 +244,39 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
<< ", supporting only 1 or 2 or 3.";
}
- auto GetConvBwdDataPd = [&data, &weight, &output,
- &fwd_pd](const mkldnn::convolution_backward_data::desc &desc) {
+ auto GetConvBwdDataPd = [&data, &weight, &output, &fwd_pd](
+ const mkldnn::convolution_backward_data::desc& desc) {
auto engine = CpuEngine::Get()->get_engine();
try {
// MKL-DNN introduced padded formats since 0.15 which require more memory
// compared to the actual size of the tensor. Currently, MKL-DNN operators
// still reuse memory from memory planning, so here we need to select a
- // suboptimal kernel for computation that has the expected memory size requirements
+ // suboptimal kernel for computation that has the expected memory size
+ // requirements
auto conv_pd =
std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(desc, engine, fwd_pd);
while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
conv_pd->diff_src_desc().get_size() != GetArraySize(data) ||
conv_pd->weights_desc().get_size() != GetArraySize(weight)) {
- // next_impl() will visit desc and engine, please make sure they are still alive here.
+ // next_impl() will visit desc and engine, please make sure they are
+ // still alive here.
CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
}
return conv_pd;
- } catch (mkldnn::error &e) {
+ } catch (mkldnn::error& e) {
LOG(ERROR) << e.message;
throw;
}
};
if (param.dilate.ndim() == 0) {
- mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, padding, padding);
+ mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ padding,
+ padding);
return GetConvBwdDataPd(desc);
} else {
mkldnn::memory::dims dilates(param.kernel.ndim());
@@ -255,23 +290,32 @@ static std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc> GetCon
dilates[1] = param.dilate[1] - 1;
dilates[2] = param.dilate[2] - 1;
} else {
- LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
- << param.dilate.ndim() << ", supporting only 1 or 2 or 3.";
+ LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+ << ", supporting only 1 or 2 or 3.";
}
- mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, dilates, padding,
+ mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
padding);
return GetConvBwdDataPd(desc);
}
}
static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> GetConvBwdWeights(
- const ConvolutionParam ¶m, const NDArray &data, const NDArray &weight, const NDArray *bias,
- const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) {
- auto data_md = GetMemDesc(data);
+ const ConvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output,
+ const mkldnn::convolution_forward::primitive_desc& fwd_pd) {
+ auto data_md = GetMemDesc(data);
auto weight_md = GetWeightDesc(weight, param.num_group);
- auto out_md = GetMemDesc(output);
- auto engine = CpuEngine::Get()->get_engine();
+ auto out_md = GetMemDesc(output);
+ auto engine = CpuEngine::Get()->get_engine();
mkldnn::memory::dims strides(param.kernel.ndim());
mkldnn::memory::dims padding(param.kernel.ndim());
if (param.kernel.ndim() == 1) {
@@ -303,37 +347,49 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
<< ", supporting only 1 or 2 or 3.";
}
- auto GetConvBwdWeightsPd = [&data, &weight, &output,
- &fwd_pd](const mkldnn::convolution_backward_weights::desc &desc) {
+ auto GetConvBwdWeightsPd = [&data, &weight, &output, &fwd_pd](
+ const mkldnn::convolution_backward_weights::desc& desc) {
auto engine = CpuEngine::Get()->get_engine();
try {
- // MKL-DNN introduced padded formats since 0.15 which require more memory
- // compared to the actual size of the tensor. Currently, MKL-DNN operators
- // still reuse memory from memory planning, so here we need to select a
- // suboptimal kernel for computation that has the expected memory size requirements
+ // MKL-DNN introduced padded formats since 0.15 which require more
+ // memory compared to the actual size of the tensor. Currently,
+ // MKL-DNN operators still reuse memory from memory planning, so here
+ // we need to select a suboptimal kernel for computation that has the
+ // expected memory size requirements
auto conv_pd = std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
desc, engine, fwd_pd);
while (conv_pd->diff_dst_desc().get_size() != GetArraySize(output) ||
conv_pd->src_desc().get_size() != GetArraySize(data) ||
conv_pd->diff_weights_desc().get_size() != GetArraySize(weight)) {
- // next_impl() will visit desc and engine, please make sure they are still alive here.
+ // next_impl() will visit desc and engine, please make sure they are
+ // still alive here.
CHECK(conv_pd->next_impl()) << "No convolution backward implementation for this request.";
}
return conv_pd;
- } catch (mkldnn::error &e) {
+ } catch (mkldnn::error& e) {
LOG(ERROR) << e.message;
throw;
}
};
if (param.dilate.ndim() == 0 && bias == nullptr) {
- mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, out_md, strides, padding, padding);
+ mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ padding,
+ padding);
return GetConvBwdWeightsPd(desc);
} else if (param.dilate.ndim() == 0) {
auto bias_md = GetMemDesc(*bias);
- mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, data_md,
- weight_md, bias_md, out_md, strides, padding,
+ mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
+ data_md,
+ weight_md,
+ bias_md,
+ out_md,
+ strides,
+ padding,
padding);
return GetConvBwdWeightsPd(desc);
} else {
@@ -348,85 +404,106 @@ static std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc> Get
dilates[1] = param.dilate[1] - 1;
dilates[2] = param.dilate[2] - 1;
} else {
- LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size "
- << param.dilate.ndim() << ", supporting only 1 or 2 or 3.";
+ LOG(FATAL) << "Unexpected MKL-DNN Conv dilate size " << param.dilate.ndim()
+ << ", supporting only 1 or 2 or 3.";
}
if (bias == nullptr) {
mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
- data_md, weight_md, out_md, strides, dilates,
- padding, padding);
+ data_md,
+ weight_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvBwdWeightsPd(desc);
} else {
auto bias_md = GetMemDesc(*bias);
mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct,
- data_md, weight_md, bias_md, out_md, strides,
- dilates, padding, padding);
+ data_md,
+ weight_md,
+ bias_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
return GetConvBwdWeightsPd(desc);
}
}
}
-MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam ¶m, const bool is_train,
- const NDArray &data, const NDArray &weight,
- const NDArray *bias, const NDArray &output)
+MKLDNNConvForward::MKLDNNConvForward(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output)
: pd_(GetConvFwdImpl(param, is_train, data, weight, bias, output)) {
fwd_ = std::make_shared<mkldnn::convolution_forward>(GetPd());
}
-MKLDNNConvForward &GetConvFwd(const MKLDNNConvFullParam ¶m, const bool is_train,
- const NDArray &data, const NDArray &weight, const NDArray *bias,
- const NDArray &output) {
+MKLDNNConvForward& GetConvFwd(const MKLDNNConvFullParam& param,
+ const bool is_train,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output) {
using conv_fwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvForward, OpHash>;
#if DMLC_CXX11_THREAD_LOCAL
static thread_local conv_fwd_map fwds;
#else
static MX_THREAD_LOCAL conv_fwd_map fwds;
#endif
- // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
- // fused conv
+ // TODO(zhennan): Hash conv_param for now, need to hash full param if we want
+ // to enable cache for fused conv
MKLDNNConvSignature key(param.conv_param);
key.AddSign(is_train);
- // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
- // for the, so we only need to get the shape and the data type of the arrays.
+ // Here we can sign the conv op with NDArray because conv primitive will
+ // decide the right layout for the, so we only need to get the shape and the
+ // data type of the arrays.
key.AddSign(data);
key.AddSign(weight);
key.AddSign(output);
- if (bias) key.AddSign(*bias);
+ if (bias)
+ key.AddSign(*bias);
auto it = fwds.find(key);
if (it == fwds.end()) {
auto fwd = MKLDNNConvForward(param, is_train, data, weight, bias, output);
- it = AddToCache(&fwds, key, fwd);
+ it = AddToCache(&fwds, key, fwd);
}
return it->second;
}
-void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m, const OpContext &ctx,
- MKLDNNConvForward *fwd,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data) {
+void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam& param,
+ const OpContext& ctx,
+ MKLDNNConvForward* fwd,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
- auto &data = in_data[conv::kData];
- auto &weight = in_data[conv::kWeight];
+ auto& data = in_data[conv::kData];
+ auto& weight = in_data[conv::kWeight];
bool no_bias = param.conv_param.no_bias && !param.mkldnn_param.with_bn;
auto data_mem = data.GetMKLDNNDataReorder(fwd->GetPd().src_desc());
- const mkldnn::memory *weight_mem;
+ const mkldnn::memory* weight_mem;
if (ctx.is_train) {
- // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it to the default format
- // for now.
+ // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
+ // to the default format for now.
if (weight.IsMKLDNNData())
- // This asks the engine to change the layout of the weight array after it's used.
+ // This asks the engine to change the layout of the weight array after
+ // it's used.
weight.Reorder2DefaultAsync();
weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
} else {
- // For inference, we want to reorder the weight array so we don't need to reorder data every
- // time.
+ // For inference, we want to reorder the weight array so we don't need to
+ // reorder data every time.
if (weight.IsDefaultData()) {
- // We also need to modify the layout on the original weight array. The data conversion happens
- // after the weight array is used.
+ // We also need to modify the layout on the original weight array. The
+ // data conversion happens after the weight array is used.
weight.MKLDNNDataReorderAsync(fwd->GetPd().weights_desc());
weight_mem = GetWeights(weight, fwd->GetPd().weights_desc(), param.conv_param.num_group);
} else {
@@ -436,14 +513,14 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m, const
mkldnn_output_t out_mem;
if (param.mkldnn_param.with_sum) {
out_mem = mkldnn_output_t(OutDataOp::Noop,
- const_cast<mkldnn::memory *>(out_data[conv::kOut].GetMKLDNNData()));
+ const_cast<mkldnn::memory*>(out_data[conv::kOut].GetMKLDNNData()));
} else {
out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd->GetPd().dst_desc(), req[conv::kOut]);
}
mkldnn_args_map_t net_args;
if (!no_bias) {
- const mkldnn::memory *bias_mem = in_data[conv::kBias].GetMKLDNNData();
+ const mkldnn::memory* bias_mem = in_data[conv::kBias].GetMKLDNNData();
net_args.insert({MKLDNN_ARG_BIAS, *bias_mem});
}
@@ -455,80 +532,91 @@ void MKLDNNConvolutionForwardFullFeature(const MKLDNNConvFullParam ¶m, const
MKLDNNStream::Get()->Submit();
}
-void MKLDNNConvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &in_data,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &out_data) {
+void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& in_data,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& out_data) {
MKLDNNConvFullParam param;
param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
- auto &fwd =
- GetConvFwd(param, ctx.is_train, in_data[conv::kData], in_data[conv::kWeight],
- param.conv_param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]);
+ auto& fwd = GetConvFwd(param,
+ ctx.is_train,
+ in_data[conv::kData],
+ in_data[conv::kWeight],
+ param.conv_param.no_bias ? nullptr : &in_data[conv::kBias],
+ out_data[conv::kOut]);
MKLDNNConvolutionForwardFullFeature(param, ctx, &fwd, in_data, req, out_data);
}
-MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam ¶m, const NDArray &data,
- const NDArray &weight, const NDArray *bias,
- const NDArray &output) {
+MKLDNNConvBackward::MKLDNNConvBackward(const MKLDNNConvFullParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output) {
const auto fwd_pd = GetConvFwdImpl(param, true, data, weight, bias, output);
- bwd_data_pd_ = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
- bwd_weight_pd_ = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
- bwd_data_ = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
- bwd_weight_ = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
+ bwd_data_pd_ = GetConvBwdData(param.conv_param, data, weight, output, *fwd_pd);
+ bwd_weight_pd_ = GetConvBwdWeights(param.conv_param, data, weight, bias, output, *fwd_pd);
+ bwd_data_ = std::make_shared<mkldnn::convolution_backward_data>(GetDataPd());
+ bwd_weight_ = std::make_shared<mkldnn::convolution_backward_weights>(GetWeightsPd());
}
-static inline MKLDNNConvBackward &GetConvBwd(const MKLDNNConvFullParam ¶m, const NDArray &data,
- const NDArray &weight, const NDArray *bias,
- const NDArray &output) {
+static inline MKLDNNConvBackward& GetConvBwd(const MKLDNNConvFullParam& param,
+ const NDArray& data,
+ const NDArray& weight,
+ const NDArray* bias,
+ const NDArray& output) {
using mkldnn_conv_bwd_map = std::unordered_map<MKLDNNConvSignature, MKLDNNConvBackward, OpHash>;
#if DMLC_CXX11_THREAD_LOCAL
static thread_local mkldnn_conv_bwd_map bwds;
#else
static MX_THREAD_LOCAL mkldnn_conv_bwd_map bwds;
#endif
- // TODO(zhennan): Hash conv_param for now, need to hash full param if we want to enable cache for
- // fused conv
+ // TODO(zhennan): Hash conv_param for now, need to hash full param if we want
+ // to enable cache for fused conv
MKLDNNConvSignature key(param.conv_param);
- // Here we can sign the conv op with NDArray because conv primitive will decide the right layout
- // for the, so we only need to get the shape and the data type of the arrays.
+ // Here we can sign the conv op with NDArray because conv primitive will
+ // decide the right layout for the, so we only need to get the shape and the
+ // data type of the arrays.
key.AddSign(data);
key.AddSign(weight);
key.AddSign(output);
- if (bias) key.AddSign(*bias);
+ if (bias)
+ key.AddSign(*bias);
auto it = bwds.find(key);
if (it == bwds.end()) {
auto bwd = MKLDNNConvBackward(param, data, weight, bias, output);
- it = AddToCache(&bwds, key, bwd);
+ it = AddToCache(&bwds, key, bwd);
}
return it->second;
}
-void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
const std::vector<NDArray>& inputs,
const std::vector<OpReqType>& req,
const std::vector<NDArray>& outputs) {
TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]);
- const std::vector<NDArray> &in_grad = outputs;
+ const std::vector<NDArray>& in_grad = outputs;
MKLDNNConvFullParam full_param;
full_param.conv_param = nnvm::get<ConvolutionParam>(attrs.parsed);
full_param.mkldnn_param.Init(std::unordered_map<std::string, std::string>());
- auto &data = inputs[conv::kData + 1];
- auto &weight = inputs[conv::kWeight + 1];
- const auto *bias = full_param.conv_param.no_bias ? nullptr : &inputs[conv::kBias + 1];
- auto &out_grad = inputs[conv::kOut];
+ auto& data = inputs[conv::kData + 1];
+ auto& weight = inputs[conv::kWeight + 1];
+ const auto* bias = full_param.conv_param.no_bias ? nullptr : &inputs[conv::kBias + 1];
+ auto& out_grad = inputs[conv::kOut];
- const ConvolutionParam ¶m = full_param.conv_param;
+ const ConvolutionParam& param = full_param.conv_param;
CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace";
- MKLDNNConvBackward &convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
- auto out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
+ MKLDNNConvBackward& convBwd = GetConvBwd(full_param, data, weight, bias, out_grad);
+ auto out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetDataPd().diff_dst_desc());
if (req[conv::kData]) {
- auto weight_mem = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
- auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(),
- req[conv::kData]);
+ auto weight_mem = GetWeights(weight, convBwd.GetDataPd().weights_desc(), param.num_group);
+ auto in_grad_mem = CreateMKLDNNMem(
+ in_grad[conv::kData], convBwd.GetDataPd().diff_src_desc(), req[conv::kData]);
MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdData(),
{{MKLDNN_ARG_DIFF_DST, *out_grad_mem},
{MKLDNN_ARG_WEIGHTS, *weight_mem},
@@ -538,7 +626,7 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct
if (req[conv::kWeight] || req[conv::kBias]) {
if (convBwd.GetDataPd().diff_dst_desc() != convBwd.GetWeightsPd().diff_dst_desc())
out_grad_mem = out_grad.GetMKLDNNDataReorder(convBwd.GetWeightsPd().diff_dst_desc());
- auto data_mem = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
+ auto data_mem = data.GetMKLDNNDataReorder(convBwd.GetWeightsPd().src_desc());
auto in_grad_weight = CreateMKLDNNWeightGrad(
in_grad[conv::kWeight], convBwd.GetWeightsPd().diff_weights_desc(), req[conv::kWeight]);
@@ -547,9 +635,8 @@ void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ct
{MKLDNN_ARG_DIFF_WEIGHTS, *in_grad_weight.second}};
mkldnn_output_t in_grad_bias;
if (!param.no_bias) {
- in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias],
- convBwd.GetWeightsPd().diff_bias_desc(),
- req[conv::kBias]);
+ in_grad_bias = CreateMKLDNNMem(
+ in_grad[conv::kBias], convBwd.GetWeightsPd().diff_bias_desc(), req[conv::kBias]);
net_args.insert({MKLDNN_ARG_DIFF_BIAS, *in_grad_bias.second});
}
MKLDNNStream::Get()->RegisterPrimArgs(convBwd.GetBwdWeights(), net_args);
diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc
index a67847f..601df3c 100644
--- a/src/operator/nn/mkldnn/mkldnn_copy.cc
+++ b/src/operator/nn/mkldnn/mkldnn_copy.cc
@@ -21,19 +21,22 @@
* \file mkldnn_copy.cc
* \brief
* \author
-*/
+ */
-#include "./mkldnn_ops-inl.h"
#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
#if MXNET_USE_MKLDNN == 1
namespace mxnet {
namespace op {
-void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
- const NDArray &in_data, const OpReqType &req,
- const NDArray &out_data) {
- if (req == kNullOp || req == kWriteInplace) return;
+void MKLDNNCopy(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const NDArray& in_data,
+ const OpReqType& req,
+ const NDArray& out_data) {
+ if (req == kNullOp || req == kWriteInplace)
+ return;
TmpMemMgr::Get()->Init(ctx.requested[0]);
auto in_mem = in_data.GetMKLDNNData();
if (req == kAddTo) {
@@ -41,16 +44,16 @@ void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
// We should try and force the input memory has the same format
// as the input output. If not, we'll have to reorder memory.
auto out_mem = out_data.GetMKLDNNData();
- in_mem = in_data.GetMKLDNNData(out_mem ->get_desc());
+ in_mem = in_data.GetMKLDNNData(out_mem->get_desc());
if (in_mem == nullptr)
in_mem = in_data.GetMKLDNNDataReorder(out_mem->get_desc());
MKLDNNSum(*out_mem, *in_mem, *out_mem);
} else {
- const_cast<NDArray &>(out_data).CopyFrom(*in_mem);
+ const_cast<NDArray&>(out_data).CopyFrom(*in_mem);
}
MKLDNNStream::Get()->Submit();
}
-} // namespace op
-} // namespace mxnet
+} // namespace op
+} // namespace mxnet
#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
index b51ec2a..b048c13 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution-inl.h
@@ -30,16 +30,17 @@
* (diff_bias) bias_grad <---| |<--- weight
* |______|<--- bias
*
- * "out" in this (and .cc) file will always refer to the output of Deconv FWD and
- * "out_grad" to its gradient. The corresponding MKLDNN names are in parentheses.
+ * "out" in this (and .cc) file will always refer to the output of Deconv FWD
+ * and "out_grad" to its gradient. The corresponding MKLDNN names are in
+ * parentheses.
*/
#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_DECONVOLUTION_INL_H_
#if MXNET_USE_MKLDNN == 1
+#include <numeric>
#include <utility>
#include <vector>
-#include <numeric>
#include "../deconvolution-inl.h"
#include "./mkldnn_base-inl.h"
@@ -48,20 +49,19 @@
namespace mxnet {
namespace op {
-using deconv_fwd_t = mkldnn::deconvolution_forward;
+using deconv_fwd_t = mkldnn::deconvolution_forward;
using deconv_fwd_pd_t = mkldnn::deconvolution_forward::primitive_desc;
-using deconv_bwd_data_t = mkldnn::deconvolution_backward_data;
+using deconv_bwd_data_t = mkldnn::deconvolution_backward_data;
using deconv_bwd_data_pd_t = mkldnn::deconvolution_backward_data::primitive_desc;
-using deconv_bwd_weights_t = mkldnn::deconvolution_backward_weights;
+using deconv_bwd_weights_t = mkldnn::deconvolution_backward_weights;
using deconv_bwd_weights_pd_t = mkldnn::deconvolution_backward_weights::primitive_desc;
-
-
-// Swaps the logical order of dimensions that in plain format would correspond to input and output
-// channels (for example: oihw => iohw, iohw => oihw, goihw => giohw).
-inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc,
+// Swaps the logical order of dimensions that in plain format would correspond
+// to input and output channels (for example: oihw => iohw, iohw => oihw, goihw
+// => giohw).
+inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc& desc,
const uint32_t num_group) {
std::vector<int> order(desc.data.ndims);
std::iota(std::begin(order), std::end(order), 0);
@@ -71,158 +71,172 @@ inline mkldnn::memory::desc IOLogicalSwapDesc(const mkldnn::memory::desc &desc,
}
// Applies IOLogicalSwapDesc to MKLDNN memory of arr
-inline void IOLogicalSwapMKLDNNMem(const NDArray &arr, const uint32_t num_group) {
+inline void IOLogicalSwapMKLDNNMem(const NDArray& arr, const uint32_t num_group) {
mkldnn::memory::desc desc;
if (arr.IsMKLDNNData()) {
desc = arr.GetMKLDNNData()->get_desc();
} else {
- // GetMKLDNNData won't take groups into account when creating mkldnn::memory, we need to use
- // descriptor from GetWeightDesc but with default format
- const auto &temp = GetWeightDesc(arr, num_group);
- desc = mkldnn::memory::desc(
- temp.dims(), temp.data_type(),
+ // GetMKLDNNData won't take groups into account when creating
+ // mkldnn::memory, we need to use descriptor from GetWeightDesc but with
+ // default format
+ const auto& temp = GetWeightDesc(arr, num_group);
+ desc = mkldnn::memory::desc(
+ temp.dims(),
+ temp.data_type(),
static_cast<mkldnn::memory::format_tag>(GetDefaultFormat(temp.data.ndims)));
}
- const_cast<NDArray &>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group));
+ const_cast<NDArray&>(arr).UpdateMKLDNNMemDesc(IOLogicalSwapDesc(desc, num_group));
}
// Version of GetWeightsDesc for deconvolution (with swap)
-inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray &weights, const uint32_t num_group) {
+inline mkldnn::memory::desc GetDeconvWeightsDesc(const NDArray& weights, const uint32_t num_group) {
return IOLogicalSwapDesc(GetWeightDesc(weights, num_group), num_group);
}
-
-
class MKLDNNDeconvFwd {
public:
struct Tensors {
- Tensors(const NDArray &data, const NDArray &weights, const NDArray *const bias,
- const NDArray &out);
- Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
- const std::vector<NDArray> &outputs);
-
- const NDArray &data;
- const NDArray &weights;
- const NDArray *const bias;
- const NDArray &out;
+ Tensors(const NDArray& data,
+ const NDArray& weights,
+ const NDArray* const bias,
+ const NDArray& out);
+ Tensors(const bool no_bias,
+ const std::vector<NDArray>& inputs,
+ const std::vector<NDArray>& outputs);
+
+ const NDArray& data;
+ const NDArray& weights;
+ const NDArray* const bias;
+ const NDArray& out;
};
- static MKLDNNDeconvFwd &GetCached(const DeconvolutionParam ¶m, const Tensors &tensors);
- static std::shared_ptr<deconv_fwd_pd_t> CreatePrimitiveDesc(const DeconvolutionParam ¶m,
- const Tensors &tensors);
+ static MKLDNNDeconvFwd& GetCached(const DeconvolutionParam& param, const Tensors& tensors);
+ static std::shared_ptr<deconv_fwd_pd_t> CreatePrimitiveDesc(const DeconvolutionParam& param,
+ const Tensors& tensors);
- MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors);
- void ControlWeightsFormat(const uint32_t num_group, const bool is_train,
- const NDArray &weights) const;
- void Execute(const uint32_t num_group, const OpReqType req, const Tensors &tensors) const;
+ MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors);
+ void ControlWeightsFormat(const uint32_t num_group,
+ const bool is_train,
+ const NDArray& weights) const;
+ void Execute(const uint32_t num_group, const OpReqType req, const Tensors& tensors) const;
private:
- const mkldnn::memory *DataMem(const NDArray &data) const;
- const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const;
- const mkldnn::memory *BiasMem(const NDArray &bias) const;
+ const mkldnn::memory* DataMem(const NDArray& data) const;
+ const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
+ const mkldnn::memory* BiasMem(const NDArray& bias) const;
- mkldnn_output_t OutMem(const OpReqType req, const NDArray &out) const;
+ mkldnn_output_t OutMem(const OpReqType req, const NDArray& out) const;
private:
std::shared_ptr<deconv_fwd_t> fwd;
std::shared_ptr<deconv_fwd_pd_t> fwd_pd;
};
-
-MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias, const std::vector<NDArray> &inputs,
- const std::vector<NDArray> &outputs)
+MKLDNNDeconvFwd::Tensors::Tensors(const bool no_bias,
+ const std::vector<NDArray>& inputs,
+ const std::vector<NDArray>& outputs)
: data(inputs[deconv::kData]),
weights(inputs[deconv::kWeight]),
bias(no_bias ? nullptr : &inputs[deconv::kBias]),
out(outputs[deconv::kOut]) {}
-MKLDNNDeconvFwd::Tensors::Tensors(const NDArray &data, const NDArray &weights,
- const NDArray *const bias, const NDArray &out)
+MKLDNNDeconvFwd::Tensors::Tensors(const NDArray& data,
+ const NDArray& weights,
+ const NDArray* const bias,
+ const NDArray& out)
: data(data), weights(weights), bias(bias), out(out) {}
-MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam ¶m, const Tensors &tensors)
+MKLDNNDeconvFwd::MKLDNNDeconvFwd(const DeconvolutionParam& param, const Tensors& tensors)
: fwd_pd(CreatePrimitiveDesc(param, tensors)) {
fwd = std::make_shared<deconv_fwd_t>(*fwd_pd);
}
-inline const mkldnn::memory *MKLDNNDeconvFwd::DataMem(const NDArray &data) const {
+inline const mkldnn::memory* MKLDNNDeconvFwd::DataMem(const NDArray& data) const {
return data.GetMKLDNNDataReorder(fwd_pd->src_desc());
}
-inline const mkldnn::memory *MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
- const NDArray &weights) const {
+inline const mkldnn::memory* MKLDNNDeconvFwd::WeightsMem(const uint32_t num_group,
+ const NDArray& weights) const {
return GetWeights(weights, fwd_pd->weights_desc(), num_group);
}
-inline const mkldnn::memory *MKLDNNDeconvFwd::BiasMem(const NDArray &bias) const {
+inline const mkldnn::memory* MKLDNNDeconvFwd::BiasMem(const NDArray& bias) const {
return bias.GetMKLDNNData();
}
-inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray &out) const {
+inline mkldnn_output_t MKLDNNDeconvFwd::OutMem(const OpReqType req, const NDArray& out) const {
return CreateMKLDNNMem(out, fwd_pd->dst_desc(), req);
}
-
-
class MKLDNNDeconvBwd {
public:
struct ReadTensors {
- ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs);
- const NDArray &data;
- const NDArray &weights;
- const NDArray *const bias;
- const NDArray &out_grad;
+ ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs);
+ const NDArray& data;
+ const NDArray& weights;
+ const NDArray* const bias;
+ const NDArray& out_grad;
};
struct WriteTensors {
- WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs);
- const NDArray &data_grad;
- const NDArray &weights_grad;
- const NDArray *const bias_grad;
+ WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs);
+ const NDArray& data_grad;
+ const NDArray& weights_grad;
+ const NDArray* const bias_grad;
};
- static MKLDNNDeconvBwd &GetCached(const DeconvolutionParam ¶m,
- const ReadTensors &read_tensors);
+ static MKLDNNDeconvBwd& GetCached(const DeconvolutionParam& param,
+ const ReadTensors& read_tensors);
static std::shared_ptr<deconv_bwd_data_pd_t> CreateDataPrimitiveDesc(
- const DeconvolutionParam ¶m, const ReadTensors &read_tensors,
- const deconv_fwd_pd_t &fwd_pd);
+ const DeconvolutionParam& param,
+ const ReadTensors& read_tensors,
+ const deconv_fwd_pd_t& fwd_pd);
static std::shared_ptr<deconv_bwd_weights_pd_t> CreateWeightsPrimitiveDesc(
- const DeconvolutionParam ¶m, const ReadTensors &read_tensors,
- const deconv_fwd_pd_t &fwd_pd);
+ const DeconvolutionParam& param,
+ const ReadTensors& read_tensors,
+ const deconv_fwd_pd_t& fwd_pd);
- MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &read_tensors);
+ MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors);
- void Execute(const uint32_t num_group, const std::vector<OpReqType> &req,
- const ReadTensors &read_tensors, const WriteTensors &write_tensors) const;
+ void Execute(const uint32_t num_group,
+ const std::vector<OpReqType>& req,
+ const ReadTensors& read_tensors,
+ const WriteTensors& write_tensors) const;
private:
- void IOSwapWeightsTensors(const uint32_t num_group, const std::vector<OpReqType> &req,
- const NDArray &weights, const NDArray &weights_grad) const;
-
- // returns the output gradient memory used to calculate the data (input) gradient,
- // which might be reused when calculating the gradient of weights
- const mkldnn::memory *ScheduleBwdData(const uint32_t num_group, const OpReqType req,
- const ReadTensors &read_tensors,
- const WriteTensors &write_tensors) const;
-
- void ScheduleBwdWeights(const uint32_t num_group, const std::vector<OpReqType> &req,
- const ReadTensors &read_tensors, const WriteTensors &write_tensors,
- const mkldnn::memory *const out_grad_mem) const;
-
- const mkldnn::memory *DataMem(const NDArray &data) const;
- const mkldnn::memory *WeightsMem(const uint32_t num_group, const NDArray &weights) const;
+ void IOSwapWeightsTensors(const uint32_t num_group,
+ const std::vector<OpReqType>& req,
+ const NDArray& weights,
+ const NDArray& weights_grad) const;
+
+ // returns the output gradient memory used to calculate the data (input)
+ // gradient, which might be reused when calculating the gradient of weights
+ const mkldnn::memory* ScheduleBwdData(const uint32_t num_group,
+ const OpReqType req,
+ const ReadTensors& read_tensors,
+ const WriteTensors& write_tensors) const;
+
+ void ScheduleBwdWeights(const uint32_t num_group,
+ const std::vector<OpReqType>& req,
+ const ReadTensors& read_tensors,
+ const WriteTensors& write_tensors,
+ const mkldnn::memory* const out_grad_mem) const;
+
+ const mkldnn::memory* DataMem(const NDArray& data) const;
+ const mkldnn::memory* WeightsMem(const uint32_t num_group, const NDArray& weights) const;
// for calculating the gradient of data (input)
- const mkldnn::memory *OutGradMem(const NDArray &out_grad) const;
+ const mkldnn::memory* OutGradMem(const NDArray& out_grad) const;
// for calculating the gradient of weights
- const mkldnn::memory *OutGradMem(const NDArray &out_grad,
- const mkldnn::memory *const out_grad_mem) const;
+ const mkldnn::memory* OutGradMem(const NDArray& out_grad,
+ const mkldnn::memory* const out_grad_mem) const;
- mkldnn_output_t DataGradMem(const OpReqType req, const NDArray &data_grad) const;
- mkldnn_output_t WeightsGradMem(const uint32_t num_group, const OpReqType req,
- const NDArray &weights_grad) const;
- mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray *const bias) const;
+ mkldnn_output_t DataGradMem(const OpReqType req, const NDArray& data_grad) const;
+ mkldnn_output_t WeightsGradMem(const uint32_t num_group,
+ const OpReqType req,
+ const NDArray& weights_grad) const;
+ mkldnn_output_t BiasGradMem(const OpReqType req, const NDArray* const bias) const;
std::shared_ptr<deconv_bwd_data_pd_t> bwd_data_pd;
std::shared_ptr<deconv_bwd_weights_pd_t> bwd_weights_pd;
@@ -230,32 +244,32 @@ class MKLDNNDeconvBwd {
std::shared_ptr<deconv_bwd_weights_t> bwd_weights;
};
-
-MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray> &inputs)
+MKLDNNDeconvBwd::ReadTensors::ReadTensors(const bool no_bias, const std::vector<NDArray>& inputs)
: data(inputs[deconv::kData + 1]),
weights(inputs[deconv::kWeight + 1]),
bias(no_bias ? nullptr : &inputs[deconv::kBias + 1]),
out_grad(inputs[deconv::kOut]) {}
-MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray> &outputs)
+MKLDNNDeconvBwd::WriteTensors::WriteTensors(const bool no_bias, const std::vector<NDArray>& outputs)
: data_grad(outputs[deconv::kData]),
weights_grad(outputs[deconv::kWeight]),
bias_grad(no_bias ? nullptr : &outputs[deconv::kBias]) {}
-MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam ¶m, const ReadTensors &read_tensors) {
- const auto &fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc(
- param, MKLDNNDeconvFwd::Tensors(read_tensors.data, read_tensors.weights, read_tensors.bias,
- read_tensors.out_grad));
- bwd_data_pd = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd);
+MKLDNNDeconvBwd::MKLDNNDeconvBwd(const DeconvolutionParam& param, const ReadTensors& read_tensors) {
+ const auto& fwd_pd = MKLDNNDeconvFwd::CreatePrimitiveDesc(
+ param,
+ MKLDNNDeconvFwd::Tensors(
+ read_tensors.data, read_tensors.weights, read_tensors.bias, read_tensors.out_grad));
+ bwd_data_pd = CreateDataPrimitiveDesc(param, read_tensors, *fwd_pd);
bwd_weights_pd = CreateWeightsPrimitiveDesc(param, read_tensors, *fwd_pd);
- bwd_data = std::make_shared<deconv_bwd_data_t>(*bwd_data_pd);
- bwd_weights = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
+ bwd_data = std::make_shared<deconv_bwd_data_t>(*bwd_data_pd);
+ bwd_weights = std::make_shared<deconv_bwd_weights_t>(*bwd_weights_pd);
}
inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
- const std::vector<OpReqType> &req,
- const NDArray &weights,
- const NDArray &weights_grad) const {
+ const std::vector<OpReqType>& req,
+ const NDArray& weights,
+ const NDArray& weights_grad) const {
if (req[deconv::kData]) {
IOLogicalSwapMKLDNNMem(weights, num_group);
}
@@ -264,69 +278,74 @@ inline void MKLDNNDeconvBwd::IOSwapWeightsTensors(const uint32_t num_group,
}
}
-inline const mkldnn::memory *MKLDNNDeconvBwd::DataMem(const NDArray &data) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::DataMem(const NDArray& data) const {
return data.GetMKLDNNDataReorder(bwd_weights_pd->src_desc());
}
-inline const mkldnn::memory *MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
- const NDArray &weights) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::WeightsMem(const uint32_t num_group,
+ const NDArray& weights) const {
return GetWeights(weights, bwd_data_pd->weights_desc(), num_group);
}
-inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(const NDArray &out_grad) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(const NDArray& out_grad) const {
return out_grad.GetMKLDNNDataReorder(bwd_data_pd->diff_dst_desc());
}
-inline const mkldnn::memory *MKLDNNDeconvBwd::OutGradMem(
- const NDArray &out_grad, const mkldnn::memory *const out_grad_mem) const {
+inline const mkldnn::memory* MKLDNNDeconvBwd::OutGradMem(
+ const NDArray& out_grad,
+ const mkldnn::memory* const out_grad_mem) const {
return (out_grad_mem && out_grad_mem->get_desc() == bwd_weights_pd->diff_dst_desc())
? out_grad_mem
: out_grad.GetMKLDNNDataReorder(bwd_weights_pd->diff_dst_desc());
}
inline mkldnn_output_t MKLDNNDeconvBwd::DataGradMem(const OpReqType req,
- const NDArray &data_grad) const {
+ const NDArray& data_grad) const {
return CreateMKLDNNMem(data_grad, bwd_data_pd->diff_src_desc(), req);
}
inline mkldnn_output_t MKLDNNDeconvBwd::WeightsGradMem(const uint32_t num_group,
const OpReqType req,
- const NDArray &weights_grad) const {
- // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat always fails (because
- // of the logical swap - explained in MKLDNNDeconvFwd::Execute). We try to reuse weights_grad
- // memory (which, when not swapped, is always in default format), so here we check if after a
+ const NDArray& weights_grad) const {
+ // CreateMKLDNNWeightGrad always creates a new tensor as IsDefaultFormat
+ // always fails (because of the logical swap - explained in
+ // MKLDNNDeconvFwd::Execute). We try to reuse weights_grad memory (which, when
+ // not swapped, is always in default format), so here we check if after a
// swap, weights_md will have a default format
- const auto &weights_md = bwd_weights_pd->diff_weights_desc();
+ const auto& weights_md = bwd_weights_pd->diff_weights_desc();
if (req == OpReqType::kWriteTo && IsDefaultFormat(IOLogicalSwapDesc(weights_md, num_group))) {
- return {OutDataOp::Noop, const_cast<NDArray &>(weights_grad).CreateMKLDNNData(weights_md)};
+ return {OutDataOp::Noop, const_cast<NDArray&>(weights_grad).CreateMKLDNNData(weights_md)};
}
return CreateMKLDNNWeightGrad(weights_grad, weights_md, req);
}
inline mkldnn_output_t MKLDNNDeconvBwd::BiasGradMem(const OpReqType req,
- const NDArray *const bias) const {
+ const NDArray* const bias) const {
return bias ? CreateMKLDNNMem(*bias, bwd_weights_pd->diff_bias_desc(), req)
: mkldnn_output_t(OutDataOp::Noop, nullptr);
}
-
-
// Utility class for creating operation descriptors of deconvolution primitives
class DeconvDescCreator {
public:
- DeconvDescCreator(const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights,
- const NDArray *const bias, const NDArray &out);
-
- // Imposes plain formats on memory descriptors with padding (so the next selected implementation
- // will pass CheckImplSizeReq). After calling this method, new primitive descriptor (with new
- // operator descriptor) should be created, which should select an implementation with matching
- // size requirements.
- // data_size, weights_size, out_size - size requirements of current implementation
- // Returns whether successfully imposed a plain format on any of the data, weights, and output
- // memory descriptors.
- bool ImposePlainWherePadding(const size_t data_size, const size_t weights_size,
+ DeconvDescCreator(const DeconvolutionParam& param,
+ const NDArray& data,
+ const NDArray& weights,
+ const NDArray* const bias,
+ const NDArray& out);
+
+ // Imposes plain formats on memory descriptors with padding (so the next
+ // selected implementation will pass CheckImplSizeReq). After calling this
+ // method, new primitive descriptor (with new operator descriptor) should be
+ // created, which should select an implementation with matching size
+ // requirements. data_size, weights_size, out_size - size requirements of
+ // current implementation Returns whether successfully imposed a plain format
+ // on any of the data, weights, and output memory descriptors.
+ bool ImposePlainWherePadding(const size_t data_size,
+ const size_t weights_size,
const size_t out_size);
- bool CheckImplSizeReq(const size_t data_size, const size_t weights_size,
+ bool CheckImplSizeReq(const size_t data_size,
+ const size_t weights_size,
const size_t out_size) const;
deconv_fwd_t::desc CreateFwdDesc() const;
@@ -344,8 +363,8 @@ class DeconvDescCreator {
mkldnn::memory::dims dilates;
};
-
-inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const size_t weights_size,
+inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size,
+ const size_t weights_size,
const size_t out_size) const {
// MKLDNN introduced padded formats since 0.15 which require more memory
// compared to the actual size of the tensor. Currently, MKLDNN operators
@@ -357,18 +376,38 @@ inline bool DeconvDescCreator::CheckImplSizeReq(const size_t data_size, const si
inline deconv_fwd_t::desc DeconvDescCreator::CreateFwdDesc() const {
return deconv_fwd_t::desc(mkldnn::prop_kind::forward_training,
- mkldnn::algorithm::deconvolution_direct, data_md, weights_md, bias_md,
- out_md, strides, dilates, padding, padding);
+ mkldnn::algorithm::deconvolution_direct,
+ data_md,
+ weights_md,
+ bias_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
}
inline deconv_bwd_data_t::desc DeconvDescCreator::CreateBwdDataDesc() const {
- return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
- out_md, strides, dilates, padding, padding);
+ return deconv_bwd_data_t::desc(mkldnn::algorithm::deconvolution_direct,
+ data_md,
+ weights_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
}
inline deconv_bwd_weights_t::desc DeconvDescCreator::CreateBwdWeightsDesc() const {
- return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct, data_md, weights_md,
- bias_md, out_md, strides, dilates, padding, padding);
+ return deconv_bwd_weights_t::desc(mkldnn::algorithm::deconvolution_direct,
+ data_md,
+ weights_md,
+ bias_md,
+ out_md,
+ strides,
+ dilates,
+ padding,
+ padding);
}
} // namespace op
diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
index 2160815..211ccd6 100644
--- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
+++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc
@@ -28,29 +28,28 @@
namespace mxnet {
namespace op {
-bool SupportMKLDNNDeconv(const DeconvolutionParam ¶ms, const NDArray &input) {
+bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray& input) {
return params.kernel.ndim() >= 1 && params.kernel.ndim() <= 3 &&
input.shape().ndim() == (params.kernel.ndim() + 2) &&
(input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);
}
-
-
-void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
- const auto ¶m = nnvm::get<DeconvolutionParam>(attrs.parsed);
+ const auto& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
const auto tensors = MKLDNNDeconvFwd::Tensors(param.no_bias, inputs, outputs);
- const auto &fwd = MKLDNNDeconvFwd::GetCached(param, tensors);
+ const auto& fwd = MKLDNNDeconvFwd::GetCached(param, tensors);
fwd.ControlWeightsFormat(param.num_group, ctx.is_train, tensors.weights);
fwd.Execute(param.num_group, req[deconv::kOut], tensors);
}
-MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam ¶m,
- const Tensors &tensors) {
+MKLDNNDeconvFwd& MKLDNNDeconvFwd::GetCached(const DeconvolutionParam& param,
+ const Tensors& tensors) {
using deconv_fwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvFwd, OpHash>;
#if DMLC_CXX11_THREAD_LOCAL
static thread_local deconv_fwd_map fwds;
@@ -74,18 +73,20 @@ MKLDNNDeconvFwd &MKLDNNDeconvFwd::GetCached(const DeconvolutionParam ¶m,
}
std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
- const DeconvolutionParam ¶m, const Tensors &tensors) {
+ const DeconvolutionParam& param,
+ const Tensors& tensors) {
DeconvDescCreator ddc(param, tensors.data, tensors.weights, tensors.bias, tensors.out);
- const auto &engine = CpuEngine::Get()->get_engine();
- const auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.CreateFwdDesc(), engine);
- const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
+ const auto& engine = CpuEngine::Get()->get_engine();
+ const auto pd = std::make_shared<deconv_fwd_pd_t>(ddc.CreateFwdDesc(), engine);
+ const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
- const auto get_out_size = [&pd]() { return pd->dst_desc().get_size(); };
+ const auto get_out_size = [&pd]() { return pd->dst_desc().get_size(); };
while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
if (!pd->next_impl()) {
- // ImposePlainWherePadding fails when all memory descriptors already have plain formats
- // imposed, meaning there is no implementation with plain formats
+ // ImposePlainWherePadding fails when all memory descriptors already have
+ // plain formats imposed, meaning there is no implementation with plain
+ // formats
CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
<< "No implementation of deconvolution forward propagation";
*pd = deconv_fwd_pd_t(ddc.CreateFwdDesc(), engine);
@@ -94,13 +95,15 @@ std::shared_ptr<deconv_fwd_pd_t> MKLDNNDeconvFwd::CreatePrimitiveDesc(
return pd;
}
-void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool is_train,
- const NDArray &weights) const {
+void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group,
+ const bool is_train,
+ const NDArray& weights) const {
if (is_train) {
// TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it
// to the default format for now.
if (weights.IsMKLDNNData()) {
- // This asks the engine to change the layout of the weights array after it's used.
+ // This asks the engine to change the layout of the weights array after
+ // it's used.
weights.Reorder2DefaultAsync();
}
} else {
@@ -117,32 +120,38 @@ void MKLDNNDeconvFwd::ControlWeightsFormat(const uint32_t num_group, const bool
}
}
-void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const OpReqType req,
- const Tensors &tensors) const {
- // MXNet (correctly) assumes that deconvolution is implemented using convolution primitives.
- // For that, we would pass input tensor in place of output and output tensor in place of input
- // (for appropriate convolution primitives: deconvolution forward = convolution backward data,
+void MKLDNNDeconvFwd::Execute(const uint32_t num_group,
+ const OpReqType req,
+ const Tensors& tensors) const {
+ // MXNet (correctly) assumes that deconvolution is implemented using
+ // convolution primitives. For that, we would pass input tensor in place of
+ // output and output tensor in place of input (for appropriate convolution
+ // primitives: deconvolution forward = convolution backward data,
// deconvolution backward data = convolution forward).
// The convolution primitive expects weights tensor with the shape of
- // (primitive_out_channels, primitive_in_channels, h, w), but with swapped input and output:
- // primitive_out_channels = deconv_in_channels, primitive_in_channels = deconv_out_channels,
- // so it becomes (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such tensor.
+ // (primitive_out_channels, primitive_in_channels, h, w), but with swapped
+ // input and output: primitive_out_channels = deconv_in_channels,
+ // primitive_in_channels = deconv_out_channels, so it becomes
+ // (deconv_in_channels, deconv_out_channels, h, w) and MXNet provides such
+ // tensor.
//
- // MKLDNN deconvolution primitive also (as convolution) expects weights tensor with the shape of
- // (primitive_out_channels, primitive_in_channels, h, w), but this time we don't swap input and
- // output tensors, so:
- // primitive_out_channels = deconv_out_channels, primitive_in_channels = deconv_in_channels,
- // thus the current weights tensor won't fit (when deconv_out_channels != deconv_in_channels).
- // However, underneath deconvolution MKLDNN also uses convolution, so even though it expects the
- // weights tensor with the logical order of oihw, it wants its physical representation to
- // match the order of iohw, which is the same as current weights tensor.
+ // MKLDNN deconvolution primitive also (as convolution) expects weights tensor
+ // with the shape of (primitive_out_channels, primitive_in_channels, h, w),
+ // but this time we don't swap input and output tensors, so:
+ // primitive_out_channels = deconv_out_channels, primitive_in_channels =
+ // deconv_in_channels, thus the current weights tensor won't fit (when
+ // deconv_out_channels != deconv_in_channels). However, underneath
+ // deconvolution MKLDNN also uses convolution, so even though it expects the
+ // weights tensor with the logical order of oihw, it wants its physical
+ // representation to match the order of iohw, which is the same as current
+ // weights tensor.
//
- // So here we swap logical order of input and output dimensions for weights tensor just for
- // MKLDNN operations.
+ // So here we swap logical order of input and output dimensions for weights
+ // tensor just for MKLDNN operations.
IOLogicalSwapMKLDNNMem(tensors.weights, num_group);
{
mkldnn_args_map_t net_args;
- const auto &out_mem = OutMem(req, tensors.out);
+ const auto& out_mem = OutMem(req, tensors.out);
net_args.insert({MKLDNN_ARG_SRC, *DataMem(tensors.data)});
net_args.insert({MKLDNN_ARG_WEIGHTS, *WeightsMem(num_group, tensors.weights)});
@@ -156,28 +165,28 @@ void MKLDNNDeconvFwd::Execute(const uint32_t num_group, const OpReqType req,
CommitOutput(tensors.out, out_mem);
MKLDNNStream::Get()->Submit();
}
- IOLogicalSwapMKLDNNMem(tensors.weights, num_group); // swap back from oihw to iohw
+ IOLogicalSwapMKLDNNMem(tensors.weights,
+ num_group); // swap back from oihw to iohw
}
-
-
-void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
- const std::vector<NDArray> &inputs,
- const std::vector<OpReqType> &req,
- const std::vector<NDArray> &outputs) {
+void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs,
+ const OpContext& ctx,
+ const std::vector<NDArray>& inputs,
+ const std::vector<OpReqType>& req,
+ const std::vector<NDArray>& outputs) {
CHECK_NE(req[deconv::kWeight], kWriteInplace) << "Cannot write weights inplace";
TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]);
- const auto ¶m = nnvm::get<DeconvolutionParam>(attrs.parsed);
- const auto read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
+ const auto& param = nnvm::get<DeconvolutionParam>(attrs.parsed);
+ const auto read_tensors = MKLDNNDeconvBwd::ReadTensors(param.no_bias, inputs);
const auto write_tensors = MKLDNNDeconvBwd::WriteTensors(param.no_bias, outputs);
- MKLDNNDeconvBwd &bwd = MKLDNNDeconvBwd::GetCached(param, read_tensors);
+ MKLDNNDeconvBwd& bwd = MKLDNNDeconvBwd::GetCached(param, read_tensors);
bwd.Execute(param.num_group, req, read_tensors, write_tensors);
}
-MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam ¶m,
- const ReadTensors &read_tensors) {
+MKLDNNDeconvBwd& MKLDNNDeconvBwd::GetCached(const DeconvolutionParam& param,
+ const ReadTensors& read_tensors) {
using deconv_bwd_map = std::unordered_map<DeconvSignature, MKLDNNDeconvBwd, OpHash>;
#if DMLC_CXX11_THREAD_LOCAL
static thread_local deconv_bwd_map bwds;
@@ -201,20 +210,22 @@ MKLDNNDeconvBwd &MKLDNNDeconvBwd::GetCached(const DeconvolutionParam ¶m,
}
std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
- const DeconvolutionParam ¶m, const ReadTensors &read_tensors,
- const deconv_fwd_pd_t &fwd_pd) {
- DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, nullptr,
- read_tensors.out_grad);
- const auto &engine = CpuEngine::Get()->get_engine();
+ const DeconvolutionParam& param,
+ const ReadTensors& read_tensors,
+ const deconv_fwd_pd_t& fwd_pd) {
+ DeconvDescCreator ddc(
+ param, read_tensors.data, read_tensors.weights, nullptr, read_tensors.out_grad);
+ const auto& engine = CpuEngine::Get()->get_engine();
const auto pd = std::make_shared<deconv_bwd_data_pd_t>(ddc.CreateBwdDataDesc(), engine, fwd_pd);
- const auto get_data_size = [&pd]() { return pd->diff_src_desc().get_size(); };
+ const auto get_data_size = [&pd]() { return pd->diff_src_desc().get_size(); };
const auto get_weights_size = [&pd]() { return pd->weights_desc().get_size(); };
- const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
+ const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
if (!pd->next_impl()) {
- // ImposePlainWherePadding fails when all memory descriptors already have plain formats
- // imposed, meaning there is no implementation with plain formats
+ // ImposePlainWherePadding fails when all memory descriptors already have
+ // plain formats imposed, meaning there is no implementation with plain
+ // formats
CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
<< "No implementation of deconvolution backward propagation";
*pd = deconv_bwd_data_pd_t(ddc.CreateBwdDataDesc(), engine, fwd_pd);
@@ -224,21 +235,23 @@ std::shared_ptr<deconv_bwd_data_pd_t> MKLDNNDeconvBwd::CreateDataPrimitiveDesc(
}
std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitiveDesc(
- const DeconvolutionParam ¶m, const ReadTensors &read_tensors,
- const deconv_fwd_pd_t &fwd_pd) {
- DeconvDescCreator ddc(param, read_tensors.data, read_tensors.weights, read_tensors.bias,
- read_tensors.out_grad);
- const auto &engine = CpuEngine::Get()->get_engine();
+ const DeconvolutionParam& param,
+ const ReadTensors& read_tensors,
+ const deconv_fwd_pd_t& fwd_pd) {
+ DeconvDescCreator ddc(
+ param, read_tensors.data, read_tensors.weights, read_tensors.bias, read_tensors.out_grad);
+ const auto& engine = CpuEngine::Get()->get_engine();
const auto pd =
std::make_shared<deconv_bwd_weights_pd_t>(ddc.CreateBwdWeightsDesc(), engine, fwd_pd);
- const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
+ const auto get_data_size = [&pd]() { return pd->src_desc().get_size(); };
const auto get_weights_size = [&pd]() { return pd->diff_weights_desc().get_size(); };
- const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
+ const auto get_out_size = [&pd]() { return pd->diff_dst_desc().get_size(); };
while (!ddc.CheckImplSizeReq(get_data_size(), get_weights_size(), get_out_size())) {
if (!pd->next_impl()) {
- // ImposePlainWherePadding fails when all memory descriptors already have plain formats
- // imposed, meaning there is no implementation with plain formats
+ // ImposePlainWherePadding fails when all memory descriptors already have
+ // plain formats imposed, meaning there is no implementation with plain
+ // formats
CHECK(ddc.ImposePlainWherePadding(get_data_size(), get_weights_size(), get_out_size()))
<< "No implementation of calculating deconvolution weights gradient";
*pd = deconv_bwd_weights_pd_t(ddc.CreateBwdWeightsDesc(), engine, fwd_pd);
@@ -247,13 +260,14 @@ std::shared_ptr<deconv_bwd_weights_pd_t> MKLDNNDeconvBwd::CreateWeightsPrimitive
return pd;
}
... 14800 lines suppressed ...