You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2018/05/13 15:26:33 UTC
[06/10] incubator-singa git commit: Streamlining of tensor.h file by
moving respective member functions to cpp or cuda file. Removal of
shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be
passed as reference instead of pointer
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 3e36877..6e86ca7 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -32,13 +32,88 @@
namespace singa {
-cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor* x){
+// ===================== Helper Functions =============================
+
+ /*
+ cudnn requires tensor dimensions to fulfill 1 requirement:
+ 1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+ if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+ (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+ for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+ Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
+ */
+ vector<int> generate_shape_cuda(const Tensor& x) {
+ Shape shape_ = x.shape();
+ vector<int> shape_arr;
+ if(shape_.size() <= 4){
+ for (size_t n=0; n<4-shape_.size(); ++n) {
+ shape_arr.push_back(1);
+ }
+ for (size_t n=0; n<shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
+ }
+ return shape_arr;
+ } else if(shape_.size() == 5){
+ for (size_t n=0; n<shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
+ }
+ return shape_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+ }
+ }
+
+ int generate_dim_cuda(const Tensor& x) {
+ if(x.shape().size() <= 4){return 4;}
+ else if(x.shape().size() == 5){return 5;}
+ else{
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+ }
+ }
+
+/*
+ cudnn requires stride dimensions to conform to the format of the shape input as well
+ 1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+ If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+ (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+ for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3}
+ and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
+ */
+ vector<int> generate_strides_cuda(const Tensor& x) {
+ Shape shape_ = x.shape();
+ vector<int> strides_ = x.strides();
+ vector<int> strides_arr;
+ int product = 1;
+ for (size_t n=0; n<(shape_.size()); ++n) {
+ product *= shape_[n];
+ }
+ if(shape_.size() <= 4){
+ for (size_t n=0; n<4-shape_.size(); ++n) {
+ strides_arr.push_back(product);
+ }
+ for (size_t n=0; n<strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
+ }
+ return strides_arr;
+ } else if(shape_.size() == 5){
+ for (size_t n=0; n<strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
+ }
+ return strides_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
+ }
+ }
+
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x){
cudnnTensorDescriptor_t x_desc;
cudnnCreateTensorDescriptor(&x_desc);
cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
- x->generate_dim_cuda(),
- x->generate_shape_cuda().data(),
- x->generate_strides_cuda().data()
+ generate_dim_cuda(x),
+ generate_shape_cuda(x).data(),
+ generate_strides_cuda(x).data()
);
return x_desc;
@@ -55,12 +130,13 @@ cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
return op_desc;
}
+// ===================== CUDA Functions =============================
/// out[i] = |in[i]|
template <>
-void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
@@ -70,7 +146,7 @@ void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
(void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cudnnDestroyTensorDescriptor(in_desc);
}
@@ -80,74 +156,74 @@ void Set<float, lang::Cuda>(const float x, Tensor* out,
Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(out),
+ cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out),
outPtr, (void*)(&x));
}
template <>
-void Add<float, lang::Cuda>(const Tensor* in, const float x,
+void Add<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
Set<float, lang::Cuda>(x, out, ctx);
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = 1.0, beta = 1.0;
cudnnAddTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
/// out = in1 + in2
template <>
-void Add<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Add<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
float alpha2 = 1.0;
float beta = 0.0;
- if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+ if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
} else {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
}
/// out = in1 - in2
template <>
-void Sub<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Sub<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
float alpha2 = -1.0;
float beta = 0.0;
- if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+ if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
} else {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
}
@@ -156,35 +232,35 @@ void Sub<float, lang::Cuda>(const Tensor* in1,
/// if x>high, then x=high; if x<low, then x=low.
template <>
void Clamp<float, lang::Cuda>(const float low,
- const float high, const Tensor* in, Tensor* out,
+ const float high, const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
/// out = in1 / in2
template <>
-void Div<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Div<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
//if both in1 and in2 strides are the same, we proceed to normal cuda::div
- if(in1->strides() == in2->strides()){
+ if(in1.strides() == in2.strides()){
cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->Set_Strides(in1->strides());
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
- out->Set_Strides(in2->strides());
+ out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -192,51 +268,51 @@ void Div<float, lang::Cuda>(const Tensor* in1,
}
template <>
-void Div<float, lang::Cuda>(const float x, const Tensor* in,
+void Div<float, lang::Cuda>(const float x, const Tensor& in,
Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::div(num, x, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
/// out = in * x
template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in,
const float x, Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = x, beta = 0.0;
cudnnAddTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
/// out = in1 * in2
template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out,
Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
//if both in1 and in2 strides are the same, we proceed to normal cuda::mult
- if(in1->strides() == in2->strides()){
+ if(in1.strides() == in2.strides()){
cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->Set_Strides(in1->strides());
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
- out->Set_Strides(in2->strides());
+ out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -246,138 +322,138 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
/// Base is e. out[i]=e^in[i]
template <>
-void Exp<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Exp<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::exp(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void GE<float, lang::Cuda>(const Tensor* in, const float x,
+void GE<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::ge(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void GE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
-void GT<float, lang::Cuda>(const Tensor* in, const float x,
+void GT<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::gt(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void GT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
-void LE<float, lang::Cuda>(const Tensor* in, const float x,
+void LE<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::le(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void LE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
template <>
-void Log<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Log<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::log(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void LT<float, lang::Cuda>(const Tensor* in, const float x,
+void LT<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::lt(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void LT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Element-wise operation, out[i] = in[i]^x
template <>
-void Pow<float, lang::Cuda>(const Tensor* in, const float x,
+void Pow<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::pow(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
/// Element-wise operation, out[i] = in1[i]^in2[i]
template <>
-void Pow<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Pow<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
- if(in1->strides() == in2->strides()){
+ if(in1.strides() == in2.strides()){
cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->Set_Strides(in1->strides());
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
- out->Set_Strides(in2->strides());
+ out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -386,9 +462,9 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
/// Element-wise operation, out[i]=max(0, in[i])
// template <>
-// void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
@@ -404,8 +480,10 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
@@ -415,20 +493,20 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
// }
template <>
-void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::relu(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// /// Element-wise operation, out[i]=sigmoid([in[i])
// template <>
-// void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
@@ -444,8 +522,10 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
@@ -456,31 +536,31 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
/// Element-wise operation, out[i]=sigmoid([in[i])
template <>
-void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// out[i] = sign(in[i])
template <>
-void Sign<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::sign(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// Element-wise operation, out[i]=sqrt([in[i])
template <>
-void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
@@ -490,39 +570,39 @@ void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
(void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
/// Element-wise operation, out[i]=in[i]^2
template <>
-void Square<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Square<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::square(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// template <>
// void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
// Context* ctx) {
// LOG(FATAL) << "Cuda Sum is not implemented!";
-// // const float* inPtr = static_cast<const float*>(in->data());
+// // const float* inPtr = static_cast<const float*>(in.data());
// // cuda::sum(num, inPtr, out, ctx->stream);
// }
template <>
-void Sum<float, lang::Cuda>(const Tensor* in, float* out,
+void Sum<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
//reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
Shape reduced_shape = {1};
- Tensor t(reduced_shape, in->device(), in->data_type());
+ Tensor t(reduced_shape, in.device(), in.data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
- vector<int> reduce_all_axes = in->generate_shape_cuda();
+ vector<int> reduce_all_axes = generate_shape_cuda(in);
for (size_t n=0; n<reduce_all_axes.size(); ++n) {
reduce_all_axes[n] = 1;
}
@@ -539,10 +619,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
cudnn_propagation, cudnn_indices, cudnn_indices_type);
//instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
- size_t reduction_size_int = Product(in->shape());
+ size_t reduction_size_int = Product(in.shape());
Shape reduction_size = {reduction_size_int*100};
- Tensor indices(reduction_size, in->device(), in->data_type());
- Tensor workspace(reduction_size, in->device(), in->data_type());
+ Tensor indices(reduction_size, in.device(), in.data_type());
+ Tensor workspace(reduction_size, in.device(), in.data_type());
size_t indices_bytes = indices.block()->size()*100;
size_t workspace_bytes = workspace.block()->size()*100;
size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -555,7 +635,7 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(&t), tPtr
+ (void*)(&beta), generate_tensorND_desc(t), tPtr
);
*out = tPtr[0];
@@ -564,9 +644,9 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
/// Element-wise operation, out[i]=tanh([in[i])
// template <>
-// void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
@@ -582,8 +662,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
@@ -593,13 +675,13 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
// }
template <>
-void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::tanh(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// ================Random functions===========================================
@@ -643,65 +725,65 @@ void Gaussian<float, lang::Cuda>(const float mean,
// =========================Blas operations==================================
// ref to http://docs.nvidia.com/cuda/cublas
template <>
-void Amax<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amax<float, lang::Cuda>(const Tensor& in, size_t* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
*out = idx - 1; // cublas index starts from 1
}
/// return the index of the element with the min value.
template <>
-void Amin<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amin<float, lang::Cuda>(const Tensor& in, size_t* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
*out = idx - 1;
}
/// out = sum |x| for all x in in
template <>
-void Asum<float, lang::Cuda>(const Tensor* in, float* out,
+void Asum<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
}
/// out = alpha * in + out
template <>
void Axpy<float, lang::Cuda>(const float alpha,
- const Tensor* in, Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const Tensor& in, Tensor* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
}
/// out = \sum_i in1[i] * in2[i]
template <>
-void Dot<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, float* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Dot<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, float* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
}
template <>
-void Nrm2<float, lang::Cuda>(const Tensor* in, float* out,
+void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cublasSnrm2(handle, num, inPtr, 1, out);
}
template <>
@@ -715,14 +797,14 @@ void Scale<float, lang::Cuda>(const float x, Tensor* out,
// NOTE: cublas uses column major order.
// http://peterwittek.com/cublas-matrix-c-style.html
template <>
-void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tensor* v,
+void DGMM<float, lang::Cuda>(const bool side_right, const Tensor& M, const Tensor& v,
Tensor* out, Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const float* MPtr = static_cast<const float*>(M->block()->data());
- const float* vPtr = static_cast<const float*>(v->block()->data());
+ const float* MPtr = static_cast<const float*>(M.block()->data());
+ const float* vPtr = static_cast<const float*>(v.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t nrow = M->shape(0);
- const size_t ncol = M->shape(1);
+ const size_t nrow = M.shape(0);
+ const size_t ncol = M.shape(1);
if (side_right) {
CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
vPtr, 1, outPtr, ncol));
@@ -732,16 +814,16 @@ void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tenso
}
}
template <>
-void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v,
+void GEMV<float, lang::Cuda>(const float alpha, const Tensor& A, const Tensor& v,
const float beta, Tensor* out, Context* ctx) {
- const float* APtr = static_cast<const float*>(A->block()->data());
- const float* vPtr = static_cast<const float*>(v->block()->data());
+ const float* APtr = static_cast<const float*>(A.block()->data());
+ const float* vPtr = static_cast<const float*>(v.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t m = A->shape()[0];
- const size_t n = A->shape()[1];
+ const size_t m = A.shape()[0];
+ const size_t n = A.shape()[1];
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- if (!(A->transpose()))
+ if (!(A.transpose()))
CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
1, &beta, outPtr, 1));
else
@@ -752,20 +834,20 @@ void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v
// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
template <>
void GEMM<float, lang::Cuda>(const float alpha,
- const Tensor* A, const Tensor* B, const float beta,
+ const Tensor& A, const Tensor& B, const float beta,
Tensor* C, Context* ctx) {
- auto transA = A->transpose();
+ auto transA = A.transpose();
auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
- auto transB = B->transpose();
+ auto transB = B.transpose();
auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
- const size_t nrowA = A->shape()[0];
- const size_t ncolA = A->shape()[1];
- const size_t ncolB = B->shape()[1];
+ const size_t nrowA = A.shape()[0];
+ const size_t ncolA = A.shape()[1];
+ const size_t ncolB = B.shape()[1];
int lda = transA ? nrowA : ncolA;
int ldb = transB ? ncolA : ncolB;
int ldc = ncolB;
- const float* APtr = static_cast<const float*>(A->block()->data());
- const float* BPtr = static_cast<const float*>(B->block()->data());
+ const float* APtr = static_cast<const float*>(A.block()->data());
+ const float* BPtr = static_cast<const float*>(B.block()->data());
float* CPtr = static_cast<float*>(C->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
@@ -799,15 +881,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
}
// template <>
-// void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
-// // const size_t nrow = in->shape()[0];
-// // const size_t ncol = in->shape()[1];
+// // const size_t nrow = in.shape()[0];
+// // const size_t ncol = in.shape()[1];
// // cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
-// //vector<int> reduce_row_axes_shape = in->generate_shape_cuda();
+// //vector<int> reduce_row_axes_shape = in.generate_shape_cuda();
// //reduce_row_axes_shape.back() = 1; //reduce axis 1, so we set last element d in shape {a,b,c,d} to 1
// vector<int> reduce_row_axes_shape = {1,1,1,1};
@@ -828,8 +910,8 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
// //instantiate new tensor to use new blocks as memory instead of cudaMalloc
// //create 2 tensors of same size as input tensor
// Shape reduction_size = {1000};
-// Tensor indices(reduction_size, in->device(), in->data_type());
-// Tensor workspace(reduction_size, in->device(), in->data_type());
+// Tensor indices(reduction_size, in.device(), in.data_type());
+// Tensor workspace(reduction_size, in.device(), in.data_type());
// size_t indices_bytes = indices.block()->size()*1000;
// size_t workspace_bytes = workspace.block()->size()*1000;
// size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -842,9 +924,12 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), reduce_row_axes_shape.data(), reduced_strides.data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// reduce_row_axes_shape.data(), reduced_strides.data());
// cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
// indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
// (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
@@ -854,15 +939,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
// }
template <>
-void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t nrow = in->shape()[0];
- const size_t ncol = in->shape()[1];
+ const size_t nrow = in.shape()[0];
+ const size_t ncol = in.shape()[1];
- if(in->transpose()){
- Tensor t(in->shape(), in->device(), in->data_type());
+ if(in.transpose()){
+ Tensor t(in.shape(), in.device(), in.data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
float alpha = 1.0;
@@ -870,7 +955,7 @@ void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(&t), tPtr
+ (void*)(&beta), generate_tensorND_desc(t), tPtr
);
const float* tPtr_const = static_cast<const float*>(t.block()->data());