You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2018/05/13 15:26:33 UTC

[06/10] incubator-singa git commit: Streamlining of tensor.h file by moving respective member functions to cpp or cuda file. Removal of shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be passed as reference instead of pointer

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 3e36877..6e86ca7 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -32,13 +32,88 @@
 
 namespace singa {
 
-cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor* x){
+// ===================== Helper Functions =============================
+
+  /*  
+  cudnn requires tensor dimensions to fulfill 1 requirement:
+    1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors 
+        if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+    for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+             Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
+  */
+  vector<int> generate_shape_cuda(const Tensor& x) {
+    Shape shape_ = x.shape();
+    vector<int> shape_arr;
+    if(shape_.size() <= 4){
+      for (size_t n=0; n<4-shape_.size(); ++n) {
+        shape_arr.push_back(1);
+      } 
+      for (size_t n=0; n<shape_.size(); ++n) {
+        shape_arr.push_back(shape_.at(n));
+      } 
+      return shape_arr;
+    } else if(shape_.size() == 5){
+      for (size_t n=0; n<shape_.size(); ++n) {
+        shape_arr.push_back(shape_.at(n));
+      } 
+      return shape_arr;
+    } else {
+      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+    }
+  }
+
+  int generate_dim_cuda(const Tensor& x) {
+    if(x.shape().size() <= 4){return 4;}
+    else if(x.shape().size() == 5){return 5;}
+    else{
+      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+    } 
+  }
+
+/*  
+  cudnn requires stride dimensions to conform to the format of the shape input as well
+    1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+        If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+    for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3}
+    and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
+  */
+  vector<int> generate_strides_cuda(const Tensor& x) {
+    Shape shape_ = x.shape();
+    vector<int> strides_ = x.strides();
+    vector<int> strides_arr;
+    int product = 1;
+    for (size_t n=0; n<(shape_.size()); ++n) {
+      product *= shape_[n];
+    }
+    if(shape_.size() <= 4){
+      for (size_t n=0; n<4-shape_.size(); ++n) {
+        strides_arr.push_back(product);
+      } 
+      for (size_t n=0; n<strides_.size(); ++n) {
+          strides_arr.push_back(strides_[n]);
+        }
+      return strides_arr;
+    } else if(shape_.size() == 5){
+      for (size_t n=0; n<strides_.size(); ++n) {
+          strides_arr.push_back(strides_[n]);
+        }
+      return strides_arr;
+    } else {
+      LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
+    }
+  }
+
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x){
   cudnnTensorDescriptor_t x_desc;
   cudnnCreateTensorDescriptor(&x_desc);
   cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
-                             x->generate_dim_cuda(),
-                             x->generate_shape_cuda().data(),
-                             x->generate_strides_cuda().data()
+                             generate_dim_cuda(x),
+                             generate_shape_cuda(x).data(),
+                             generate_strides_cuda(x).data()
                              );
 
   return x_desc;
@@ -55,12 +130,13 @@ cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
   return op_desc;
 }
 
+// ===================== CUDA Functions =============================
 
 /// out[i] = |in[i]|
 template <>
-void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha1 = 1.0;
@@ -70,7 +146,7 @@ void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
   cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
                 (void*)(&alpha1), in_desc, inPtr, 
                 (void*)(&alpha2), in_desc, inPtr,
-                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                (void*)(&beta), generate_tensorND_desc(*out), outPtr
                 );
   cudnnDestroyTensorDescriptor(in_desc);
 }
@@ -80,74 +156,74 @@ void Set<float, lang::Cuda>(const float x, Tensor* out,
                             Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
-  cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(out), 
+  cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out), 
                   outPtr, (void*)(&x));
 }
 
 template <>
-void Add<float, lang::Cuda>(const Tensor* in, const float x,
+void Add<float, lang::Cuda>(const Tensor& in, const float x,
                             Tensor* out, Context* ctx) {
   Set<float, lang::Cuda>(x, out, ctx);
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha = 1.0, beta = 1.0;
   cudnnAddTensor(ctx->cudnn_handle,
                  (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                 (void*)(&beta), generate_tensorND_desc(out), outPtr
+                 (void*)(&beta), generate_tensorND_desc(*out), outPtr
                  );
 }
 
 /// out = in1 + in2
 template <>
-void Add<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Add<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha1 = 1.0;
   float alpha2 = 1.0;
   float beta = 0.0;
 
-  if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+  if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
               (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
               (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
-              (void*)(&beta), generate_tensorND_desc(out), outPtr
+              (void*)(&beta), generate_tensorND_desc(*out), outPtr
               );
   } else {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
           (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
           (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
-          (void*)(&beta), generate_tensorND_desc(out), outPtr
+          (void*)(&beta), generate_tensorND_desc(*out), outPtr
           );
   }
 }
 
 /// out = in1 - in2
 template <>
-void Sub<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Sub<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha1 = 1.0;
   float alpha2 = -1.0;
   float beta = 0.0;
 
-  if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+  if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
               (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
               (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
-              (void*)(&beta), generate_tensorND_desc(out), outPtr
+              (void*)(&beta), generate_tensorND_desc(*out), outPtr
               );
   } else {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
           (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
           (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
-          (void*)(&beta), generate_tensorND_desc(out), outPtr
+          (void*)(&beta), generate_tensorND_desc(*out), outPtr
           );
   }
 }
@@ -156,35 +232,35 @@ void Sub<float, lang::Cuda>(const Tensor* in1,
 /// if x>high, then x=high; if x<low, then x=low.
 template <>
 void Clamp<float, lang::Cuda>(const float low,
-                              const float high, const Tensor* in, Tensor* out,
+                              const float high, const Tensor& in, Tensor* out,
                               Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 /// out = in1 / in2
 template <>
-void Div<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Div<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
 
   //if both in1 and in2 strides are the same, we proceed to normal cuda::div
-  if(in1->strides() == in2->strides()){
+  if(in1.strides() == in2.strides()){
         cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->Set_Strides(in1->strides());
+        out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
-    out->Set_Strides(in2->strides());
+    out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -192,51 +268,51 @@ void Div<float, lang::Cuda>(const Tensor* in1,
 }
 
 template <>
-void Div<float, lang::Cuda>(const float x, const Tensor* in,
+void Div<float, lang::Cuda>(const float x, const Tensor& in,
                             Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::div(num, x, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 /// out = in * x
 template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in,
                                     const float x, Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha = x, beta = 0.0;
   cudnnAddTensor(ctx->cudnn_handle,
                 (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                (void*)(&beta), generate_tensorND_desc(*out), outPtr
                 );
 }
 
 /// out = in1 * in2
 template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
-                                    const Tensor* in2, Tensor* out,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
+                                    const Tensor& in2, Tensor* out,
                                     Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
 
   //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
-  if(in1->strides() == in2->strides()){ 
+  if(in1.strides() == in2.strides()){ 
         cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->Set_Strides(in1->strides());
+        out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
-    out->Set_Strides(in2->strides());
+    out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -246,138 +322,138 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
 
 /// Base is e. out[i]=e^in[i]
 template <>
-void Exp<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Exp<float, lang::Cuda>(const Tensor& in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::exp(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 template <>
-void GE<float, lang::Cuda>(const Tensor* in, const float x,
+void GE<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::ge(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void GE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
 
 template <>
-void GT<float, lang::Cuda>(const Tensor* in, const float x,
+void GT<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::gt(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void GT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 template <>
-void LE<float, lang::Cuda>(const Tensor* in, const float x,
+void LE<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::le(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void LE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
 /// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
 template <>
-void Log<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Log<float, lang::Cuda>(const Tensor& in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::log(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void LT<float, lang::Cuda>(const Tensor* in, const float x,
+void LT<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::lt(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void LT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 /// Element-wise operation, out[i] = in[i]^x
 template <>
-void Pow<float, lang::Cuda>(const Tensor* in, const float x,
+void Pow<float, lang::Cuda>(const Tensor& in, const float x,
                             Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::pow(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 /// Element-wise operation, out[i] = in1[i]^in2[i]
 template <>
-void Pow<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Pow<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
 
-  if(in1->strides() == in2->strides()){
+  if(in1.strides() == in2.strides()){
         cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->Set_Strides(in1->strides());
+        out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
-    out->Set_Strides(in2->strides());
+    out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -386,9 +462,9 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
 
 /// Element-wise operation, out[i]=max(0, in[i])
 // template <>
-// void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                              Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
 //   cudnnActivationDescriptor_t act_desc;
@@ -404,8 +480,10 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
 //   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
 //                         (void*)(&beta), out_desc, outPtr);
 
@@ -415,20 +493,20 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
 // }
 
 template <>
-void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::relu(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // /// Element-wise operation, out[i]=sigmoid([in[i])
 // template <>
-// void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                                 Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
 //   cudnnActivationDescriptor_t act_desc;
@@ -444,8 +522,10 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
 //   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
 //                         (void*)(&beta), out_desc, outPtr);
 
@@ -456,31 +536,31 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
 
 /// Element-wise operation, out[i]=sigmoid([in[i])
 template <>
-void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                 Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // out[i] = sign(in[i])
 template <>
-void Sign<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::sign(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // Element-wise operation, out[i]=sqrt([in[i])
 template <>
-void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   
   float alpha1 = 1.0;
@@ -490,39 +570,39 @@ void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
   cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
                 (void*)(&alpha1), in_desc, inPtr, 
                 (void*)(&alpha2), in_desc, inPtr,
-                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                (void*)(&beta), generate_tensorND_desc(*out), outPtr
                 );
 }
 
 /// Element-wise operation, out[i]=in[i]^2
 template <>
-void Square<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Square<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::square(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // template <>
 // void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
 //                             Context* ctx) {
 //   LOG(FATAL) << "Cuda Sum is not implemented!";
-//   // const float* inPtr = static_cast<const float*>(in->data());
+//   // const float* inPtr = static_cast<const float*>(in.data());
 //   // cuda::sum(num, inPtr, out, ctx->stream);
 // }
 
 template <>
-void Sum<float, lang::Cuda>(const Tensor* in, float* out,
+void Sum<float, lang::Cuda>(const Tensor& in, float* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
 
    //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
    Shape reduced_shape = {1};
-   Tensor t(reduced_shape, in->device(), in->data_type());
+   Tensor t(reduced_shape, in.device(), in.data_type());
    float* tPtr = static_cast<float*>(t.block()->mutable_data());
-   vector<int> reduce_all_axes = in->generate_shape_cuda();
+   vector<int> reduce_all_axes = generate_shape_cuda(in);
    for (size_t n=0; n<reduce_all_axes.size(); ++n) {
     reduce_all_axes[n] = 1;
    }
@@ -539,10 +619,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
                                  cudnn_propagation, cudnn_indices, cudnn_indices_type);
 
   //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
-  size_t reduction_size_int = Product(in->shape());
+  size_t reduction_size_int = Product(in.shape());
   Shape reduction_size = {reduction_size_int*100};
-  Tensor indices(reduction_size, in->device(), in->data_type());
-  Tensor workspace(reduction_size, in->device(), in->data_type());
+  Tensor indices(reduction_size, in.device(), in.data_type());
+  Tensor workspace(reduction_size, in.device(), in.data_type());
   size_t indices_bytes = indices.block()->size()*100;
   size_t workspace_bytes = workspace.block()->size()*100;
   size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -555,7 +635,7 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
   cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
                     (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                    (void*)(&beta), generate_tensorND_desc(&t), tPtr
+                    (void*)(&beta), generate_tensorND_desc(t), tPtr
                     );
 
   *out = tPtr[0];
@@ -564,9 +644,9 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
 
 /// Element-wise operation, out[i]=tanh([in[i])
 // template <>
-// void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                              Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
 //   cudnnActivationDescriptor_t act_desc;
@@ -582,8 +662,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
 //   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
 //                         (void*)(&beta), out_desc, outPtr);
 
@@ -593,13 +675,13 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
 // }
 
 template <>
-void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                 Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::tanh(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // ================Random functions===========================================
@@ -643,65 +725,65 @@ void Gaussian<float, lang::Cuda>(const float mean,
 // =========================Blas operations==================================
 // ref to http://docs.nvidia.com/cuda/cublas
 template <>
-void Amax<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amax<float, lang::Cuda>(const Tensor& in, size_t* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   int idx = 1;
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
   *out = idx - 1;  // cublas index starts from 1
 }
 
 /// return the index of the element with the min value.
 template <>
-void Amin<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amin<float, lang::Cuda>(const Tensor& in, size_t* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   int idx = 1;
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
   *out = idx - 1;
 }
 
 /// out = sum |x| for all x in in
 template <>
-void Asum<float, lang::Cuda>(const Tensor* in, float* out,
+void Asum<float, lang::Cuda>(const Tensor& in, float* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
 }
 
 /// out = alpha * in + out
 template <>
 void Axpy<float, lang::Cuda>(const float alpha,
-                             const Tensor* in, Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+                             const Tensor& in, Tensor* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
 }
 
 /// out = \sum_i in1[i] * in2[i]
 template <>
-void Dot<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, float* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Dot<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, float* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
   CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
 }
 template <>
-void Nrm2<float, lang::Cuda>(const Tensor* in, float* out,
+void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
                              Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cublasSnrm2(handle, num, inPtr, 1, out);
 }
 template <>
@@ -715,14 +797,14 @@ void Scale<float, lang::Cuda>(const float x, Tensor* out,
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
-void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tensor* v,
+void DGMM<float, lang::Cuda>(const bool side_right, const Tensor& M, const Tensor& v,
                              Tensor* out, Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float* MPtr = static_cast<const float*>(M->block()->data());
-  const float* vPtr = static_cast<const float*>(v->block()->data());
+  const float* MPtr = static_cast<const float*>(M.block()->data());
+  const float* vPtr = static_cast<const float*>(v.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t nrow = M->shape(0);
-  const size_t ncol = M->shape(1);
+  const size_t nrow = M.shape(0);
+  const size_t ncol = M.shape(1);
   if (side_right) {
     CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
                              vPtr, 1, outPtr, ncol));
@@ -732,16 +814,16 @@ void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tenso
   }
 }
 template <>
-void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v,
+void GEMV<float, lang::Cuda>(const float alpha, const Tensor& A, const Tensor& v,
                              const float beta, Tensor* out, Context* ctx) {
-  const float* APtr = static_cast<const float*>(A->block()->data());
-  const float* vPtr = static_cast<const float*>(v->block()->data());
+  const float* APtr = static_cast<const float*>(A.block()->data());
+  const float* vPtr = static_cast<const float*>(v.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t m = A->shape()[0];
-  const size_t n = A->shape()[1];
+  const size_t m = A.shape()[0];
+  const size_t n = A.shape()[1];
 
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  if (!(A->transpose()))
+  if (!(A.transpose()))
     CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
                              1, &beta, outPtr, 1));
   else
@@ -752,20 +834,20 @@ void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v
 // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
 template <>
 void GEMM<float, lang::Cuda>(const float alpha,
-                             const Tensor* A, const Tensor* B, const float beta,
+                             const Tensor& A, const Tensor& B, const float beta,
                              Tensor* C, Context* ctx) {
-  auto transA = A->transpose();
+  auto transA = A.transpose();
   auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto transB = B->transpose();
+  auto transB = B.transpose();
   auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  const size_t nrowA = A->shape()[0];
-  const size_t ncolA = A->shape()[1];
-  const size_t ncolB = B->shape()[1];
+  const size_t nrowA = A.shape()[0];
+  const size_t ncolA = A.shape()[1];
+  const size_t ncolB = B.shape()[1];
   int lda = transA ? nrowA : ncolA;
   int ldb = transB ? ncolA : ncolB;
   int ldc = ncolB;
-  const float* APtr = static_cast<const float*>(A->block()->data());
-  const float* BPtr = static_cast<const float*>(B->block()->data());
+  const float* APtr = static_cast<const float*>(A.block()->data());
+  const float* BPtr = static_cast<const float*>(B.block()->data());
   float* CPtr = static_cast<float*>(C->block()->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
@@ -799,15 +881,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 }
 
 // template <>
-// void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                                Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-//   // const size_t nrow = in->shape()[0];
-//   // const size_t ncol = in->shape()[1];
+//   // const size_t nrow = in.shape()[0];
+//   // const size_t ncol = in.shape()[1];
 //   // cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
 
-//   //vector<int> reduce_row_axes_shape = in->generate_shape_cuda();
+//   //vector<int> reduce_row_axes_shape = in.generate_shape_cuda();
 //   //reduce_row_axes_shape.back() = 1; //reduce axis 1, so we set last element d in shape {a,b,c,d} to 1
 
 //   vector<int> reduce_row_axes_shape = {1,1,1,1};
@@ -828,8 +910,8 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 //   //instantiate new tensor to use new blocks as memory instead of cudaMalloc
 //   //create 2 tensors of same size as input tensor
 //   Shape reduction_size = {1000};
-//   Tensor indices(reduction_size, in->device(), in->data_type());
-//   Tensor workspace(reduction_size, in->device(), in->data_type());
+//   Tensor indices(reduction_size, in.device(), in.data_type());
+//   Tensor workspace(reduction_size, in.device(), in.data_type());
 //   size_t indices_bytes = indices.block()->size()*1000;
 //   size_t workspace_bytes = workspace.block()->size()*1000;
 //   size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -842,9 +924,12 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), reduce_row_axes_shape.data(), reduced_strides.data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// reduce_row_axes_shape.data(), reduced_strides.data());
 //   cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
 //                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
 //                     (void*)(&alpha), in_desc, inPtr, (void*)(&beta),  out_desc, outPtr);
@@ -854,15 +939,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 // }
 
 template <>
-void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t nrow = in->shape()[0];
-  const size_t ncol = in->shape()[1];
+  const size_t nrow = in.shape()[0];
+  const size_t ncol = in.shape()[1];
 
-  if(in->transpose()){
-    Tensor t(in->shape(), in->device(), in->data_type());
+  if(in.transpose()){
+    Tensor t(in.shape(), in.device(), in.data_type());
     float* tPtr = static_cast<float*>(t.block()->mutable_data());
 
     float alpha = 1.0;
@@ -870,7 +955,7 @@ void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
 
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                        (void*)(&beta), generate_tensorND_desc(&t), tPtr
+                        (void*)(&beta), generate_tensorND_desc(t), tPtr
                         );
 
     const float* tPtr_const = static_cast<const float*>(t.block()->data());