You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2018/05/13 15:26:34 UTC

[07/10] incubator-singa git commit: Streamlining of tensor.h file by moving respective member functions to cpp or cuda file. Removal of shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be passed as reference instead of pointer

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index d4cd5da..1ca312a 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -23,7 +23,6 @@
 #include "singa/core/common.h"
 #include "singa/core/tensor.h"
 #include <math.h>
-#include <vector>
 
 #ifdef USE_CBLAS
 #include <cblas.h>
@@ -31,80 +30,134 @@
 
 namespace singa {
 
-// template <>
-// void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = fabs(inPtr[i]);
-//   }
-// }
+// ===================== Helper Functions =============================
+
+//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
+vector<int> generate_traversal_info(const Tensor& x) {
+    vector<int> traversal_info = {};
+    for(size_t n=0; n<(x.shape().size()+2); ++n) {
+      traversal_info.push_back(0);
+    }
+    return traversal_info;
+};
+
+//generate shape multipliers
+//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
+//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
+//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
+//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
+vector<int> generate_shape_multipliers(const Tensor& x) {
+    Shape y_shape = x.shape();
+    if(y_shape.size()==0){
+      return {1};
+    }
+    vector<int> shape_multipliers = {1};
+    int cumulative_product = 1;
+
+    for (size_t n=0; n<(y_shape.size()-1); ++n) {
+        cumulative_product = cumulative_product*y_shape[y_shape.size()-1-n];
+        shape_multipliers.insert(shape_multipliers.begin(), cumulative_product);
+    }
+    return shape_multipliers;
+};
+
+// ******************************************************************************************
+// CPP traversal operations (works on const declarations without modifying tensor variables)
+// ******************************************************************************************
+
+//this function checks whether the next index falls on a special multiplier of the outer shape
+//so the algorithm knows when to jump over/back to a starting element of the outer shape
+//for e.g. in [[1,4,7], [2,5,8], [3,6,9]], elements 1,2,3 are the starting elements of their respective rows
+//this additional check only has 1 loop for 2d matrix
+//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
+int determine_order(vector<int>& shape_multipliers, int counter) {
+    for (size_t n=0; n<(shape_multipliers.size()-1); ++n) {
+        if((counter%shape_multipliers[n])==0){
+            return ((shape_multipliers.size()) - 1 - n);
+        }
+    }
+    return 0;
+};
+
+//this function updates the base indexes with the current index after every single traversal step,
+//can be generalized beyond 2d cases
+void update_base_index(const Tensor& x, vector<int>& traversal_info) {
+    for (int n=0; n<(traversal_info[x.shape().size()+1]+1); ++n) {
+        traversal_info[n] = traversal_info[x.shape().size()];
+    }
+};
+
+//function to traverse a const strided tensor object
+//it requires an additional vector, traversal_info {0,0,0,0 ...}, comprising (x.shape().size()+2) elements of 0
+//for e.g. 2d matrix:
+//index 0 and 1 store the base row and column index respectively
+//index 2 stores the current index of the traversal
+//index 3 stores the order of the traversal for e.g. if the order is 0,
+//it means the next element can be navigated to using the innermost stride
+void traverse_next(const Tensor& x,
+                   vector<int>& shape_multipliers, 
+                   vector<int>& traversal_info,
+                   int counter) {
+
+    update_base_index(x, traversal_info);
+    traversal_info[x.shape().size()+1] = determine_order(shape_multipliers, counter);
+    traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size()+1]] + 
+                                                   x.strides()[x.strides().size()-traversal_info[x.shape().size()+1]-1];
+};
+
+template <typename DType>
+void TraverseUnary(const Tensor &in, Tensor* out, std::function<DType(DType)> func){
+  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+  const DType *inPtr = static_cast<const DType *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) { 
+    outPtr[i] = func(inPtr[traversal_info[in.shape().size()]]);
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
+  }
+}
+
+template <typename DType>
+void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out, 
+                    std::function<DType(DType, DType)> func){
+  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+  const DType *in1Ptr = static_cast<const DType *>(in1.block()->data());
+  const DType *in2Ptr = static_cast<const DType *>(in2.block()->data());
+  vector<int> traversal_info_in1 = generate_traversal_info(in1);
+  vector<int> traversal_info_in2 = generate_traversal_info(in2);
+  vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+  vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+  for (size_t i = 0; i < in1.Size(); i++) {
+    outPtr[i] = func(in1Ptr[traversal_info_in1[in1.shape().size()]],
+                     in2Ptr[traversal_info_in2[in2.shape().size()]]);
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+  }
+}
+
+// ******************************************************************************************
+// traversal operations end
+// ******************************************************************************************
+
+// ===================== CUDA Functions =============================
 
 template <>
-void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out, Context *ctx) {
+void Abs<float, lang::Cpp>(const Tensor& in, Tensor* out, Context *ctx) {
   TraverseUnary<float>(in, out, [](float x) {return fabs(x);});
 }
 
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in, const float x,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = inPtr[i] + x;
-//   }
-// }
-
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->block()->data());
-//   vector<int> traversal_info = in->generate_traversal_info();
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = inPtr[traversal_info[in->shape().size()]] + x;
-//     in->traverse_next(traversal_info, i+1);
-//   }
-// }
-
 template <>
-void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out, Context *ctx) {
   auto add_lambda = [&x](float a) {
     return (a+x);
   };
   TraverseUnary<float>(in, out, add_lambda);
 }
 
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   // CHECK_EQ(ctx->stream, nullptr);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = in1Ptr[i] + in2Ptr[i];
-//   }
-// }
-
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
-//   // CHECK_EQ(ctx->stream, nullptr);
-//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-//   //call axpy if both strides are 1?
-//   vector<int> traversal_info_in1 = in1->generate_traversal_info();
-//   vector<int> traversal_info_in2 = in2->generate_traversal_info();
-//   for (size_t i = 0; i < in1->Size(); i++) {
-//     outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] + in2Ptr[traversal_info_in2[in2->shape().size()]];
-//     in1->traverse_next(traversal_info_in1, i+1);
-//     in2->traverse_next(traversal_info_in2, i+1);
-//   }
-// }
-
 template <>
-void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   auto add_lambda_binary = [](float a, float b) {
     return (a+b);
@@ -113,46 +166,9 @@ void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Co
   
 }
 
-// template <>
-// void Clamp<float, lang::Cpp>(const float low,
-//                              const float high, const Tensor* in, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     if (inPtr[i] > high) {
-//       outPtr[i] = high;
-//     } else if (inPtr[i] < low) {
-//       outPtr[i] = low;
-//     } else {
-//       outPtr[i] = inPtr[i];
-//     }
-//   }
-// }
-
-// template <>
-// void Clamp<float, lang::Cpp>(const Tensor* in, const float low,
-//                              const float high, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->block()->data());
-//   vector<int> traversal_info = in->generate_traversal_info();
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     int traversed_index = traversal_info[in->shape().size()];
-//     if (inPtr[traversed_index] > high) {
-//       outPtr[i] = high;
-//     } else if (inPtr[traversed_index] < low) {
-//       outPtr[i] = low;
-//     } else {
-//       outPtr[i] = inPtr[traversed_index];
-//     }
-//     in->traverse_next(traversal_info, i+1);
-//   }
-// }
-
 template <>
 void Clamp<float, lang::Cpp>(const float low, const float high,
-                             const Tensor* in, Tensor* out,
+                             const Tensor& in, Tensor* out,
                              Context *ctx) {
   auto clamp_lambda = [&low, &high](float a) {
     if(a < low){return low;}
@@ -162,73 +178,42 @@ void Clamp<float, lang::Cpp>(const float low, const float high,
   TraverseUnary<float>(in, out, clamp_lambda);
 }
 
-
-// template <>
-// void Div<float, lang::Cpp>(const float x, const Tensor* in,
-//                            Tensor* out, Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_NE(inPtr[i], 0.f);
-//     outPtr[i] = x / inPtr[i];
-//   }
-// }
-
 template <>
-void Div<float, lang::Cpp>(const float x, const Tensor* in, Tensor* out,
+void Div<float, lang::Cpp>(const float x, const Tensor& in, Tensor* out,
                            Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
+  const float *inPtr = static_cast<const float *>(in.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) {
-    CHECK_NE(inPtr[traversal_info[in->shape().size()]], 0.f);
-    outPtr[i] = x / inPtr[traversal_info[in->shape().size()]];
-    in->traverse_next(traversal_info, i+1);
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) {
+    CHECK_NE(inPtr[traversal_info[in.shape().size()]], 0.f);
+    outPtr[i] = x / inPtr[traversal_info[in.shape().size()]];
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
-
-// template <>
-// void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_NE(in2Ptr[i], 0.f);
-//     outPtr[i] = in1Ptr[i] / in2Ptr[i];
-//   }
-// }
-
 template <>
-void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+void Div<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-  vector<int> traversal_info_in1 = in1->generate_traversal_info();
-  vector<int> traversal_info_in2 = in2->generate_traversal_info();
-  for (size_t i = 0; i < in1->Size(); i++) {
-    CHECK_NE(in2Ptr[traversal_info_in2[in2->shape().size()]], 0.f);
-    outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] / in2Ptr[traversal_info_in2[in2->shape().size()]];
-    in1->traverse_next(traversal_info_in1, i+1);
-    in2->traverse_next(traversal_info_in2, i+1);
+  const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+  vector<int> traversal_info_in1 = generate_traversal_info(in1);
+  vector<int> traversal_info_in2 = generate_traversal_info(in2);
+  vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+  vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+  for (size_t i = 0; i < in1.Size(); i++) {
+    CHECK_NE(in2Ptr[traversal_info_in2[in2.shape().size()]], 0.f);
+    outPtr[i] = in1Ptr[traversal_info_in1[in1.shape().size()]] / in2Ptr[traversal_info_in2[in2.shape().size()]];
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
   }
 }
 
-
-// template <>
-// void EltwiseMult<float, lang::Cpp>(const Tensor* in,
-//                                    const float x, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = inPtr[i] * x;
-//   }
-// }
-
 template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void EltwiseMult<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                                    Context *ctx) {
   auto eltwisemult_lambda = [&x](float a) {
     return (a*x);
@@ -236,19 +221,8 @@ void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, eltwisemult_lambda);
 }
 
-// template <>
-// void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, 
-//                                    Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = in1Ptr[i] * in2Ptr[i];
-//   }
-// }
-
 template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, 
+void EltwiseMult<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, 
                                    Context *ctx) {
   auto eltwisemult_lambda_binary = [](float a, float b) {
     return (a*b);
@@ -256,33 +230,13 @@ void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor*
   TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
 }
 
-// template <>
-// void Exp<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = exp(inPtr[i]);
-//   }
-// }
-
 template <>
-void Exp<float, lang::Cpp>(const Tensor* in, Tensor *out, Context *ctx) {
+void Exp<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
   TraverseUnary<float>(in, out, [](float x) {return exp(x);});
 }
 
-// template <>
-// void GE<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void GE<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto ge_lambda = [&x](float a) {
     return (a >= x) ? 1.f : 0.f;
@@ -290,19 +244,8 @@ void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, ge_lambda);
 }
 
-// template <>
-// void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void GE<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto ge_lambda_binary = [](float a, float b) {
     return (a >= b) ? 1.f : 0.f;
@@ -310,18 +253,8 @@ void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, ge_lambda_binary);
 }
 
-// template <>
-// void GT<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void GT<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto gt_lambda = [&x](float a) {
     return (a > x) ? 1.f : 0.f;
@@ -329,19 +262,8 @@ void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, gt_lambda);
 }
 
-// template <>
-// void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void GT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto gt_lambda_binary = [](float a, float b) {
     return (a > b) ? 1.f : 0.f;
@@ -349,18 +271,8 @@ void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, gt_lambda_binary);
 }
 
-// template <>
-// void LE<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void LE<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto le_lambda = [&x](float a) {
     return (a <= x) ? 1.f : 0.f;
@@ -368,19 +280,8 @@ void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, le_lambda);
 }
 
-// template <>
-// void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void LE<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto le_lambda_binary = [](float a, float b) {
     return (a <= b) ? 1.f : 0.f;
@@ -388,42 +289,23 @@ void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, le_lambda_binary);
 }
 
-// template <>
-// void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_GT(inPtr[i], 0.f);
-//     outPtr[i] = log(inPtr[i]);
-//   }
-// }
-
 template <>
-void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Log<float, lang::Cpp>(const Tensor& in, Tensor* out,
                            Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) {
-    CHECK_GT(inPtr[traversal_info[in->shape().size()]], 0.f);
-    outPtr[i] = log(inPtr[traversal_info[in->shape().size()]]);
-    in->traverse_next(traversal_info, i+1);
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) {
+    CHECK_GT(inPtr[traversal_info[in.shape().size()]], 0.f);
+    outPtr[i] = log(inPtr[traversal_info[in.shape().size()]]);
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
-// template <>
-// void LT<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void LT<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto lt_lambda = [&x](float a) {
     return (a < x) ? 1.f : 0.f;
@@ -431,19 +313,9 @@ void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, lt_lambda);
 }
 
-// template <>
-// void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
 
 template <>
-void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void LT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto lt_lambda_binary = [](float a, float b) {
     return (a < b) ? 1.f : 0.f;
@@ -451,34 +323,13 @@ void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, lt_lambda_binary);
 }
 
-// template <>
-// void Pow<float, lang::Cpp>(const Tensor* in, const float x,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = pow(inPtr[i], x);
-//   }
-// }
-
 template <>
-void Pow<float, lang::Cpp>(const Tensor* in, const float x, Tensor *out, Context *ctx) {
+void Pow<float, lang::Cpp>(const Tensor& in, const float x, Tensor *out, Context *ctx) {
   TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
 }
 
-// template <>
-// void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
-//   }
-// }
-
 template <>
-void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void Pow<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                            Context *ctx) {
   auto pow_lambda_binary = [](float a, float b) {
     return pow(a,b);
@@ -486,18 +337,8 @@ void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
 }
 
-// template <>
-// void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
-//   }
-// }
-
 template <>
-void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void ReLU<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto relu_lambda = [](float a) {
     return (a >= 0.f) ? a : 0.f;
@@ -505,13 +346,6 @@ void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, relu_lambda);
 }
 
-// template <>
-// void Set<float, lang::Cpp>(const float x, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
-// }
-
 template <>
 void Set<float, lang::Cpp>(const float x, Tensor* out,
                            Context *ctx) {
@@ -519,13 +353,6 @@ void Set<float, lang::Cpp>(const float x, Tensor* out,
   for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
 
-// template <>
-// void Set<int, lang::Cpp>(const int x, Tensor* out,
-//                            Context *ctx) {
-//   int *outPtr = static_cast<int *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
-// }
-
 template <>
 void Set<int, lang::Cpp>(const int x, Tensor* out,
                            Context *ctx) {
@@ -533,18 +360,8 @@ void Set<int, lang::Cpp>(const int x, Tensor* out,
   for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
 
-// template <>
-// void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                                Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
-//   }
-// }
-
 template <>
-void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sigmoid<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto sigmoid_lambda = [](float a) {
     return 1.f / (1.f + exp(-a));
@@ -552,18 +369,8 @@ void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, sigmoid_lambda);
 }
 
-// template <>
-// void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
-//   }
-// }
-
 template <>
-void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sign<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto sign_lambda = [](float a) {
     return (a > 0) - (a < 0);
@@ -571,56 +378,23 @@ void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, sign_lambda);
 }
 
-// template <>
-// void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_GE(inPtr[i], 0.f);
-//     outPtr[i] = sqrt(inPtr[i]);
-//   }
-// }
-
 template <>
-void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sqrt<float, lang::Cpp>(const Tensor& in, Tensor* out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) {
-    CHECK_GE(inPtr[traversal_info[in->shape().size()]], 0.f);
-    outPtr[i] = sqrt(inPtr[traversal_info[in->shape().size()]]);
-    in->traverse_next(traversal_info, i+1);
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) {
+    CHECK_GE(inPtr[traversal_info[in.shape().size()]], 0.f);
+    outPtr[i] = sqrt(inPtr[traversal_info[in.shape().size()]]);
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
-/*
 template <>
-void Square<float, lang::Cpp>(const Tensor* in, Tensor* out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < in->Size(); i++) {
-    outPtr[i] = inPtr[i] * inPtr[i];
-  }
-}
-*/
-
-// template <>
-// void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   // CHECK_EQ(ctx->stream, nullptr);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = in1Ptr[i] - in2Ptr[i];
-//   }
-// }
-
-template <>
-void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+void Sub<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   auto sub_lambda_binary = [](float a, float b) {
@@ -632,28 +406,18 @@ void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
 // sum all elements of input into out
 // TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const Tensor* in, float *out,
+void Sum<float, lang::Cpp>(const Tensor& in, float *out,
                            Context *ctx) {
   float s = 0.f;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) {
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) {
     s += inPtr[i];
   }
   *out = s;
 }
 
-// template <>
-// void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = tanh(inPtr[i]);
-//   }
-// }
-
 template <>
-void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Tanh<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto tanh_lambda = [](float a) {
     return tanh(a);
@@ -661,17 +425,6 @@ void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, tanh_lambda);
 }
 
-// ===============Random operations==========================================
-// template <>
-// void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
-//                                  Context *ctx) {
-//   std::bernoulli_distribution distribution(p);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
-//   }
-// }
-
 template <>
 void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
                                  Context *ctx) {
@@ -682,16 +435,6 @@ void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
   }
 }
 
-// template <>
-// void Gaussian<float, lang::Cpp>(const float mean,
-//                                 const float std, Tensor* out, Context *ctx) {
-//   std::normal_distribution<float> distribution(mean, std);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-//   }
-// }
-
 template <>
 void Gaussian<float, lang::Cpp>(const float mean,
                                 const float std, Tensor* out, Context *ctx) {
@@ -702,16 +445,6 @@ void Gaussian<float, lang::Cpp>(const float mean,
   }
 }
 
-// template <>
-// void Uniform<float, lang::Cpp>(const float low,
-//                                const float high, Tensor* out, Context *ctx) {
-//   std::uniform_real_distribution<float> distribution(low, high);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-//   }
-// }
-
 template <>
 void Uniform<float, lang::Cpp>(const float low,
                                const float high, Tensor* out, Context *ctx) {
@@ -727,113 +460,72 @@ void Uniform<float, lang::Cpp>(const float low,
 //warning, this function has block M overwritting to block M itself
 template <>
 void DGMM<float, lang::Cpp>(const bool side_right,
-                            const Tensor* M, const Tensor* v,
+                            const Tensor& M, const Tensor& v,
                             Tensor* out, Context *ctx) {
-  const float *MPtr = static_cast<const float *>(M->block()->data());
-  const float *vPtr = static_cast<const float *>(v->block()->data());
+  const float *MPtr = static_cast<const float *>(M.block()->data());
+  const float *vPtr = static_cast<const float *>(v.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const size_t nrow = M->shape(0);
-  const size_t ncol = M->shape(1);
-  vector<int> traversal_info = M->generate_traversal_info();
+  const size_t nrow = M.shape(0);
+  const size_t ncol = M.shape(1);
+  vector<int> traversal_info = generate_traversal_info(M);
+  vector<int> shape_multipliers = generate_shape_multipliers(M);
 
   if (side_right) {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[c];
-        M->traverse_next(traversal_info, offset+c+1);
+        outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[c];
+        traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
       }
     }
   } else {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[r];
-        M->traverse_next(traversal_info, offset+c+1);
+        outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[r];
+        traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
       }
     }
   }
 }
 
-// #ifdef USE_CBLAS
-// template <>
-// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
-//                             Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   *out = cblas_isamax(in->Size(), inPtr, 1);
-// }
-
-// template <>
-// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
-//                             Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   *out = cblas_sasum(in->Size(), inPtr, 1);
-// }
-
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-//                             const Tensor* in, Tensor* out, Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
-// }
-
-// template <>
-// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            float *out, Context *ctx) {
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   *out = cblas_sdot(in->Size(), in1Ptr, 1, in2Ptr, 1);
-// }
-// template <>
-// void Scale<float, lang::Cpp>(const float x, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   cblas_sscal(in->Size(), x, outPtr, 1);
-// }
-// template <>
-// void Nrm2<float, lang::Cpp>(const Tensor* in, float *out,
-//                             Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   *out = cblas_snrm2(in->Size(), inPtr, 1);
-// }
 
 #ifdef USE_CBLAS
 template <>
-void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  *out = cblas_isamax(in->Size(), inPtr, 1); //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  *out = cblas_isamax(in.Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
-void Asum<float, lang::Cpp>(const Tensor *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor& in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  *out = cblas_sasum(in->Size(), inPtr, 1); //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  *out = cblas_sasum(in.Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
 void Axpy<float, lang::Cpp>(const float alpha,
-                            const Tensor *in, Tensor *out, Context *ctx) {
+                            const Tensor& in, Tensor *out, Context *ctx) {
   //check input tensor for strides first
-  if(in->strides() != out->strides()){
-    const float *inPtr = static_cast<const float *>(in->block()->data());
+  if(in.strides() == out->strides()){
+    const float *inPtr = static_cast<const float *>(in.block()->data());
     float *outPtr = static_cast<float *>(out->block()->mutable_data());
-    cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+    cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
   } else {
     LOG(FATAL) << "Axpy, input and output strides do not match." ;
   }
 }
 
 template <>
-void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            float *out, Context *ctx) {
   //check input tensor for strides first
-  if(!(in1->transpose()) && !(in2->transpose())){
-    const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-    const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-    *out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+  if(!(in1.transpose()) && !(in2.transpose())){
+    const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+    const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+    *out = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
   } else {
     LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
   }
@@ -847,40 +539,21 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
 }
 
 template <>
-void Nrm2<float, lang::Cpp>(const Tensor *in, float *out,
+void Nrm2<float, lang::Cpp>(const Tensor& in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  *out = cblas_snrm2(in->Size(), inPtr, 1); //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  *out = cblas_snrm2(in.Size(), inPtr, 1); //not using strided traversal
 }
 
-// template <>
-// void GEMV<float, lang::Cpp>(//bool trans,
-//                             const std::vector<int> stridesA,
-//                             const size_t m, const size_t n,
-//                             const float alpha, const Tensor* A, const Tensor* v,
-//                             const float beta, Tensor* out, Context *ctx) {
-//   const float *APtr = static_cast<const float *>(A->data());
-//   const float *vPtr = static_cast<const float *>(v->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   auto trans = (stridesA.back() == 1) ? true : false;
-//   if (!trans) {
-//     cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
-//                 beta, outPtr, 1);
-//   } else {
-//     cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
-//                 outPtr, 1);
-//   }
-// }
-
 template <>
-void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
                             const float beta, Tensor *out, Context *ctx) {
-  const float *APtr = static_cast<const float *>(A->block()->data());
-  const float *vPtr = static_cast<const float *>(v->block()->data());
+  const float *APtr = static_cast<const float *>(A.block()->data());
+  const float *vPtr = static_cast<const float *>(v.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const size_t m = A->shape()[0];
-  const size_t n = A->shape()[1];
-  if (A->transpose()) {
+  const size_t m = A.shape()[0];
+  const size_t n = A.shape()[1];
+  if (A.transpose()) {
     cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
                 outPtr, 1);
   } else {
@@ -889,147 +562,36 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
   }
 }
 
-// template <>
-// void GEMM<float, lang::Cpp>(//const bool transA, const bool transB,
-//                             const std::vector<int> stridesA, const std::vector<int> stridesB,
-//                             const size_t nrowA, const size_t ncolB,
-//                             const size_t ncolA, const float alpha,
-//                             const Tensor* A, const Tensor* B, const float beta,
-//                             Tensor* C, Context *ctx) {
-//   auto transA = (stridesA.back() == 1) ? true : false;
-//   auto transa = transA ? CblasTrans : CblasNoTrans;
-//   auto transB = (stridesB.back() == 1) ? true : false;
-//   auto transb = transB ? CblasTrans : CblasNoTrans;
-//   auto lda = transA ? nrowA : ncolA;
-//   auto ldb = transB ? ncolA : ncolB;
-//   auto ldc = ncolB;
-//   const float *APtr = static_cast<const float *>(A->data());
-//   const float *BPtr = static_cast<const float *>(B->data());
-//   float *CPtr = static_cast<float *>(C->mutable_data());
-//   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
-//    lda, BPtr, ldb, beta, CPtr, ldc);
-// }
-
 template <>
 void GEMM<float, lang::Cpp>(const float alpha,
-                            const Tensor *A, const Tensor *B, const float beta,
+                            const Tensor& A, const Tensor& B, const float beta,
                             Tensor *C, Context *ctx) {
-  auto transA = A->transpose();
+  auto transA = A.transpose();
   auto transa = transA ? CblasTrans : CblasNoTrans;
-  auto transB = B->transpose();
+  auto transB = B.transpose();
   auto transb = transB ? CblasTrans : CblasNoTrans;
-  const size_t nrowA = A->shape()[0];
-  const size_t ncolA = A->shape()[1];
-  const size_t ncolB = B->shape()[1];
+  const size_t nrowA = A.shape()[0];
+  const size_t ncolA = A.shape()[1];
+  const size_t ncolB = B.shape()[1];
   auto lda = transA ? nrowA : ncolA;
   auto ldb = transB ? ncolA : ncolB;
   auto ldc = ncolB;
-  const float *APtr = static_cast<const float *>(A->block()->data());
-  const float *BPtr = static_cast<const float *>(B->block()->data());
+  const float *APtr = static_cast<const float *>(A.block()->data());
+  const float *BPtr = static_cast<const float *>(B.block()->data());
   float *CPtr = static_cast<float *>(C->block()->mutable_data());
   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
     lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
-#else
-
-// template <>
-// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
-//                             Context *ctx) {
-//   size_t maxPos = 0;
-//   float maxVal = 0;
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     if (i == 0) {
-//       maxVal = inPtr[i];
-//     } else if (inPtr[i] > maxVal) {
-//       maxVal = inPtr[i];
-//       maxPos = i;
-//     }
-//   }
-//   *out = maxPos;
-// }
-// template <>
-// void Amin<float, lang::Cpp>(const Tensor* in, size_t *out,
-//                             Context *ctx) {
-//   size_t minPos = 0;
-//   float minVal = 0;
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     if (i == 0) {
-//       minVal = inPtr[i];
-//     } else if (inPtr[i] > minVal) {
-//       minVal = inPtr[i];
-//       minPos = i;
-//     }
-//   }
-//   *out = minPos;
-// }
-
-// template <>
-// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
-//                             Context *ctx) {
-//   float sum = 0;
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     sum += fabs(inPtr[i]);
-//   }
-// }
-
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-//                             const Tensor* in, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] += alpha * inPtr[i];
-//   }
-// }
-
-// template <>
-// void Scale<float, lang::Cpp>(const float x, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] *= x;
-//   }
-// }
-
-// template <>
-// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            float *out, Context *ctx) {
-//   float sum = 0;
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     sum += in1Ptr[i] * in2Ptr[i];
-//   }
-// }
-
-// template <>
-// void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-//                             const float alpha, const Tensor* A, const Tensor* v,
-//                             const float beta, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *APtr = static_cast<const float *>(A->data());
-//   const float *vPtr = static_cast<const float *>(v->data());
-//   for (size_t r = 0; r < m; r++) {
-//     float sum = 0;
-//     for (size_t c = 0; c < n; c++) {
-//       size_t idx = trans ? c * m + r : r * n + c;
-//       sum += APtr[idx] * vPtr[c];
-//     }
-//     outPtr[r] = alpha * sum + beta * outPtr[r];
-//   }
-// }
+#else    
 
 template <>
-void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
                             Context *ctx) {
   size_t maxPos = 0;
   float maxVal = 0;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) { //not using strided traversal
     if (i == 0) {
       maxVal = inPtr[i];
     } else if (inPtr[i] > maxVal) {
@@ -1040,12 +602,12 @@ void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
   *out = maxPos;
 }
 template <>
-void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amin<float, lang::Cpp>(const Tensor& in, size_t *out,
                             Context *ctx) {
   size_t minPos = 0;
   float minVal = 0;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) { //not using strided traversal
     if (i == 0) {
       minVal = inPtr[i];
     } else if (inPtr[i] > minVal) {
@@ -1057,24 +619,26 @@ void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
 }
 
 template <>
-void Asum<float, lang::Cpp>(const Tensor *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor& in, float *out,
                             Context *ctx) {
   float sum = 0;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) {
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) {
     sum += fabs(inPtr[i]); //not using strided traversal
   }
 }
 
 template <>
 void Axpy<float, lang::Cpp>(const float alpha,
-                            const Tensor *in, Tensor *out, Context *ctx) {
+                            const Tensor& in, Tensor *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) { 
-    outPtr[i] += alpha * inPtr[traversal_info[in->shape().size()]];
-    in->traverse_next(traversal_info, i+1);
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) { 
+    outPtr[i] += alpha * inPtr[traversal_info[in.shape().size()]];
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
@@ -1088,35 +652,38 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
 }
 
 template <>
-void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            float *out, Context *ctx) {
   float sum = 0;
-  // const float *in1Ptr = static_cast<const float *>(in1->data());
-  // const float *in2Ptr = static_cast<const float *>(in2->data());
-  // for (size_t i = 0; i < in->Size(); i++) {
+  // const float *in1Ptr = static_cast<const float *>(in1.data());
+  // const float *in2Ptr = static_cast<const float *>(in2.data());
+  // for (size_t i = 0; i < in.Size(); i++) {
   //   sum += in1Ptr[i] * in2Ptr[i]; 
   // }
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-  vector<int> traversal_info_in1 = in1->generate_traversal_info();
-  vector<int> traversal_info_in2 = in2->generate_traversal_info();
-  for (size_t i = 0; i < in1->Size(); i++) {
-    sum += in1Ptr[traversal_info_in1[in1->shape().size()]] * in2Ptr[traversal_info_in2[in2->shape().size()]];
-    in1->traverse_next(traversal_info_in1, i+1);
-    in2->traverse_next(traversal_info_in2, i+1);
+  const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+  vector<int> traversal_info_in1 = generate_traversal_info(in1);
+  vector<int> traversal_info_in2 = generate_traversal_info(in2);
+  vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+  vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+  for (size_t i = 0; i < in1.Size(); i++) {
+    sum += in1Ptr[traversal_info_in1[in1.shape().size()]] * in2Ptr[traversal_info_in2[in2.shape().size()]];
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
   }
 }
 
 template <>
-void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
                             const float beta, Tensor *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *APtr = static_cast<const float *>(A->block()->data());
-  const float *vPtr = static_cast<const float *>(v->block()->data());
-  bool trans = A->transpose();
-  const size_t m = A->shape(0);
-  const size_t n = A->shape(1);
+  const float *APtr = static_cast<const float *>(A.block()->data());
+  const float *vPtr = static_cast<const float *>(v.block()->data());
+  bool trans = A.transpose();
+  const size_t m = A.shape(0);
+  const size_t n = A.shape(1);
   for (size_t r = 0; r < m; r++) {
     float sum = 0;
     for (size_t c = 0; c < n; c++) {
@@ -1189,34 +756,21 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool int_target,
   }
 }
 
-// template <>
-// void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-//                               const Tensor* in, Tensor* out, Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t r = 0; r < nrow; r++) {
-//     int offset = (int)(r * ncol);
-//     float maxval = inPtr[offset];
-//     for (size_t c = 1; c < ncol; c++)
-//       maxval = (std::max)(maxval, inPtr[offset + c]);
-//     outPtr[r] = maxval;
-//   }
-// }
-
 template <>
-void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
+void RowMax<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const size_t nrow = in->shape()[0];
-  const size_t ncol = in->shape()[1];
-  vector<int> traversal_info = in->generate_traversal_info();
+  const size_t nrow = in.shape()[0];
+  const size_t ncol = in.shape()[1];
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
     
   for (size_t r = 0; r < nrow; r++) {
     int counter_offset = (r * ncol);
     float maxval = 0;
     for (size_t c = 0; c < ncol; c++){
-      maxval = (std::max)(maxval, inPtr[traversal_info[in->shape().size()]]);
-      in->traverse_next(traversal_info, counter_offset+c+1);
+      maxval = (std::max)(maxval, inPtr[traversal_info[in.shape().size()]]);
+      traverse_next(in, shape_multipliers, traversal_info, counter_offset+c+1);
     }
     outPtr[r] = maxval;
   }
@@ -1226,11 +780,11 @@ void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
 /*
 template <>
 void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Tensor* A, const Tensor* v, Tensor* out,
+                              const Tensor& A, const Tensor& v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
+  const float *APtr = static_cast<const float *>(A.data());
+  const float *vPtr = static_cast<const float *>(v.data());
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
     for (size_t c = 0; c < ncol; c++) {
@@ -1241,11 +795,11 @@ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Tensor* A, const Tensor* v, Tensor* out,
+                              const Tensor& A, const Tensor& v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
+  const float *APtr = static_cast<const float *>(A.data());
+  const float *vPtr = static_cast<const float *>(v.data());
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
     for (size_t c = 0; c < ncol; c++) {
@@ -1254,11 +808,11 @@ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
   }
 }
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
-                             const Tensor* in2, Tensor* out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor& in1,
+                             const Tensor& in2, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+  const float *in1Ptr = static_cast<const float *>(in1.data());
+  const float *in2Ptr = static_cast<const float *>(in2.data());
   for (size_t r = 0; r < m; r++) {
     size_t offset = r * n;
     for (size_t c = 0; c < n; c++) {
@@ -1268,9 +822,9 @@ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
 }
 template <>
 void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Tensor* in, Tensor* out, Context *ctx) {
+                               const Tensor& in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+  const float *inPtr = static_cast<const float *>(in.data());
   float *bPtr = new float[ncol];
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
@@ -1289,9 +843,9 @@ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Tensor* in, Tensor* out, Context *ctx) {
+                                  const Tensor& in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+  const float *inPtr = static_cast<const float *>(in.data());
   for (size_t c = 0; c < ncol; c++) {
     outPtr[c] = 0.f;
   }
@@ -1305,9 +859,9 @@ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Tensor* in, Tensor* out, Context *ctx) {
+                               const Tensor& in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+  const float *inPtr = static_cast<const float *>(in.data());
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
     outPtr[r] = 0.f;