You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2018/05/13 15:26:28 UTC

[01/10] incubator-singa git commit: Singa-341 Added stride functionality to tensors for CPP

Repository: incubator-singa
Updated Branches:
  refs/heads/master 394d78d00 -> 600f27ede


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 4f510ed..01d9fe3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -21,7 +21,9 @@
 #include "./tensor_math.h"
 #include <cfloat>
 #include "singa/core/common.h"
+#include "singa/core/tensor.h"
 #include <math.h>
+#include <vector>
 
 #ifdef USE_CBLAS
 #include <cblas.h>
@@ -29,422 +31,856 @@
 
 namespace singa {
 
+// template <>
+// void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = fabs(inPtr[i]);
+//   }
+// }
+
 template <>
-void Abs<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = fabs(inPtr[i]);
-  }
+void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out, Context *ctx) {
+  TraverseUnary<float>(in, out, [](float x) {return fabs(x);});
 }
 
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in, const float x,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = inPtr[i] + x;
+//   }
+// }
+
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->block()->data());
+//   vector<int> traversal_info = in->generate_traversal_info();
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = inPtr[traversal_info[in->shape().size()]] + x;
+//     in->traverse_next(traversal_info, i+1);
+//   }
+// }
+
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] + x;
-  }
+void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
+  auto add_lambda = [&x](float a) {
+    return (a+x);
+  };
+  TraverseUnary<float>(in, out, add_lambda);
 }
 
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   // CHECK_EQ(ctx->stream, nullptr);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = in1Ptr[i] + in2Ptr[i];
+//   }
+// }
+
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
+//   // CHECK_EQ(ctx->stream, nullptr);
+//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+//   //call axpy if both strides are 1?
+//   vector<int> traversal_info_in1 = in1->generate_traversal_info();
+//   vector<int> traversal_info_in2 = in2->generate_traversal_info();
+//   for (size_t i = 0; i < in1->Size(); i++) {
+//     outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] + in2Ptr[traversal_info_in2[in2->shape().size()]];
+//     in1->traverse_next(traversal_info_in1, i+1);
+//     in2->traverse_next(traversal_info_in2, i+1);
+//   }
+// }
+
 template <>
-void Add<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                           Block *out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] + in2Ptr[i];
-  }
+  auto add_lambda_binary = [](float a, float b) {
+    return (a+b);
+  };
+  TraverseBinary<float>(in1, in2, out, add_lambda_binary);
+  
 }
 
+// template <>
+// void Clamp<float, lang::Cpp>(const float low,
+//                              const float high, const Tensor* in, Tensor* out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     if (inPtr[i] > high) {
+//       outPtr[i] = high;
+//     } else if (inPtr[i] < low) {
+//       outPtr[i] = low;
+//     } else {
+//       outPtr[i] = inPtr[i];
+//     }
+//   }
+// }
+
+// template <>
+// void Clamp<float, lang::Cpp>(const Tensor* in, const float low,
+//                              const float high, Tensor* out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->block()->data());
+//   vector<int> traversal_info = in->generate_traversal_info();
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     int traversed_index = traversal_info[in->shape().size()];
+//     if (inPtr[traversed_index] > high) {
+//       outPtr[i] = high;
+//     } else if (inPtr[traversed_index] < low) {
+//       outPtr[i] = low;
+//     } else {
+//       outPtr[i] = inPtr[traversed_index];
+//     }
+//     in->traverse_next(traversal_info, i+1);
+//   }
+// }
+
 template <>
-void Clamp<float, lang::Cpp>(const size_t num, const float low,
-                             const float high, const Block *in, Block *out,
+void Clamp<float, lang::Cpp>(const float low, const float high,
+                             const Tensor* in, Tensor* out,
                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    if (inPtr[i] > high) {
-      outPtr[i] = high;
-    } else if (inPtr[i] < low) {
-      outPtr[i] = low;
-    } else {
-      outPtr[i] = inPtr[i];
-    }
-  }
+  auto clamp_lambda = [&low, &high](float a) {
+    if(a < low){return low;}
+    else if(a > high){return high;}
+    else {return a;}
+  };
+  TraverseUnary<float>(in, out, clamp_lambda);
 }
 
+
+// template <>
+// void Div<float, lang::Cpp>(const float x, const Tensor* in,
+//                            Tensor* out, Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_NE(inPtr[i], 0.f);
+//     outPtr[i] = x / inPtr[i];
+//   }
+// }
+
 template <>
-void Div<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_NE(in2Ptr[i], 0.f);
-    outPtr[i] = in1Ptr[i] / in2Ptr[i];
+void Div<float, lang::Cpp>(const float x, const Tensor* in, Tensor* out,
+                           Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) {
+    CHECK_NE(inPtr[traversal_info[in->shape().size()]], 0.f);
+    outPtr[i] = x / inPtr[traversal_info[in->shape().size()]];
+    in->traverse_next(traversal_info, i+1);
   }
 }
 
+
+// template <>
+// void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_NE(in2Ptr[i], 0.f);
+//     outPtr[i] = in1Ptr[i] / in2Ptr[i];
+//   }
+// }
+
 template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Block *in,
-                           Block *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_NE(inPtr[i], 0.f);
-    outPtr[i] = x / inPtr[i];
+void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+  vector<int> traversal_info_in1 = in1->generate_traversal_info();
+  vector<int> traversal_info_in2 = in2->generate_traversal_info();
+  for (size_t i = 0; i < in1->Size(); i++) {
+    CHECK_NE(in2Ptr[traversal_info_in2[in2->shape().size()]], 0.f);
+    outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] / in2Ptr[traversal_info_in2[in2->shape().size()]];
+    in1->traverse_next(traversal_info_in1, i+1);
+    in2->traverse_next(traversal_info_in2, i+1);
   }
 }
 
+
+// template <>
+// void EltwiseMult<float, lang::Cpp>(const Tensor* in,
+//                                    const float x, Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = inPtr[i] * x;
+//   }
+// }
+
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in,
-                                   const float x, Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = inPtr[i] * x;
-  }
+void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                                   Context *ctx) {
+  auto eltwisemult_lambda = [&x](float a) {
+    return (a*x);
+  };
+  TraverseUnary<float>(in, out, eltwisemult_lambda);
 }
 
+// template <>
+// void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, 
+//                                    Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = in1Ptr[i] * in2Ptr[i];
+//   }
+// }
+
 template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in1,
-                                   const Block *in2, Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] * in2Ptr[i];
-  }
+void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, 
+                                   Context *ctx) {
+  auto eltwisemult_lambda_binary = [](float a, float b) {
+    return (a*b);
+  };
+  TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
 }
+
+// template <>
+// void Exp<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = exp(inPtr[i]);
+//   }
+// }
+
 template <>
-void Exp<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                           Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = exp(inPtr[i]);
-  }
+void Exp<float, lang::Cpp>(const Tensor* in, Tensor *out, Context *ctx) {
+  TraverseUnary<float>(in, out, [](float x) {return exp(x);});
 }
 
+// template <>
+// void GE<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
-  }
+void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto ge_lambda = [&x](float a) {
+    return (a >= x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, ge_lambda);
 }
 
+// template <>
+// void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
-  }
+void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto ge_lambda_binary = [](float a, float b) {
+    return (a >= b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, ge_lambda_binary);
 }
+
+// template <>
+// void GT<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
-  }
+void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto gt_lambda = [&x](float a) {
+    return (a > x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, gt_lambda);
 }
+
+// template <>
+// void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void GT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
-  }
+void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto gt_lambda_binary = [](float a, float b) {
+    return (a > b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, gt_lambda_binary);
 }
 
+// template <>
+// void LE<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
-  }
+void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto le_lambda = [&x](float a) {
+    return (a <= x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, le_lambda);
 }
+
+// template <>
+// void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
-  }
+void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto le_lambda_binary = [](float a, float b) {
+    return (a <= b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, le_lambda_binary);
 }
+
+// template <>
+// void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_GT(inPtr[i], 0.f);
+//     outPtr[i] = log(inPtr[i]);
+//   }
+// }
+
 template <>
-void Log<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_GT(inPtr[i], 0.f);
-    outPtr[i] = log(inPtr[i]);
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) {
+    CHECK_GT(inPtr[traversal_info[in->shape().size()]], 0.f);
+    outPtr[i] = log(inPtr[traversal_info[in->shape().size()]]);
+    in->traverse_next(traversal_info, i+1);
   }
 }
+
+// template <>
+// void LT<float, lang::Cpp>(const Tensor* in, const float x,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
-  }
+void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+                          Context *ctx) {
+  auto lt_lambda = [&x](float a) {
+    return (a < x) ? 1.f : 0.f;
+  };
+  TraverseUnary<float>(in, out, lt_lambda);
 }
+
+// template <>
+// void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                           Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr1 = static_cast<const float *>(in1->data());
+//   const float *inPtr2 = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
+//   }
+// }
+
 template <>
-void LT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                          Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr1 = static_cast<const float *>(in1->data());
-  const float *inPtr2 = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
-  }
+void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                          Context *ctx) {
+  auto lt_lambda_binary = [](float a, float b) {
+    return (a < b) ? 1.f : 0.f;
+  };
+  TraverseBinary<float>(in1, in2, out, lt_lambda_binary);
 }
 
+// template <>
+// void Pow<float, lang::Cpp>(const Tensor* in, const float x,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = pow(inPtr[i], x);
+//   }
+// }
+
 template <>
-void Pow<float, lang::Cpp>(const size_t num, const Block *in, const float x,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = pow(inPtr[i], x);
-  }
+void Pow<float, lang::Cpp>(const Tensor* in, const float x, Tensor *out, Context *ctx) {
+  TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
 }
 
+// template <>
+// void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
+//   }
+// }
+
 template <>
-void Pow<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                           Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
-  }
+void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+                           Context *ctx) {
+  auto pow_lambda_binary = [](float a, float b) {
+    return pow(a,b);
+  };
+  TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
 }
+
+// template <>
+// void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+//   }
+// }
+
 template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
-  }
+void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto relu_lambda = [](float a) {
+    return (a >= 0.f) ? a : 0.f;
+  };
+  TraverseUnary<float>(in, out, relu_lambda);
 }
+
+// template <>
+// void Set<float, lang::Cpp>(const float x, Tensor* out,
+//                            Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
+// }
+
 template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Set<float, lang::Cpp>(const float x, Tensor* out,
                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
+
+// template <>
+// void Set<int, lang::Cpp>(const int x, Tensor* out,
+//                            Context *ctx) {
+//   int *outPtr = static_cast<int *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
+// }
+
 template <>
-void Set<int, lang::Cpp>(const size_t num, const int x, Block *out,
+void Set<int, lang::Cpp>(const int x, Tensor* out,
                            Context *ctx) {
-  int *outPtr = static_cast<int *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) outPtr[i] = x;
+  int *outPtr = static_cast<int *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
 
+// template <>
+// void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                                Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+//   }
+// }
+
 template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                               Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
-  }
+void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto sigmoid_lambda = [](float a) {
+    return 1.f / (1.f + exp(-a));
+  };
+  TraverseUnary<float>(in, out, sigmoid_lambda);
 }
 
+// template <>
+// void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
+//   }
+// }
+
 template <>
-void Sign<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
-  }
+void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto sign_lambda = [](float a) {
+    return (a > 0) - (a < 0);
+  };
+  TraverseUnary<float>(in, out, sign_lambda);
 }
 
+// template <>
+// void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     CHECK_GE(inPtr[i], 0.f);
+//     outPtr[i] = sqrt(inPtr[i]);
+//   }
+// }
+
 template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
                             Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    CHECK_GE(inPtr[i], 0.f);
-    outPtr[i] = sqrt(inPtr[i]);
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) {
+    CHECK_GE(inPtr[traversal_info[in->shape().size()]], 0.f);
+    outPtr[i] = sqrt(inPtr[traversal_info[in->shape().size()]]);
+    in->traverse_next(traversal_info, i+1);
   }
 }
+
 /*
 template <>
-void Square<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Square<float, lang::Cpp>(const Tensor* in, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  for (size_t i = 0; i < in->Size(); i++) {
     outPtr[i] = inPtr[i] * inPtr[i];
   }
 }
 */
 
+// template <>
+// void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            Tensor* out, Context *ctx) {
+//   // CHECK_EQ(ctx->stream, nullptr);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = in1Ptr[i] - in2Ptr[i];
+//   }
+// }
+
 template <>
-void Sub<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
-                           Block *out, Context *ctx) {
+void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = in1Ptr[i] - in2Ptr[i];
-  }
+  auto sub_lambda_binary = [](float a, float b) {
+    return (a-b);
+  };
+  TraverseBinary<float>(in1, in2, out, sub_lambda_binary);
 }
 
 // sum all elements of input into out
 // TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Sum<float, lang::Cpp>(const Tensor* in, float *out,
                            Context *ctx) {
   float s = 0.f;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) {
     s += inPtr[i];
   }
   *out = s;
 }
 
+// template <>
+// void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+//                             Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = tanh(inPtr[i]);
+//   }
+// }
+
 template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
-                            Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] = tanh(inPtr[i]);
-  }
+void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+                          Context *ctx) {
+  auto tanh_lambda = [](float a) {
+    return tanh(a);
+  };
+  TraverseUnary<float>(in, out, tanh_lambda);
 }
 
 // ===============Random operations==========================================
+// template <>
+// void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
+//                                  Context *ctx) {
+//   std::bernoulli_distribution distribution(p);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+//   }
+// }
+
 template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Block *out,
+void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
                                  Context *ctx) {
   std::bernoulli_distribution distribution(p);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
     outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
   }
 }
 
+// template <>
+// void Gaussian<float, lang::Cpp>(const float mean,
+//                                 const float std, Tensor* out, Context *ctx) {
+//   std::normal_distribution<float> distribution(mean, std);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+//   }
+// }
+
 template <>
-void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
-                                const float std, Block *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const float mean,
+                                const float std, Tensor* out, Context *ctx) {
   std::normal_distribution<float> distribution(mean, std);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
+
+// template <>
+// void Uniform<float, lang::Cpp>(const float low,
+//                                const float high, Tensor* out, Context *ctx) {
+//   std::uniform_real_distribution<float> distribution(low, high);
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+//   }
+// }
+
 template <>
-void Uniform<float, lang::Cpp>(const size_t num, const float low,
-                               const float high, Block *out, Context *ctx) {
+void Uniform<float, lang::Cpp>(const float low,
+                               const float high, Tensor* out, Context *ctx) {
   std::uniform_real_distribution<float> distribution(low, high);
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
   }
 }
 
 // ====================Blas operations======================================
 
+//yisen todo, this function has block M overwritting to block M itself
 template <>
-void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
-                            const size_t ncol, const Block *M, const Block *v,
-                            Block *out, Context *ctx) {
-  const float *MPtr = static_cast<const float *>(M->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void DGMM<float, lang::Cpp>(const bool side_right,
+                            const Tensor* M, const Tensor* v,
+                            Tensor* out, Context *ctx) {
+  const float *MPtr = static_cast<const float *>(M->block()->data());
+  const float *vPtr = static_cast<const float *>(v->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const size_t nrow = M->shape(0);
+  const size_t ncol = M->shape(1);
+  vector<int> traversal_info = M->generate_traversal_info();
+
   if (side_right) {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[offset + c] = MPtr[offset + c] * vPtr[c];
+        outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[c];
+        M->traverse_next(traversal_info, offset+c+1);
       }
     }
   } else {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[offset + c] = MPtr[offset + c] * vPtr[r];
+        outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[r];
+        M->traverse_next(traversal_info, offset+c+1);
       }
     }
   }
 }
 
+// #ifdef USE_CBLAS
+// template <>
+// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
+//                             Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   *out = cblas_isamax(in->Size(), inPtr, 1);
+// }
+
+// template <>
+// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
+//                             Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   *out = cblas_sasum(in->Size(), inPtr, 1);
+// }
+
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+//                             const Tensor* in, Tensor* out, Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+// }
+
+// template <>
+// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            float *out, Context *ctx) {
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   *out = cblas_sdot(in->Size(), in1Ptr, 1, in2Ptr, 1);
+// }
+// template <>
+// void Scale<float, lang::Cpp>(const float x, Tensor* out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   cblas_sscal(in->Size(), x, outPtr, 1);
+// }
+// template <>
+// void Nrm2<float, lang::Cpp>(const Tensor* in, float *out,
+//                             Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   *out = cblas_snrm2(in->Size(), inPtr, 1);
+// }
+
 #ifdef USE_CBLAS
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  *out = cblas_isamax(num, inPtr, 1);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  *out = cblas_isamax(in->Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor *in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  *out = cblas_sasum(num, inPtr, 1);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  *out = cblas_sasum(in->Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
-                            const Block *in, Block *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
+void Axpy<float, lang::Cpp>(const float alpha,
+                            const Tensor *in, Tensor *out, Context *ctx) {
+  //check input tensor for strides first
+  if((in->strides())[0] == 1){
+    const float *inPtr = static_cast<const float *>(in->block()->data());
+    float *outPtr = static_cast<float *>(out->block()->mutable_data());
+    cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+  }
+  //yisen todo
+  //else throw error
 }
 
 template <>
-void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
                            float *out, Context *ctx) {
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
+  //check input tensor for strides first
+  if(((in1->strides())[0] == 1) && ((in2->strides())[0] == 1)){
+    const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+    const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+    *out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+  }
+  //yisen todo
+  //else throw error
 }
+
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Scale<float, lang::Cpp>(const float x, Tensor *out,
                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  cblas_sscal(num, x, outPtr, 1);
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  cblas_sscal(out->Size(), x, outPtr, 1); //not using strided traversal
 }
+
 template <>
-void Nrm2<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Nrm2<float, lang::Cpp>(const Tensor *in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  *out = cblas_snrm2(num, inPtr, 1);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  *out = cblas_snrm2(in->Size(), inPtr, 1); //not using strided traversal
 }
 
+// template <>
+// void GEMV<float, lang::Cpp>(//bool trans,
+//                             const std::vector<int> stridesA,
+//                             const size_t m, const size_t n,
+//                             const float alpha, const Tensor* A, const Tensor* v,
+//                             const float beta, Tensor* out, Context *ctx) {
+//   const float *APtr = static_cast<const float *>(A->data());
+//   const float *vPtr = static_cast<const float *>(v->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   auto trans = (stridesA.back() == 1) ? true : false;
+//   if (!trans) {
+//     cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+//                 beta, outPtr, 1);
+//   } else {
+//     cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
+//                 outPtr, 1);
+//   }
+// }
+
 template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-                            const float alpha, const Block *A, const Block *v,
-                            const float beta, Block *out, Context *ctx) {
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+                            const float beta, Tensor *out, Context *ctx) {
+  const float *APtr = static_cast<const float *>(A->block()->data());
+  const float *vPtr = static_cast<const float *>(v->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  auto trans = ((A->strides())[0] != 1) ? true : false;
+  const size_t m = A->shape()[0];
+  const size_t n = A->shape()[1];
   if (!trans) {
     cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
                 beta, outPtr, 1);
@@ -454,33 +890,147 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
   }
 }
 
+// template <>
+// void GEMM<float, lang::Cpp>(//const bool transA, const bool transB,
+//                             const std::vector<int> stridesA, const std::vector<int> stridesB,
+//                             const size_t nrowA, const size_t ncolB,
+//                             const size_t ncolA, const float alpha,
+//                             const Tensor* A, const Tensor* B, const float beta,
+//                             Tensor* C, Context *ctx) {
+//   auto transA = (stridesA.back() == 1) ? true : false;
+//   auto transa = transA ? CblasTrans : CblasNoTrans;
+//   auto transB = (stridesB.back() == 1) ? true : false;
+//   auto transb = transB ? CblasTrans : CblasNoTrans;
+//   auto lda = transA ? nrowA : ncolA;
+//   auto ldb = transB ? ncolA : ncolB;
+//   auto ldc = ncolB;
+//   const float *APtr = static_cast<const float *>(A->data());
+//   const float *BPtr = static_cast<const float *>(B->data());
+//   float *CPtr = static_cast<float *>(C->mutable_data());
+//   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
+//    lda, BPtr, ldb, beta, CPtr, ldc);
+// }
+
 template <>
-void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
-                            const size_t nrowA, const size_t ncolB,
-                            const size_t ncolA, const float alpha,
-                            const Block *A, const Block *B, const float beta,
-                            Block *C, Context *ctx) {
+void GEMM<float, lang::Cpp>(const float alpha,
+                            const Tensor *A, const Tensor *B, const float beta,
+                            Tensor *C, Context *ctx) {
+  auto transA = ((A->strides())[0] != 1) ? true : false;
   auto transa = transA ? CblasTrans : CblasNoTrans;
+  auto transB = ((B->strides())[0] != 1) ? true : false;
   auto transb = transB ? CblasTrans : CblasNoTrans;
+  const size_t nrowA = A->shape()[0];
+  const size_t ncolA = A->shape()[1];
+  const size_t ncolB = B->shape()[1];
   auto lda = transA ? nrowA : ncolA;
   auto ldb = transB ? ncolA : ncolB;
   auto ldc = ncolB;
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *BPtr = static_cast<const float *>(B->data());
-  float *CPtr = static_cast<float *>(C->mutable_data());
+  const float *APtr = static_cast<const float *>(A->block()->data());
+  const float *BPtr = static_cast<const float *>(B->block()->data());
+  float *CPtr = static_cast<float *>(C->block()->mutable_data());
   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
-	  lda, BPtr, ldb, beta, CPtr, ldc);
+    lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
 #else
 
+// template <>
+// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
+//                             Context *ctx) {
+//   size_t maxPos = 0;
+//   float maxVal = 0;
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     if (i == 0) {
+//       maxVal = inPtr[i];
+//     } else if (inPtr[i] > maxVal) {
+//       maxVal = inPtr[i];
+//       maxPos = i;
+//     }
+//   }
+//   *out = maxPos;
+// }
+// template <>
+// void Amin<float, lang::Cpp>(const Tensor* in, size_t *out,
+//                             Context *ctx) {
+//   size_t minPos = 0;
+//   float minVal = 0;
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     if (i == 0) {
+//       minVal = inPtr[i];
+//     } else if (inPtr[i] > minVal) {
+//       minVal = inPtr[i];
+//       minPos = i;
+//     }
+//   }
+//   *out = minPos;
+// }
+
+// template <>
+// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
+//                             Context *ctx) {
+//   float sum = 0;
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     sum += fabs(inPtr[i]);
+//   }
+// }
+
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+//                             const Tensor* in, Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] += alpha * inPtr[i];
+//   }
+// }
+
+// template <>
+// void Scale<float, lang::Cpp>(const float x, Tensor* out,
+//                              Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     outPtr[i] *= x;
+//   }
+// }
+
+// template <>
+// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+//                            float *out, Context *ctx) {
+//   float sum = 0;
+//   const float *in1Ptr = static_cast<const float *>(in1->data());
+//   const float *in2Ptr = static_cast<const float *>(in2->data());
+//   for (size_t i = 0; i < in->Size(); i++) {
+//     sum += in1Ptr[i] * in2Ptr[i];
+//   }
+// }
+
+// template <>
+// void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+//                             const float alpha, const Tensor* A, const Tensor* v,
+//                             const float beta, Tensor* out, Context *ctx) {
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   const float *APtr = static_cast<const float *>(A->data());
+//   const float *vPtr = static_cast<const float *>(v->data());
+//   for (size_t r = 0; r < m; r++) {
+//     float sum = 0;
+//     for (size_t c = 0; c < n; c++) {
+//       size_t idx = trans ? c * m + r : r * n + c;
+//       sum += APtr[idx] * vPtr[c];
+//     }
+//     outPtr[r] = alpha * sum + beta * outPtr[r];
+//   }
+// }
+
 template <>
-void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
                             Context *ctx) {
   size_t maxPos = 0;
   float maxVal = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
     if (i == 0) {
       maxVal = inPtr[i];
     } else if (inPtr[i] > maxVal) {
@@ -491,12 +1041,12 @@ void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
   *out = maxPos;
 }
 template <>
-void Amin<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
                             Context *ctx) {
   size_t minPos = 0;
   float minVal = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
     if (i == 0) {
       minVal = inPtr[i];
     } else if (inPtr[i] > minVal) {
@@ -508,52 +1058,67 @@ void Amin<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
 }
 
 template <>
-void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor *in, float *out,
                             Context *ctx) {
   float sum = 0;
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    sum += fabs(inPtr[i]);
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  for (size_t i = 0; i < in->Size(); i++) {
+    sum += fabs(inPtr[i]); //not using strided traversal
   }
 }
 
 template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
-                            const Block *in, Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] += alpha * inPtr[i];
+void Axpy<float, lang::Cpp>(const float alpha,
+                            const Tensor *in, Tensor *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) { 
+    outPtr[i] += alpha * inPtr[traversal_info[in->shape().size()]];
+    in->traverse_next(traversal_info, i+1);
   }
 }
 
 template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Scale<float, lang::Cpp>(const float x, Tensor *out,
                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  for (size_t i = 0; i < num; i++) {
-    outPtr[i] *= x;
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  for (size_t i = 0; i < out->Size(); i++) {
+    outPtr[i] *= x; //not using strided traversal
   }
 }
 
+//yisen todo check purpose of sum in this function
 template <>
-void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
                            float *out, Context *ctx) {
   float sum = 0;
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
-  for (size_t i = 0; i < num; i++) {
-    sum += in1Ptr[i] * in2Ptr[i];
+  // const float *in1Ptr = static_cast<const float *>(in1->data());
+  // const float *in2Ptr = static_cast<const float *>(in2->data());
+  // for (size_t i = 0; i < in->Size(); i++) {
+  //   sum += in1Ptr[i] * in2Ptr[i]; 
+  // }
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+  vector<int> traversal_info_in1 = in1->generate_traversal_info();
+  vector<int> traversal_info_in2 = in2->generate_traversal_info();
+  for (size_t i = 0; i < in1->Size(); i++) {
+    sum += in1Ptr[traversal_info_in1[in1->shape().size()]] * in2Ptr[traversal_info_in2[in2->shape().size()]];
+    in1->traverse_next(traversal_info_in1, i+1);
+    in2->traverse_next(traversal_info_in2, i+1);
   }
 }
 
 template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-                            const float alpha, const Block *A, const Block *v,
-                            const float beta, Block *out, Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+                            const float beta, Tensor *out, Context *ctx) {
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const float *APtr = static_cast<const float *>(A->block()->data());
+  const float *vPtr = static_cast<const float *>(v->block()->data());
+  bool trans = ((A->strides())[0] != 1) ? true : false;
+  const size_t m = A->shape(0);
+  const size_t n = A->shape(1);
   for (size_t r = 0; r < m; r++) {
     float sum = 0;
     for (size_t c = 0; c < n; c++) {
@@ -564,6 +1129,7 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
   }
 }
 
+//yisen todo
 #endif  // USE_CBLAS
 template <>
 void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
@@ -626,16 +1192,35 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool int_target,
   }
 }
 
+// template <>
+// void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+//                               const Tensor* in, Tensor* out, Context *ctx) {
+//   const float *inPtr = static_cast<const float *>(in->data());
+//   float *outPtr = static_cast<float *>(out->mutable_data());
+//   for (size_t r = 0; r < nrow; r++) {
+//     int offset = (int)(r * ncol);
+//     float maxval = inPtr[offset];
+//     for (size_t c = 1; c < ncol; c++)
+//       maxval = (std::max)(maxval, inPtr[offset + c]);
+//     outPtr[r] = maxval;
+//   }
+// }
+
 template <>
-void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Block *in, Block *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->data());
-  float *outPtr = static_cast<float *>(out->mutable_data());
+void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in->block()->data());
+  float *outPtr = static_cast<float *>(out->block()->mutable_data());
+  const size_t nrow = in->shape()[0];
+  const size_t ncol = in->shape()[1];
+  vector<int> traversal_info = in->generate_traversal_info();
+    
   for (size_t r = 0; r < nrow; r++) {
-    int offset = (int)(r * ncol);
-    float maxval = inPtr[offset];
-    for (size_t c = 1; c < ncol; c++)
-      maxval = (std::max)(maxval, inPtr[offset + c]);
+    int counter_offset = (r * ncol);
+    float maxval = 0;
+    for (size_t c = 0; c < ncol; c++){
+      maxval = (std::max)(maxval, inPtr[traversal_info[in->shape().size()]]);
+      in->traverse_next(traversal_info, counter_offset+c+1);
+    }
     outPtr[r] = maxval;
   }
 }
@@ -644,7 +1229,7 @@ void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 /*
 template <>
 void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Block *A, const Block *v, Block *out,
+                              const Tensor* A, const Tensor* v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *APtr = static_cast<const float *>(A->data());
@@ -659,7 +1244,7 @@ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Block *A, const Block *v, Block *out,
+                              const Tensor* A, const Tensor* v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *APtr = static_cast<const float *>(A->data());
@@ -672,8 +1257,8 @@ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
   }
 }
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Block *in1,
-                             const Block *in2, Block *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
+                             const Tensor* in2, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1->data());
   const float *in2Ptr = static_cast<const float *>(in2->data());
@@ -686,7 +1271,7 @@ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Block *in1,
 }
 template <>
 void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Block *in, Block *out, Context *ctx) {
+                               const Tensor* in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   float *bPtr = new float[ncol];
@@ -707,7 +1292,7 @@ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Block *in, Block *out, Context *ctx) {
+                                  const Tensor* in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t c = 0; c < ncol; c++) {
@@ -723,7 +1308,7 @@ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Block *in, Block *out, Context *ctx) {
+                               const Tensor* in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
   const float *inPtr = static_cast<const float *>(in->data());
   for (size_t r = 0; r < nrow; r++) {

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index 9264e55..fd25607 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -50,19 +50,19 @@ enum CopyDirection {
 
 // configuration for device memory pool
 message MemPoolConf {
-	optional string type = 1 [default = "cnmem"];
-	// allocation size for each device, default is 256 MB
-	optional uint32 init_size = 2 [default = 256];
+  optional string type = 1 [default = "cnmem"];
+  // allocation size for each device, default is 256 MB
+  optional uint32 init_size = 2 [default = 256];
   // size limit in MB; report error/warning if this limit is reached.
   // 0 for unlimited memory, i.e., use as much memory as the device has
   // not used currently.
-	optional uint32 max_size = 3 [default = 0];
+  optional uint32 max_size = 3 [default = 0];
 
-	// memory manager flag for cnmem
-	// flag = 0: default flag
-	// flag = 1: prevent the manager from growing its memory consumption
-	// flag = 2: prevent the manager from stealing memory
-	optional uint32 flag = 11 [default = 0];
+  // memory manager flag for cnmem
+  // flag = 0: default flag
+  // flag = 1: prevent the manager from growing its memory consumption
+  // flag = 2: prevent the manager from stealing memory
+  optional uint32 flag = 11 [default = 0];
   repeated uint32 device = 12;
 }
 
@@ -70,7 +70,8 @@ message MemPoolConf {
 message TensorProto {
   repeated uint32 shape = 1;
   optional DataType data_type = 2;
-  optional bool transpose = 3;
+  //optional bool transpose = 3;
+  repeated int32 strides = 3;
   repeated float float_data = 4 [packed = true];
   repeated double double_data = 5 [packed = true];
   repeated int32 int_data = 6 [packed = true];


[10/10] incubator-singa git commit: Merge branch 'pr367' into latest

Posted by wa...@apache.org.
Merge branch 'pr367' into latest


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/600f27ed
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/600f27ed
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/600f27ed

Branch: refs/heads/master
Commit: 600f27ede2bdf6cb6c1e502ec46bbf79f5bed243
Parents: 394d78d 3e2b75c
Author: Wang Wei <dc...@nus.edu.sg>
Authored: Sun May 13 23:26:07 2018 +0800
Committer: Wang Wei <dc...@nus.edu.sg>
Committed: Sun May 13 23:26:07 2018 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |  65 ++-
 src/core/tensor/tensor.cc          | 410 +++++++++------
 src/core/tensor/tensor_math.h      | 143 ++---
 src/core/tensor/tensor_math_cpp.h  | 817 +++++++++++++++++------------
 src/core/tensor/tensor_math_cuda.h | 898 +++++++++++++++++++++++++-------
 src/proto/core.proto               |  21 +-
 6 files changed, 1567 insertions(+), 787 deletions(-)
----------------------------------------------------------------------



[07/10] incubator-singa git commit: Streamlining of tensor.h file by moving respective member functions to cpp or cuda file. Removal of shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be passed as reference instead of pointer

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index d4cd5da..1ca312a 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -23,7 +23,6 @@
 #include "singa/core/common.h"
 #include "singa/core/tensor.h"
 #include <math.h>
-#include <vector>
 
 #ifdef USE_CBLAS
 #include <cblas.h>
@@ -31,80 +30,134 @@
 
 namespace singa {
 
-// template <>
-// void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = fabs(inPtr[i]);
-//   }
-// }
+// ===================== Helper Functions =============================
+
+//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
+vector<int> generate_traversal_info(const Tensor& x) {
+    vector<int> traversal_info = {};
+    for(size_t n=0; n<(x.shape().size()+2); ++n) {
+      traversal_info.push_back(0);
+    }
+    return traversal_info;
+};
+
+//generate shape multipliers
+//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
+//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
+//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
+//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
+vector<int> generate_shape_multipliers(const Tensor& x) {
+    Shape y_shape = x.shape();
+    if(y_shape.size()==0){
+      return {1};
+    }
+    vector<int> shape_multipliers = {1};
+    int cumulative_product = 1;
+
+    for (size_t n=0; n<(y_shape.size()-1); ++n) {
+        cumulative_product = cumulative_product*y_shape[y_shape.size()-1-n];
+        shape_multipliers.insert(shape_multipliers.begin(), cumulative_product);
+    }
+    return shape_multipliers;
+};
+
+// ******************************************************************************************
+// CPP traversal operations (works on const declarations without modifying tensor variables)
+// ******************************************************************************************
+
+//this function checks whether the next index falls on a special multiplier of the outer shape
+//so the algorithm knows when to jump over/back to a starting element of the outer shape
+//for e.g. in [[1,4,7], [2,5,8], [3,6,9]], elements 1,2,3 are the starting elements of their respective rows
+//this additional check only has 1 loop for 2d matrix
+//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
+int determine_order(vector<int>& shape_multipliers, int counter) {
+    for (size_t n=0; n<(shape_multipliers.size()-1); ++n) {
+        if((counter%shape_multipliers[n])==0){
+            return ((shape_multipliers.size()) - 1 - n);
+        }
+    }
+    return 0;
+};
+
+//this function updates the base indexes with the current index after every single traversal step,
+//can be generalized beyond 2d cases
+void update_base_index(const Tensor& x, vector<int>& traversal_info) {
+    for (int n=0; n<(traversal_info[x.shape().size()+1]+1); ++n) {
+        traversal_info[n] = traversal_info[x.shape().size()];
+    }
+};
+
+//function to traverse a const strided tensor object
+//it requires an additional vector, traversal_info {0,0,0,0 ...}, comprising (x.shape().size()+2) elements of 0
+//for e.g. 2d matrix:
+//index 0 and 1 store the base row and column index respectively
+//index 2 stores the current index of the traversal
+//index 3 stores the order of the traversal for e.g. if the order is 0,
+//it means the next element can be navigated to using the innermost stride
+void traverse_next(const Tensor& x,
+                   vector<int>& shape_multipliers, 
+                   vector<int>& traversal_info,
+                   int counter) {
+
+    update_base_index(x, traversal_info);
+    traversal_info[x.shape().size()+1] = determine_order(shape_multipliers, counter);
+    traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size()+1]] + 
+                                                   x.strides()[x.strides().size()-traversal_info[x.shape().size()+1]-1];
+};
+
+template <typename DType>
+void TraverseUnary(const Tensor &in, Tensor* out, std::function<DType(DType)> func){
+  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+  const DType *inPtr = static_cast<const DType *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) { 
+    outPtr[i] = func(inPtr[traversal_info[in.shape().size()]]);
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
+  }
+}
+
+template <typename DType>
+void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out, 
+                    std::function<DType(DType, DType)> func){
+  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+  const DType *in1Ptr = static_cast<const DType *>(in1.block()->data());
+  const DType *in2Ptr = static_cast<const DType *>(in2.block()->data());
+  vector<int> traversal_info_in1 = generate_traversal_info(in1);
+  vector<int> traversal_info_in2 = generate_traversal_info(in2);
+  vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+  vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+  for (size_t i = 0; i < in1.Size(); i++) {
+    outPtr[i] = func(in1Ptr[traversal_info_in1[in1.shape().size()]],
+                     in2Ptr[traversal_info_in2[in2.shape().size()]]);
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+  }
+}
+
+// ******************************************************************************************
+// traversal operations end
+// ******************************************************************************************
+
+// ===================== CUDA Functions =============================
 
 template <>
-void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out, Context *ctx) {
+void Abs<float, lang::Cpp>(const Tensor& in, Tensor* out, Context *ctx) {
   TraverseUnary<float>(in, out, [](float x) {return fabs(x);});
 }
 
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in, const float x,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = inPtr[i] + x;
-//   }
-// }
-
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->block()->data());
-//   vector<int> traversal_info = in->generate_traversal_info();
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = inPtr[traversal_info[in->shape().size()]] + x;
-//     in->traverse_next(traversal_info, i+1);
-//   }
-// }
-
 template <>
-void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out, Context *ctx) {
   auto add_lambda = [&x](float a) {
     return (a+x);
   };
   TraverseUnary<float>(in, out, add_lambda);
 }
 
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   // CHECK_EQ(ctx->stream, nullptr);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = in1Ptr[i] + in2Ptr[i];
-//   }
-// }
-
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
-//   // CHECK_EQ(ctx->stream, nullptr);
-//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-//   //call axpy if both strides are 1?
-//   vector<int> traversal_info_in1 = in1->generate_traversal_info();
-//   vector<int> traversal_info_in2 = in2->generate_traversal_info();
-//   for (size_t i = 0; i < in1->Size(); i++) {
-//     outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] + in2Ptr[traversal_info_in2[in2->shape().size()]];
-//     in1->traverse_next(traversal_info_in1, i+1);
-//     in2->traverse_next(traversal_info_in2, i+1);
-//   }
-// }
-
 template <>
-void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   auto add_lambda_binary = [](float a, float b) {
     return (a+b);
@@ -113,46 +166,9 @@ void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Co
   
 }
 
-// template <>
-// void Clamp<float, lang::Cpp>(const float low,
-//                              const float high, const Tensor* in, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     if (inPtr[i] > high) {
-//       outPtr[i] = high;
-//     } else if (inPtr[i] < low) {
-//       outPtr[i] = low;
-//     } else {
-//       outPtr[i] = inPtr[i];
-//     }
-//   }
-// }
-
-// template <>
-// void Clamp<float, lang::Cpp>(const Tensor* in, const float low,
-//                              const float high, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->block()->data());
-//   vector<int> traversal_info = in->generate_traversal_info();
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     int traversed_index = traversal_info[in->shape().size()];
-//     if (inPtr[traversed_index] > high) {
-//       outPtr[i] = high;
-//     } else if (inPtr[traversed_index] < low) {
-//       outPtr[i] = low;
-//     } else {
-//       outPtr[i] = inPtr[traversed_index];
-//     }
-//     in->traverse_next(traversal_info, i+1);
-//   }
-// }
-
 template <>
 void Clamp<float, lang::Cpp>(const float low, const float high,
-                             const Tensor* in, Tensor* out,
+                             const Tensor& in, Tensor* out,
                              Context *ctx) {
   auto clamp_lambda = [&low, &high](float a) {
     if(a < low){return low;}
@@ -162,73 +178,42 @@ void Clamp<float, lang::Cpp>(const float low, const float high,
   TraverseUnary<float>(in, out, clamp_lambda);
 }
 
-
-// template <>
-// void Div<float, lang::Cpp>(const float x, const Tensor* in,
-//                            Tensor* out, Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_NE(inPtr[i], 0.f);
-//     outPtr[i] = x / inPtr[i];
-//   }
-// }
-
 template <>
-void Div<float, lang::Cpp>(const float x, const Tensor* in, Tensor* out,
+void Div<float, lang::Cpp>(const float x, const Tensor& in, Tensor* out,
                            Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
+  const float *inPtr = static_cast<const float *>(in.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) {
-    CHECK_NE(inPtr[traversal_info[in->shape().size()]], 0.f);
-    outPtr[i] = x / inPtr[traversal_info[in->shape().size()]];
-    in->traverse_next(traversal_info, i+1);
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) {
+    CHECK_NE(inPtr[traversal_info[in.shape().size()]], 0.f);
+    outPtr[i] = x / inPtr[traversal_info[in.shape().size()]];
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
-
-// template <>
-// void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_NE(in2Ptr[i], 0.f);
-//     outPtr[i] = in1Ptr[i] / in2Ptr[i];
-//   }
-// }
-
 template <>
-void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+void Div<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-  vector<int> traversal_info_in1 = in1->generate_traversal_info();
-  vector<int> traversal_info_in2 = in2->generate_traversal_info();
-  for (size_t i = 0; i < in1->Size(); i++) {
-    CHECK_NE(in2Ptr[traversal_info_in2[in2->shape().size()]], 0.f);
-    outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] / in2Ptr[traversal_info_in2[in2->shape().size()]];
-    in1->traverse_next(traversal_info_in1, i+1);
-    in2->traverse_next(traversal_info_in2, i+1);
+  const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+  vector<int> traversal_info_in1 = generate_traversal_info(in1);
+  vector<int> traversal_info_in2 = generate_traversal_info(in2);
+  vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+  vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+  for (size_t i = 0; i < in1.Size(); i++) {
+    CHECK_NE(in2Ptr[traversal_info_in2[in2.shape().size()]], 0.f);
+    outPtr[i] = in1Ptr[traversal_info_in1[in1.shape().size()]] / in2Ptr[traversal_info_in2[in2.shape().size()]];
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
   }
 }
 
-
-// template <>
-// void EltwiseMult<float, lang::Cpp>(const Tensor* in,
-//                                    const float x, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = inPtr[i] * x;
-//   }
-// }
-
 template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void EltwiseMult<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                                    Context *ctx) {
   auto eltwisemult_lambda = [&x](float a) {
     return (a*x);
@@ -236,19 +221,8 @@ void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, eltwisemult_lambda);
 }
 
-// template <>
-// void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, 
-//                                    Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = in1Ptr[i] * in2Ptr[i];
-//   }
-// }
-
 template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, 
+void EltwiseMult<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, 
                                    Context *ctx) {
   auto eltwisemult_lambda_binary = [](float a, float b) {
     return (a*b);
@@ -256,33 +230,13 @@ void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor*
   TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
 }
 
-// template <>
-// void Exp<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = exp(inPtr[i]);
-//   }
-// }
-
 template <>
-void Exp<float, lang::Cpp>(const Tensor* in, Tensor *out, Context *ctx) {
+void Exp<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
   TraverseUnary<float>(in, out, [](float x) {return exp(x);});
 }
 
-// template <>
-// void GE<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void GE<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto ge_lambda = [&x](float a) {
     return (a >= x) ? 1.f : 0.f;
@@ -290,19 +244,8 @@ void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, ge_lambda);
 }
 
-// template <>
-// void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void GE<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto ge_lambda_binary = [](float a, float b) {
     return (a >= b) ? 1.f : 0.f;
@@ -310,18 +253,8 @@ void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, ge_lambda_binary);
 }
 
-// template <>
-// void GT<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void GT<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto gt_lambda = [&x](float a) {
     return (a > x) ? 1.f : 0.f;
@@ -329,19 +262,8 @@ void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, gt_lambda);
 }
 
-// template <>
-// void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void GT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto gt_lambda_binary = [](float a, float b) {
     return (a > b) ? 1.f : 0.f;
@@ -349,18 +271,8 @@ void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, gt_lambda_binary);
 }
 
-// template <>
-// void LE<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void LE<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto le_lambda = [&x](float a) {
     return (a <= x) ? 1.f : 0.f;
@@ -368,19 +280,8 @@ void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, le_lambda);
 }
 
-// template <>
-// void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void LE<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto le_lambda_binary = [](float a, float b) {
     return (a <= b) ? 1.f : 0.f;
@@ -388,42 +289,23 @@ void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, le_lambda_binary);
 }
 
-// template <>
-// void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_GT(inPtr[i], 0.f);
-//     outPtr[i] = log(inPtr[i]);
-//   }
-// }
-
 template <>
-void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Log<float, lang::Cpp>(const Tensor& in, Tensor* out,
                            Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) {
-    CHECK_GT(inPtr[traversal_info[in->shape().size()]], 0.f);
-    outPtr[i] = log(inPtr[traversal_info[in->shape().size()]]);
-    in->traverse_next(traversal_info, i+1);
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) {
+    CHECK_GT(inPtr[traversal_info[in.shape().size()]], 0.f);
+    outPtr[i] = log(inPtr[traversal_info[in.shape().size()]]);
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
-// template <>
-// void LT<float, lang::Cpp>(const Tensor* in, const float x,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
-//   }
-// }
-
 template <>
-void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void LT<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                           Context *ctx) {
   auto lt_lambda = [&x](float a) {
     return (a < x) ? 1.f : 0.f;
@@ -431,19 +313,9 @@ void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
   TraverseUnary<float>(in, out, lt_lambda);
 }
 
-// template <>
-// void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                           Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr1 = static_cast<const float *>(in1->data());
-//   const float *inPtr2 = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
-//   }
-// }
 
 template <>
-void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void LT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                           Context *ctx) {
   auto lt_lambda_binary = [](float a, float b) {
     return (a < b) ? 1.f : 0.f;
@@ -451,34 +323,13 @@ void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, lt_lambda_binary);
 }
 
-// template <>
-// void Pow<float, lang::Cpp>(const Tensor* in, const float x,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = pow(inPtr[i], x);
-//   }
-// }
-
 template <>
-void Pow<float, lang::Cpp>(const Tensor* in, const float x, Tensor *out, Context *ctx) {
+void Pow<float, lang::Cpp>(const Tensor& in, const float x, Tensor *out, Context *ctx) {
   TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
 }
 
-// template <>
-// void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
-//   }
-// }
-
 template <>
-void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void Pow<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                            Context *ctx) {
   auto pow_lambda_binary = [](float a, float b) {
     return pow(a,b);
@@ -486,18 +337,8 @@ void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
   TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
 }
 
-// template <>
-// void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
-//   }
-// }
-
 template <>
-void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void ReLU<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto relu_lambda = [](float a) {
     return (a >= 0.f) ? a : 0.f;
@@ -505,13 +346,6 @@ void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, relu_lambda);
 }
 
-// template <>
-// void Set<float, lang::Cpp>(const float x, Tensor* out,
-//                            Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
-// }
-
 template <>
 void Set<float, lang::Cpp>(const float x, Tensor* out,
                            Context *ctx) {
@@ -519,13 +353,6 @@ void Set<float, lang::Cpp>(const float x, Tensor* out,
   for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
 
-// template <>
-// void Set<int, lang::Cpp>(const int x, Tensor* out,
-//                            Context *ctx) {
-//   int *outPtr = static_cast<int *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
-// }
-
 template <>
 void Set<int, lang::Cpp>(const int x, Tensor* out,
                            Context *ctx) {
@@ -533,18 +360,8 @@ void Set<int, lang::Cpp>(const int x, Tensor* out,
   for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
 
-// template <>
-// void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                                Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
-//   }
-// }
-
 template <>
-void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sigmoid<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto sigmoid_lambda = [](float a) {
     return 1.f / (1.f + exp(-a));
@@ -552,18 +369,8 @@ void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, sigmoid_lambda);
 }
 
-// template <>
-// void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
-//   }
-// }
-
 template <>
-void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sign<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto sign_lambda = [](float a) {
     return (a > 0) - (a < 0);
@@ -571,56 +378,23 @@ void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, sign_lambda);
 }
 
-// template <>
-// void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     CHECK_GE(inPtr[i], 0.f);
-//     outPtr[i] = sqrt(inPtr[i]);
-//   }
-// }
-
 template <>
-void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sqrt<float, lang::Cpp>(const Tensor& in, Tensor* out,
                             Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) {
-    CHECK_GE(inPtr[traversal_info[in->shape().size()]], 0.f);
-    outPtr[i] = sqrt(inPtr[traversal_info[in->shape().size()]]);
-    in->traverse_next(traversal_info, i+1);
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) {
+    CHECK_GE(inPtr[traversal_info[in.shape().size()]], 0.f);
+    outPtr[i] = sqrt(inPtr[traversal_info[in.shape().size()]]);
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
-/*
 template <>
-void Square<float, lang::Cpp>(const Tensor* in, Tensor* out,
-                              Context *ctx) {
-  float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
-  for (size_t i = 0; i < in->Size(); i++) {
-    outPtr[i] = inPtr[i] * inPtr[i];
-  }
-}
-*/
-
-// template <>
-// void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            Tensor* out, Context *ctx) {
-//   // CHECK_EQ(ctx->stream, nullptr);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = in1Ptr[i] - in2Ptr[i];
-//   }
-// }
-
-template <>
-void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+void Sub<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   auto sub_lambda_binary = [](float a, float b) {
@@ -632,28 +406,18 @@ void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
 // sum all elements of input into out
 // TODO(wangwei) optimize using omp
 template <>
-void Sum<float, lang::Cpp>(const Tensor* in, float *out,
+void Sum<float, lang::Cpp>(const Tensor& in, float *out,
                            Context *ctx) {
   float s = 0.f;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) {
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) {
     s += inPtr[i];
   }
   *out = s;
 }
 
-// template <>
-// void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
-//                             Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = tanh(inPtr[i]);
-//   }
-// }
-
 template <>
-void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Tanh<float, lang::Cpp>(const Tensor& in, Tensor* out,
                           Context *ctx) {
   auto tanh_lambda = [](float a) {
     return tanh(a);
@@ -661,17 +425,6 @@ void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
   TraverseUnary<float>(in, out, tanh_lambda);
 }
 
-// ===============Random operations==========================================
-// template <>
-// void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
-//                                  Context *ctx) {
-//   std::bernoulli_distribution distribution(p);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
-//   }
-// }
-
 template <>
 void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
                                  Context *ctx) {
@@ -682,16 +435,6 @@ void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
   }
 }
 
-// template <>
-// void Gaussian<float, lang::Cpp>(const float mean,
-//                                 const float std, Tensor* out, Context *ctx) {
-//   std::normal_distribution<float> distribution(mean, std);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-//   }
-// }
-
 template <>
 void Gaussian<float, lang::Cpp>(const float mean,
                                 const float std, Tensor* out, Context *ctx) {
@@ -702,16 +445,6 @@ void Gaussian<float, lang::Cpp>(const float mean,
   }
 }
 
-// template <>
-// void Uniform<float, lang::Cpp>(const float low,
-//                                const float high, Tensor* out, Context *ctx) {
-//   std::uniform_real_distribution<float> distribution(low, high);
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-//   }
-// }
-
 template <>
 void Uniform<float, lang::Cpp>(const float low,
                                const float high, Tensor* out, Context *ctx) {
@@ -727,113 +460,72 @@ void Uniform<float, lang::Cpp>(const float low,
 //warning, this function has block M overwritting to block M itself
 template <>
 void DGMM<float, lang::Cpp>(const bool side_right,
-                            const Tensor* M, const Tensor* v,
+                            const Tensor& M, const Tensor& v,
                             Tensor* out, Context *ctx) {
-  const float *MPtr = static_cast<const float *>(M->block()->data());
-  const float *vPtr = static_cast<const float *>(v->block()->data());
+  const float *MPtr = static_cast<const float *>(M.block()->data());
+  const float *vPtr = static_cast<const float *>(v.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const size_t nrow = M->shape(0);
-  const size_t ncol = M->shape(1);
-  vector<int> traversal_info = M->generate_traversal_info();
+  const size_t nrow = M.shape(0);
+  const size_t ncol = M.shape(1);
+  vector<int> traversal_info = generate_traversal_info(M);
+  vector<int> shape_multipliers = generate_shape_multipliers(M);
 
   if (side_right) {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[c];
-        M->traverse_next(traversal_info, offset+c+1);
+        outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[c];
+        traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
       }
     }
   } else {
     for (size_t r = 0; r < nrow; r++) {
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
-        outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[r];
-        M->traverse_next(traversal_info, offset+c+1);
+        outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[r];
+        traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
       }
     }
   }
 }
 
-// #ifdef USE_CBLAS
-// template <>
-// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
-//                             Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   *out = cblas_isamax(in->Size(), inPtr, 1);
-// }
-
-// template <>
-// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
-//                             Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   *out = cblas_sasum(in->Size(), inPtr, 1);
-// }
-
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-//                             const Tensor* in, Tensor* out, Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
-// }
-
-// template <>
-// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            float *out, Context *ctx) {
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   *out = cblas_sdot(in->Size(), in1Ptr, 1, in2Ptr, 1);
-// }
-// template <>
-// void Scale<float, lang::Cpp>(const float x, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   cblas_sscal(in->Size(), x, outPtr, 1);
-// }
-// template <>
-// void Nrm2<float, lang::Cpp>(const Tensor* in, float *out,
-//                             Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   *out = cblas_snrm2(in->Size(), inPtr, 1);
-// }
 
 #ifdef USE_CBLAS
 template <>
-void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  *out = cblas_isamax(in->Size(), inPtr, 1); //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  *out = cblas_isamax(in.Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
-void Asum<float, lang::Cpp>(const Tensor *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor& in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  *out = cblas_sasum(in->Size(), inPtr, 1); //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  *out = cblas_sasum(in.Size(), inPtr, 1); //not using strided traversal
 }
 
 template <>
 void Axpy<float, lang::Cpp>(const float alpha,
-                            const Tensor *in, Tensor *out, Context *ctx) {
+                            const Tensor& in, Tensor *out, Context *ctx) {
   //check input tensor for strides first
-  if(in->strides() != out->strides()){
-    const float *inPtr = static_cast<const float *>(in->block()->data());
+  if(in.strides() == out->strides()){
+    const float *inPtr = static_cast<const float *>(in.block()->data());
     float *outPtr = static_cast<float *>(out->block()->mutable_data());
-    cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+    cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
   } else {
     LOG(FATAL) << "Axpy, input and output strides do not match." ;
   }
 }
 
 template <>
-void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            float *out, Context *ctx) {
   //check input tensor for strides first
-  if(!(in1->transpose()) && !(in2->transpose())){
-    const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-    const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-    *out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+  if(!(in1.transpose()) && !(in2.transpose())){
+    const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+    const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+    *out = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
   } else {
     LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
   }
@@ -847,40 +539,21 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
 }
 
 template <>
-void Nrm2<float, lang::Cpp>(const Tensor *in, float *out,
+void Nrm2<float, lang::Cpp>(const Tensor& in, float *out,
                             Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  *out = cblas_snrm2(in->Size(), inPtr, 1); //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  *out = cblas_snrm2(in.Size(), inPtr, 1); //not using strided traversal
 }
 
-// template <>
-// void GEMV<float, lang::Cpp>(//bool trans,
-//                             const std::vector<int> stridesA,
-//                             const size_t m, const size_t n,
-//                             const float alpha, const Tensor* A, const Tensor* v,
-//                             const float beta, Tensor* out, Context *ctx) {
-//   const float *APtr = static_cast<const float *>(A->data());
-//   const float *vPtr = static_cast<const float *>(v->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   auto trans = (stridesA.back() == 1) ? true : false;
-//   if (!trans) {
-//     cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
-//                 beta, outPtr, 1);
-//   } else {
-//     cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
-//                 outPtr, 1);
-//   }
-// }
-
 template <>
-void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
                             const float beta, Tensor *out, Context *ctx) {
-  const float *APtr = static_cast<const float *>(A->block()->data());
-  const float *vPtr = static_cast<const float *>(v->block()->data());
+  const float *APtr = static_cast<const float *>(A.block()->data());
+  const float *vPtr = static_cast<const float *>(v.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const size_t m = A->shape()[0];
-  const size_t n = A->shape()[1];
-  if (A->transpose()) {
+  const size_t m = A.shape()[0];
+  const size_t n = A.shape()[1];
+  if (A.transpose()) {
     cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
                 outPtr, 1);
   } else {
@@ -889,147 +562,36 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
   }
 }
 
-// template <>
-// void GEMM<float, lang::Cpp>(//const bool transA, const bool transB,
-//                             const std::vector<int> stridesA, const std::vector<int> stridesB,
-//                             const size_t nrowA, const size_t ncolB,
-//                             const size_t ncolA, const float alpha,
-//                             const Tensor* A, const Tensor* B, const float beta,
-//                             Tensor* C, Context *ctx) {
-//   auto transA = (stridesA.back() == 1) ? true : false;
-//   auto transa = transA ? CblasTrans : CblasNoTrans;
-//   auto transB = (stridesB.back() == 1) ? true : false;
-//   auto transb = transB ? CblasTrans : CblasNoTrans;
-//   auto lda = transA ? nrowA : ncolA;
-//   auto ldb = transB ? ncolA : ncolB;
-//   auto ldc = ncolB;
-//   const float *APtr = static_cast<const float *>(A->data());
-//   const float *BPtr = static_cast<const float *>(B->data());
-//   float *CPtr = static_cast<float *>(C->mutable_data());
-//   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
-//    lda, BPtr, ldb, beta, CPtr, ldc);
-// }
-
 template <>
 void GEMM<float, lang::Cpp>(const float alpha,
-                            const Tensor *A, const Tensor *B, const float beta,
+                            const Tensor& A, const Tensor& B, const float beta,
                             Tensor *C, Context *ctx) {
-  auto transA = A->transpose();
+  auto transA = A.transpose();
   auto transa = transA ? CblasTrans : CblasNoTrans;
-  auto transB = B->transpose();
+  auto transB = B.transpose();
   auto transb = transB ? CblasTrans : CblasNoTrans;
-  const size_t nrowA = A->shape()[0];
-  const size_t ncolA = A->shape()[1];
-  const size_t ncolB = B->shape()[1];
+  const size_t nrowA = A.shape()[0];
+  const size_t ncolA = A.shape()[1];
+  const size_t ncolB = B.shape()[1];
   auto lda = transA ? nrowA : ncolA;
   auto ldb = transB ? ncolA : ncolB;
   auto ldc = ncolB;
-  const float *APtr = static_cast<const float *>(A->block()->data());
-  const float *BPtr = static_cast<const float *>(B->block()->data());
+  const float *APtr = static_cast<const float *>(A.block()->data());
+  const float *BPtr = static_cast<const float *>(B.block()->data());
   float *CPtr = static_cast<float *>(C->block()->mutable_data());
   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
     lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
-#else
-
-// template <>
-// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
-//                             Context *ctx) {
-//   size_t maxPos = 0;
-//   float maxVal = 0;
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     if (i == 0) {
-//       maxVal = inPtr[i];
-//     } else if (inPtr[i] > maxVal) {
-//       maxVal = inPtr[i];
-//       maxPos = i;
-//     }
-//   }
-//   *out = maxPos;
-// }
-// template <>
-// void Amin<float, lang::Cpp>(const Tensor* in, size_t *out,
-//                             Context *ctx) {
-//   size_t minPos = 0;
-//   float minVal = 0;
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     if (i == 0) {
-//       minVal = inPtr[i];
-//     } else if (inPtr[i] > minVal) {
-//       minVal = inPtr[i];
-//       minPos = i;
-//     }
-//   }
-//   *out = minPos;
-// }
-
-// template <>
-// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
-//                             Context *ctx) {
-//   float sum = 0;
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     sum += fabs(inPtr[i]);
-//   }
-// }
-
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-//                             const Tensor* in, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] += alpha * inPtr[i];
-//   }
-// }
-
-// template <>
-// void Scale<float, lang::Cpp>(const float x, Tensor* out,
-//                              Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     outPtr[i] *= x;
-//   }
-// }
-
-// template <>
-// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-//                            float *out, Context *ctx) {
-//   float sum = 0;
-//   const float *in1Ptr = static_cast<const float *>(in1->data());
-//   const float *in2Ptr = static_cast<const float *>(in2->data());
-//   for (size_t i = 0; i < in->Size(); i++) {
-//     sum += in1Ptr[i] * in2Ptr[i];
-//   }
-// }
-
-// template <>
-// void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-//                             const float alpha, const Tensor* A, const Tensor* v,
-//                             const float beta, Tensor* out, Context *ctx) {
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   const float *APtr = static_cast<const float *>(A->data());
-//   const float *vPtr = static_cast<const float *>(v->data());
-//   for (size_t r = 0; r < m; r++) {
-//     float sum = 0;
-//     for (size_t c = 0; c < n; c++) {
-//       size_t idx = trans ? c * m + r : r * n + c;
-//       sum += APtr[idx] * vPtr[c];
-//     }
-//     outPtr[r] = alpha * sum + beta * outPtr[r];
-//   }
-// }
+#else    
 
 template <>
-void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
                             Context *ctx) {
   size_t maxPos = 0;
   float maxVal = 0;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) { //not using strided traversal
     if (i == 0) {
       maxVal = inPtr[i];
     } else if (inPtr[i] > maxVal) {
@@ -1040,12 +602,12 @@ void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
   *out = maxPos;
 }
 template <>
-void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amin<float, lang::Cpp>(const Tensor& in, size_t *out,
                             Context *ctx) {
   size_t minPos = 0;
   float minVal = 0;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) { //not using strided traversal
     if (i == 0) {
       minVal = inPtr[i];
     } else if (inPtr[i] > minVal) {
@@ -1057,24 +619,26 @@ void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
 }
 
 template <>
-void Asum<float, lang::Cpp>(const Tensor *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor& in, float *out,
                             Context *ctx) {
   float sum = 0;
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  for (size_t i = 0; i < in->Size(); i++) {
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  for (size_t i = 0; i < in.Size(); i++) {
     sum += fabs(inPtr[i]); //not using strided traversal
   }
 }
 
 template <>
 void Axpy<float, lang::Cpp>(const float alpha,
-                            const Tensor *in, Tensor *out, Context *ctx) {
+                            const Tensor& in, Tensor *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->block()->data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) { 
-    outPtr[i] += alpha * inPtr[traversal_info[in->shape().size()]];
-    in->traverse_next(traversal_info, i+1);
+  const float *inPtr = static_cast<const float *>(in.block()->data());
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+  for (size_t i = 0; i < in.Size(); i++) { 
+    outPtr[i] += alpha * inPtr[traversal_info[in.shape().size()]];
+    traverse_next(in, shape_multipliers, traversal_info, i+1);
   }
 }
 
@@ -1088,35 +652,38 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
 }
 
 template <>
-void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            float *out, Context *ctx) {
   float sum = 0;
-  // const float *in1Ptr = static_cast<const float *>(in1->data());
-  // const float *in2Ptr = static_cast<const float *>(in2->data());
-  // for (size_t i = 0; i < in->Size(); i++) {
+  // const float *in1Ptr = static_cast<const float *>(in1.data());
+  // const float *in2Ptr = static_cast<const float *>(in2.data());
+  // for (size_t i = 0; i < in.Size(); i++) {
   //   sum += in1Ptr[i] * in2Ptr[i]; 
   // }
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-  const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-  vector<int> traversal_info_in1 = in1->generate_traversal_info();
-  vector<int> traversal_info_in2 = in2->generate_traversal_info();
-  for (size_t i = 0; i < in1->Size(); i++) {
-    sum += in1Ptr[traversal_info_in1[in1->shape().size()]] * in2Ptr[traversal_info_in2[in2->shape().size()]];
-    in1->traverse_next(traversal_info_in1, i+1);
-    in2->traverse_next(traversal_info_in2, i+1);
+  const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+  const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+  vector<int> traversal_info_in1 = generate_traversal_info(in1);
+  vector<int> traversal_info_in2 = generate_traversal_info(in2);
+  vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+  vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+  for (size_t i = 0; i < in1.Size(); i++) {
+    sum += in1Ptr[traversal_info_in1[in1.shape().size()]] * in2Ptr[traversal_info_in2[in2.shape().size()]];
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
   }
 }
 
 template <>
-void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
                             const float beta, Tensor *out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const float *APtr = static_cast<const float *>(A->block()->data());
-  const float *vPtr = static_cast<const float *>(v->block()->data());
-  bool trans = A->transpose();
-  const size_t m = A->shape(0);
-  const size_t n = A->shape(1);
+  const float *APtr = static_cast<const float *>(A.block()->data());
+  const float *vPtr = static_cast<const float *>(v.block()->data());
+  bool trans = A.transpose();
+  const size_t m = A.shape(0);
+  const size_t n = A.shape(1);
   for (size_t r = 0; r < m; r++) {
     float sum = 0;
     for (size_t c = 0; c < n; c++) {
@@ -1189,34 +756,21 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool int_target,
   }
 }
 
-// template <>
-// void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-//                               const Tensor* in, Tensor* out, Context *ctx) {
-//   const float *inPtr = static_cast<const float *>(in->data());
-//   float *outPtr = static_cast<float *>(out->mutable_data());
-//   for (size_t r = 0; r < nrow; r++) {
-//     int offset = (int)(r * ncol);
-//     float maxval = inPtr[offset];
-//     for (size_t c = 1; c < ncol; c++)
-//       maxval = (std::max)(maxval, inPtr[offset + c]);
-//     outPtr[r] = maxval;
-//   }
-// }
-
 template <>
-void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
-  const float *inPtr = static_cast<const float *>(in->block()->data());
+void RowMax<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
+  const float *inPtr = static_cast<const float *>(in.block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  const size_t nrow = in->shape()[0];
-  const size_t ncol = in->shape()[1];
-  vector<int> traversal_info = in->generate_traversal_info();
+  const size_t nrow = in.shape()[0];
+  const size_t ncol = in.shape()[1];
+  vector<int> traversal_info = generate_traversal_info(in);
+  vector<int> shape_multipliers = generate_shape_multipliers(in);
     
   for (size_t r = 0; r < nrow; r++) {
     int counter_offset = (r * ncol);
     float maxval = 0;
     for (size_t c = 0; c < ncol; c++){
-      maxval = (std::max)(maxval, inPtr[traversal_info[in->shape().size()]]);
-      in->traverse_next(traversal_info, counter_offset+c+1);
+      maxval = (std::max)(maxval, inPtr[traversal_info[in.shape().size()]]);
+      traverse_next(in, shape_multipliers, traversal_info, counter_offset+c+1);
     }
     outPtr[r] = maxval;
   }
@@ -1226,11 +780,11 @@ void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
 /*
 template <>
 void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Tensor* A, const Tensor* v, Tensor* out,
+                              const Tensor& A, const Tensor& v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
+  const float *APtr = static_cast<const float *>(A.data());
+  const float *vPtr = static_cast<const float *>(v.data());
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
     for (size_t c = 0; c < ncol; c++) {
@@ -1241,11 +795,11 @@ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                              const Tensor* A, const Tensor* v, Tensor* out,
+                              const Tensor& A, const Tensor& v, Tensor* out,
                               Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *APtr = static_cast<const float *>(A->data());
-  const float *vPtr = static_cast<const float *>(v->data());
+  const float *APtr = static_cast<const float *>(A.data());
+  const float *vPtr = static_cast<const float *>(v.data());
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
     for (size_t c = 0; c < ncol; c++) {
@@ -1254,11 +808,11 @@ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
   }
 }
 template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
-                             const Tensor* in2, Tensor* out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor& in1,
+                             const Tensor& in2, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *in1Ptr = static_cast<const float *>(in1->data());
-  const float *in2Ptr = static_cast<const float *>(in2->data());
+  const float *in1Ptr = static_cast<const float *>(in1.data());
+  const float *in2Ptr = static_cast<const float *>(in2.data());
   for (size_t r = 0; r < m; r++) {
     size_t offset = r * n;
     for (size_t c = 0; c < n; c++) {
@@ -1268,9 +822,9 @@ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
 }
 template <>
 void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Tensor* in, Tensor* out, Context *ctx) {
+                               const Tensor& in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+  const float *inPtr = static_cast<const float *>(in.data());
   float *bPtr = new float[ncol];
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
@@ -1289,9 +843,9 @@ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                                  const Tensor* in, Tensor* out, Context *ctx) {
+                                  const Tensor& in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+  const float *inPtr = static_cast<const float *>(in.data());
   for (size_t c = 0; c < ncol; c++) {
     outPtr[c] = 0.f;
   }
@@ -1305,9 +859,9 @@ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
 
 template <>
 void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-                               const Tensor* in, Tensor* out, Context *ctx) {
+                               const Tensor& in, Tensor* out, Context *ctx) {
   float *outPtr = static_cast<float *>(out->mutable_data());
-  const float *inPtr = static_cast<const float *>(in->data());
+  const float *inPtr = static_cast<const float *>(in.data());
   for (size_t r = 0; r < nrow; r++) {
     size_t offset = r * ncol;
     outPtr[r] = 0.f;


[08/10] incubator-singa git commit: Streamlining of tensor.h file by moving respective member functions to cpp or cuda file. Removal of shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be passed as reference instead of pointer

Posted by wa...@apache.org.
Streamlining of tensor.h file by moving respective member functions to cpp or cuda file. Removal of shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be passed as reference instead of pointer


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c52e2aa3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c52e2aa3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c52e2aa3

Branch: refs/heads/master
Commit: c52e2aa3b5272750960ce6d3ae9f14bad1cee397
Parents: a44d2e7
Author: Vaan Ng <cm...@gmail.com>
Authored: Sun May 13 00:24:40 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Sun May 13 00:24:40 2018 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |  152 +----
 src/core/tensor/tensor.cc          |   60 +-
 src/core/tensor/tensor_math.h      |  124 ++--
 src/core/tensor/tensor_math_cpp.h  | 1012 +++++++++----------------------
 src/core/tensor/tensor_math_cuda.h |  499 ++++++++-------
 5 files changed, 647 insertions(+), 1200 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index b94a982..e25aafd 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -22,7 +22,6 @@
 #include <vector>
 #include <tuple>
 #include <memory>
-#include <algorithm>
 
 #include "singa/core/common.h"
 #include "singa/core/device.h"
@@ -31,7 +30,6 @@
 
 using std::vector;
 using std::tuple;
-using std::reverse;
 namespace singa {
 
 typedef vector<size_t> Shape;
@@ -104,43 +102,6 @@ class Tensor {
     return shape_.at(idx);
   }
 
-  /*  
-  cudnn requires tensor dimensions to fulfill 1 requirement:
-    1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors 
-        if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
-        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-
-    for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
-             Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
-  */
-  vector<int> generate_shape_cuda() const {
-    vector<int> shape_arr;
-    if(shape_.size() <= 4){
-      for (size_t n=0; n<4-shape_.size(); ++n) {
-        shape_arr.push_back(1);
-      } 
-      for (size_t n=0; n<shape_.size(); ++n) {
-        shape_arr.push_back(shape_.at(n));
-      } 
-      return shape_arr;
-    } else if(shape_.size() == 5){
-      for (size_t n=0; n<shape_.size(); ++n) {
-        shape_arr.push_back(shape_.at(n));
-      } 
-      return shape_arr;
-    } else {
-      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
-    }
-  }
-
-  int generate_dim_cuda() const {
-    if(shape_.size() <= 4){return 4;}
-    else if(shape_.size() == 5){return 5;}
-    else{
-      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
-    } 
-  }
-
   size_t nDim() const { return shape_.size(); }
 
   bool empty() const { return nDim() == 0; }
@@ -150,40 +111,6 @@ class Tensor {
 
   const vector<int>& strides() const { return strides_; }
 
-  /*  
-  cudnn requires stride dimensions to conform to the format of the shape input as well
-    1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
-        If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
-        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-
-    for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3} and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
-  */
-  vector<int> generate_strides_cuda() const {
-    vector<int> strides_arr;
-    int product = 1;
-    for (size_t n=0; n<(shape_.size()); ++n) {
-      product *= shape_[n];
-    }
-    if(shape_.size() <= 4){
-      for (size_t n=0; n<4-shape_.size(); ++n) {
-        strides_arr.push_back(product);
-      } 
-      for (size_t n=0; n<strides_.size(); ++n) {
-          strides_arr.push_back(strides_[n]);
-        }
-      return strides_arr;
-    } else if(shape_.size() == 5){
-      for (size_t n=0; n<strides_.size(); ++n) {
-          strides_arr.push_back(strides_[n]);
-        }
-      return strides_arr;
-    } else {
-      LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
-    }
-  }
-
-  const vector<int>& shape_multipliers() const { return shape_multipliers_; }
-
   /// return true if the content of the tensor is initialized
   bool initailized() const {
     return block_ != nullptr && block_->initialized();
@@ -292,7 +219,7 @@ class Tensor {
   float L2() const;
 
   //generate strides automatically if stride field is not passed
-void Generate_Strides(){
+void generate_strides(){
     if(shape_.size()==0){
       strides_ = {1};
       return void();
@@ -306,84 +233,11 @@ void Generate_Strides(){
     }
 };
 
-void Set_Strides(const vector<int> new_strides){
+void set_strides(const vector<int> new_strides){
   strides_ = new_strides;
 }
 
-//generate shape multipliers
-//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
-//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
-//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
-//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
-vector<int> Generate_Shape_Multipliers(Shape y_shape) const {
-    if(y_shape.size()==0){
-      return {1};
-    }
-    reverse(y_shape.begin(), y_shape.end());
-    vector<int> shape_multipliers = {};
-    int cumulative_product = 1;
-
-    shape_multipliers.push_back(1);
-    for (size_t n=0; n<(y_shape.size()-1); ++n) {
-        cumulative_product = cumulative_product*y_shape[n];
-        shape_multipliers.push_back(cumulative_product);
-    }
-    reverse(shape_multipliers.begin(), shape_multipliers.end());
-    return shape_multipliers;
-};
-
-// ******************************************************************************************
-// Some traversal operations (works on const declarations without modifying tensor variables)
-// ******************************************************************************************
-
-//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
-vector<int> generate_traversal_info() const {
-    vector<int> traversal_info = {};
-    for(size_t n=0; n<(shape_.size()+2); ++n) {
-      traversal_info.push_back(0);
-    }
-    return traversal_info;
-};
-
-//this function checks whether the next index falls on a special multiplier of the outer shape
-//so the algorithm knows when to jump over/back to a starting element of the outer shape
-//for e.g. in [[1,4,7], [2,5,8], [3,6,9]], elements 1,2,3 are the starting elements of their respective rows
-//this additional check only has 1 loop for 2d matrix
-//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
-int determine_order(int counter) const {
-    for (size_t n=0; n<(shape_multipliers_.size()-1); ++n) {
-        if((counter%shape_multipliers_[n])==0){
-            return ((shape_multipliers_.size()) - 1 - n);
-        }
-    }
-    return 0;
-};
-
-//this function updates the base indexes with the current index after every single traversal step, can be generalized beyond 2d cases
-void update_base_index(std::vector<int>& traversal_info) const {
-    for (int n=0; n<(traversal_info[shape_.size()+1]+1); ++n) {
-        traversal_info[n] = traversal_info[shape_.size()];
-    }
-};
-
-//function to traverse a const strided tensor object
-//it requires an additional vector, traversal_info {0,0,0,0 ...}, comprising (shape_.size()+2) elements of 0
-//for e.g. 2d matrix:
-//index 0 and 1 store the base row and column index respectively
-//index 2 stores the current index of the traversal
-//index 3 stores the order of the traversal for e.g. if the order is 0, it means the next element can be navigated to using the innermost stride
-void traverse_next(std::vector<int>& traversal_info, int counter) const {
-    update_base_index(traversal_info);
-    traversal_info[shape_.size()+1] = determine_order(counter);
-    traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[strides_.size()-traversal_info[shape_.size()+1]-1];
-};
-
-// ******************************************************************************************
-// traversal operations end
-// ******************************************************************************************
-
  protected:
-  //bool transpose_ = false;
   DataType data_type_ = kFloat32;
   std::shared_ptr<Device> device_ = nullptr;
   /// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
@@ -391,8 +245,6 @@ void traverse_next(std::vector<int>& traversal_info, int counter) const {
   Block *block_ = nullptr;
   Shape shape_ = {};
   vector<int> strides_ = {};
-  vector<int> shape_multipliers_ = {};
-
 }; //end of tensor class
 
 typedef Shape::iterator ShapeIter;

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 9067242..a4efd64 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -34,7 +34,6 @@ Tensor::~Tensor() {
 Tensor::Tensor() { 
   device_ = defaultDevice;
   strides_ = {1};
-  shape_multipliers_ = {1};
 }
 
 //non-strided constructors 
@@ -43,16 +42,14 @@ Tensor::Tensor(const Shape &shape, DataType dtype)
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
-  Generate_Strides();
-  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+  generate_strides();
 }
 Tensor::Tensor(Shape &&shape, DataType dtype)
     : data_type_(dtype), device_(defaultDevice), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
-  Generate_Strides();
-  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+  generate_strides();
 }
 
 //non-strided constructors with device
@@ -62,16 +59,14 @@ Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
-  Generate_Strides();
-  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+  generate_strides();
 }
 Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
-  Generate_Strides();
-  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+  generate_strides();
 }
 
 
@@ -81,8 +76,7 @@ Tensor::Tensor(const Tensor &in)
       device_(in.device_),
       block_(in.block()),
       shape_(in.shape_),
-      strides_(in.strides_),
-      shape_multipliers_(in.shape_multipliers_) {
+      strides_(in.strides_) {
   if (block_ != nullptr)
     block_->IncRefCount();
 }
@@ -95,7 +89,6 @@ Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides)
       block_(in.block()),
       shape_(new_shape),
       strides_(new_strides) {
-  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
   if (block_ != nullptr)
     block_->IncRefCount();
 }
@@ -105,8 +98,7 @@ Tensor::Tensor(Tensor &&in)
       data_type_(in.data_type_),
       device_(in.device_),
       shape_(std::move(in.shape_)),
-      strides_(in.strides_),
-      shape_multipliers_(in.shape_multipliers_) {
+      strides_(in.strides_) {
   block_ = in.block_;
   in.block_ = nullptr;
 }
@@ -129,7 +121,6 @@ void Tensor::ResetLike(const Tensor &in) {
   }
   shape_ = in.shape_;
   strides_ = in.strides_;
-  shape_multipliers_ = in.shape_multipliers_;
 }
 
 //if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
@@ -146,8 +137,7 @@ void Tensor::Reshape(const Shape &shape) {
     LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
   }
   shape_ = shape;
-  Generate_Strides();
-  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+  generate_strides();
 }
 
 void Tensor::Reshape(Shape &&shape) {
@@ -162,8 +152,7 @@ void Tensor::Reshape(Shape &&shape) {
     LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
   }
   shape_ = std::move(shape);
-  Generate_Strides();
-  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+  generate_strides();
 }
 
 void Tensor::AsType(const DataType type) {
@@ -350,7 +339,6 @@ Tensor Tensor::T() const {
   t.strides_.clear();
   t.strides_.push_back(strides_[1]);
   t.strides_.push_back(strides_[0]);
-  t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
   t.block_ = block_;
   block_->IncRefCount();
   return t;
@@ -359,7 +347,7 @@ Tensor Tensor::T() const {
 //normal transpose without axes
 Tensor Tensor::Transpose() const {
   // if(shape_.size() != strides_.size())
-  //   Generate_Strides();
+  //   generate_strides();
 
   Tensor t;
   t.device_ = device_;
@@ -369,7 +357,6 @@ Tensor Tensor::Transpose() const {
     t.shape_.push_back(shape_[shape_.size()-n-1]);
     t.strides_.push_back(strides_[shape_.size()-n-1]);
   }
-  t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
   t.block_ = block_;
   block_->IncRefCount();
   return t;
@@ -382,7 +369,7 @@ Tensor Tensor::Transpose(Shape axes) const {
   //   return void();
   // }
   // if(shape_.size() != strides_.size())
-  //   Generate_Strides();
+  //   generate_strides();
 
   Tensor t;
   t.device_ = device_;
@@ -392,7 +379,6 @@ Tensor Tensor::Transpose(Shape axes) const {
     t.shape_.push_back(shape_[axes[n]]);
     t.strides_.push_back(strides_[axes[n]]);
   }
-  t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
   t.block_ = block_;
   block_->IncRefCount();
   return t;
@@ -564,7 +550,7 @@ float Tensor::L1() const {
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context *ctx) {
       DType ret = DType(0);
-      Asum<DType, Lang>(this, &ret, ctx);
+      Asum<DType, Lang>(*this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
   });
@@ -577,7 +563,7 @@ float Tensor::L2() const {
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context *ctx) {
       DType ret = DType(0);
-      Nrm2<DType, Lang>(this, &ret, ctx);
+      Nrm2<DType, Lang>(*this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
   });
@@ -603,7 +589,7 @@ template void Tensor::SetValue<int>(const int x);
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
       ret->device()->Exec([t, ret](Context * ctx) {                    \
-        fn<DType, Lang>(&t, ret, ctx);       \
+        fn<DType, Lang>(t, ret, ctx);       \
       }, {t.block()}, {ret->block()});                                 \
     });                                                                \
   } while (0)
@@ -632,7 +618,7 @@ GenUnaryTensorFn(Tanh);
     TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {  \
       CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                     \
       ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                  \
-        fn<DType, Lang>(&lhs, &rhs, ret, \
+        fn<DType, Lang>(lhs, rhs, ret, \
                         ctx);                                               \
       }, {lhs.block(), rhs.block()}, {ret->block()});                       \
     });                                                                     \
@@ -663,7 +649,7 @@ GenBinaryTensorFn(operator>=, GE);
       static_assert(std::is_same<SType, DType>::value,                  \
                     "The Scalar type must match the Tensor data type"); \
       ret->device()->Exec([t, x, ret](Context * ctx) {                  \
-        fn<DType, Lang>(&t, x, ret, ctx);     \
+        fn<DType, Lang>(t, x, ret, ctx);     \
       }, {t.block()}, {ret->block()});                                  \
     });                                                                 \
   } while (0)
@@ -706,7 +692,7 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     // TODO(wangwei) type cast SType to DType;
     in.device()->Exec([alpha, in, out](Context *ctx) {
-      Div<DType, Lang>(alpha, &in, out, ctx);
+      Div<DType, Lang>(alpha, in, out, ctx);
     }, {in.block()}, {out->block()});
   });
 }
@@ -743,7 +729,7 @@ float Sum<float>(const Tensor &in) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     one.device()->Exec([in, one, &s](Context *ctx) {
       DType ret = DType(0);
-      Dot<DType, Lang>(&in, &one, &ret, ctx);
+      Dot<DType, Lang>(in, one, &ret, ctx);
       s = ret;
     }, {in.block(), one.block()}, {});
   });
@@ -776,7 +762,7 @@ Tensor RowMax(const Tensor &in) {
       //size_t nrow = 1;
       //if (in.nDim() > 1) nrow = in.shape(0);
       //size_t ncol = in.Size() / nrow;
-      RowMax<DType, Lang>(&in, &ret, ctx);
+      RowMax<DType, Lang>(in, &ret, ctx);
     }, {in.block()}, {ret.block()});
   });
   return ret;
@@ -1012,7 +998,7 @@ void MultColumn(const Tensor &v, Tensor *M) {
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context *ctx) {
-      DGMM<DType, Lang>(false, M, &v,
+      DGMM<DType, Lang>(false, *M, v,
                         M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
@@ -1027,7 +1013,7 @@ void MultRow(const Tensor &v, Tensor *M) {
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context *ctx) {
-      DGMM<DType, Lang>(true, M, &v,
+      DGMM<DType, Lang>(true, *M, v,
                         M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
@@ -1113,7 +1099,7 @@ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     auto a = TypeCast<SType, DType>(alpha);
     out->device()->Exec([a, in, out](Context *ctx) {
-      Axpy<DType, Lang>(a, &in, out, ctx);
+      Axpy<DType, Lang>(a, in, out, ctx);
     }, {in.block(), out->block()}, {out->block()});
   });
 }
@@ -1143,7 +1129,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context *ctx) {
-        GEMV<DType, Lang>(a, &A, &B, b, C, ctx);
+        GEMV<DType, Lang>(a, A, B, b, C, ctx);
       }, {A.block(), B.block()}, {C->block()});
     });
   } else {
@@ -1152,7 +1138,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context *ctx) {
-        GEMM<DType, Lang>(a, &A, &B, b, C,
+        GEMM<DType, Lang>(a, A, B, b, C,
                           ctx);
       }, {A.block(), B.block()}, {C->block()});
     });

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index c403f30..c7fdfe5 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -40,7 +40,7 @@ namespace singa {
 /// 4. Function argument names, use 'num' for total number of elements in
 ///    elementwise operations; use 'in1' 'in2' for in Tensors; use 'out' for
 ///    output Tensor or value. With exceptions for some functions, e.g.,
-///      Scale(const float alpha, const Tensor* in, Tensor* out);
+///      Scale(const float alpha, const Tensor &in, Tensor* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
 ///    For blas functions, follow the blas style for argument names.
 ///    Use 'M' and 'v' for matrix and vector tensors in functions involving both
@@ -50,37 +50,6 @@ namespace singa {
 /// 7. Use size_t for the number of elements, rows or columns.
 /// 8. Use the same name for the Tensor and Tensor level math functions.
 
-// template <typename DType>
-// void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){}
-
-// template <typename DType>
-// void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){}
-
-template <typename DType>
-void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){
-  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
-  const DType *inPtr = static_cast<const DType *>(in->block()->data());
-  vector<int> traversal_info = in->generate_traversal_info();
-  for (size_t i = 0; i < in->Size(); i++) { 
-    outPtr[i] = func(inPtr[traversal_info[in->shape().size()]]);
-    in->traverse_next(traversal_info, i+1);
-  }
-}
-
-template <typename DType>
-void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){
-  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
-  const DType *in1Ptr = static_cast<const DType *>(in1->block()->data());
-  const DType *in2Ptr = static_cast<const DType *>(in2->block()->data());
-  vector<int> traversal_info_in1 = in1->generate_traversal_info();
-  vector<int> traversal_info_in2 = in2->generate_traversal_info();
-  for (size_t i = 0; i < in1->Size(); i++) {
-    outPtr[i] = func(in1Ptr[traversal_info_in1[in1->shape().size()]], in2Ptr[traversal_info_in2[in2->shape().size()]]);
-    in1->traverse_next(traversal_info_in1, i+1);
-    in2->traverse_next(traversal_info_in2, i+1);
-  }
-}
-
 
 // **************************************
 // Element-wise functions
@@ -88,41 +57,41 @@ void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::func
 
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
-void Abs(const Tensor *in, Tensor *out, Context *ctx) {
+void Abs(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
 /// out[i] = in[i] + x
 template <typename DType, typename Lang>
-void Add(const Tensor *in, const DType x, Tensor *out,
+void Add(const Tensor &in, const DType x, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Add Not Implemented";
 }
 
 /// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
-void Add(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Add(const Tensor &in1, const Tensor &in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Add-Pair Not Implemented";
 }
 /// Clamp every element into [low, high]
 /// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
-void Clamp(const DType low, const DType high, const Tensor *in,
+void Clamp(const DType low, const DType high, const Tensor &in,
            Tensor *out, Context *ctx) {
   LOG(FATAL) << "Clamp Not Implemented";
 }
 
 /// out[i] = x / in[i]
 template <typename DType, typename Lang>
-void Div(const DType x, const Tensor *in, Tensor *out,
+void Div(const DType x, const Tensor &in, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Div Not Implemented";
 }
 
 /// out[i] = in[i] / x
 template <typename DType, typename Lang>
-void Div(const Tensor *in, const DType x, Tensor *out,
+void Div(const Tensor &in, const DType x, Tensor *out,
          Context *ctx) {
   CHECK_NE(x, 0.f);
   EltwiseMult<DType, Lang>(in, DType(1) / x, out, ctx);
@@ -130,101 +99,101 @@ void Div(const Tensor *in, const DType x, Tensor *out,
 
 /// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
-void Div(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Div(const Tensor &in1, const Tensor &in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
 /// out[i] = in[i] * x
 template <typename DType, typename Lang>
-void EltwiseMult(const Tensor *in, const DType x, Tensor *out,
+void EltwiseMult(const Tensor &in, const DType x, Tensor *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
 /// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
-void EltwiseMult(const Tensor *in1, const Tensor *in2, Tensor *out,
+void EltwiseMult(const Tensor &in1, const Tensor &in2, Tensor *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
 
 /// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
-void Exp(const Tensor *in, Tensor *out, Context *ctx) {
+void Exp(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Exp Not Implemented";
 }
 
 /// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void LE(const Tensor *in, const DType x, Tensor *out,
+void LE(const Tensor &in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "LE Not Implemented";
 }
 /// out[i]=(in1[i]<=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void LE(const Tensor *in1, const Tensor *in2, Tensor *out,
+void LE(const Tensor &in1, const Tensor &in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
 }
 /// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
-void Log(const Tensor *in, Tensor *out, Context *ctx) {
+void Log(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Log Not Implemented";
 }
 /// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void LT(const Tensor *in, const DType x, Tensor *out,
+void LT(const Tensor &in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "LT Not Implemented";
 }
 /// out[i]=(in1[i]<in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void LT(const Tensor *in1, const Tensor *in2, Tensor *out,
+void LT(const Tensor &in1, const Tensor &in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
 }
 /// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void GE(const Tensor *in, const DType x, Tensor *out,
+void GE(const Tensor &in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "GE Not Implemented";
 }
 /// out[i]=(in1[i]>=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void GE(const Tensor *in1, const Tensor *in2, Tensor *out,
+void GE(const Tensor &in1, const Tensor &in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
 }
 /// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void GT(const Tensor *in, const DType x, Tensor *out,
+void GT(const Tensor &in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "GT Not Implemented";
 }
 /// out[i]=(in[i]>in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void GT(const Tensor *in, const Tensor *in2, Tensor *out,
+void GT(const Tensor &in, const Tensor &in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
 }
 /// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
-void Pow(const Tensor *in, const DType x, Tensor *out,
+void Pow(const Tensor &in, const DType x, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
-void Pow(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Pow(const Tensor &in1, const Tensor &in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
 /// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
-void ReLU(const Tensor *in, Tensor *out, Context *ctx) {
+void ReLU(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "ReLU Not Implemented";
 }
 
@@ -235,50 +204,50 @@ void Set(const DType x, Tensor *out, Context *ctx) {
 }
 /// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
-void Sigmoid(const Tensor *in, Tensor *out, Context *ctx) {
+void Sigmoid(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
 /// out[i] = sign(in[i])
 template <typename DType, typename Lang>
-void Sign(const Tensor *in, Tensor *out, Context *ctx) {
+void Sign(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sign Not Implemented";
 }
 /// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
-void Sqrt(const Tensor *in, Tensor *out, Context *ctx) {
+void Sqrt(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sqrt Not Implemented";
 }
 
 /// out[i]=square(in[i])
 template <typename DType, typename Lang>
-void Square(const Tensor *in, Tensor *out, Context *ctx) {
+void Square(const Tensor &in, Tensor *out, Context *ctx) {
   EltwiseMult<DType, Lang>(in, in, out, ctx);
 }
 
 /// out[i] =  in[i] - x
 template <typename DType, typename Lang>
-void Sub(const Tensor *in, const DType x, Tensor *out,
+void Sub(const Tensor &in, const DType x, Tensor *out,
          Context *ctx) {
   Add<DType, Lang>(in, -x, out, ctx);
 }
 
 /// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
-void Sub(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Sub(const Tensor &in1, const Tensor &in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Sub-Pair Not Implemented";
 }
 
 /// sum all elements of in into out
 template <typename DType, typename Lang>
-void Sum(const Tensor *in, DType *out, Context *ctx) {
+void Sum(const Tensor &in, DType *out, Context *ctx) {
   LOG(FATAL) << "Sum Not Implemented";
 }
 
 /// out[i]=tanh(in[i])
 template <typename DType, typename Lang>
-void Tanh(const Tensor *in, Tensor *out, Context *ctx) {
+void Tanh(const Tensor &in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Tanh Not Implemented";
 }
 
@@ -313,31 +282,31 @@ void Uniform(const float low, const float high, Tensor *out,
 
 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(const Tensor *in, size_t *out, Context *ctx) {
+void Amax(const Tensor &in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amax Not Implemented";
 }
 
 /// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(const Tensor *in, size_t *out, Context *ctx) {
+void Amin(const Tensor &in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amin Not Implemented";
 }
 /// out = sum |x| for all x in in
 template <typename DType, typename Lang>
-void Asum(const Tensor *in, DType *out, Context *ctx) {
+void Asum(const Tensor &in, DType *out, Context *ctx) {
   LOG(FATAL) << "Asum Not Implemented";
 }
 
 /// out = alpha * in + out
 template <typename DType, typename Lang>
-void Axpy(const DType alpha, const Tensor *in, Tensor *out,
+void Axpy(const DType alpha, const Tensor &in, Tensor *out,
           Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
 /// out = ||in||_2^2, i.e, L2 norm.
 template <typename DType, typename Lang>
-void Nrm2(const Tensor *in, float *out, Context *ctx) {
+void Nrm2(const Tensor &in, float *out, Context *ctx) {
   LOG(FATAL) << "Nrm2 Not Implemented";
 }
 
@@ -349,7 +318,7 @@ void Scale(const DType x, Tensor *out, Context *ctx) {
 
 /// inner product of array in1 and in2
 template <typename DType, typename Lang>
-void Dot(const Tensor *in1, const Tensor *in2, DType *out,
+void Dot(const Tensor &in1, const Tensor &in2, DType *out,
          Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
@@ -358,7 +327,7 @@ void Dot(const Tensor *in1, const Tensor *in2, DType *out,
 /// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
 void GEMV(const DType alpha,
-          const Tensor *A, const Tensor *v, const DType beta, Tensor *out,
+          const Tensor &A, const Tensor &v, const DType beta, Tensor *out,
           Context *ctx) {
   LOG(FATAL) << "GEMV Not Implemented";
 }
@@ -367,7 +336,7 @@ void GEMV(const DType alpha,
 /// if matrix_lef_side is true, do M*v; else do v*M
 template <typename DType, typename Lang>
 void DGMM(const bool side_right,
-  const Tensor *M, const Tensor *v, Tensor *out, Context *ctx) {
+  const Tensor &M, const Tensor &v, Tensor *out, Context *ctx) {
   LOG(FATAL) << "DGMM Not Implemented";
 }
 
@@ -375,7 +344,7 @@ void DGMM(const bool side_right,
 /// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
 void GEMM(const DType alpha,
-          const Tensor *A, const Tensor *B, const DType beta, Tensor *C,
+          const Tensor &A, const Tensor &B, const DType beta, Tensor *C,
           Context *ctx) {
   LOG(FATAL) << "GEMM Not Implemented";
 }
@@ -396,7 +365,7 @@ void SoftmaxCrossEntropyBwd(bool int_target, const size_t batchsize,
 }
 
 template <typename DType, typename Lang>
-void RowMax(const Tensor *in, Tensor *out, Context* ctx) {
+void RowMax(const Tensor &in, Tensor *out, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // **************************************
@@ -405,28 +374,28 @@ void RowMax(const Tensor *in, Tensor *out, Context* ctx) {
 /*
 /// Add the vector v to every column of A as the column of out
 template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+void AddCol(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor &v,
             Tensor *out, Context *ctx) {
   LOG(FATAL) << "AddCol Not Implemented";
 }
 // TODO(wangwei) unify AddRow and AddCol.
 /// Add the vector v to every row of A as the row of out
 template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+void AddRow(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor &v,
             Tensor *out, Context *ctx) {
   LOG(FATAL) << "AddRow Not Implemented";
 }
 /// outer-product.
 /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Tensor *in1, const Tensor *in2,
+void Outer(const size_t m, const size_t n, const Tensor &in1, const Tensor &in2,
            Tensor *out, Context *ctx) {
   LOG(FATAL) << "Outer Not Implemented";
 }
 
 /// Sum the columns of the in matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
+void SumColumns(const size_t nrow, const size_t ncol, const Tensor &in, Tensor *out,
                 Context *ctx) {
   LOG(FATAL) << "SumColumns Not Implemented";
 }
@@ -438,10 +407,11 @@ void Set(const DType x, Tensor *out, Context *ctx) {
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the in matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
+void SumRows(const size_t nrow, const size_t ncol, const Tensor &in, Tensor *out,
              Context *ctx) {
   LOG(FATAL) << "SumRows Not Implemented";
 }
 */
+
 }  // namespace singa
 #endif  // SINGA_CORE_MATH_H_


[02/10] incubator-singa git commit: Singa-341 Added stride functionality to tensors for CPP

Posted by wa...@apache.org.
Singa-341 Added stride functionality to tensors for CPP


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a88efa00
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a88efa00
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a88efa00

Branch: refs/heads/master
Commit: a88efa00c425f610c54a359e597ecaa82d41ff25
Parents: 060e7df
Author: Vaan Ng <cm...@gmail.com>
Authored: Tue Apr 17 20:09:19 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Tue Apr 17 20:09:19 2018 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h       |  118 +++-
 src/core/tensor/tensor.cc         |  199 ++++--
 src/core/tensor/tensor_math.h     |  173 +++--
 src/core/tensor/tensor_math_cpp.h | 1199 ++++++++++++++++++++++++--------
 src/proto/core.proto              |   21 +-
 5 files changed, 1275 insertions(+), 435 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 6621fa0..6eafbdf 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -22,6 +22,7 @@
 #include <vector>
 #include <tuple>
 #include <memory>
+#include <algorithm>
 
 #include "singa/core/common.h"
 #include "singa/core/device.h"
@@ -30,6 +31,7 @@
 
 using std::vector;
 using std::tuple;
+using std::reverse;
 namespace singa {
 
 typedef vector<size_t> Shape;
@@ -58,12 +60,14 @@ class Tensor {
   Tensor();
   explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
   explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
+
   Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
-  Tensor(const Shape &shape, std::shared_ptr<Device> dev,
-         DataType dtype = kFloat32);
+  Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor &from);
+  /// Copy Tensor to share the internal data.  No deep copy. For 2 tensors sharing same block but different strides.
+  Tensor(const Tensor &from, Shape &new_shape, vector<int> &new_strides);
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(Tensor &&from);
 
@@ -104,7 +108,12 @@ class Tensor {
 
   bool empty() const { return nDim() == 0; }
 
-  bool transpose() const { return transpose_; }
+  //bool transpose() const { return transpose_; }
+  bool transpose() const { return (strides_[0] != 1); }
+
+  const vector<int>& strides() const { return strides_; }
+
+  const vector<int>& shape_multipliers() const { return shape_multipliers_; }
 
   /// return true if the content of the tensor is initialized
   bool initailized() const {
@@ -171,6 +180,10 @@ class Tensor {
   /// No data copy, just set the transpose_ filed of the returned tensor.
   Tensor T() const;
 
+  Tensor Transpose() const;
+
+  Tensor Transpose(Shape axes) const;
+
   /// Copy the meta info with data block shared.
   Tensor &operator=(const Tensor &in);
 
@@ -209,15 +222,106 @@ class Tensor {
   /// Return average L2 norm
   float L2() const;
 
+  //generate strides automatically if stride field is not passed
+void Generate_Strides(){
+    if(shape_.size()==0){
+      strides_ = {1};
+      return void();
+    }
+    strides_.clear();
+    size_t dim = Size();
+    int cumulative_product = 1;
+    for (size_t n=0; n<shape_.size(); ++n) {
+        cumulative_product = cumulative_product*shape_[n];
+        strides_.push_back(dim/cumulative_product);
+    }
+    reverse(strides_.begin(), strides_.end());
+};
+
+//generate shape multipliers
+//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
+//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
+//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
+//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
+vector<int> Generate_Shape_Multipliers(Shape y_shape) const {
+    if(y_shape.size()==0){
+      return {1};
+    }
+    reverse(y_shape.begin(), y_shape.end());
+    vector<int> shape_multipliers = {};
+    int cumulative_product = 1;
+
+    shape_multipliers.push_back(1);
+    for (size_t n=0; n<(y_shape.size()-1); ++n) {
+        cumulative_product = cumulative_product*y_shape[n];
+        shape_multipliers.push_back(cumulative_product);
+    }
+    reverse(shape_multipliers.begin(), shape_multipliers.end());
+    return shape_multipliers;
+};
+
+// ******************************************************************************************
+// Some traversal operations (works on const declarations without modifying tensor variables)
+// ******************************************************************************************
+
+//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
+vector<int> generate_traversal_info() const {
+    vector<int> traversal_info = {};
+    for(size_t n=0; n<(shape_.size()+2); ++n) {
+      traversal_info.push_back(0);
+    }
+    return traversal_info;
+};
+
+//this function checks whether the next index falls on a special multiplier of the outer shape
+//so the algorithm knows when to jump over/back to a starting element of the outer shape
+//for e.g. in [[1,4,7], [2,5,8], [3,6,9]], elements 1,2,3 are the starting elements of their respective rows
+//this additional check only has 1 loop for 2d matrix
+//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
+int determine_order(int counter) const {
+    for (size_t n=0; n<(shape_multipliers_.size()-1); ++n) {
+        if((counter%shape_multipliers_[n])==0){
+            return ((shape_multipliers_.size()) - 1 - n);
+        }
+    }
+    return 0;
+};
+
+//this function updates the base indexes with the current index after every single traversal step, can be generalized beyond 2d cases
+void update_base_index(std::vector<int>& traversal_info) const {
+    for (int n=0; n<(traversal_info[shape_.size()+1]+1); ++n) {
+        traversal_info[n] = traversal_info[shape_.size()];
+    }
+};
+
+//function to traverse a const strided tensor object
+//it requires an additional vector, traversal_info {0,0,0,0 ...}, comprising (shape_.size()+2) elements of 0
+//for e.g. 2d matrix:
+//index 0 and 1 store the base row and column index respectively
+//index 2 stores the current index of the traversal
+//index 3 stores the order of the traversal for e.g. if the order is 0, it means the next element can be navigated to using the innermost stride
+void traverse_next(std::vector<int>& traversal_info, int counter) const {
+    update_base_index(traversal_info);
+    traversal_info[shape_.size()+1] = determine_order(counter);
+    traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[traversal_info[shape_.size()+1]];
+};
+
+// ******************************************************************************************
+// traversal operations end
+// ******************************************************************************************
+
  protected:
-  bool transpose_ = false;
+  //bool transpose_ = false;
   DataType data_type_ = kFloat32;
   std::shared_ptr<Device> device_ = nullptr;
   /// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
   /// If you want to get an allocated Block, use block() instead of block_.
   Block *block_ = nullptr;
   Shape shape_ = {};
-};
+  vector<int> strides_ = {};
+  vector<int> shape_multipliers_ = {};
+
+}; //end of tensor class
 
 typedef Shape::iterator ShapeIter;
 inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
@@ -452,12 +556,16 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
 /// each instance, t[i] could be 2 or [0, 0, 1]. If one instance could have
 /// multiple labels, then t[i] could be [1, 0, 1].
 /// The loss is computed into p.
+
 void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss);
+
 /// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
 /// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
 /// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
+
 void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p);
 
+
 /// Return a tensor consisting of rows ([start, end)) from 'in'. It copies the
 /// values from 'in'. 'in' ia a 2D Tensor.
 Tensor CopyRows(const Tensor &in, const size_t start, const size_t end);

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index ed4da96..48751ef 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -21,6 +21,7 @@
 #include "./tensor_math_cuda.h"
 #include "./tensor_math_opencl.h"
 #include <utility>
+#include <iostream>
 
 namespace singa {
 
@@ -30,52 +31,87 @@ Tensor::~Tensor() {
   block_ = nullptr;
 }
 
-Tensor::Tensor() { device_ = defaultDevice; }
+Tensor::Tensor() { 
+  device_ = defaultDevice;
+  strides_ = {1};
+  shape_multipliers_ = {1};
+}
 
+//non-strided constructors 
 Tensor::Tensor(const Shape &shape, DataType dtype)
     : data_type_(dtype), device_(defaultDevice), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
+  Generate_Strides();
+  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
 }
 Tensor::Tensor(Shape &&shape, DataType dtype)
     : data_type_(dtype), device_(defaultDevice), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
+  Generate_Strides();
+  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
 }
+
+//non-strided constructors with device
 Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
                DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
+  Generate_Strides();
+  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
 }
 Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
     : data_type_(dtype), device_(device), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
+  Generate_Strides();
+  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
 }
+
+
 Tensor::Tensor(const Tensor &in)
-    : transpose_(in.transpose_),
+    : //transpose_(in.transpose_),
+      data_type_(in.data_type_),
+      device_(in.device_),
+      block_(in.block()),
+      shape_(in.shape_),
+      strides_(in.strides_),
+      shape_multipliers_(in.shape_multipliers_) {
+  if (block_ != nullptr)
+    block_->IncRefCount();
+}
+
+//strided constructor taking in a tensor, shape and strides
+Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides)
+    : //transpose_(in.transpose_),
       data_type_(in.data_type_),
       device_(in.device_),
       block_(in.block()),
-      shape_(in.shape_) {
+      shape_(new_shape),
+      strides_(new_strides) {
+  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
   if (block_ != nullptr)
     block_->IncRefCount();
 }
 
 Tensor::Tensor(Tensor &&in)
-    : transpose_(in.transpose_),
+    : //transpose_(in.transpose_),
       data_type_(in.data_type_),
       device_(in.device_),
-      shape_(std::move(in.shape_)) {
+      shape_(std::move(in.shape_)),
+      strides_(in.strides_),
+      shape_multipliers_(in.shape_multipliers_) {
   block_ = in.block_;
   in.block_ = nullptr;
 }
 
+
 void Tensor::SetBlock(Block *block) {
   LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
   if (block_ != nullptr)
@@ -92,24 +128,46 @@ void Tensor::ResetLike(const Tensor &in) {
     block_ = device_->NewBlock((int)in.MemSize());
   }
   shape_ = in.shape_;
+  strides_ = in.strides_;
+  shape_multipliers_ = in.shape_multipliers_;
 }
 
+//yisen todo
+//if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
+//if tensor is already transposed i.e strides != 1, it should be copied to a new tensor with newly generated default strides 
+
 void Tensor::Reshape(const Shape &shape) {
+  if(strides_.size()==0)
+    strides_.push_back(1);
+
   if (Product(shape_) != Product(shape)) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
+  } else if (strides_[0] != 1) {
+    std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
+    return void();
   }
   shape_ = shape;
+  Generate_Strides();
+  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
 }
 
 void Tensor::Reshape(Shape &&shape) {
+  if(strides_.size()==0)
+    strides_.push_back(1);
+
   if (Product(shape_) != Product(shape)) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
+  } else if (strides_[0] != 1) {
+    std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
+    return void();
   }
   shape_ = std::move(shape);
+  Generate_Strides();
+  shape_multipliers_ = Generate_Shape_Multipliers(shape_);
 }
 
 void Tensor::AsType(const DataType type) {
@@ -177,7 +235,9 @@ void Tensor::FromProto(const singa::TensorProto &proto) {
   for (uint32_t s : proto.shape()) shape.push_back(s);
   data_type_ = proto.data_type();
   Reshape(shape);
-  transpose_ = proto.transpose();
+  //transpose_ = proto.transpose();
+  strides_.clear();
+  for (int32_t s : proto.strides()) strides_.push_back(s);
   switch (data_type_) {
     case kFloat32: {
       std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
@@ -226,7 +286,11 @@ void Tensor::ToProto(singa::TensorProto *proto) const {
     proto->add_shape(s);
   }
   proto->set_data_type(data_type_);
-  proto->set_transpose(transpose_);
+  //proto->set_transpose(transpose_);
+  proto->clear_strides();
+  for (auto s : strides_) {
+    proto->add_strides(s);
+  }
   switch (data_type_) {
     case kFloat32: {
       proto->clear_float_data();
@@ -272,19 +336,67 @@ void Tensor::ToProto(singa::TensorProto *proto) const {
 Tensor Tensor::Clone(std::shared_ptr<Device> device) const {
   if (device == nullptr) device = device_;
   Tensor t(shape_, device_, data_type_);
-  t.transpose_ = transpose_;
+  //t.transpose_ = transpose_;
+  t.strides_ = strides_;
   t.CopyData(*this);
   return t;
 }
 
+//yisen todo
 Tensor Tensor::T() const {
+  // this function only works for 2d tensors
   CHECK_EQ(shape_.size(), 2u);
   Tensor t;
   t.device_ = device_;
   t.data_type_ = data_type_;
-  t.transpose_ = !transpose_;
   t.shape_.push_back(shape_[1]);
   t.shape_.push_back(shape_[0]);
+  t.strides_.clear();
+  t.strides_.push_back(strides_[1]);
+  t.strides_.push_back(strides_[0]);
+  t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
+  t.block_ = block_;
+  block_->IncRefCount();
+  return t;
+}
+
+//normal transpose without axes
+Tensor Tensor::Transpose() const {
+  // if(shape_.size() != strides_.size())
+  //   Generate_Strides();
+
+  Tensor t;
+  t.device_ = device_;
+  t.data_type_ = data_type_;
+  t.strides_.clear();
+  for(size_t n=0; n<shape_.size(); ++n){
+    t.shape_.push_back(shape_[shape_.size()-n-1]);
+    t.strides_.push_back(strides_[shape_.size()-n-1]);
+  }
+  t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
+  t.block_ = block_;
+  block_->IncRefCount();
+  return t;
+}
+
+//transpose with axes
+Tensor Tensor::Transpose(Shape axes) const {
+  // if(axes.size() != shape_.size()){
+  //   std::cout << "Warning: Size of input axes doesn't match size of shape" << std::endl;
+  //   return void();
+  // }
+  // if(shape_.size() != strides_.size())
+  //   Generate_Strides();
+
+  Tensor t;
+  t.device_ = device_;
+  t.data_type_ = data_type_;
+  t.strides_.clear();
+  for(size_t n=0; n<axes.size(); ++n){
+    t.shape_.push_back(shape_[axes[n]]);
+    t.strides_.push_back(strides_[axes[n]]);
+  }
+  t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
   t.block_ = block_;
   block_->IncRefCount();
   return t;
@@ -294,7 +406,8 @@ Tensor &Tensor::operator=(const Tensor &in) {
   // LOG(ERROR) << "= const &";
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
-  transpose_ = in.transpose_;
+  //transpose_ = in.transpose_;
+  strides_ = in.strides_;
   data_type_ = in.data_type_;
   shape_ = in.shape_;
   device_ = in.device_;
@@ -308,7 +421,8 @@ Tensor &Tensor::operator=(Tensor &&in) {
   // LOG(ERROR) << "= &&";
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
-  transpose_ = in.transpose_;
+  //transpose_ = in.transpose_;
+  strides_ = in.strides_;
   data_type_ = in.data_type_;
   shape_ = std::move(in.shape_);
   device_ = in.device_;
@@ -317,6 +431,7 @@ Tensor &Tensor::operator=(Tensor &&in) {
   return *this;
 }
 
+//yisen todo
 Tensor Reshape(const Tensor &in, const Shape &s) {
   Tensor out(in);
   out.Reshape(s);
@@ -373,7 +488,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                               (int)s_offset);
     } else if (src_dev->lang() == kCpp) {
       dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, (int)d_offset,
-							  (int)s_offset);
+                (int)s_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
@@ -453,7 +568,7 @@ float Tensor::L1() const {
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context *ctx) {
       DType ret = DType(0);
-      Asum<DType, Lang>(this->Size(), this->block(), &ret, ctx);
+      Asum<DType, Lang>(this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
   });
@@ -466,7 +581,7 @@ float Tensor::L2() const {
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     device_->Exec([&nrm, this](Context *ctx) {
       DType ret = DType(0);
-      Nrm2<DType, Lang>(this->Size(), this->block(), &ret, ctx);
+      Nrm2<DType, Lang>(this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
     }, {this->block()}, {});
   });
@@ -476,12 +591,12 @@ float Tensor::L2() const {
 template <typename SType>
 void Tensor::SetValue(const SType x) {
   CHECK_EQ(sizeof(SType), SizeOf(data_type_));
-  auto size = Size();
+  //auto size = Size();
   auto ptr = block_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     // TODO(wangwei) cast x to DType
-    device_->Exec([size, x, ptr](Context *ctx) {
-      Set<DType, Lang>(size, x, ptr, ctx);
+    device_->Exec([this, x, ptr](Context *ctx) {
+      Set<DType, Lang>(x, this, ctx);
     }, {}, {ptr});
   });
 }
@@ -492,7 +607,7 @@ template void Tensor::SetValue<int>(const int x);
   do {                                                                 \
     TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
       ret->device()->Exec([t, ret](Context * ctx) {                    \
-        fn<DType, Lang>(t.Size(), t.block(), ret->block(), ctx);       \
+        fn<DType, Lang>(&t, ret, ctx);       \
       }, {t.block()}, {ret->block()});                                 \
     });                                                                \
   } while (0)
@@ -521,7 +636,7 @@ GenUnaryTensorFn(Tanh);
     TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, {  \
       CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type()));                     \
       ret->device()->Exec([lhs, rhs, ret](Context * ctx) {                  \
-        fn<DType, Lang>(lhs.Size(), lhs.block(), rhs.block(), ret->block(), \
+        fn<DType, Lang>(&lhs, &rhs, ret, \
                         ctx);                                               \
       }, {lhs.block(), rhs.block()}, {ret->block()});                       \
     });                                                                     \
@@ -552,7 +667,7 @@ GenBinaryTensorFn(operator>=, GE);
       static_assert(std::is_same<SType, DType>::value,                  \
                     "The Scalar type must match the Tensor data type"); \
       ret->device()->Exec([t, x, ret](Context * ctx) {                  \
-        fn<DType, Lang>(t.Size(), t.block(), x, ret->block(), ctx);     \
+        fn<DType, Lang>(&t, x, ret, ctx);     \
       }, {t.block()}, {ret->block()});                                  \
     });                                                                 \
   } while (0)
@@ -595,7 +710,7 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     // TODO(wangwei) type cast SType to DType;
     in.device()->Exec([alpha, in, out](Context *ctx) {
-      Div<DType, Lang>(in.Size(), alpha, in.block(), out->block(), ctx);
+      Div<DType, Lang>(alpha, &in, out, ctx);
     }, {in.block()}, {out->block()});
   });
 }
@@ -632,7 +747,7 @@ float Sum<float>(const Tensor &in) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     one.device()->Exec([in, one, &s](Context *ctx) {
       DType ret = DType(0);
-      Dot<DType, Lang>(in.Size(), in.block(), one.block(), &ret, ctx);
+      Dot<DType, Lang>(&in, &one, &ret, ctx);
       s = ret;
     }, {in.block(), one.block()}, {});
   });
@@ -661,11 +776,11 @@ Tensor SoftMax(const Tensor &in) {
 Tensor RowMax(const Tensor &in) {
   Tensor ret({in.shape(0)}, in.device(), in.data_type());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-    in.device()->Exec([in, ret](Context *ctx) {
-      size_t nrow = 1;
-      if (in.nDim() > 1) nrow = in.shape(0);
-      size_t ncol = in.Size() / nrow;
-      RowMax<DType, Lang>(nrow, ncol, in.block(), ret.block(), ctx);
+    in.device()->Exec([&in, &ret](Context *ctx) {
+      //size_t nrow = 1;
+      //if (in.nDim() > 1) nrow = in.shape(0);
+      //size_t ncol = in.Size() / nrow;
+      RowMax<DType, Lang>(&in, &ret, ctx);
     }, {in.block()}, {ret.block()});
   });
   return ret;
@@ -708,13 +823,13 @@ void AddColumn(const SType alpha, const SType beta, const Tensor &v,
     Tensor vmat = Reshape(v, Shape{nb_row, 1});
     Mult(alpha, vmat, one, beta, M);
   }
-}
+} 
 template
 void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
 
 void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
 
-/// Sub column 'v' by each column of matrix M; write results into 'out'
+/// Add row 'v' by each column of matrix M; write results into 'out'
 template <typename SType>
 void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
   if (M->transpose()) {
@@ -894,30 +1009,30 @@ void DivRow(const Tensor &v, Tensor *M) {
 
 /// Multiply column 'v' and each column of matrix M; write results into 'out'
 void MultColumn(const Tensor &v, Tensor *M) {
-  CHECK(!M->transpose()) << "Not supported yet";
+  //CHECK(!M->transpose()) << "Not supported yet";
   CHECK_EQ(M->nDim(), 2u);
   // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context *ctx) {
-      DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->block(), v.block(),
-                        M->block(), ctx);
+      DGMM<DType, Lang>(false, M, &v,
+                        M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
 }
 
 /// Multiply row 'v' with each row of matrix M; write results into 'out'
 void MultRow(const Tensor &v, Tensor *M) {
-  CHECK(!M->transpose()) << "Not supported yet";
+  //CHECK(!M->transpose()) << "Not supported yet";
   CHECK_EQ(M->nDim(), 2u);
   // CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
     v.device()->Exec([M, v](Context *ctx) {
-      DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->block(), v.block(),
-                        M->block(), ctx);
+      DGMM<DType, Lang>(true, M, &v,
+                        M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
 }
@@ -963,7 +1078,7 @@ void Bernoulli(const SType p, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto prob = TypeCast<SType, DType>(p);
     out->device()->Exec([prob, out](Context *ctx) {
-      Bernoulli<DType, Lang>(out->Size(), prob, out->block(), ctx);
+      Bernoulli<DType, Lang>(prob, out, ctx);
     }, {}, {out->block()}, true);
   });
 }
@@ -976,7 +1091,7 @@ void Uniform(const SType low, const SType high, Tensor *out) {
     auto l = TypeCast<SType, DType>(low);
     auto h = TypeCast<SType, DType>(high);
     out->device()->Exec([l, h, out](Context *ctx) {
-      Uniform<DType, Lang>(out->Size(), l, h, out->block(), ctx);
+      Uniform<DType, Lang>(l, h, out, ctx);
     }, {}, {out->block()}, true);
   });
 }
@@ -989,7 +1104,7 @@ void Gaussian(const SType mean, const SType std, Tensor *out) {
     auto m = TypeCast<SType, DType>(mean);
     auto s = TypeCast<SType, DType>(std);
     out->device()->Exec([m, s, out](Context *ctx) {
-      Gaussian<DType, Lang>(out->Size(), m, s, out->block(), ctx);
+      Gaussian<DType, Lang>(m, s, out, ctx);
     }, {}, {out->block()}, true);
   });
 }
@@ -1002,7 +1117,7 @@ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     auto a = TypeCast<SType, DType>(alpha);
     out->device()->Exec([a, in, out](Context *ctx) {
-      Axpy<DType, Lang>(in.Size(), a, in.block(), out->block(), ctx);
+      Axpy<DType, Lang>(a, &in, out, ctx);
     }, {in.block(), out->block()}, {out->block()});
   });
 }
@@ -1032,8 +1147,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context *ctx) {
-        GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.block(),
-                          B.block(), b, C->block(), ctx);
+        GEMV<DType, Lang>(a, &A, &B, b, C, ctx);
       }, {A.block(), B.block()}, {C->block()});
     });
   } else {
@@ -1042,8 +1156,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
       C->device()->Exec([a, A, b, B, C](Context *ctx) {
-        GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
-                          A.shape(1), a, A.block(), B.block(), b, C->block(),
+        GEMM<DType, Lang>(a, &A, &B, b, C,
                           ctx);
       }, {A.block(), B.block()}, {C->block()});
     });

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 6d42211..c403f30 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -19,7 +19,9 @@
 #define SINGA_CORE_MATH_H_
 #include <type_traits>
 #include "singa/core/common.h"
+#include "singa/core/tensor.h"
 #include "singa/utils/logging.h"
+#include <vector>
 
 namespace singa {
 
@@ -33,20 +35,52 @@ namespace singa {
 /// first
 ///    letter.
 /// 2. Order functions based on function name in alphabetical order.
-/// 3. Function arguments order is [const basic type] [const Block] [mutable
-/// Block].
+/// 3. Function arguments order is [const basic type] [const Tensor] [mutable
+/// Tensor].
 /// 4. Function argument names, use 'num' for total number of elements in
-///    elementwise operations; use 'in1' 'in2' for in blocks; use 'out' for
-///    output block or value. With exceptions for some functions, e.g.,
-///      Scale(const float alpha, const Block* in, Block* out);
+///    elementwise operations; use 'in1' 'in2' for in Tensors; use 'out' for
+///    output Tensor or value. With exceptions for some functions, e.g.,
+///      Scale(const float alpha, const Tensor* in, Tensor* out);
 ///    For such cases, use x, v, alpha, etc for scalar types.
 ///    For blas functions, follow the blas style for argument names.
 ///    Use 'M' and 'v' for matrix and vector tensors in functions involving both
 ///    matrix and vectors.
-/// 5. For Block argument xxx, name its raw pointer as xxxPtr.
+/// 5. For Tensor argument xxx, name its raw pointer as xxxPtr.
 /// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
 /// 7. Use size_t for the number of elements, rows or columns.
-/// 8. Use the same name for the Tensor and Block level math functions.
+/// 8. Use the same name for the Tensor and Tensor level math functions.
+
+// template <typename DType>
+// void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){}
+
+// template <typename DType>
+// void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){}
+
+template <typename DType>
+void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){
+  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+  const DType *inPtr = static_cast<const DType *>(in->block()->data());
+  vector<int> traversal_info = in->generate_traversal_info();
+  for (size_t i = 0; i < in->Size(); i++) { 
+    outPtr[i] = func(inPtr[traversal_info[in->shape().size()]]);
+    in->traverse_next(traversal_info, i+1);
+  }
+}
+
+template <typename DType>
+void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){
+  DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+  const DType *in1Ptr = static_cast<const DType *>(in1->block()->data());
+  const DType *in2Ptr = static_cast<const DType *>(in2->block()->data());
+  vector<int> traversal_info_in1 = in1->generate_traversal_info();
+  vector<int> traversal_info_in2 = in2->generate_traversal_info();
+  for (size_t i = 0; i < in1->Size(); i++) {
+    outPtr[i] = func(in1Ptr[traversal_info_in1[in1->shape().size()]], in2Ptr[traversal_info_in2[in2->shape().size()]]);
+    in1->traverse_next(traversal_info_in1, i+1);
+    in2->traverse_next(traversal_info_in2, i+1);
+  }
+}
+
 
 // **************************************
 // Element-wise functions
@@ -54,197 +88,197 @@ namespace singa {
 
 /// out[i] = |in[i]|
 template <typename DType, typename Lang>
-void Abs(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Abs(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Abs Not Implemented";
 }
 
 /// out[i] = in[i] + x
 template <typename DType, typename Lang>
-void Add(const size_t num, const Block *in, const DType x, Block *out,
+void Add(const Tensor *in, const DType x, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Add Not Implemented";
 }
 
 /// out[i] = in1[i] + in2[i]
 template <typename DType, typename Lang>
-void Add(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Add(const Tensor *in1, const Tensor *in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Add-Pair Not Implemented";
 }
 /// Clamp every element into [low, high]
 /// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
 template <typename DType, typename Lang>
-void Clamp(const size_t num, const DType low, const DType high, const Block *in,
-           Block *out, Context *ctx) {
+void Clamp(const DType low, const DType high, const Tensor *in,
+           Tensor *out, Context *ctx) {
   LOG(FATAL) << "Clamp Not Implemented";
 }
 
 /// out[i] = x / in[i]
 template <typename DType, typename Lang>
-void Div(const size_t num, const DType x, const Block *in, Block *out,
+void Div(const DType x, const Tensor *in, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Div Not Implemented";
 }
 
 /// out[i] = in[i] / x
 template <typename DType, typename Lang>
-void Div(const size_t num, const Block *in, const DType x, Block *out,
+void Div(const Tensor *in, const DType x, Tensor *out,
          Context *ctx) {
   CHECK_NE(x, 0.f);
-  EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
+  EltwiseMult<DType, Lang>(in, DType(1) / x, out, ctx);
 }
 
 /// out[i] = in1[i] / in2[i]
 template <typename DType, typename Lang>
-void Div(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Div(const Tensor *in1, const Tensor *in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Div-Pair Not Implemented";
 }
 
 /// out[i] = in[i] * x
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Block *in, const DType x, Block *out,
+void EltwiseMult(const Tensor *in, const DType x, Tensor *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult Not Implemented";
 }
 
 /// out[i] = in1[i] * in2[i]
 template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Block *in1, const Block *in2, Block *out,
+void EltwiseMult(const Tensor *in1, const Tensor *in2, Tensor *out,
                  Context *ctx) {
   LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
 }
 
 /// Base is e, Neper number. out[i]=exp(in[i])
 template <typename DType, typename Lang>
-void Exp(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Exp(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Exp Not Implemented";
 }
 
 /// out[i]=(in[i]<=x)?1.f:0.f
 template <typename DType, typename Lang>
-void LE(const size_t num, const Block *in, const DType x, Block *out,
+void LE(const Tensor *in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "LE Not Implemented";
 }
 /// out[i]=(in1[i]<=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void LE(const size_t num, const Block *in1, const Block *in2, Block *out,
+void LE(const Tensor *in1, const Tensor *in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
 }
 /// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
 template <typename DType, typename Lang>
-void Log(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Log(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Log Not Implemented";
 }
 /// out[i]=(in[i]<x)?1.f:0.f
 template <typename DType, typename Lang>
-void LT(const size_t num, const Block *in, const DType x, Block *out,
+void LT(const Tensor *in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "LT Not Implemented";
 }
 /// out[i]=(in1[i]<in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void LT(const size_t num, const Block *in1, const Block *in2, Block *out,
+void LT(const Tensor *in1, const Tensor *in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
 }
 /// out[i]=(in[i]>=x)?1.f:0.f
 template <typename DType, typename Lang>
-void GE(const size_t num, const Block *in, const DType x, Block *out,
+void GE(const Tensor *in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "GE Not Implemented";
 }
 /// out[i]=(in1[i]>=in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void GE(const size_t num, const Block *in1, const Block *in2, Block *out,
+void GE(const Tensor *in1, const Tensor *in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
 }
 /// out[i]=(in[i]>x)?1.f:0.f
 template <typename DType, typename Lang>
-void GT(const size_t num, const Block *in, const DType x, Block *out,
+void GT(const Tensor *in, const DType x, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "GT Not Implemented";
 }
 /// out[i]=(in[i]>in2[i])?1.f:0.f
 template <typename DType, typename Lang>
-void GT(const size_t num, const Block *in, const Block *in2, Block *out,
+void GT(const Tensor *in, const Tensor *in2, Tensor *out,
         Context *ctx) {
   LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
 }
 /// out[i] = pow(in[i], x)
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Block *in, const DType x, Block *out,
+void Pow(const Tensor *in, const DType x, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Pow Not Implemented";
 }
 
 /// out[i]=pow(in1[i], in2[i])
 template <typename DType, typename Lang>
-void Pow(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Pow(const Tensor *in1, const Tensor *in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Pow-Pair Not Implemented";
 }
 
 /// out[i]=max(0, in[i])
 template <typename DType, typename Lang>
-void ReLU(const size_t num, const Block *in, Block *out, Context *ctx) {
+void ReLU(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "ReLU Not Implemented";
 }
 
 /// out[i] = x
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Block *out, Context *ctx) {
+void Set(const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Set Not Implemented";
 }
 /// out[i]=sigmoid(in[i])
 template <typename DType, typename Lang>
-void Sigmoid(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Sigmoid(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sigmoid Not Implemented";
 }
 
 /// out[i] = sign(in[i])
 template <typename DType, typename Lang>
-void Sign(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Sign(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sign Not Implemented";
 }
 /// out[i]=sqrt(in[i])
 template <typename DType, typename Lang>
-void Sqrt(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Sqrt(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Sqrt Not Implemented";
 }
 
 /// out[i]=square(in[i])
 template <typename DType, typename Lang>
-void Square(const size_t num, const Block *in, Block *out, Context *ctx) {
-  EltwiseMult<DType, Lang>(num, in, in, out, ctx);
+void Square(const Tensor *in, Tensor *out, Context *ctx) {
+  EltwiseMult<DType, Lang>(in, in, out, ctx);
 }
 
 /// out[i] =  in[i] - x
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Block *in, const DType x, Block *out,
+void Sub(const Tensor *in, const DType x, Tensor *out,
          Context *ctx) {
-  Add<DType, Lang>(num, in, -x, out, ctx);
+  Add<DType, Lang>(in, -x, out, ctx);
 }
 
 /// out[i] = in1[i] - in2[i]
 template <typename DType, typename Lang>
-void Sub(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Sub(const Tensor *in1, const Tensor *in2, Tensor *out,
          Context *ctx) {
   LOG(FATAL) << "Sub-Pair Not Implemented";
 }
 
 /// sum all elements of in into out
 template <typename DType, typename Lang>
-void Sum(const size_t num, const Block *in, DType *out, Context *ctx) {
+void Sum(const Tensor *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Sum Not Implemented";
 }
 
 /// out[i]=tanh(in[i])
 template <typename DType, typename Lang>
-void Tanh(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Tanh(const Tensor *in, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Tanh Not Implemented";
 }
 
@@ -255,20 +289,20 @@ void Tanh(const size_t num, const Block *in, Block *out, Context *ctx) {
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <typename DType, typename Lang>
-void Bernoulli(const size_t num, const float p, Block *out, Context *ctx) {
+void Bernoulli(const float p, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Bernoulli Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and std to DType
 template <typename DType, typename Lang>
-void Gaussian(const size_t num, const float mean, const float std, Block *out,
+void Gaussian(const float mean, const float std, Tensor *out,
               Context *ctx) {
   LOG(FATAL) << "Gaussian Not Implemented";
 }
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
 template <typename DType, typename Lang>
-void Uniform(const size_t num, const float low, const float high, Block *out,
+void Uniform(const float low, const float high, Tensor *out,
              Context *ctx) {
   LOG(FATAL) << "Uniform Not Implemented";
 }
@@ -279,43 +313,43 @@ void Uniform(const size_t num, const float low, const float high, Block *out,
 
 /// outurn the index of the element with the max value.
 template <typename DType, typename Lang>
-void Amax(const size_t num, const Block *in, size_t *out, Context *ctx) {
+void Amax(const Tensor *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amax Not Implemented";
 }
 
 /// outurn the index of the element with the min value.
 template <typename DType, typename Lang>
-void Amin(const size_t num, const Block *in, size_t *out, Context *ctx) {
+void Amin(const Tensor *in, size_t *out, Context *ctx) {
   LOG(FATAL) << "Amin Not Implemented";
 }
 /// out = sum |x| for all x in in
 template <typename DType, typename Lang>
-void Asum(const size_t num, const Block *in, DType *out, Context *ctx) {
+void Asum(const Tensor *in, DType *out, Context *ctx) {
   LOG(FATAL) << "Asum Not Implemented";
 }
 
 /// out = alpha * in + out
 template <typename DType, typename Lang>
-void Axpy(const size_t num, const DType alpha, const Block *in, Block *out,
+void Axpy(const DType alpha, const Tensor *in, Tensor *out,
           Context *ctx) {
   LOG(FATAL) << "Axpy Not Implemented";
 }
 
 /// out = ||in||_2^2, i.e, L2 norm.
 template <typename DType, typename Lang>
-void Nrm2(const size_t num, const Block *in, float *out, Context *ctx) {
+void Nrm2(const Tensor *in, float *out, Context *ctx) {
   LOG(FATAL) << "Nrm2 Not Implemented";
 }
 
 /// out *= x
 template <typename DType, typename Lang>
-void Scale(const size_t num, const DType x, Block *out, Context *ctx) {
+void Scale(const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Scale Not Implemented";
 }
 
 /// inner product of array in1 and in2
 template <typename DType, typename Lang>
-void Dot(const size_t num, const Block *in1, const Block *in2, DType *out,
+void Dot(const Tensor *in1, const Tensor *in2, DType *out,
          Context *ctx) {
   LOG(FATAL) << "Dot Not Implemented";
 }
@@ -323,8 +357,8 @@ void Dot(const size_t num, const Block *in1, const Block *in2, DType *out,
 /// out = alpha * A * v + beta * out.
 /// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
-void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
-          const Block *A, const Block *v, const DType beta, Block *out,
+void GEMV(const DType alpha,
+          const Tensor *A, const Tensor *v, const DType beta, Tensor *out,
           Context *ctx) {
   LOG(FATAL) << "GEMV Not Implemented";
 }
@@ -332,21 +366,21 @@ void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
 /// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
 /// if matrix_lef_side is true, do M*v; else do v*M
 template <typename DType, typename Lang>
-void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
-          const Block *M, const Block *v, Block *out, Context *ctx) {
+void DGMM(const bool side_right,
+  const Tensor *M, const Tensor *v, Tensor *out, Context *ctx) {
   LOG(FATAL) << "DGMM Not Implemented";
 }
 
 /// C = alpha * A * B + beta * C.
 /// transA indicates if the internal data layout is transposed of A
 template <typename DType, typename Lang>
-void GEMM(const bool transA, const bool transB, const size_t nrowA,
-          const size_t ncolB, const size_t ncolA, const DType alpha,
-          const Block *A, const Block *B, const DType beta, Block *C,
+void GEMM(const DType alpha,
+          const Tensor *A, const Tensor *B, const DType beta, Tensor *C,
           Context *ctx) {
   LOG(FATAL) << "GEMM Not Implemented";
 }
 
+//yisen todo
 template <typename DType, typename Lang>
 void ComputeCrossEntropy(bool int_target, const size_t batchsize,
                          const size_t dim, const Block *p, const Block *t,
@@ -362,8 +396,7 @@ void SoftmaxCrossEntropyBwd(bool int_target, const size_t batchsize,
 }
 
 template <typename DType, typename Lang>
-void RowMax(const size_t nrow, const size_t ncol, const Block *in,
-    Block *ret, Context* ctx) {
+void RowMax(const Tensor *in, Tensor *out, Context* ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 // **************************************
@@ -372,40 +405,40 @@ void RowMax(const size_t nrow, const size_t ncol, const Block *in,
 /*
 /// Add the vector v to every column of A as the column of out
 template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
-            Block *out, Context *ctx) {
+void AddCol(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+            Tensor *out, Context *ctx) {
   LOG(FATAL) << "AddCol Not Implemented";
 }
 // TODO(wangwei) unify AddRow and AddCol.
 /// Add the vector v to every row of A as the row of out
 template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
-            Block *out, Context *ctx) {
+void AddRow(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+            Tensor *out, Context *ctx) {
   LOG(FATAL) << "AddRow Not Implemented";
 }
 /// outer-product.
 /// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
 template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Block *in1, const Block *in2,
-           Block *out, Context *ctx) {
+void Outer(const size_t m, const size_t n, const Tensor *in1, const Tensor *in2,
+           Tensor *out, Context *ctx) {
   LOG(FATAL) << "Outer Not Implemented";
 }
 
 /// Sum the columns of the in matrix into a vector
 template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Block *in, Block *out,
+void SumColumns(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
                 Context *ctx) {
   LOG(FATAL) << "SumColumns Not Implemented";
 }
 template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Block *out, Context *ctx) {
+void Set(const DType x, Tensor *out, Context *ctx) {
   LOG(FATAL) << "Not Implemented";
 }
 
 // TODO(wangwei) unify SumRow and SumCol.
 /// Sum the rows of the in matrix into a vector
 template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Block *in, Block *out,
+void SumRows(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
              Context *ctx) {
   LOG(FATAL) << "SumRows Not Implemented";
 }



[03/10] incubator-singa git commit: Singa-351 Added stride support and cudnn codes to cuda

Posted by wa...@apache.org.
Singa-351 Added stride support and cudnn codes to cuda


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/26101eee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/26101eee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/26101eee

Branch: refs/heads/master
Commit: 26101eee95db67316d31bf96956b10a28c37b0e1
Parents: a88efa0
Author: Vaan Ng <cm...@gmail.com>
Authored: Sun May 6 23:24:35 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Thu May 10 14:39:26 2018 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |  79 ++-
 src/core/tensor/tensor_math_cuda.h | 860 +++++++++++++++++++++++++-------
 2 files changed, 745 insertions(+), 194 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/26101eee/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 6eafbdf..2c28e0f 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -104,15 +104,83 @@ class Tensor {
     return shape_.at(idx);
   }
 
+  /*  
+  cudnn requires tensor dimensions to fulfill 2 requirements:
+    1.) dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+    2.) dimensions have to be set to multiples of 8
+
+    for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,24,24} to be the input
+             Tensor B has shape (2,3,4), cudnn requires shape of {1,16,24,32} to be the input
+  */
+  vector<int> generate_shape_cuda() const {
+    vector<int> shape_arr;
+    if(shape_.size() <= 4){
+      for (size_t n=0; n<4-shape_.size(); ++n) {
+        shape_arr.push_back(1);
+      } 
+      for (size_t n=0; n<shape_.size(); ++n) {
+        shape_arr.push_back(shape_.at(n));
+      } 
+      return shape_arr;
+    } else if(shape_.size() == 5){
+      for (size_t n=0; n<shape_.size(); ++n) {
+        shape_arr.push_back(shape_.at(n));
+      } 
+      return shape_arr;
+    } else {
+      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+    }
+  }
+
+  int generate_dim_cuda() const {
+    if(shape_.size() <= 4){return 4;}
+    else if(shape_.size() == 5){return 5;}
+    else{
+      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+    } 
+  }
+
   size_t nDim() const { return shape_.size(); }
 
   bool empty() const { return nDim() == 0; }
 
   //bool transpose() const { return transpose_; }
-  bool transpose() const { return (strides_[0] != 1); }
+  bool transpose() const { return (strides_.back() != 1); }
 
   const vector<int>& strides() const { return strides_; }
 
+  /*  
+  cudnn requires stride dimensions to conform to the format of the shape input as well
+    1.) stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+    2.) stride dimensions have to be set to powers of 8, depending on the stride order (outer stride = higher power)
+
+    for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,24,24} and stride {576, 576, 24, 1} to be the inputs,
+             if A is transposed with stride {1,3}, then the new cudnn stride becomes {576, 576, 8, 3}
+  */
+  vector<int> generate_strides_cuda() const {
+    vector<int> strides_arr;
+    int product = 1;
+    for (size_t n=0; n<(shape_.size()); ++n) {
+      product *= shape_[n];
+    }
+    if(shape_.size() <= 4){
+      for (size_t n=0; n<4-shape_.size(); ++n) {
+        strides_arr.push_back(product);
+      } 
+      for (size_t n=0; n<strides_.size(); ++n) {
+          strides_arr.push_back(strides_[n]);
+        }
+      return strides_arr;
+    } else if(shape_.size() == 5){
+      for (size_t n=0; n<strides_.size(); ++n) {
+          strides_arr.push_back(strides_[n]);
+        }
+      return strides_arr;
+    } else {
+      LOG(FATAL) << "Dimensions (strides) beyond 3 are currently not supported" ;
+    }
+  }
+
   const vector<int>& shape_multipliers() const { return shape_multipliers_; }
 
   /// return true if the content of the tensor is initialized
@@ -235,9 +303,12 @@ void Generate_Strides(){
         cumulative_product = cumulative_product*shape_[n];
         strides_.push_back(dim/cumulative_product);
     }
-    reverse(strides_.begin(), strides_.end());
 };
 
+void Set_Strides(const vector<int> new_strides){
+  strides_ = new_strides;
+}
+
 //generate shape multipliers
 //for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
 //for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
@@ -303,7 +374,7 @@ void update_base_index(std::vector<int>& traversal_info) const {
 void traverse_next(std::vector<int>& traversal_info, int counter) const {
     update_base_index(traversal_info);
     traversal_info[shape_.size()+1] = determine_order(counter);
-    traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[traversal_info[shape_.size()+1]];
+    traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[strides_.size()-traversal_info[shape_.size()+1]-1];
 };
 
 // ******************************************************************************************
@@ -498,6 +569,8 @@ void MultColumn(const Tensor &v, Tensor *M);
 void MultRow(const Tensor &v, Tensor *M);
 /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
 Tensor SoftMax(const Tensor &in);
+
+Tensor RowMax(const Tensor &in);
 /// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
 void SoftMax(const Tensor &in, Tensor *out);
 /// Sub column 'v' by each column of matrix M

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/26101eee/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 8a9e47a..f4839e3 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -20,6 +20,7 @@
 #define  SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
 #include "singa/singa_config.h"
 #ifdef USE_CUDA
+#include "singa/core/tensor.h"
 #include "./tensor_math.h"
 #include "./math_kernel.h"
 #include "singa/utils/cuda_utils.h"
@@ -27,254 +28,636 @@
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include "singa/utils/cuda_utils.h"
+#include <cudnn.h>
 
 namespace singa {
 
 /// out[i] = |in[i]|
 template <>
-void Abs<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::abs(num, inPtr, outPtr, ctx->stream);
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_MAX;
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+  cudnnOpTensorDescriptor_t op_desc;
+  cudnnCreateOpTensorDescriptor(&op_desc);
+  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+  
+  float alpha1[1] = {1.0};
+  float alpha2[1] = {-1.0};
+  float beta[1] = {0.0};
+  cudnnTensorDescriptor_t in_desc, out_desc;
+  cudnnCreateTensorDescriptor(&in_desc);
+  cudnnCreateTensorDescriptor(&out_desc);
+  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr, 
+                (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
+
+  cudnnDestroyTensorDescriptor(in_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
 }
-/// out = in + x
+
 template <>
-void Add<float, lang::Cuda>(const size_t num, const Block* in, const float x,
-                            Block* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::add(num, inPtr, x, outPtr, ctx->stream);
+void Set<float, lang::Cuda>(const float x, Tensor* out,
+                            Context* ctx) {
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  //float valuePtr[1] = {x};
+
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnTensorDescriptor_t out_desc;
+  cudnnCreateTensorDescriptor(&out_desc);
+  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+  cudnnSetTensor(ctx->cudnn_handle, out_desc, outPtr, (void*)(&x));
+
+  cudnnDestroyTensorDescriptor(out_desc);
+}
+
+template <>
+void Add<float, lang::Cuda>(const Tensor* in, const float x,
+                            Tensor* out, Context* ctx) {
+  Set<float, lang::Cuda>(x, out, ctx);
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+  float alpha = 1.0, beta=1.0;
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnTensorDescriptor_t in_desc, out_desc;
+  cudnnCreateTensorDescriptor(&in_desc);
+  cudnnCreateTensorDescriptor(&out_desc);
+  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+  cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr,  (void*)(&beta), out_desc, outPtr);
+
+  cudnnDestroyTensorDescriptor(in_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
 }
+
 /// out = in1 + in2
 template <>
-void Add<float, lang::Cuda>(const size_t num, const Block* in1,
-                            const Block* in2, Block* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::add(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void Add<float, lang::Cuda>(const Tensor* in1,
+                            const Tensor* in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+  cudnnOpTensorDescriptor_t op_desc;
+  cudnnCreateOpTensorDescriptor(&op_desc);
+  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+
+  float alpha1[1] = {1.0};
+  float alpha2[1] = {1.0};
+  float beta[1] = {0.0};
+  cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
+  cudnnCreateTensorDescriptor(&in1_desc);
+  cudnnCreateTensorDescriptor(&in2_desc);
+  cudnnCreateTensorDescriptor(&out_desc);
+  cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+  if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+  } else {
+    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+  }
+
+  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
+                (void*)(alpha2), in2_desc, inPtr2, (void*)(beta), out_desc, outPtr);
+
+  cudnnDestroyTensorDescriptor(in1_desc);
+  cudnnDestroyTensorDescriptor(in2_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
+}
+
+/// out = in1 - in2
+template <>
+void Sub<float, lang::Cuda>(const Tensor* in1,
+                            const Tensor* in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+  cudnnOpTensorDescriptor_t op_desc;
+  cudnnCreateOpTensorDescriptor(&op_desc);
+  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+
+  float alpha1[1] = {1.0};
+  float alpha2[1] = {-1.0};
+  float beta[1] = {0.0};
+  cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
+  cudnnCreateTensorDescriptor(&in1_desc);
+  cudnnCreateTensorDescriptor(&in2_desc);
+  cudnnCreateTensorDescriptor(&out_desc);
+  cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+  if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+  } else {
+    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+  }
+
+  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
+                (void*)(alpha2), in2_desc, inPtr2, (void*)(beta),  out_desc, outPtr);
+
+  cudnnDestroyTensorDescriptor(in1_desc);
+  cudnnDestroyTensorDescriptor(in2_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
 }
+
 /// Element-wise operation, clamp every element into [low, high]
 /// if x>high, then x=high; if x<low, then x=low.
 template <>
-void Clamp<float, lang::Cuda>(const size_t num, const float low,
-                              const float high, const Block* in, Block* out,
+void Clamp<float, lang::Cuda>(const float low,
+                              const float high, const Tensor* in, Tensor* out,
                               Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 /// out = in1 / in2
 template <>
-void Div<float, lang::Cuda>(const size_t num, const Block* in1,
-                            const Block* in2, Block* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void Div<float, lang::Cuda>(const Tensor* in1,
+                            const Tensor* in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in1->Size();
+
+  if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::div
+        cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+        out->Set_Strides(in1->strides());
+  } else { //else we transform in1 to out to store first
+    float alpha[1] = {1.0};
+    float beta[1] = {0.0};
+
+    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+    cudnnTensorDescriptor_t in1_desc, out_desc;
+    cudnnCreateTensorDescriptor(&in1_desc);
+    cudnnCreateTensorDescriptor(&out_desc);
+    cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+    out->Set_Strides(in2->strides());
+    cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
+                         (void*)(beta), out_desc, outPtr);
+
+    cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
+    cudnnDestroyTensorDescriptor(in1_desc);
+    cudnnDestroyTensorDescriptor(out_desc);
+  }
 }
 
 template <>
-void Div<float, lang::Cuda>(const size_t num, const float x, const Block* in,
-                            Block* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+void Div<float, lang::Cuda>(const float x, const Tensor* in,
+                            Tensor* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::div(num, x, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 
 /// out = in * x
 template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in,
-                                    const float x, Block* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::mult(num, inPtr, x, outPtr, ctx->stream);
+void EltwiseMult<float, lang::Cuda>(const Tensor* in,
+                                    const float x, Tensor* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+  float alpha = x, beta = 0.0;
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnTensorDescriptor_t in_desc, out_desc;
+  cudnnCreateTensorDescriptor(&in_desc);
+  cudnnCreateTensorDescriptor(&out_desc);
+  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+  cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr,  (void*)(&beta), out_desc, outPtr);
+
+  cudnnDestroyTensorDescriptor(in_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
 }
+
 /// out = in1 * in2
 template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in1,
-                                    const Block* in2, Block* out,
+void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
+                                    const Tensor* in2, Tensor* out,
                                     Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in1->Size();
+
+  if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
+        cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+        out->Set_Strides(in1->strides());
+  } else { //else we transform in1 to out to store first
+    float alpha[1] = {1.0};
+    float beta[1] = {0.0};
+
+
+    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+    cudnnTensorDescriptor_t in1_desc, out_desc;
+    cudnnCreateTensorDescriptor(&in1_desc);
+    cudnnCreateTensorDescriptor(&out_desc);
+    cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+    out->Set_Strides(in2->strides());
+    cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
+                         (void*)(beta), out_desc, outPtr);
+
+    cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
+    cudnnDestroyTensorDescriptor(in1_desc);
+    cudnnDestroyTensorDescriptor(out_desc);
+  }
 }
+
+
 /// Base is e. out[i]=e^in[i]
 template <>
-void Exp<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Exp<float, lang::Cuda>(const Tensor* in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::exp(num, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 
 template <>
-void GE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
+void GE<float, lang::Cuda>(const Tensor* in, const float x,
+                           Tensor* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const size_t num = in->Size();
   cuda::ge(num, inPtr, x, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 template <>
-void GE<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void GE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context* ctx) {
+  Sub<float, lang::Cuda>(in1, in2, out, ctx);
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  const size_t num = in1->Size();
+  //cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
+  cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
 
 template <>
-void GT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
+void GT<float, lang::Cuda>(const Tensor* in, const float x,
+                           Tensor* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const size_t num = in->Size();
   cuda::gt(num, inPtr, x, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 template <>
-void GT<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void GT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context* ctx) {
+  Sub<float, lang::Cuda>(in1, in2, out, ctx);
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  const size_t num = in1->Size();
+  //cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+  cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 template <>
-void LE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
+void LE<float, lang::Cuda>(const Tensor* in, const float x,
+                           Tensor* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const size_t num = in->Size();
   cuda::le(num, inPtr, x, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 template <>
-void LE<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void LE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context* ctx) {
+  Sub<float, lang::Cuda>(in1, in2, out, ctx);
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  const size_t num = in1->Size();
+  //cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
+  cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
 /// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
 template <>
-void Log<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Log<float, lang::Cuda>(const Tensor* in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::log(num, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 template <>
-void LT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->data());
+void LT<float, lang::Cuda>(const Tensor* in, const float x,
+                           Tensor* out, Context* ctx) {
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const size_t num = in->Size();
   cuda::lt(num, inPtr, x, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 template <>
-void LT<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
-                           Block* out, Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void LT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+                           Tensor* out, Context* ctx) {
+  Sub<float, lang::Cuda>(in1, in2, out, ctx);
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  const size_t num = in1->Size();
+  //cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+  cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 /// Element-wise operation, out[i] = in[i]^x
 template <>
-void Pow<float, lang::Cuda>(const size_t num, const Block* in, const float x,
-                            Block* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+void Pow<float, lang::Cuda>(const Tensor* in, const float x,
+                            Tensor* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::pow(num, inPtr, x, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 /// Element-wise operation, out[i] = in1[i]^in2[i]
 template <>
-void Pow<float, lang::Cuda>(const size_t num, const Block* in1,
-                            const Block* in2, Block* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void Pow<float, lang::Cuda>(const Tensor* in1,
+                            const Tensor* in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in1->Size();
+
+  if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::pow
+        cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
+        out->Set_Strides(in1->strides());
+  } else { //else we transform in1 to out to store first
+    float alpha[1] = {1.0};
+    float beta[1] = {0.0};
+
+    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+    cudnnTensorDescriptor_t in1_desc, out_desc;
+    cudnnCreateTensorDescriptor(&in1_desc);
+    cudnnCreateTensorDescriptor(&out_desc);
+    cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+    out->Set_Strides(in2->strides());
+    cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
+                         (void*)(beta), out_desc, outPtr);
+
+    cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
+    cudnnDestroyTensorDescriptor(in1_desc);
+    cudnnDestroyTensorDescriptor(out_desc);
+  }
 }
 
 /// Element-wise operation, out[i]=max(0, in[i])
+// template <>
+// void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+//                              Context* ctx) {
+//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+//   cudnnActivationDescriptor_t act_desc;
+//   cudnnActivationMode_t mode = CUDNN_ACTIVATION_RELU;
+//   cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+//   double coef = 0.0; //only used for CLIPPED_RELU or ELU
+//   cudnnCreateActivationDescriptor(&act_desc);
+//   cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
+  
+//   float alpha[1] = {1.0};
+//   float beta[1] = {0.0};
+//   cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+//   cudnnTensorDescriptor_t in_desc, out_desc;
+//   cudnnCreateTensorDescriptor(&in_desc);
+//   cudnnCreateTensorDescriptor(&out_desc);
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
+//                         (void*)(&beta), out_desc, outPtr);
+
+//   cudnnDestroyTensorDescriptor(in_desc);
+//   cudnnDestroyTensorDescriptor(out_desc);
+//   cudnnDestroyActivationDescriptor(act_desc);
+// }
+
 template <>
-void ReLU<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::relu(num, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 
-/// out[i] = x
-template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Block* out,
-                            Context* ctx) {
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::set(num, x, outPtr, ctx->stream);
-}
+// /// Element-wise operation, out[i]=sigmoid([in[i])
+// template <>
+// void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+//                                 Context* ctx) {
+//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+//   cudnnActivationDescriptor_t act_desc;
+//   cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
+//   cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+//   double coef = 0.0; //only used for CLIPPED_RELU or ELU
+//   cudnnCreateActivationDescriptor(&act_desc);
+//   cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
+  
+//   float alpha[1] = {1.0};
+//   float beta[1] = {0.0};
+//   cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+//   cudnnTensorDescriptor_t in_desc, out_desc;
+//   cudnnCreateTensorDescriptor(&in_desc);
+//   cudnnCreateTensorDescriptor(&out_desc);
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
+//                         (void*)(&beta), out_desc, outPtr);
+
+//   cudnnDestroyTensorDescriptor(in_desc);
+//   cudnnDestroyTensorDescriptor(out_desc);
+//   cudnnDestroyActivationDescriptor(act_desc);
+// }
+
 /// Element-wise operation, out[i]=sigmoid([in[i])
 template <>
-void Sigmoid<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
                                 Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
+
 // out[i] = sign(in[i])
 template <>
-void Sign<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Sign<float, lang::Cuda>(const Tensor* in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::sign(num, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 
-/// Element-wise operation, out[i]=sqrt([in[i])
+// Element-wise operation, out[i]=sqrt([in[i])
 template <>
-void Sqrt<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::sqrt(num, inPtr, outPtr, ctx->stream);
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_SQRT;
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+  cudnnOpTensorDescriptor_t op_desc;
+  cudnnCreateOpTensorDescriptor(&op_desc);
+  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+  
+  float alpha1[1] = {1.0};
+  float alpha2[1] = {0.0};
+  float beta[1] = {0.0};
+  cudnnTensorDescriptor_t in_desc, out_desc;
+  cudnnCreateTensorDescriptor(&in_desc);
+  cudnnCreateTensorDescriptor(&out_desc);
+  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr, 
+                (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
+
+  cudnnDestroyTensorDescriptor(in_desc);
+  cudnnDestroyTensorDescriptor(out_desc);
 }
 
 /// Element-wise operation, out[i]=in[i]^2
 template <>
-void Square<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Square<float, lang::Cuda>(const Tensor* in, Tensor* out,
                                Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::square(num, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
-/// out = in1 - in2
-template <>
-void Sub<float, lang::Cuda>(const size_t num, const Block* in1,
-                            const Block* in2, Block* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::sub(num, inPtr1, inPtr2, outPtr, ctx->stream);
-}
 
-/// sum all elements of input into out
+// template <>
+// void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+//                             Context* ctx) {
+//   LOG(FATAL) << "Cuda Sum is not implemented!";
+//   // const float* inPtr = static_cast<const float*>(in->data());
+//   // cuda::sum(num, inPtr, out, ctx->stream);
+// }
+
 template <>
-void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+void Sum<float, lang::Cuda>(const Tensor* in, float* out,
                             Context* ctx) {
-  LOG(FATAL) << "Cuda Sum is not implemented!";
-  // const float* inPtr = static_cast<const float*>(in->data());
-  // cuda::sum(num, inPtr, out, ctx->stream);
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+
+   //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
+   Shape reduced_shape = {1};
+   Tensor t(reduced_shape, in->device(), in->data_type());
+   float* tPtr = static_cast<float*>(t.block()->mutable_data());
+   vector<int> reduce_all_axes = in->generate_shape_cuda();
+   for (size_t n=0; n<reduce_all_axes.size(); ++n) {
+    reduce_all_axes[n] = 1;
+   }
+   
+  //reduce_desc
+  cudnnReduceTensorDescriptor_t reduce_desc;
+  cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
+  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+  cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES;
+  cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES;
+  cudnnCreateReduceTensorDescriptor(&reduce_desc);
+  cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype,
+                                 cudnn_propagation, cudnn_indices, cudnn_indices_type);
+
+  //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
+  Shape reduction_size = {1000};
+  Tensor indices(reduction_size, in->device(), in->data_type());
+  Tensor workspace(reduction_size, in->device(), in->data_type());
+  size_t indices_bytes = indices.block()->size()*1000;
+  size_t workspace_bytes = workspace.block()->size()*1000;
+  size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
+  float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
+  //void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
+  //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
+
+  float alpha[1] = {1.0};
+  float beta[1] = {0.0};
+  cudnnTensorDescriptor_t in_desc, t_desc;
+  cudnnCreateTensorDescriptor(&in_desc);
+  cudnnCreateTensorDescriptor(&t_desc);
+  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+  cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), reduce_all_axes.data(), reduce_all_axes.data());
+  cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
+                    indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
+                    (void*)(&alpha), in_desc, inPtr, (void*)(&beta), t_desc, tPtr);
+
+  *out = tPtr[0];
+  cudnnDestroyTensorDescriptor(in_desc);
+  cudnnDestroyTensorDescriptor(t_desc);
 }
 
+
 /// Element-wise operation, out[i]=tanh([in[i])
+// template <>
+// void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+//                              Context* ctx) {
+//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+//   cudnnActivationDescriptor_t act_desc;
+//   cudnnActivationMode_t mode = CUDNN_ACTIVATION_TANH;
+//   cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+//   double coef = 0.0; //only used for CLIPPED_RELU or ELU
+//   cudnnCreateActivationDescriptor(&act_desc);
+//   cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
+  
+//   float alpha[1] = {1.0};
+//   float beta[1] = {0.0};
+//   cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+//   cudnnTensorDescriptor_t in_desc, out_desc;
+//   cudnnCreateTensorDescriptor(&in_desc);
+//   cudnnCreateTensorDescriptor(&out_desc);
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
+//                         (void*)(&beta), out_desc, outPtr);
+
+//   cudnnDestroyTensorDescriptor(in_desc);
+//   cudnnDestroyTensorDescriptor(out_desc);
+//   cudnnDestroyActivationDescriptor(act_desc);
+// }
+
 template <>
-void Tanh<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
-                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+                                Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = in->Size();
   cuda::tanh(num, inPtr, outPtr, ctx->stream);
+  out->Set_Strides(in->strides());
 }
 
 // ================Random functions===========================================
@@ -282,10 +665,11 @@ void Tanh<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
 // Get the random generator from 'ctx'
 // If DType is not float, then convert the threshold to DType
 template <>
-void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Block* out,
+void Bernoulli<float, lang::Cuda>(const float p, Tensor* out,
                                   Context* ctx) {
   auto rgen = ctx->curand_generator;
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = out->Size();
   CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
   cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
 }
@@ -293,10 +677,11 @@ void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Block* out,
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the low and high to DType
 template <>
-void Uniform<float, lang::Cuda>(const size_t num, const float low,
-                                const float high, Block* out, Context* ctx) {
+void Uniform<float, lang::Cuda>(const float low,
+                                const float high, Tensor* out, Context* ctx) {
   auto rgen = ctx->curand_generator;
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = out->Size();
   CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
   cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
   cuda::add(num, outPtr, low, outPtr, ctx->stream);
@@ -305,88 +690,97 @@ void Uniform<float, lang::Cuda>(const size_t num, const float low,
 // The random generator should be extracted from ctx.
 // If DType is not float, then convert the mean and delta to DType
 template <>
-void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
-                                 const float std, Block* out, Context* ctx) {
+void Gaussian<float, lang::Cuda>(const float mean,
+                                 const float std, Tensor* out, Context* ctx) {
   auto rgen = ctx->curand_generator;
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = out->Size();
   CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
 }
 
 // =========================Blas operations==================================
 // ref to http://docs.nvidia.com/cuda/cublas
 template <>
-void Amax<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
+void Amax<float, lang::Cuda>(const Tensor* in, size_t* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   int idx = 1;
+  const size_t num = in->Size();
   CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
   *out = idx - 1;  // cublas index starts from 1
 }
 
 /// return the index of the element with the min value.
 template <>
-void Amin<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
+void Amin<float, lang::Cuda>(const Tensor* in, size_t* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   int idx = 1;
+  const size_t num = in->Size();
   CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
   *out = idx - 1;
 }
 
 /// out = sum |x| for all x in in
 template <>
-void Asum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+void Asum<float, lang::Cuda>(const Tensor* in, float* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const size_t num = in->Size();
   CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
 }
 
 /// out = alpha * in + out
 template <>
-void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
-                             const Block* in, Block* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+void Axpy<float, lang::Cuda>(const float alpha,
+                             const Tensor* in, Tensor* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const size_t num = in->Size();
   CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
 }
 
 /// out = \sum_i in1[i] * in2[i]
 template <>
-void Dot<float, lang::Cuda>(const size_t num, const Block* in1,
-                            const Block* in2, float* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->data());
-  const float* inPtr2 = static_cast<const float*>(in2->data());
+void Dot<float, lang::Cuda>(const Tensor* in1,
+                            const Tensor* in2, float* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
+  const size_t num = in1->Size();
   CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
 }
 template <>
-void Nrm2<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+void Nrm2<float, lang::Cuda>(const Tensor* in, float* out,
                              Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float* inPtr = static_cast<const float*>(in->data());
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const size_t num = in->Size();
   cublasSnrm2(handle, num, inPtr, 1, out);
 }
 template <>
-void Scale<float, lang::Cuda>(const size_t num, const float x, Block* out,
+void Scale<float, lang::Cuda>(const float x, Tensor* out,
                               Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t num = out->Size();
   CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
 }
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
-void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
-                             const size_t ncol, const Block* M, const Block* v,
-                             Block* out, Context* ctx) {
+void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tensor* v,
+                             Tensor* out, Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float* MPtr = static_cast<const float*>(M->data());
-  const float* vPtr = static_cast<const float*>(v->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+  const float* MPtr = static_cast<const float*>(M->block()->data());
+  const float* vPtr = static_cast<const float*>(v->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t nrow = M->shape(0);
+  const size_t ncol = M->shape(1);
   if (side_right) {
     CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
                              vPtr, 1, outPtr, ncol));
@@ -396,14 +790,16 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
   }
 }
 template <>
-void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
-                             const float alpha, const Block* A, const Block* v,
-                             const float beta, Block* out, Context* ctx) {
-  const float* APtr = static_cast<const float*>(A->data());
-  const float* vPtr = static_cast<const float*>(v->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
+void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v,
+                             const float beta, Tensor* out, Context* ctx) {
+  const float* APtr = static_cast<const float*>(A->block()->data());
+  const float* vPtr = static_cast<const float*>(v->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t m = A->shape()[0];
+  const size_t n = A->shape()[1];
+
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  if (!trans)
+  if (!(A->transpose()))
     CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
                              1, &beta, outPtr, 1));
   else
@@ -413,19 +809,22 @@ void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
 
 // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
 template <>
-void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
-                             const size_t nrowA, const size_t ncolB,
-                             const size_t ncolA, const float alpha,
-                             const Block* A, const Block* B, const float beta,
-                             Block* C, Context* ctx) {
+void GEMM<float, lang::Cuda>(const float alpha,
+                             const Tensor* A, const Tensor* B, const float beta,
+                             Tensor* C, Context* ctx) {
+  auto transA = A->transpose();
   auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  auto transB = B->transpose();
   auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  const size_t nrowA = A->shape()[0];
+  const size_t ncolA = A->shape()[1];
+  const size_t ncolB = B->shape()[1];
   int lda = transA ? nrowA : ncolA;
   int ldb = transB ? ncolA : ncolB;
   int ldc = ncolB;
-  const float* APtr = static_cast<const float*>(A->data());
-  const float* BPtr = static_cast<const float*>(B->data());
-  float* CPtr = static_cast<float*>(C->mutable_data());
+  const float* APtr = static_cast<const float*>(A->block()->data());
+  const float* BPtr = static_cast<const float*>(B->block()->data());
+  float* CPtr = static_cast<float*>(C->block()->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
                            BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
@@ -457,14 +856,93 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
                                ctx->stream);
 }
 
+// template <>
+// void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+//                                Context* ctx) {
+//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   float* outPtr = static_cast<float*>(out->block()->mutable_data());
+//   // const size_t nrow = in->shape()[0];
+//   // const size_t ncol = in->shape()[1];
+//   // cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+
+//   //vector<int> reduce_row_axes_shape = in->generate_shape_cuda();
+//   //reduce_row_axes_shape.back() = 1; //reduce axis 1, so we set last element d in shape {a,b,c,d} to 1
+
+//   vector<int> reduce_row_axes_shape = {1,1,1,1};
+//   vector<int> reduced_strides = {1,1,1,1};
+
+//   //reduce_desc
+//   cudnnReduceTensorDescriptor_t reduce_desc;
+//   cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
+//   cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+//   cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+//   cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES;
+//   //cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_FLATTENED_INDICES;
+//   cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES;
+//   cudnnCreateReduceTensorDescriptor(&reduce_desc);
+//   cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype,
+//                                  cudnn_propagation, cudnn_indices, cudnn_indices_type);
+
+//   //instantiate new tensor to use new blocks as memory instead of cudaMalloc
+//   //create 2 tensors of same size as input tensor
+//   Shape reduction_size = {1000};
+//   Tensor indices(reduction_size, in->device(), in->data_type());
+//   Tensor workspace(reduction_size, in->device(), in->data_type());
+//   size_t indices_bytes = indices.block()->size()*1000;
+//   size_t workspace_bytes = workspace.block()->size()*1000;
+//   size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
+//   float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
+//   //void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
+//   //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
+
+//   float alpha[1] = {1.0};
+//   float beta[1] = {0.0};
+//   cudnnTensorDescriptor_t in_desc, out_desc;
+//   cudnnCreateTensorDescriptor(&in_desc);
+//   cudnnCreateTensorDescriptor(&out_desc);
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+//   //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), reduce_row_axes_shape.data(), reduced_strides.data());
+//   cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
+//                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
+//                     (void*)(&alpha), in_desc, inPtr, (void*)(&beta),  out_desc, outPtr);
+
+//   cudnnDestroyTensorDescriptor(in_desc);
+//   cudnnDestroyTensorDescriptor(out_desc);
+// }
+
 template <>
-void RowMax<float, lang::Cuda>(const size_t nrow, const size_t ncol,
-                               const Block* in, Block* out,
+void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
                                Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->data());
-  float* outPtr = static_cast<float*>(out->mutable_data());
-  cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+  const float* inPtr = static_cast<const float*>(in->block()->data());
+  float* outPtr = static_cast<float*>(out->block()->mutable_data());
+  const size_t nrow = in->shape()[0];
+  const size_t ncol = in->shape()[1];
+
+  if(in->transpose()){
+    Tensor t(in->shape(), in->device(), in->data_type());
+    float* tPtr = static_cast<float*>(t.block()->mutable_data());
+    float alpha[1] = {1.0};
+    float beta[1] = {0.0};
+
+    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+    cudnnTensorDescriptor_t in_desc, t_desc;
+    cudnnCreateTensorDescriptor(&in_desc);
+    cudnnCreateTensorDescriptor(&t_desc);
+    cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+    cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), t.generate_shape_cuda().data(), t.generate_strides_cuda().data());
+    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in_desc, inPtr,
+                         (void*)(beta), t_desc, tPtr);
+
+    const float* tPtr_const = static_cast<const float*>(t.block()->data());
+    cuda::RowMax(nrow, ncol, tPtr_const, outPtr, ctx->stream);
+    cudnnDestroyTensorDescriptor(in_desc);
+    cudnnDestroyTensorDescriptor(t_desc);
+  } else {
+    cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+  }
 }
+
 }  // namespace singa
 
 #endif  // USE_CUDA


[06/10] incubator-singa git commit: Streamlining of tensor.h file by moving respective member functions to cpp or cuda file. Removal of shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be passed as reference instead of pointer

Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 3e36877..6e86ca7 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -32,13 +32,88 @@
 
 namespace singa {
 
-cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor* x){
+// ===================== Helper Functions =============================
+
+  /*  
+  cudnn requires tensor dimensions to fulfill 1 requirement:
+    1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors 
+        if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+    for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+             Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
+  */
+  vector<int> generate_shape_cuda(const Tensor& x) {
+    Shape shape_ = x.shape();
+    vector<int> shape_arr;
+    if(shape_.size() <= 4){
+      for (size_t n=0; n<4-shape_.size(); ++n) {
+        shape_arr.push_back(1);
+      } 
+      for (size_t n=0; n<shape_.size(); ++n) {
+        shape_arr.push_back(shape_.at(n));
+      } 
+      return shape_arr;
+    } else if(shape_.size() == 5){
+      for (size_t n=0; n<shape_.size(); ++n) {
+        shape_arr.push_back(shape_.at(n));
+      } 
+      return shape_arr;
+    } else {
+      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+    }
+  }
+
+  int generate_dim_cuda(const Tensor& x) {
+    if(x.shape().size() <= 4){return 4;}
+    else if(x.shape().size() == 5){return 5;}
+    else{
+      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+    } 
+  }
+
+/*  
+  cudnn requires stride dimensions to conform to the format of the shape input as well
+    1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+        If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+    for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3}
+    and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
+  */
+  vector<int> generate_strides_cuda(const Tensor& x) {
+    Shape shape_ = x.shape();
+    vector<int> strides_ = x.strides();
+    vector<int> strides_arr;
+    int product = 1;
+    for (size_t n=0; n<(shape_.size()); ++n) {
+      product *= shape_[n];
+    }
+    if(shape_.size() <= 4){
+      for (size_t n=0; n<4-shape_.size(); ++n) {
+        strides_arr.push_back(product);
+      } 
+      for (size_t n=0; n<strides_.size(); ++n) {
+          strides_arr.push_back(strides_[n]);
+        }
+      return strides_arr;
+    } else if(shape_.size() == 5){
+      for (size_t n=0; n<strides_.size(); ++n) {
+          strides_arr.push_back(strides_[n]);
+        }
+      return strides_arr;
+    } else {
+      LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
+    }
+  }
+
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x){
   cudnnTensorDescriptor_t x_desc;
   cudnnCreateTensorDescriptor(&x_desc);
   cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
-                             x->generate_dim_cuda(),
-                             x->generate_shape_cuda().data(),
-                             x->generate_strides_cuda().data()
+                             generate_dim_cuda(x),
+                             generate_shape_cuda(x).data(),
+                             generate_strides_cuda(x).data()
                              );
 
   return x_desc;
@@ -55,12 +130,13 @@ cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
   return op_desc;
 }
 
+// ===================== CUDA Functions =============================
 
 /// out[i] = |in[i]|
 template <>
-void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha1 = 1.0;
@@ -70,7 +146,7 @@ void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
   cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
                 (void*)(&alpha1), in_desc, inPtr, 
                 (void*)(&alpha2), in_desc, inPtr,
-                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                (void*)(&beta), generate_tensorND_desc(*out), outPtr
                 );
   cudnnDestroyTensorDescriptor(in_desc);
 }
@@ -80,74 +156,74 @@ void Set<float, lang::Cuda>(const float x, Tensor* out,
                             Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
-  cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(out), 
+  cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out), 
                   outPtr, (void*)(&x));
 }
 
 template <>
-void Add<float, lang::Cuda>(const Tensor* in, const float x,
+void Add<float, lang::Cuda>(const Tensor& in, const float x,
                             Tensor* out, Context* ctx) {
   Set<float, lang::Cuda>(x, out, ctx);
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha = 1.0, beta = 1.0;
   cudnnAddTensor(ctx->cudnn_handle,
                  (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                 (void*)(&beta), generate_tensorND_desc(out), outPtr
+                 (void*)(&beta), generate_tensorND_desc(*out), outPtr
                  );
 }
 
 /// out = in1 + in2
 template <>
-void Add<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Add<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha1 = 1.0;
   float alpha2 = 1.0;
   float beta = 0.0;
 
-  if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+  if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
               (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
               (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
-              (void*)(&beta), generate_tensorND_desc(out), outPtr
+              (void*)(&beta), generate_tensorND_desc(*out), outPtr
               );
   } else {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
           (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
           (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
-          (void*)(&beta), generate_tensorND_desc(out), outPtr
+          (void*)(&beta), generate_tensorND_desc(*out), outPtr
           );
   }
 }
 
 /// out = in1 - in2
 template <>
-void Sub<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Sub<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha1 = 1.0;
   float alpha2 = -1.0;
   float beta = 0.0;
 
-  if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+  if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
               (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
               (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
-              (void*)(&beta), generate_tensorND_desc(out), outPtr
+              (void*)(&beta), generate_tensorND_desc(*out), outPtr
               );
   } else {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
           (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
           (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
-          (void*)(&beta), generate_tensorND_desc(out), outPtr
+          (void*)(&beta), generate_tensorND_desc(*out), outPtr
           );
   }
 }
@@ -156,35 +232,35 @@ void Sub<float, lang::Cuda>(const Tensor* in1,
 /// if x>high, then x=high; if x<low, then x=low.
 template <>
 void Clamp<float, lang::Cuda>(const float low,
-                              const float high, const Tensor* in, Tensor* out,
+                              const float high, const Tensor& in, Tensor* out,
                               Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 /// out = in1 / in2
 template <>
-void Div<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Div<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
 
   //if both in1 and in2 strides are the same, we proceed to normal cuda::div
-  if(in1->strides() == in2->strides()){
+  if(in1.strides() == in2.strides()){
         cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->Set_Strides(in1->strides());
+        out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
-    out->Set_Strides(in2->strides());
+    out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -192,51 +268,51 @@ void Div<float, lang::Cuda>(const Tensor* in1,
 }
 
 template <>
-void Div<float, lang::Cuda>(const float x, const Tensor* in,
+void Div<float, lang::Cuda>(const float x, const Tensor& in,
                             Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::div(num, x, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 /// out = in * x
 template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in,
                                     const float x, Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha = x, beta = 0.0;
   cudnnAddTensor(ctx->cudnn_handle,
                 (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                (void*)(&beta), generate_tensorND_desc(*out), outPtr
                 );
 }
 
 /// out = in1 * in2
 template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
-                                    const Tensor* in2, Tensor* out,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
+                                    const Tensor& in2, Tensor* out,
                                     Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
 
   //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
-  if(in1->strides() == in2->strides()){ 
+  if(in1.strides() == in2.strides()){ 
         cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->Set_Strides(in1->strides());
+        out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
-    out->Set_Strides(in2->strides());
+    out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -246,138 +322,138 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
 
 /// Base is e. out[i]=e^in[i]
 template <>
-void Exp<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Exp<float, lang::Cuda>(const Tensor& in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::exp(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 template <>
-void GE<float, lang::Cuda>(const Tensor* in, const float x,
+void GE<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::ge(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void GE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
 
 template <>
-void GT<float, lang::Cuda>(const Tensor* in, const float x,
+void GT<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::gt(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void GT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 template <>
-void LE<float, lang::Cuda>(const Tensor* in, const float x,
+void LE<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::le(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void LE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 
 /// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
 template <>
-void Log<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Log<float, lang::Cuda>(const Tensor& in, Tensor* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::log(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void LT<float, lang::Cuda>(const Tensor* in, const float x,
+void LT<float, lang::Cuda>(const Tensor& in, const float x,
                            Tensor* out, Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cuda::lt(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 template <>
-void LT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context* ctx) {
   Sub<float, lang::Cuda>(in1, in2, out, ctx);
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
-  const size_t num = in1->Size();
+  // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+  const size_t num = in1.Size();
   //cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
   cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
 }
 /// Element-wise operation, out[i] = in[i]^x
 template <>
-void Pow<float, lang::Cuda>(const Tensor* in, const float x,
+void Pow<float, lang::Cuda>(const Tensor& in, const float x,
                             Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::pow(num, inPtr, x, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 /// Element-wise operation, out[i] = in1[i]^in2[i]
 template <>
-void Pow<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, Tensor* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Pow<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, Tensor* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
 
-  if(in1->strides() == in2->strides()){
+  if(in1.strides() == in2.strides()){
         cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->Set_Strides(in1->strides());
+        out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
-    out->Set_Strides(in2->strides());
+    out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -386,9 +462,9 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
 
 /// Element-wise operation, out[i]=max(0, in[i])
 // template <>
-// void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                              Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
 //   cudnnActivationDescriptor_t act_desc;
@@ -404,8 +480,10 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
 //   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
 //                         (void*)(&beta), out_desc, outPtr);
 
@@ -415,20 +493,20 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
 // }
 
 template <>
-void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::relu(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // /// Element-wise operation, out[i]=sigmoid([in[i])
 // template <>
-// void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                                 Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
 //   cudnnActivationDescriptor_t act_desc;
@@ -444,8 +522,10 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
 //   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
 //                         (void*)(&beta), out_desc, outPtr);
 
@@ -456,31 +536,31 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
 
 /// Element-wise operation, out[i]=sigmoid([in[i])
 template <>
-void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                 Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // out[i] = sign(in[i])
 template <>
-void Sign<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::sign(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // Element-wise operation, out[i]=sqrt([in[i])
 template <>
-void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   
   float alpha1 = 1.0;
@@ -490,39 +570,39 @@ void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
   cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
                 (void*)(&alpha1), in_desc, inPtr, 
                 (void*)(&alpha2), in_desc, inPtr,
-                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                (void*)(&beta), generate_tensorND_desc(*out), outPtr
                 );
 }
 
 /// Element-wise operation, out[i]=in[i]^2
 template <>
-void Square<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Square<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::square(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // template <>
 // void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
 //                             Context* ctx) {
 //   LOG(FATAL) << "Cuda Sum is not implemented!";
-//   // const float* inPtr = static_cast<const float*>(in->data());
+//   // const float* inPtr = static_cast<const float*>(in.data());
 //   // cuda::sum(num, inPtr, out, ctx->stream);
 // }
 
 template <>
-void Sum<float, lang::Cuda>(const Tensor* in, float* out,
+void Sum<float, lang::Cuda>(const Tensor& in, float* out,
                             Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
 
    //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
    Shape reduced_shape = {1};
-   Tensor t(reduced_shape, in->device(), in->data_type());
+   Tensor t(reduced_shape, in.device(), in.data_type());
    float* tPtr = static_cast<float*>(t.block()->mutable_data());
-   vector<int> reduce_all_axes = in->generate_shape_cuda();
+   vector<int> reduce_all_axes = generate_shape_cuda(in);
    for (size_t n=0; n<reduce_all_axes.size(); ++n) {
     reduce_all_axes[n] = 1;
    }
@@ -539,10 +619,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
                                  cudnn_propagation, cudnn_indices, cudnn_indices_type);
 
   //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
-  size_t reduction_size_int = Product(in->shape());
+  size_t reduction_size_int = Product(in.shape());
   Shape reduction_size = {reduction_size_int*100};
-  Tensor indices(reduction_size, in->device(), in->data_type());
-  Tensor workspace(reduction_size, in->device(), in->data_type());
+  Tensor indices(reduction_size, in.device(), in.data_type());
+  Tensor workspace(reduction_size, in.device(), in.data_type());
   size_t indices_bytes = indices.block()->size()*100;
   size_t workspace_bytes = workspace.block()->size()*100;
   size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -555,7 +635,7 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
   cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
                     (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                    (void*)(&beta), generate_tensorND_desc(&t), tPtr
+                    (void*)(&beta), generate_tensorND_desc(t), tPtr
                     );
 
   *out = tPtr[0];
@@ -564,9 +644,9 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
 
 /// Element-wise operation, out[i]=tanh([in[i])
 // template <>
-// void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                              Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
 //   cudnnActivationDescriptor_t act_desc;
@@ -582,8 +662,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
 //   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
 //                         (void*)(&beta), out_desc, outPtr);
 
@@ -593,13 +675,13 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
 // }
 
 template <>
-void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                 Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   cuda::tanh(num, inPtr, outPtr, ctx->stream);
-  out->Set_Strides(in->strides());
+  out->set_strides(in.strides());
 }
 
 // ================Random functions===========================================
@@ -643,65 +725,65 @@ void Gaussian<float, lang::Cuda>(const float mean,
 // =========================Blas operations==================================
 // ref to http://docs.nvidia.com/cuda/cublas
 template <>
-void Amax<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amax<float, lang::Cuda>(const Tensor& in, size_t* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   int idx = 1;
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
   *out = idx - 1;  // cublas index starts from 1
 }
 
 /// return the index of the element with the min value.
 template <>
-void Amin<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amin<float, lang::Cuda>(const Tensor& in, size_t* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   int idx = 1;
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
   *out = idx - 1;
 }
 
 /// out = sum |x| for all x in in
 template <>
-void Asum<float, lang::Cuda>(const Tensor* in, float* out,
+void Asum<float, lang::Cuda>(const Tensor& in, float* out,
                              Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
 }
 
 /// out = alpha * in + out
 template <>
 void Axpy<float, lang::Cuda>(const float alpha,
-                             const Tensor* in, Tensor* out, Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+                             const Tensor& in, Tensor* out, Context* ctx) {
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const size_t num = in->Size();
+  const size_t num = in.Size();
   CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
 }
 
 /// out = \sum_i in1[i] * in2[i]
 template <>
-void Dot<float, lang::Cuda>(const Tensor* in1,
-                            const Tensor* in2, float* out, Context* ctx) {
-  const float* inPtr1 = static_cast<const float*>(in1->block()->data());
-  const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Dot<float, lang::Cuda>(const Tensor& in1,
+                            const Tensor& in2, float* out, Context* ctx) {
+  const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+  const float* inPtr2 = static_cast<const float*>(in2.block()->data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const size_t num = in1->Size();
+  const size_t num = in1.Size();
   CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
 }
 template <>
-void Nrm2<float, lang::Cuda>(const Tensor* in, float* out,
+void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
                              Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float* inPtr = static_cast<const float*>(in->block()->data());
-  const size_t num = in->Size();
+  const float* inPtr = static_cast<const float*>(in.block()->data());
+  const size_t num = in.Size();
   cublasSnrm2(handle, num, inPtr, 1, out);
 }
 template <>
@@ -715,14 +797,14 @@ void Scale<float, lang::Cuda>(const float x, Tensor* out,
 // NOTE: cublas uses column major order.
 // http://peterwittek.com/cublas-matrix-c-style.html
 template <>
-void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tensor* v,
+void DGMM<float, lang::Cuda>(const bool side_right, const Tensor& M, const Tensor& v,
                              Tensor* out, Context* ctx) {
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  const float* MPtr = static_cast<const float*>(M->block()->data());
-  const float* vPtr = static_cast<const float*>(v->block()->data());
+  const float* MPtr = static_cast<const float*>(M.block()->data());
+  const float* vPtr = static_cast<const float*>(v.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t nrow = M->shape(0);
-  const size_t ncol = M->shape(1);
+  const size_t nrow = M.shape(0);
+  const size_t ncol = M.shape(1);
   if (side_right) {
     CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
                              vPtr, 1, outPtr, ncol));
@@ -732,16 +814,16 @@ void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tenso
   }
 }
 template <>
-void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v,
+void GEMV<float, lang::Cuda>(const float alpha, const Tensor& A, const Tensor& v,
                              const float beta, Tensor* out, Context* ctx) {
-  const float* APtr = static_cast<const float*>(A->block()->data());
-  const float* vPtr = static_cast<const float*>(v->block()->data());
+  const float* APtr = static_cast<const float*>(A.block()->data());
+  const float* vPtr = static_cast<const float*>(v.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t m = A->shape()[0];
-  const size_t n = A->shape()[1];
+  const size_t m = A.shape()[0];
+  const size_t n = A.shape()[1];
 
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
-  if (!(A->transpose()))
+  if (!(A.transpose()))
     CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
                              1, &beta, outPtr, 1));
   else
@@ -752,20 +834,20 @@ void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v
 // http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
 template <>
 void GEMM<float, lang::Cuda>(const float alpha,
-                             const Tensor* A, const Tensor* B, const float beta,
+                             const Tensor& A, const Tensor& B, const float beta,
                              Tensor* C, Context* ctx) {
-  auto transA = A->transpose();
+  auto transA = A.transpose();
   auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  auto transB = B->transpose();
+  auto transB = B.transpose();
   auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-  const size_t nrowA = A->shape()[0];
-  const size_t ncolA = A->shape()[1];
-  const size_t ncolB = B->shape()[1];
+  const size_t nrowA = A.shape()[0];
+  const size_t ncolA = A.shape()[1];
+  const size_t ncolB = B.shape()[1];
   int lda = transA ? nrowA : ncolA;
   int ldb = transB ? ncolA : ncolB;
   int ldc = ncolB;
-  const float* APtr = static_cast<const float*>(A->block()->data());
-  const float* BPtr = static_cast<const float*>(B->block()->data());
+  const float* APtr = static_cast<const float*>(A.block()->data());
+  const float* BPtr = static_cast<const float*>(B.block()->data());
   float* CPtr = static_cast<float*>(C->block()->mutable_data());
   auto handle = ctx->cublas_handle;  // TODO(wangwei) set cudastream
   CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
@@ -799,15 +881,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 }
 
 // template <>
-// void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //                                Context* ctx) {
-//   const float* inPtr = static_cast<const float*>(in->block()->data());
+//   const float* inPtr = static_cast<const float*>(in.block()->data());
 //   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-//   // const size_t nrow = in->shape()[0];
-//   // const size_t ncol = in->shape()[1];
+//   // const size_t nrow = in.shape()[0];
+//   // const size_t ncol = in.shape()[1];
 //   // cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
 
-//   //vector<int> reduce_row_axes_shape = in->generate_shape_cuda();
+//   //vector<int> reduce_row_axes_shape = in.generate_shape_cuda();
 //   //reduce_row_axes_shape.back() = 1; //reduce axis 1, so we set last element d in shape {a,b,c,d} to 1
 
 //   vector<int> reduce_row_axes_shape = {1,1,1,1};
@@ -828,8 +910,8 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 //   //instantiate new tensor to use new blocks as memory instead of cudaMalloc
 //   //create 2 tensors of same size as input tensor
 //   Shape reduction_size = {1000};
-//   Tensor indices(reduction_size, in->device(), in->data_type());
-//   Tensor workspace(reduction_size, in->device(), in->data_type());
+//   Tensor indices(reduction_size, in.device(), in.data_type());
+//   Tensor workspace(reduction_size, in.device(), in.data_type());
 //   size_t indices_bytes = indices.block()->size()*1000;
 //   size_t workspace_bytes = workspace.block()->size()*1000;
 //   size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -842,9 +924,12 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-//   //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), reduce_row_axes_shape.data(), reduced_strides.data());
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+//   //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+// reduce_row_axes_shape.data(), reduced_strides.data());
 //   cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
 //                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
 //                     (void*)(&alpha), in_desc, inPtr, (void*)(&beta),  out_desc, outPtr);
@@ -854,15 +939,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 // }
 
 template <>
-void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
                                Context* ctx) {
-  const float* inPtr = static_cast<const float*>(in->block()->data());
+  const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  const size_t nrow = in->shape()[0];
-  const size_t ncol = in->shape()[1];
+  const size_t nrow = in.shape()[0];
+  const size_t ncol = in.shape()[1];
 
-  if(in->transpose()){
-    Tensor t(in->shape(), in->device(), in->data_type());
+  if(in.transpose()){
+    Tensor t(in.shape(), in.device(), in.data_type());
     float* tPtr = static_cast<float*>(t.block()->mutable_data());
 
     float alpha = 1.0;
@@ -870,7 +955,7 @@ void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
 
     cudnnTransformTensor(ctx->cudnn_handle,
                         (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                        (void*)(&beta), generate_tensorND_desc(&t), tPtr
+                        (void*)(&beta), generate_tensorND_desc(t), tPtr
                         );
 
     const float* tPtr_const = static_cast<const float*>(t.block()->data());



[09/10] incubator-singa git commit: reformat the code

Posted by wa...@apache.org.
reformat the code


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3e2b75cb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3e2b75cb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3e2b75cb

Branch: refs/heads/master
Commit: 3e2b75cbe86908f551ac3f492a8aba07008b227b
Parents: c52e2aa
Author: Wang Wei <dc...@nus.edu.sg>
Authored: Sun May 13 20:42:52 2018 +0800
Committer: Wang Wei <dc...@nus.edu.sg>
Committed: Sun May 13 20:42:52 2018 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |  55 +++---
 src/core/tensor/tensor.cc          | 291 ++++++++++++++++----------------
 src/core/tensor/tensor_math_cpp.h  | 163 +++++++++---------
 src/core/tensor/tensor_math_cuda.h | 286 +++++++++++++++----------------
 4 files changed, 403 insertions(+), 392 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index e25aafd..3cc28ff 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -36,7 +36,8 @@ typedef vector<size_t> Shape;
 /// hardcode the width of types defined in DataType
 const size_t kDataWidth[] = {sizeof(float),  sizeof(float) / 2,
                              sizeof(int),    sizeof(char),
-                             sizeof(double), sizeof(unsigned char)};
+                             sizeof(double), sizeof(unsigned char)
+                            };
 inline size_t SizeOf(DataType t) {
   static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(size_t),
                 "Num of data types not match num of data width");
@@ -51,7 +52,7 @@ inline size_t SizeOf(DataType t) {
 /// Tensor.
 /// For all operations, if the result tensor is passed as an argument,
 /// then it must be set up correctly (shape, device). Otherwise, runtime error
-/// like SegmentFault would happen. Simply type/device check would be conducted.
+/// like SegmentFault would happen. Simple type/device check would be conducted.
 class Tensor {
  public:
   ~Tensor();
@@ -59,12 +60,17 @@ class Tensor {
   explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
   explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
 
-  Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
-  Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
+  Tensor(Shape &&shape,
+         std::shared_ptr<Device> dev,
+         DataType dtype = kFloat32);
+  Tensor(const Shape &shape,
+         std::shared_ptr<Device> dev,
+         DataType dtype = kFloat32);
 
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(const Tensor &from);
-  /// Copy Tensor to share the internal data.  No deep copy. For 2 tensors sharing same block but different strides.
+  /// Copy Tensor to share the internal data.  No deep copy.
+  /// For 2 tensors sharing same block but different strides.
   Tensor(const Tensor &from, Shape &new_shape, vector<int> &new_strides);
   /// Copy Tensor to share the internal data.  No deep copy.
   Tensor(Tensor &&from);
@@ -89,7 +95,7 @@ class Tensor {
   void GetValue(SType *value, const size_t num) {
     CHECK(device_ == defaultDevice);
     const SType* ptr = data<SType>();
-    for(size_t i = 0; i < num; i++) value[i] = ptr[i];
+    for (size_t i = 0; i < num; i++) value[i] = ptr[i];
   }
 
   /// data type, including kFloat16, kFloat32, kInt
@@ -106,7 +112,7 @@ class Tensor {
 
   bool empty() const { return nDim() == 0; }
 
-  //bool transpose() const { return transpose_; }
+  /// Check if the tensor's last stride==1
   bool transpose() const { return (strides_.back() != 1); }
 
   const vector<int>& strides() const { return strides_; }
@@ -131,9 +137,8 @@ class Tensor {
   void Reshape(Shape &&shape);
 
   /// Reset the shape, device, and data type as given tensor.
-  /// If block size changes, then reallocate a new block. The previous block
-  /// would
-  /// be deleted.
+  /// If block size changes, then reallocate a new block.
+  /// The previous block would be deleted.
   void ResetLike(const Tensor &t);
 
   /// Reset the data type, it would reallocate block if type changes.
@@ -176,9 +181,11 @@ class Tensor {
   /// No data copy, just set the transpose_ filed of the returned tensor.
   Tensor T() const;
 
+  /// Reverse the shape vector
   Tensor Transpose() const;
 
-  Tensor Transpose(Shape axes) const;
+  /// Change the axes
+  Tensor Transpose(const vector<size_t>& axes) const;
 
   /// Copy the meta info with data block shared.
   Tensor &operator=(const Tensor &in);
@@ -219,23 +226,24 @@ class Tensor {
   float L2() const;
 
   //generate strides automatically if stride field is not passed
-void generate_strides(){
-    if(shape_.size()==0){
-      strides_ = {1};
-      return void();
-    }
+  void generate_strides() {
     strides_.clear();
+    if (shape_.size() == 0) {
+      strides_.push_back(1);
+      return;
+    }
+
     size_t dim = Size();
     int cumulative_product = 1;
-    for (size_t n=0; n<shape_.size(); ++n) {
-        cumulative_product = cumulative_product*shape_[n];
-        strides_.push_back(dim/cumulative_product);
+    for (size_t n = 0; n < shape_.size(); ++n) {
+      cumulative_product = cumulative_product * shape_[n];
+      strides_.push_back(dim / cumulative_product);
     }
-};
+  }
 
-void set_strides(const vector<int> new_strides){
-  strides_ = new_strides;
-}
+  void set_strides(const vector<int> new_strides) {
+    strides_ = new_strides;
+  }
 
  protected:
   DataType data_type_ = kFloat32;
@@ -247,7 +255,6 @@ void set_strides(const vector<int> new_strides){
   vector<int> strides_ = {};
 }; //end of tensor class
 
-typedef Shape::iterator ShapeIter;
 inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
   if (len == 0) len = shape.size();
   if (len == 0)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index a4efd64..d98e6a6 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -21,7 +21,6 @@
 #include "./tensor_math_cuda.h"
 #include "./tensor_math_opencl.h"
 #include <utility>
-#include <iostream>
 
 namespace singa {
 
@@ -31,21 +30,21 @@ Tensor::~Tensor() {
   block_ = nullptr;
 }
 
-Tensor::Tensor() { 
+Tensor::Tensor() {
   device_ = defaultDevice;
   strides_ = {1};
 }
 
-//non-strided constructors 
+//non-strided constructors
 Tensor::Tensor(const Shape &shape, DataType dtype)
-    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  : data_type_(dtype), device_(defaultDevice), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
   generate_strides();
 }
 Tensor::Tensor(Shape &&shape, DataType dtype)
-    : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+  : data_type_(dtype), device_(defaultDevice), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
@@ -55,14 +54,14 @@ Tensor::Tensor(Shape &&shape, DataType dtype)
 //non-strided constructors with device
 Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
                DataType dtype)
-    : data_type_(dtype), device_(device), shape_(shape) {
+  : data_type_(dtype), device_(device), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
   generate_strides();
 }
 Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
-    : data_type_(dtype), device_(device), shape_(shape) {
+  : data_type_(dtype), device_(device), shape_(shape) {
   size_t size = Product(shape_) * SizeOf(data_type_);
   if (size)
     block_ = device_->NewBlock((int)size);
@@ -71,34 +70,34 @@ Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
 
 
 Tensor::Tensor(const Tensor &in)
-    : //transpose_(in.transpose_),
-      data_type_(in.data_type_),
-      device_(in.device_),
-      block_(in.block()),
-      shape_(in.shape_),
-      strides_(in.strides_) {
+  : //transpose_(in.transpose_),
+    data_type_(in.data_type_),
+    device_(in.device_),
+    block_(in.block()),
+    shape_(in.shape_),
+    strides_(in.strides_) {
   if (block_ != nullptr)
     block_->IncRefCount();
 }
 
 //strided constructor taking in a tensor, shape and strides
 Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides)
-    : //transpose_(in.transpose_),
-      data_type_(in.data_type_),
-      device_(in.device_),
-      block_(in.block()),
-      shape_(new_shape),
-      strides_(new_strides) {
+  : //transpose_(in.transpose_),
+    data_type_(in.data_type_),
+    device_(in.device_),
+    block_(in.block()),
+    shape_(new_shape),
+    strides_(new_strides) {
   if (block_ != nullptr)
     block_->IncRefCount();
 }
 
 Tensor::Tensor(Tensor &&in)
-    : //transpose_(in.transpose_),
-      data_type_(in.data_type_),
-      device_(in.device_),
-      shape_(std::move(in.shape_)),
-      strides_(in.strides_) {
+  : //transpose_(in.transpose_),
+    data_type_(in.data_type_),
+    device_(in.device_),
+    shape_(std::move(in.shape_)),
+    strides_(in.strides_) {
   block_ = in.block_;
   in.block_ = nullptr;
 }
@@ -123,10 +122,13 @@ void Tensor::ResetLike(const Tensor &in) {
   strides_ = in.strides_;
 }
 
-//if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
-//if tensor is already transposed i.e strides != 1, it should be copied to a new tensor with newly generated default strides 
+// if tensor is not transposed yet i.e strides == 1,
+// then we simply change the shape and generate new default strides
+// if tensor is already transposed i.e strides != 1,
+// it should be copied to a new tensor with newly generated default strides
+// TODO(wangwei) raise error if the shape not match
 void Tensor::Reshape(const Shape &shape) {
-  if(strides_.size()==0)
+  if (strides_.size() == 0)
     strides_.push_back(1);
 
   if (Product(shape_) != Product(shape)) {
@@ -141,7 +143,7 @@ void Tensor::Reshape(const Shape &shape) {
 }
 
 void Tensor::Reshape(Shape &&shape) {
-  if(strides_.size()==0)
+  if (strides_.size() == 0)
     strides_.push_back(1);
 
   if (Product(shape_) != Product(shape)) {
@@ -196,12 +198,12 @@ void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
   }
 }
 template void Tensor::CopyDataFromHostPtr(const unsigned char *src,
-                                          const size_t num,
-                                          const size_t offset);
+    const size_t num,
+    const size_t offset);
 template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num,
-                                          const size_t offset);
+    const size_t offset);
 template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num,
-                                          const size_t offset);
+    const size_t offset);
 
 void Tensor::CopyData(const Tensor &src) {
   CHECK_EQ(Size(), src.Size());
@@ -224,44 +226,44 @@ void Tensor::FromProto(const singa::TensorProto &proto) {
   strides_.clear();
   for (int32_t s : proto.strides()) strides_.push_back(s);
   switch (data_type_) {
-    case kFloat32: {
-      std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
-      for (size_t i = 0; i < Product(shape_); ++i)
-        data_ptr[i] = static_cast<float>(proto.float_data((int)i));
-      CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
-      break;
-    }
-    case kDouble: {
-      std::unique_ptr<double[]> data(new double[Product(shape_)]);
-      for (size_t i = 0; i < Product(shape_); ++i)
-        data[i] = proto.double_data((int)i);
-      CopyDataFromHostPtr<double>(data.get(), Product(shape_));
-      break;
-    }
-    case kInt: {
-      std::unique_ptr<int[]> data(new int[Product(shape_)]);
-      for (size_t i = 0; i < Product(shape_); ++i) data[i] = proto.int_data((int)i);
-      CopyDataFromHostPtr<int>(data.get(), Product(shape_));
-      break;
-    }
-    ///TODO(wangji): Implement to support C++ type char using bytes type in protobuf
-    /// which is equivalent to string type is different from the other cases. The kchar
-    /// and kUChar case is to be implemented.
-    /*
-    case kChar: {
-      std::unique_ptr<char[]> data(new char[Product(shape_)]);
-      for (size_t i = 0; i < Product(shape_); ++i)
-        data[i] = static_cast<char>(proto.bytes_data(i));
-      break;
-    }
-    case kUChar: {
-      std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
-      for (size_t i = 0; i < Product(shape_); ++i)
-        data[i] = static_cast<unsigned char>(proto.bytes_data(i));
-      break;
-    }
-    */
-    default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
+  case kFloat32: {
+    std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
+    for (size_t i = 0; i < Product(shape_); ++i)
+      data_ptr[i] = static_cast<float>(proto.float_data((int)i));
+    CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
+    break;
+  }
+  case kDouble: {
+    std::unique_ptr<double[]> data(new double[Product(shape_)]);
+    for (size_t i = 0; i < Product(shape_); ++i)
+      data[i] = proto.double_data((int)i);
+    CopyDataFromHostPtr<double>(data.get(), Product(shape_));
+    break;
+  }
+  case kInt: {
+    std::unique_ptr<int[]> data(new int[Product(shape_)]);
+    for (size_t i = 0; i < Product(shape_); ++i) data[i] = proto.int_data((int)i);
+    CopyDataFromHostPtr<int>(data.get(), Product(shape_));
+    break;
+  }
+  ///TODO(wangji): Implement to support C++ type char using bytes type in protobuf
+  /// which is equivalent to string type is different from the other cases. The kchar
+  /// and kUChar case is to be implemented.
+  /*
+  case kChar: {
+    std::unique_ptr<char[]> data(new char[Product(shape_)]);
+    for (size_t i = 0; i < Product(shape_); ++i)
+      data[i] = static_cast<char>(proto.bytes_data(i));
+    break;
+  }
+  case kUChar: {
+    std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
+    for (size_t i = 0; i < Product(shape_); ++i)
+      data[i] = static_cast<unsigned char>(proto.bytes_data(i));
+    break;
+  }
+  */
+  default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
   }
 }
 
@@ -277,44 +279,44 @@ void Tensor::ToProto(singa::TensorProto *proto) const {
     proto->add_strides(s);
   }
   switch (data_type_) {
-    case kFloat32: {
-      proto->clear_float_data();
-      const float *data_ptr = data<float>();
-      for (size_t i = 0; i < Product(shape_); ++i)
-        proto->add_float_data(data_ptr[i]);
-      break;
-    }
-    case kDouble: {
-      proto->clear_double_data();
-      const double *data_ptr = data<double>();
-      for (size_t i = 0; i < Product(shape_); ++i)
-        proto->add_double_data(data_ptr[i]);
-      break;
-    }
-    case kInt: {
-      proto->clear_int_data();
-      const int *data_ptr = data<int>();
-      for (size_t i = 0; i < Product(shape_); ++i)
-        proto->add_int_data(data_ptr[i]);
-      break;
-    }
-    /*
-    case kChar: {
-      proto->clear_bytes_data();
-      const char *data = data<char>();
-      for (size_t i = 0; i < Product(shape_); ++i)
-        proto->add_bytes_data(static_cast<unsigned char>(data[i]));
-      break;
-    }
-    case kUChar: {
-      proto->clear_bytes_data();
-      const unsigned char *data = data<unsigned char>();
-      for (size_t i = 0; i < Product(shape_); ++i)
-        proto->add_bytes_data(static_cast<unsigned char>(data[i]));
-      break;
-    }
-    */
-    default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
+  case kFloat32: {
+    proto->clear_float_data();
+    const float *data_ptr = data<float>();
+    for (size_t i = 0; i < Product(shape_); ++i)
+      proto->add_float_data(data_ptr[i]);
+    break;
+  }
+  case kDouble: {
+    proto->clear_double_data();
+    const double *data_ptr = data<double>();
+    for (size_t i = 0; i < Product(shape_); ++i)
+      proto->add_double_data(data_ptr[i]);
+    break;
+  }
+  case kInt: {
+    proto->clear_int_data();
+    const int *data_ptr = data<int>();
+    for (size_t i = 0; i < Product(shape_); ++i)
+      proto->add_int_data(data_ptr[i]);
+    break;
+  }
+  /*
+  case kChar: {
+    proto->clear_bytes_data();
+    const char *data = data<char>();
+    for (size_t i = 0; i < Product(shape_); ++i)
+      proto->add_bytes_data(static_cast<unsigned char>(data[i]));
+    break;
+  }
+  case kUChar: {
+    proto->clear_bytes_data();
+    const unsigned char *data = data<unsigned char>();
+    for (size_t i = 0; i < Product(shape_); ++i)
+      proto->add_bytes_data(static_cast<unsigned char>(data[i]));
+    break;
+  }
+  */
+  default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
   }
 }
 
@@ -353,9 +355,9 @@ Tensor Tensor::Transpose() const {
   t.device_ = device_;
   t.data_type_ = data_type_;
   t.strides_.clear();
-  for(size_t n=0; n<shape_.size(); ++n){
-    t.shape_.push_back(shape_[shape_.size()-n-1]);
-    t.strides_.push_back(strides_[shape_.size()-n-1]);
+  for (size_t n = 0; n < shape_.size(); ++n) {
+    t.shape_.push_back(shape_[shape_.size() - n - 1]);
+    t.strides_.push_back(strides_[shape_.size() - n - 1]);
   }
   t.block_ = block_;
   block_->IncRefCount();
@@ -363,6 +365,7 @@ Tensor Tensor::Transpose() const {
 }
 
 //transpose with axes
+// TODO(wangwei) the shape and axes should match
 Tensor Tensor::Transpose(Shape axes) const {
   // if(axes.size() != shape_.size()){
   //   std::cout << "Warning: Size of input axes doesn't match size of shape" << std::endl;
@@ -375,7 +378,7 @@ Tensor Tensor::Transpose(Shape axes) const {
   t.device_ = device_;
   t.data_type_ = data_type_;
   t.strides_.clear();
-  for(size_t n=0; n<axes.size(); ++n){
+  for (size_t n = 0; n < axes.size(); ++n) {
     t.shape_.push_back(shape_[axes[n]]);
     t.strides_.push_back(strides_[axes[n]]);
   }
@@ -404,7 +407,7 @@ Tensor &Tensor::operator=(Tensor &&in) {
   if (block_ != nullptr && block_->DecRefCount() == 0)
     device_->FreeBlock(block_);
   //transpose_ = in.transpose_;
-  strides_ = in.strides_;
+  strides_ = std::move(in.strides_);
   data_type_ = in.data_type_;
   shape_ = std::move(in.shape_);
   device_ = in.device_;
@@ -470,7 +473,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
                               (int)s_offset);
     } else if (src_dev->lang() == kCpp) {
       dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, (int)d_offset,
-                (int)s_offset);
+                              (int)s_offset);
     } else {
       LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
     }
@@ -548,7 +551,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
 float Tensor::L1() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
-    device_->Exec([&nrm, this](Context *ctx) {
+    device_->Exec([&nrm, this](Context * ctx) {
       DType ret = DType(0);
       Asum<DType, Lang>(*this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
@@ -561,7 +564,7 @@ float Tensor::L1() const {
 float Tensor::L2() const {
   float nrm = 0.0f;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
-    device_->Exec([&nrm, this](Context *ctx) {
+    device_->Exec([&nrm, this](Context * ctx) {
       DType ret = DType(0);
       Nrm2<DType, Lang>(*this, &ret, ctx);
       nrm = TypeCast<DType, float>(ret);
@@ -577,7 +580,7 @@ void Tensor::SetValue(const SType x) {
   auto ptr = block_;
   TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
     // TODO(wangwei) cast x to DType
-    device_->Exec([this, x, ptr](Context *ctx) {
+    device_->Exec([this, x, ptr](Context * ctx) {
       Set<DType, Lang>(x, this, ctx);
     }, {}, {ptr});
   });
@@ -691,7 +694,7 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
   CHECK(in.shape() == out->shape());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     // TODO(wangwei) type cast SType to DType;
-    in.device()->Exec([alpha, in, out](Context *ctx) {
+    in.device()->Exec([alpha, in, out](Context * ctx) {
       Div<DType, Lang>(alpha, in, out, ctx);
     }, {in.block()}, {out->block()});
   });
@@ -727,7 +730,7 @@ float Sum<float>(const Tensor &in) {
   Tensor one(in.shape(), in.device(), in.data_type());
   one.SetValue(1.0f);
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-    one.device()->Exec([in, one, &s](Context *ctx) {
+    one.device()->Exec([in, one, &s](Context * ctx) {
       DType ret = DType(0);
       Dot<DType, Lang>(in, one, &ret, ctx);
       s = ret;
@@ -758,7 +761,7 @@ Tensor SoftMax(const Tensor &in) {
 Tensor RowMax(const Tensor &in) {
   Tensor ret({in.shape(0)}, in.device(), in.data_type());
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
-    in.device()->Exec([&in, &ret](Context *ctx) {
+    in.device()->Exec([&in, &ret](Context * ctx) {
       //size_t nrow = 1;
       //if (in.nDim() > 1) nrow = in.shape(0);
       //size_t ncol = in.Size() / nrow;
@@ -805,7 +808,7 @@ void AddColumn(const SType alpha, const SType beta, const Tensor &v,
     Tensor vmat = Reshape(v, Shape{nb_row, 1});
     Mult(alpha, vmat, one, beta, M);
   }
-} 
+}
 template
 void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
 
@@ -846,16 +849,16 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) {
   CHECK_GE(dim, 2u) << " Only work for tensor of dim >=2 ";
   size_t size = in[0].Size() / in[0].shape(axis);
   size_t new_size = 0u;
-  for (const auto& t: in) {
+  for (const auto& t : in) {
     CHECK_EQ(dim, t.shape().size()) << "All tensors should have the same dim";
     CHECK_EQ(size, t.Size() / t.shape(axis)) << "The size of all axis should "
-      <<" be the same except the concatenated axis";
+        << " be the same except the concatenated axis";
     new_size += t.shape(axis);
   }
   out_shape[axis] = new_size;
   if (axis == 0) {
     size_t nrow = 0;
-    for (const auto& t: in) {
+    for (const auto& t : in) {
       nrow += t.shape(0);
       tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)}));
     }
@@ -863,7 +866,7 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) {
     ret.Reshape(out_shape);
     return ret;
   } else {
-    for (const auto& t: in) {
+    for (const auto& t : in) {
       size_t nrow = 1;
       for (int i = 0; i < axis; i++)
         nrow *= t.shape(i);
@@ -944,7 +947,7 @@ Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int axis)
   out_shape[axis] = end - start;
   if (axis == 0) {
     auto ret = SliceRows(Reshape(in, {in.shape(0), in.Size() / in.shape(0)}),
-        start, end);
+                         start, end);
     ret.Reshape(out_shape);
     return ret;
   } else {
@@ -953,7 +956,7 @@ Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int axis)
       nrow *= in.shape(i);
     auto suffix = in.Size() / nrow / in.shape(axis);
     auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}),
-        start * suffix, end * suffix);
+                            start * suffix, end * suffix);
     ret.Reshape(out_shape);
     return ret;
   }
@@ -997,9 +1000,9 @@ void MultColumn(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(0));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec([M, v](Context *ctx) {
+    v.device()->Exec([M, v](Context * ctx) {
       DGMM<DType, Lang>(false, *M, v,
-                        M, ctx);
+      M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
 }
@@ -1012,9 +1015,9 @@ void MultRow(const Tensor &v, Tensor *M) {
   CHECK_EQ(v.Size(), M->shape(1));
   CheckDataTypeAndLang(*M, v);
   TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
-    v.device()->Exec([M, v](Context *ctx) {
+    v.device()->Exec([M, v](Context * ctx) {
       DGMM<DType, Lang>(true, *M, v,
-                        M, ctx);
+      M, ctx);
     }, {M->block(), v.block()}, {M->block()});
   });
 }
@@ -1059,7 +1062,7 @@ template <typename SType>
 void Bernoulli(const SType p, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto prob = TypeCast<SType, DType>(p);
-    out->device()->Exec([prob, out](Context *ctx) {
+    out->device()->Exec([prob, out](Context * ctx) {
       Bernoulli<DType, Lang>(prob, out, ctx);
     }, {}, {out->block()}, true);
   });
@@ -1072,7 +1075,7 @@ void Uniform(const SType low, const SType high, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto l = TypeCast<SType, DType>(low);
     auto h = TypeCast<SType, DType>(high);
-    out->device()->Exec([l, h, out](Context *ctx) {
+    out->device()->Exec([l, h, out](Context * ctx) {
       Uniform<DType, Lang>(l, h, out, ctx);
     }, {}, {out->block()}, true);
   });
@@ -1085,7 +1088,7 @@ void Gaussian(const SType mean, const SType std, Tensor *out) {
   TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
     auto m = TypeCast<SType, DType>(mean);
     auto s = TypeCast<SType, DType>(std);
-    out->device()->Exec([m, s, out](Context *ctx) {
+    out->device()->Exec([m, s, out](Context * ctx) {
       Gaussian<DType, Lang>(m, s, out, ctx);
     }, {}, {out->block()}, true);
   });
@@ -1098,7 +1101,7 @@ template <typename SType>
 void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
   TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
     auto a = TypeCast<SType, DType>(alpha);
-    out->device()->Exec([a, in, out](Context *ctx) {
+    out->device()->Exec([a, in, out](Context * ctx) {
       Axpy<DType, Lang>(a, in, out, ctx);
     }, {in.block(), out->block()}, {out->block()});
   });
@@ -1128,7 +1131,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
-      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+      C->device()->Exec([a, A, b, B, C](Context * ctx) {
         GEMV<DType, Lang>(a, A, B, b, C, ctx);
       }, {A.block(), B.block()}, {C->block()});
     });
@@ -1137,9 +1140,9 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
     TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
       auto a = TypeCast<SType, DType>(alpha);
       auto b = TypeCast<SType, DType>(beta);
-      C->device()->Exec([a, A, b, B, C](Context *ctx) {
+      C->device()->Exec([a, A, b, B, C](Context * ctx) {
         GEMM<DType, Lang>(a, A, B, b, C,
-                          ctx);
+        ctx);
       }, {A.block(), B.block()}, {C->block()});
     });
   }
@@ -1155,10 +1158,10 @@ void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
   if (p.nDim() == 2u) batchsize = p.shape(0);
   size_t dim = p.Size() / batchsize;
   TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
-    p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
-        bool int_target = t.Size() == batchsize;
-        ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p.block(),
-            t.block(), loss->block(), ctx);
+    p.device()->Exec([batchsize, dim, t, p, loss](Context * ctx) {
+      bool int_target = t.Size() == batchsize;
+      ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p.block(),
+      t.block(), loss->block(), ctx);
     }, {p.block(), t.block()}, {loss->block()});
   });
 }
@@ -1170,10 +1173,10 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
   if (p->nDim() == 2u) batchsize = p->shape(0);
   size_t dim = p->Size() / batchsize;
   TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
-    p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+    p->device()->Exec([batchsize, dim, t, p](Context * ctx) {
       bool int_target = t.Size() == batchsize;
       SoftmaxCrossEntropyBwd<DType, Lang>(int_target, batchsize, dim,
-          p->block(), t.block(), p->block(), ctx);
+      p->block(), t.block(), p->block(), ctx);
     }, {p->block(), t.block()}, {p->block()});
   });
 }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 1ca312a..bfdd026 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -32,13 +32,14 @@ namespace singa {
 
 // ===================== Helper Functions =============================
 
-//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
+// generate a traversal_info vector based on the tensor's shape for the
+// traverse_next function to work
 vector<int> generate_traversal_info(const Tensor& x) {
-    vector<int> traversal_info = {};
-    for(size_t n=0; n<(x.shape().size()+2); ++n) {
-      traversal_info.push_back(0);
-    }
-    return traversal_info;
+  vector<int> traversal_info = {};
+  for (size_t n = 0; n < (x.shape().size() + 2); ++n) {
+    traversal_info.push_back(0);
+  }
+  return traversal_info;
 };
 
 //generate shape multipliers
@@ -47,18 +48,18 @@ vector<int> generate_traversal_info(const Tensor& x) {
 //this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
 //so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
 vector<int> generate_shape_multipliers(const Tensor& x) {
-    Shape y_shape = x.shape();
-    if(y_shape.size()==0){
-      return {1};
-    }
-    vector<int> shape_multipliers = {1};
-    int cumulative_product = 1;
+  Shape y_shape = x.shape();
+  if (y_shape.size() == 0) {
+    return {1};
+  }
+  vector<int> shape_multipliers = {1};
+  int cumulative_product = 1;
 
-    for (size_t n=0; n<(y_shape.size()-1); ++n) {
-        cumulative_product = cumulative_product*y_shape[y_shape.size()-1-n];
-        shape_multipliers.insert(shape_multipliers.begin(), cumulative_product);
-    }
-    return shape_multipliers;
+  for (size_t n = 0; n < (y_shape.size() - 1); ++n) {
+    cumulative_product = cumulative_product * y_shape[y_shape.size() - 1 - n];
+    shape_multipliers.insert(shape_multipliers.begin(), cumulative_product);
+  }
+  return shape_multipliers;
 };
 
 // ******************************************************************************************
@@ -71,20 +72,20 @@ vector<int> generate_shape_multipliers(const Tensor& x) {
 //this additional check only has 1 loop for 2d matrix
 //but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
 int determine_order(vector<int>& shape_multipliers, int counter) {
-    for (size_t n=0; n<(shape_multipliers.size()-1); ++n) {
-        if((counter%shape_multipliers[n])==0){
-            return ((shape_multipliers.size()) - 1 - n);
-        }
+  for (size_t n = 0; n < (shape_multipliers.size() - 1); ++n) {
+    if ((counter % shape_multipliers[n]) == 0) {
+      return ((shape_multipliers.size()) - 1 - n);
     }
-    return 0;
+  }
+  return 0;
 };
 
 //this function updates the base indexes with the current index after every single traversal step,
 //can be generalized beyond 2d cases
 void update_base_index(const Tensor& x, vector<int>& traversal_info) {
-    for (int n=0; n<(traversal_info[x.shape().size()+1]+1); ++n) {
-        traversal_info[n] = traversal_info[x.shape().size()];
-    }
+  for (int n = 0; n < (traversal_info[x.shape().size() + 1] + 1); ++n) {
+    traversal_info[n] = traversal_info[x.shape().size()];
+  }
 };
 
 //function to traverse a const strided tensor object
@@ -95,32 +96,32 @@ void update_base_index(const Tensor& x, vector<int>& traversal_info) {
 //index 3 stores the order of the traversal for e.g. if the order is 0,
 //it means the next element can be navigated to using the innermost stride
 void traverse_next(const Tensor& x,
-                   vector<int>& shape_multipliers, 
+                   vector<int>& shape_multipliers,
                    vector<int>& traversal_info,
                    int counter) {
 
-    update_base_index(x, traversal_info);
-    traversal_info[x.shape().size()+1] = determine_order(shape_multipliers, counter);
-    traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size()+1]] + 
-                                                   x.strides()[x.strides().size()-traversal_info[x.shape().size()+1]-1];
+  update_base_index(x, traversal_info);
+  traversal_info[x.shape().size() + 1] = determine_order(shape_multipliers, counter);
+  traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size() + 1]] +
+                                     x.strides()[x.strides().size() - traversal_info[x.shape().size() + 1] - 1];
 };
 
 template <typename DType>
-void TraverseUnary(const Tensor &in, Tensor* out, std::function<DType(DType)> func){
+void TraverseUnary(const Tensor &in, Tensor* out, std::function<DType(DType)> func) {
   DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
   const DType *inPtr = static_cast<const DType *>(in.block()->data());
   vector<int> traversal_info = generate_traversal_info(in);
   vector<int> shape_multipliers = generate_shape_multipliers(in);
 
-  for (size_t i = 0; i < in.Size(); i++) { 
+  for (size_t i = 0; i < in.Size(); i++) {
     outPtr[i] = func(inPtr[traversal_info[in.shape().size()]]);
-    traverse_next(in, shape_multipliers, traversal_info, i+1);
+    traverse_next(in, shape_multipliers, traversal_info, i + 1);
   }
 }
 
 template <typename DType>
-void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out, 
-                    std::function<DType(DType, DType)> func){
+void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out,
+                    std::function<DType(DType, DType)> func) {
   DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
   const DType *in1Ptr = static_cast<const DType *>(in1.block()->data());
   const DType *in2Ptr = static_cast<const DType *>(in2.block()->data());
@@ -132,8 +133,8 @@ void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out,
   for (size_t i = 0; i < in1.Size(); i++) {
     outPtr[i] = func(in1Ptr[traversal_info_in1[in1.shape().size()]],
                      in2Ptr[traversal_info_in2[in2.shape().size()]]);
-    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
-    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
   }
 }
 
@@ -151,7 +152,7 @@ void Abs<float, lang::Cpp>(const Tensor& in, Tensor* out, Context *ctx) {
 template <>
 void Add<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out, Context *ctx) {
   auto add_lambda = [&x](float a) {
-    return (a+x);
+    return (a + x);
   };
   TraverseUnary<float>(in, out, add_lambda);
 }
@@ -160,10 +161,10 @@ template <>
 void Add<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   auto add_lambda_binary = [](float a, float b) {
-    return (a+b);
+    return (a + b);
   };
   TraverseBinary<float>(in1, in2, out, add_lambda_binary);
-  
+
 }
 
 template <>
@@ -171,8 +172,8 @@ void Clamp<float, lang::Cpp>(const float low, const float high,
                              const Tensor& in, Tensor* out,
                              Context *ctx) {
   auto clamp_lambda = [&low, &high](float a) {
-    if(a < low){return low;}
-    else if(a > high){return high;}
+    if (a < low) {return low;}
+    else if (a > high) {return high;}
     else {return a;}
   };
   TraverseUnary<float>(in, out, clamp_lambda);
@@ -189,7 +190,7 @@ void Div<float, lang::Cpp>(const float x, const Tensor& in, Tensor* out,
   for (size_t i = 0; i < in.Size(); i++) {
     CHECK_NE(inPtr[traversal_info[in.shape().size()]], 0.f);
     outPtr[i] = x / inPtr[traversal_info[in.shape().size()]];
-    traverse_next(in, shape_multipliers, traversal_info, i+1);
+    traverse_next(in, shape_multipliers, traversal_info, i + 1);
   }
 }
 
@@ -207,8 +208,8 @@ void Div<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
   for (size_t i = 0; i < in1.Size(); i++) {
     CHECK_NE(in2Ptr[traversal_info_in2[in2.shape().size()]], 0.f);
     outPtr[i] = in1Ptr[traversal_info_in1[in1.shape().size()]] / in2Ptr[traversal_info_in2[in2.shape().size()]];
-    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
-    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
   }
 }
 
@@ -216,16 +217,16 @@ template <>
 void EltwiseMult<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
                                    Context *ctx) {
   auto eltwisemult_lambda = [&x](float a) {
-    return (a*x);
+    return (a * x);
   };
   TraverseUnary<float>(in, out, eltwisemult_lambda);
 }
 
 template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, 
+void EltwiseMult<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                                    Context *ctx) {
   auto eltwisemult_lambda_binary = [](float a, float b) {
-    return (a*b);
+    return (a * b);
   };
   TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
 }
@@ -300,7 +301,7 @@ void Log<float, lang::Cpp>(const Tensor& in, Tensor* out,
   for (size_t i = 0; i < in.Size(); i++) {
     CHECK_GT(inPtr[traversal_info[in.shape().size()]], 0.f);
     outPtr[i] = log(inPtr[traversal_info[in.shape().size()]]);
-    traverse_next(in, shape_multipliers, traversal_info, i+1);
+    traverse_next(in, shape_multipliers, traversal_info, i + 1);
   }
 }
 
@@ -325,21 +326,21 @@ void LT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
 
 template <>
 void Pow<float, lang::Cpp>(const Tensor& in, const float x, Tensor *out, Context *ctx) {
-  TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
+  TraverseUnary<float>(in, out, [x](float y) {return pow(y, x);});
 }
 
 template <>
 void Pow<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
                            Context *ctx) {
   auto pow_lambda_binary = [](float a, float b) {
-    return pow(a,b);
+    return pow(a, b);
   };
   TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
 }
 
 template <>
 void ReLU<float, lang::Cpp>(const Tensor& in, Tensor* out,
-                          Context *ctx) {
+                            Context *ctx) {
   auto relu_lambda = [](float a) {
     return (a >= 0.f) ? a : 0.f;
   };
@@ -355,14 +356,14 @@ void Set<float, lang::Cpp>(const float x, Tensor* out,
 
 template <>
 void Set<int, lang::Cpp>(const int x, Tensor* out,
-                           Context *ctx) {
+                         Context *ctx) {
   int *outPtr = static_cast<int *>(out->block()->mutable_data());
   for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
 }
 
 template <>
 void Sigmoid<float, lang::Cpp>(const Tensor& in, Tensor* out,
-                          Context *ctx) {
+                               Context *ctx) {
   auto sigmoid_lambda = [](float a) {
     return 1.f / (1.f + exp(-a));
   };
@@ -371,7 +372,7 @@ void Sigmoid<float, lang::Cpp>(const Tensor& in, Tensor* out,
 
 template <>
 void Sign<float, lang::Cpp>(const Tensor& in, Tensor* out,
-                          Context *ctx) {
+                            Context *ctx) {
   auto sign_lambda = [](float a) {
     return (a > 0) - (a < 0);
   };
@@ -389,7 +390,7 @@ void Sqrt<float, lang::Cpp>(const Tensor& in, Tensor* out,
   for (size_t i = 0; i < in.Size(); i++) {
     CHECK_GE(inPtr[traversal_info[in.shape().size()]], 0.f);
     outPtr[i] = sqrt(inPtr[traversal_info[in.shape().size()]]);
-    traverse_next(in, shape_multipliers, traversal_info, i+1);
+    traverse_next(in, shape_multipliers, traversal_info, i + 1);
   }
 }
 
@@ -398,7 +399,7 @@ void Sub<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            Tensor* out, Context *ctx) {
   // CHECK_EQ(ctx->stream, nullptr);
   auto sub_lambda_binary = [](float a, float b) {
-    return (a-b);
+    return (a - b);
   };
   TraverseBinary<float>(in1, in2, out, sub_lambda_binary);
 }
@@ -418,7 +419,7 @@ void Sum<float, lang::Cpp>(const Tensor& in, float *out,
 
 template <>
 void Tanh<float, lang::Cpp>(const Tensor& in, Tensor* out,
-                          Context *ctx) {
+                            Context *ctx) {
   auto tanh_lambda = [](float a) {
     return tanh(a);
   };
@@ -475,7 +476,7 @@ void DGMM<float, lang::Cpp>(const bool side_right,
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
         outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[c];
-        traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
+        traverse_next(M, shape_multipliers, traversal_info, offset + c + 1);
       }
     }
   } else {
@@ -483,7 +484,7 @@ void DGMM<float, lang::Cpp>(const bool side_right,
       size_t offset = r * ncol;
       for (size_t c = 0; c < ncol; c++) {
         outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[r];
-        traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
+        traverse_next(M, shape_multipliers, traversal_info, offset + c + 1);
       }
     }
   }
@@ -509,7 +510,7 @@ template <>
 void Axpy<float, lang::Cpp>(const float alpha,
                             const Tensor& in, Tensor *out, Context *ctx) {
   //check input tensor for strides first
-  if(in.strides() == out->strides()){
+  if (in.strides() == out->strides()) {
     const float *inPtr = static_cast<const float *>(in.block()->data());
     float *outPtr = static_cast<float *>(out->block()->mutable_data());
     cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
@@ -522,7 +523,7 @@ template <>
 void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
                            float *out, Context *ctx) {
   //check input tensor for strides first
-  if(!(in1.transpose()) && !(in2.transpose())){
+  if (!(in1.transpose()) && !(in2.transpose())) {
     const float *in1Ptr = static_cast<const float *>(in1.block()->data());
     const float *in2Ptr = static_cast<const float *>(in2.block()->data());
     *out = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
@@ -580,10 +581,10 @@ void GEMM<float, lang::Cpp>(const float alpha,
   const float *BPtr = static_cast<const float *>(B.block()->data());
   float *CPtr = static_cast<float *>(C->block()->mutable_data());
   cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
-    lda, BPtr, ldb, beta, CPtr, ldc);
+              lda, BPtr, ldb, beta, CPtr, ldc);
 }
 
-#else    
+#else
 
 template <>
 void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
@@ -636,9 +637,9 @@ void Axpy<float, lang::Cpp>(const float alpha,
   vector<int> traversal_info = generate_traversal_info(in);
   vector<int> shape_multipliers = generate_shape_multipliers(in);
 
-  for (size_t i = 0; i < in.Size(); i++) { 
+  for (size_t i = 0; i < in.Size(); i++) {
     outPtr[i] += alpha * inPtr[traversal_info[in.shape().size()]];
-    traverse_next(in, shape_multipliers, traversal_info, i+1);
+    traverse_next(in, shape_multipliers, traversal_info, i + 1);
   }
 }
 
@@ -658,7 +659,7 @@ void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
   // const float *in1Ptr = static_cast<const float *>(in1.data());
   // const float *in2Ptr = static_cast<const float *>(in2.data());
   // for (size_t i = 0; i < in.Size(); i++) {
-  //   sum += in1Ptr[i] * in2Ptr[i]; 
+  //   sum += in1Ptr[i] * in2Ptr[i];
   // }
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
   const float *in1Ptr = static_cast<const float *>(in1.block()->data());
@@ -670,8 +671,8 @@ void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
 
   for (size_t i = 0; i < in1.Size(); i++) {
     sum += in1Ptr[traversal_info_in1[in1.shape().size()]] * in2Ptr[traversal_info_in2[in2.shape().size()]];
-    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
-    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+    traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
+    traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
   }
 }
 
@@ -697,10 +698,10 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
 #endif  // USE_CBLAS
 template <>
 void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
-                                           const size_t batchsize,
-                                           const size_t dim, const Block *p,
-                                           const Block *t, Block *loss,
-                                           Context *ctx) {
+    const size_t batchsize,
+    const size_t dim, const Block *p,
+    const Block *t, Block *loss,
+    Context *ctx) {
   const float *pPtr = static_cast<const float *>(p->data());
   const int *tPtr = static_cast<const int *>(t->data());
   float *lossPtr = static_cast<float *>(loss->mutable_data());
@@ -712,7 +713,7 @@ void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
       lossPtr[i] = -std::log((std::max)(prob_of_truth, FLT_MIN));
     }
   } else {
-    for (size_t i = 0;i < batchsize; i++) {
+    for (size_t i = 0; i < batchsize; i++) {
       float sum = 0.f;
       for (size_t j = 0; j < dim; j++) {
         sum += tPtr[i * dim + j];
@@ -728,10 +729,10 @@ void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
 
 template <>
 void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool int_target,
-                                              const size_t batchsize,
-                                              const size_t dim, const Block *p,
-                                              const Block *t, Block *grad,
-                                              Context *ctx) {
+    const size_t batchsize,
+    const size_t dim, const Block *p,
+    const Block *t, Block *grad,
+    Context *ctx) {
   CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
   // const float* pPtr = static_cast<const float*>(p->data());
   const int *tPtr = static_cast<const int *>(t->data());
@@ -764,13 +765,13 @@ void RowMax<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
   const size_t ncol = in.shape()[1];
   vector<int> traversal_info = generate_traversal_info(in);
   vector<int> shape_multipliers = generate_shape_multipliers(in);
-    
+
   for (size_t r = 0; r < nrow; r++) {
     int counter_offset = (r * ncol);
     float maxval = 0;
-    for (size_t c = 0; c < ncol; c++){
+    for (size_t c = 0; c < ncol; c++) {
       maxval = (std::max)(maxval, inPtr[traversal_info[in.shape().size()]]);
-      traverse_next(in, shape_multipliers, traversal_info, counter_offset+c+1);
+      traverse_next(in, shape_multipliers, traversal_info, counter_offset + c + 1);
     }
     outPtr[r] = maxval;
   }

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 6e86ca7..55d6a1b 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -34,45 +34,45 @@ namespace singa {
 
 // ===================== Helper Functions =============================
 
-  /*  
-  cudnn requires tensor dimensions to fulfill 1 requirement:
-    1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors 
-        if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
-        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-
-    for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
-             Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
-  */
-  vector<int> generate_shape_cuda(const Tensor& x) {
-    Shape shape_ = x.shape();
-    vector<int> shape_arr;
-    if(shape_.size() <= 4){
-      for (size_t n=0; n<4-shape_.size(); ++n) {
-        shape_arr.push_back(1);
-      } 
-      for (size_t n=0; n<shape_.size(); ++n) {
-        shape_arr.push_back(shape_.at(n));
-      } 
-      return shape_arr;
-    } else if(shape_.size() == 5){
-      for (size_t n=0; n<shape_.size(); ++n) {
-        shape_arr.push_back(shape_.at(n));
-      } 
-      return shape_arr;
-    } else {
-      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+/*
+cudnn requires tensor dimensions to fulfill 1 requirement:
+  1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+      if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+      (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+  for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+           Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
+*/
+vector<int> generate_shape_cuda(const Tensor& x) {
+  Shape shape_ = x.shape();
+  vector<int> shape_arr;
+  if (shape_.size() <= 4) {
+    for (size_t n = 0; n < 4 - shape_.size(); ++n) {
+      shape_arr.push_back(1);
+    }
+    for (size_t n = 0; n < shape_.size(); ++n) {
+      shape_arr.push_back(shape_.at(n));
     }
+    return shape_arr;
+  } else if (shape_.size() == 5) {
+    for (size_t n = 0; n < shape_.size(); ++n) {
+      shape_arr.push_back(shape_.at(n));
+    }
+    return shape_arr;
+  } else {
+    LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
   }
+}
 
-  int generate_dim_cuda(const Tensor& x) {
-    if(x.shape().size() <= 4){return 4;}
-    else if(x.shape().size() == 5){return 5;}
-    else{
-      LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
-    } 
+int generate_dim_cuda(const Tensor& x) {
+  if (x.shape().size() <= 4) {return 4;}
+  else if (x.shape().size() == 5) {return 5;}
+  else {
+    LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
   }
+}
 
-/*  
+/*
   cudnn requires stride dimensions to conform to the format of the shape input as well
     1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
         If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
@@ -81,51 +81,51 @@ namespace singa {
     for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3}
     and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
   */
-  vector<int> generate_strides_cuda(const Tensor& x) {
-    Shape shape_ = x.shape();
-    vector<int> strides_ = x.strides();
-    vector<int> strides_arr;
-    int product = 1;
-    for (size_t n=0; n<(shape_.size()); ++n) {
-      product *= shape_[n];
+vector<int> generate_strides_cuda(const Tensor& x) {
+  Shape shape_ = x.shape();
+  vector<int> strides_ = x.strides();
+  vector<int> strides_arr;
+  int product = 1;
+  for (size_t n = 0; n < (shape_.size()); ++n) {
+    product *= shape_[n];
+  }
+  if (shape_.size() <= 4) {
+    for (size_t n = 0; n < 4 - shape_.size(); ++n) {
+      strides_arr.push_back(product);
+    }
+    for (size_t n = 0; n < strides_.size(); ++n) {
+      strides_arr.push_back(strides_[n]);
     }
-    if(shape_.size() <= 4){
-      for (size_t n=0; n<4-shape_.size(); ++n) {
-        strides_arr.push_back(product);
-      } 
-      for (size_t n=0; n<strides_.size(); ++n) {
-          strides_arr.push_back(strides_[n]);
-        }
-      return strides_arr;
-    } else if(shape_.size() == 5){
-      for (size_t n=0; n<strides_.size(); ++n) {
-          strides_arr.push_back(strides_[n]);
-        }
-      return strides_arr;
-    } else {
-      LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
+    return strides_arr;
+  } else if (shape_.size() == 5) {
+    for (size_t n = 0; n < strides_.size(); ++n) {
+      strides_arr.push_back(strides_[n]);
     }
+    return strides_arr;
+  } else {
+    LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
   }
+}
 
-cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x){
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x) {
   cudnnTensorDescriptor_t x_desc;
   cudnnCreateTensorDescriptor(&x_desc);
   cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
                              generate_dim_cuda(x),
                              generate_shape_cuda(x).data(),
                              generate_strides_cuda(x).data()
-                             );
+                            );
 
   return x_desc;
 }
 
-cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
+cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op) {
   cudnnOpTensorDescriptor_t op_desc;
   cudnnCreateOpTensorDescriptor(&op_desc);
   cudnnSetOpTensorDescriptor(op_desc, op,
                              CUDNN_DATA_FLOAT,
                              CUDNN_PROPAGATE_NAN
-                             );
+                            );
 
   return op_desc;
 }
@@ -144,10 +144,10 @@ void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out,
   float beta = 0.0;
   cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
   cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
-                (void*)(&alpha1), in_desc, inPtr, 
+                (void*)(&alpha1), in_desc, inPtr,
                 (void*)(&alpha2), in_desc, inPtr,
                 (void*)(&beta), generate_tensorND_desc(*out), outPtr
-                );
+               );
   cudnnDestroyTensorDescriptor(in_desc);
 }
 
@@ -156,8 +156,8 @@ void Set<float, lang::Cuda>(const float x, Tensor* out,
                             Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
-  cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out), 
-                  outPtr, (void*)(&x));
+  cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out),
+                 outPtr, (void*)(&x));
 }
 
 template <>
@@ -171,7 +171,7 @@ void Add<float, lang::Cuda>(const Tensor& in, const float x,
   cudnnAddTensor(ctx->cudnn_handle,
                  (void*)(&alpha), generate_tensorND_desc(in), inPtr,
                  (void*)(&beta), generate_tensorND_desc(*out), outPtr
-                 );
+                );
 }
 
 /// out = in1 + in2
@@ -186,18 +186,18 @@ void Add<float, lang::Cuda>(const Tensor& in1,
   float alpha2 = 1.0;
   float beta = 0.0;
 
-  if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
+  if ((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)) {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
-              (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
-              (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
-              (void*)(&beta), generate_tensorND_desc(*out), outPtr
-              );
+                  (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+                  (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+                  (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                 );
   } else {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
-          (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
-          (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
-          (void*)(&beta), generate_tensorND_desc(*out), outPtr
-          );
+                  (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+                  (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+                  (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                 );
   }
 }
 
@@ -213,18 +213,18 @@ void Sub<float, lang::Cuda>(const Tensor& in1,
   float alpha2 = -1.0;
   float beta = 0.0;
 
-  if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
+  if ((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)) {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
-              (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
-              (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
-              (void*)(&beta), generate_tensorND_desc(*out), outPtr
-              );
+                  (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+                  (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+                  (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                 );
   } else {
     cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
-          (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
-          (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
-          (void*)(&beta), generate_tensorND_desc(*out), outPtr
-          );
+                  (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+                  (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+                  (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                 );
   }
 }
 
@@ -250,17 +250,17 @@ void Div<float, lang::Cuda>(const Tensor& in1,
   const size_t num = in1.Size();
 
   //if both in1 and in2 strides are the same, we proceed to normal cuda::div
-  if(in1.strides() == in2.strides()){
-        cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->set_strides(in1.strides());
+  if (in1.strides() == in2.strides()) {
+    cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+    out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
     out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
-                        (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+                         (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -286,8 +286,8 @@ void EltwiseMult<float, lang::Cuda>(const Tensor& in,
 
   float alpha = x, beta = 0.0;
   cudnnAddTensor(ctx->cudnn_handle,
-                (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                 (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+                 (void*)(&beta), generate_tensorND_desc(*out), outPtr
                 );
 }
 
@@ -302,17 +302,17 @@ void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
   const size_t num = in1.Size();
 
   //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
-  if(in1.strides() == in2.strides()){ 
-        cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->set_strides(in1.strides());
+  if (in1.strides() == in2.strides()) {
+    cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+    out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
     out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
-                        (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+                         (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -443,17 +443,17 @@ void Pow<float, lang::Cuda>(const Tensor& in1,
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in1.Size();
 
-  if(in1.strides() == in2.strides()){
-        cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
-        out->set_strides(in1.strides());
+  if (in1.strides() == in2.strides()) {
+    cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
+    out->set_strides(in1.strides());
   } else { //else we transform in1 to out to store first
     float alpha = 1.0;
     float beta = 0.0;
 
     out->set_strides(in2.strides());
     cudnnTransformTensor(ctx->cudnn_handle,
-                        (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
-                        (void*)(&beta), generate_tensorND_desc(*out), outPtr
+                         (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+                         (void*)(&beta), generate_tensorND_desc(*out), outPtr
                         );
 
     cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -473,18 +473,18 @@ void Pow<float, lang::Cuda>(const Tensor& in1,
 //   double coef = 0.0; //only used for CLIPPED_RELU or ELU
 //   cudnnCreateActivationDescriptor(&act_desc);
 //   cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
-  
+
 //   float alpha[1] = {1.0};
 //   float beta[1] = {0.0};
 //   cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
 // in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
 // out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
+//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
 //                         (void*)(&beta), out_desc, outPtr);
 
 //   cudnnDestroyTensorDescriptor(in_desc);
@@ -515,18 +515,18 @@ void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
 //   double coef = 0.0; //only used for CLIPPED_RELU or ELU
 //   cudnnCreateActivationDescriptor(&act_desc);
 //   cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
-  
+
 //   float alpha[1] = {1.0};
 //   float beta[1] = {0.0};
 //   cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
 // in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
 // out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
+//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
 //                         (void*)(&beta), out_desc, outPtr);
 
 //   cudnnDestroyTensorDescriptor(in_desc);
@@ -562,16 +562,16 @@ void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  
+
   float alpha1 = 1.0;
   float alpha2 = 0.0;
   float beta = 0.0;
   cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
   cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
-                (void*)(&alpha1), in_desc, inPtr, 
+                (void*)(&alpha1), in_desc, inPtr,
                 (void*)(&alpha2), in_desc, inPtr,
                 (void*)(&beta), generate_tensorND_desc(*out), outPtr
-                );
+               );
 }
 
 /// Element-wise operation, out[i]=in[i]^2
@@ -598,15 +598,15 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
                             Context* ctx) {
   const float* inPtr = static_cast<const float*>(in.block()->data());
 
-   //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
-   Shape reduced_shape = {1};
-   Tensor t(reduced_shape, in.device(), in.data_type());
-   float* tPtr = static_cast<float*>(t.block()->mutable_data());
-   vector<int> reduce_all_axes = generate_shape_cuda(in);
-   for (size_t n=0; n<reduce_all_axes.size(); ++n) {
+  //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
+  Shape reduced_shape = {1};
+  Tensor t(reduced_shape, in.device(), in.data_type());
+  float* tPtr = static_cast<float*>(t.block()->mutable_data());
+  vector<int> reduce_all_axes = generate_shape_cuda(in);
+  for (size_t n = 0; n < reduce_all_axes.size(); ++n) {
     reduce_all_axes[n] = 1;
-   }
-   
+  }
+
   //reduce_desc
   cudnnReduceTensorDescriptor_t reduce_desc;
   cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
@@ -620,11 +620,11 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
 
   //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
   size_t reduction_size_int = Product(in.shape());
-  Shape reduction_size = {reduction_size_int*100};
+  Shape reduction_size = {reduction_size_int * 100};
   Tensor indices(reduction_size, in.device(), in.data_type());
   Tensor workspace(reduction_size, in.device(), in.data_type());
-  size_t indices_bytes = indices.block()->size()*100;
-  size_t workspace_bytes = workspace.block()->size()*100;
+  size_t indices_bytes = indices.block()->size() * 100;
+  size_t workspace_bytes = workspace.block()->size() * 100;
   size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
   float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
   //void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
@@ -636,7 +636,7 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
                     (void*)(&alpha), generate_tensorND_desc(in), inPtr,
                     (void*)(&beta), generate_tensorND_desc(t), tPtr
-                    );
+                   );
 
   *out = tPtr[0];
 }
@@ -655,18 +655,18 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
 //   double coef = 0.0; //only used for CLIPPED_RELU or ELU
 //   cudnnCreateActivationDescriptor(&act_desc);
 //   cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
-  
+
 //   float alpha[1] = {1.0};
 //   float beta[1] = {0.0};
 //   cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
 // in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
 // out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr, 
+//   cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
 //                         (void*)(&beta), out_desc, outPtr);
 
 //   cudnnDestroyTensorDescriptor(in_desc);
@@ -676,7 +676,7 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
 
 template <>
 void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
-                                Context* ctx) {
+                             Context* ctx) {
   const float* inPtr = static_cast<const float*>(in.block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in.Size();
@@ -856,22 +856,22 @@ void GEMM<float, lang::Cuda>(const float alpha,
 
 template <>
 void ComputeCrossEntropy<float, lang::Cuda>(bool int_target,
-                                            const size_t batchsize,
-                                            const size_t dim, const Block* p,
-                                            const Block* t, Block* loss,
-                                            Context* ctx) {
+    const size_t batchsize,
+    const size_t dim, const Block* p,
+    const Block* t, Block* loss,
+    Context* ctx) {
   const float* pPtr = static_cast<const float*>(p->data());
   const int* tPtr = static_cast<const int*>(t->data());
   float* lossPtr = static_cast<float*>(loss->mutable_data());
   cuda::ComputeCrossEntropy(int_target, batchsize, dim, pPtr, tPtr, lossPtr,
-      ctx->stream);
+                            ctx->stream);
 }
 template <>
 void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
-                                               const size_t batchsize,
-                                               const size_t dim, const Block* p,
-                                               const Block* t, Block* grad,
-                                               Context* ctx) {
+    const size_t batchsize,
+    const size_t dim, const Block* p,
+    const Block* t, Block* grad,
+    Context* ctx) {
   CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
   const float* pPtr = static_cast<const float*>(p->data());
   const int* tPtr = static_cast<const int*>(t->data());
@@ -924,11 +924,11 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
 //   cudnnTensorDescriptor_t in_desc, out_desc;
 //   cudnnCreateTensorDescriptor(&in_desc);
 //   cudnnCreateTensorDescriptor(&out_desc);
-//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
 // in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-//   //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+//   //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
 // out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), 
+//   cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
 // reduce_row_axes_shape.data(), reduced_strides.data());
 //   cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
 //                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
@@ -946,7 +946,7 @@ void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
   const size_t nrow = in.shape()[0];
   const size_t ncol = in.shape()[1];
 
-  if(in.transpose()){
+  if (in.transpose()) {
     Tensor t(in.shape(), in.device(), in.data_type());
     float* tPtr = static_cast<float*>(t.block()->mutable_data());
 
@@ -954,8 +954,8 @@ void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
     float beta = 0.0;
 
     cudnnTransformTensor(ctx->cudnn_handle,
-                        (void*)(&alpha), generate_tensorND_desc(in), inPtr,
-                        (void*)(&beta), generate_tensorND_desc(t), tPtr
+                         (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+                         (void*)(&beta), generate_tensorND_desc(t), tPtr
                         );
 
     const float* tPtr_const = static_cast<const float*>(t.block()->data());


[05/10] incubator-singa git commit: Merge branch 'master' of github.com:apache/incubator-singa into SINGA-341-351

Posted by wa...@apache.org.
Merge branch 'master' of github.com:apache/incubator-singa into SINGA-341-351


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a44d2e76
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a44d2e76
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a44d2e76

Branch: refs/heads/master
Commit: a44d2e76b70a54e6ad1e063c0b8e895c43018b63
Parents: 75f9a0e b5600d3
Author: Vaan Ng <cm...@gmail.com>
Authored: Fri May 11 15:29:56 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Fri May 11 15:29:56 2018 +0800

----------------------------------------------------------------------
 .gitignore                                      |   1 +
 .travis.yml                                     |   3 +-
 CMakeLists.txt                                  |  14 +-
 cmake/Dependencies.cmake                        |  14 +-
 doc/_static/style.css                           |   3 +
 doc/_templates/layout.html                      |   2 +-
 doc/conf.py                                     |   8 +-
 doc/en/community/team-list.rst                  |   2 +-
 doc/en/docs/install_macos1013.rst               |  18 ++
 doc/en/docs/install_win.rst                     | 178 +++++++++++++++++++
 doc/en/docs/installation.md                     |  36 +---
 doc/zh/community/issue-tracking.md              |   9 +
 doc/zh/community/mail-lists.rst                 |  28 +++
 doc/zh/community/source-repository.md           |  22 +++
 doc/zh/community/team-list.rst                  |  84 +++++++++
 doc/zh/develop/contribute-code.md               |  48 +++++
 doc/zh/develop/how-contribute.md                |   9 +
 doc/zh/develop/schedule.rst                     |  66 +++++++
 doc/zh/docs.rst                                 |  23 +++
 doc/zh/downloads.md                             | 109 ++++++++++++
 doc/zh/index.rst                                |  42 ++++-
 python/CMakeLists.txt                           |  10 +-
 python/setup.py.in                              |   2 +-
 python/singa/layer.py                           |   6 +-
 python/singa/net.py                             |  24 +--
 python/singa/tensor.py                          | 135 ++++++++++++++
 src/api/model_layer.i                           |  21 +++
 src/core/device/opencl_func.h                   |   6 +-
 src/model/layer/cudnn_activation.cc             |  13 --
 src/model/layer/cudnn_convolution.cc            |  13 +-
 src/model/layer/cudnn_pooling.cc                |   8 -
 src/model/layer/cudnn_rnn.cc                    |   4 +-
 test/python/run.py                              |  11 +-
 test/python/test_tensor.py                      |  16 ++
 tool/conda/README.md                            |  33 ++++
 tool/conda/build.sh                             |  29 +--
 tool/conda/meta.yaml                            |   9 +-
 tool/docker/README.md                           |  33 ++--
 tool/docker/build.sh                            |  19 +-
 tool/docker/devel/Dockerfile                    |  36 ----
 tool/docker/devel/conda/cuda/Dockerfile         |  52 ++++++
 tool/docker/devel/cuda/Dockerfile               |  39 ----
 tool/docker/devel/native/centos6/Dockerfile     |  48 +++++
 .../devel/native/ubuntu/cuda/py2/Dockerfile     |  53 ++++++
 .../devel/native/ubuntu/cuda/py3/Dockerfile     |  54 ++++++
 tool/docker/runtime/Dockerfile                  |  31 +++-
 tool/docker/runtime/cuda/Dockerfile             |  31 ----
 tool/jenkins/README.md                          |  97 +++++-----
 tool/jenkins/docker/devel/centos6/Dockerfile    |  64 -------
 tool/jenkins/docker/devel/ubuntu/Dockerfile     |  70 --------
 tool/jenkins/docker/runtime/Dockerfile          |  51 ------
 tool/jenkins/gen_doc.sh                         |  39 ++++
 tool/jenkins/jenkins_doc.sh                     |  37 ----
 tool/jenkins/jenkins_test.sh                    |  57 ------
 tool/jenkins/test.sh                            |  63 +++++++
 tool/opencl/clsrc_to_str.py                     |   4 +-
 tool/travis/build.sh                            |  40 +++--
 tool/travis/conda.sh                            |  38 ----
 tool/travis/depends.sh                          |  41 ++---
 59 files changed, 1369 insertions(+), 687 deletions(-)
----------------------------------------------------------------------



[04/10] incubator-singa git commit: misc. changes and further abstraction of some cudnn codes

Posted by wa...@apache.org.
misc. changes and further abstraction of some cudnn codes


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/75f9a0e3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/75f9a0e3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/75f9a0e3

Branch: refs/heads/master
Commit: 75f9a0e39520fe86f6e774f5295d65830bd274ab
Parents: 26101ee
Author: Vaan Ng <cm...@gmail.com>
Authored: Thu May 10 18:34:44 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Thu May 10 18:34:44 2018 +0800

----------------------------------------------------------------------
 include/singa/core/tensor.h        |  21 +--
 src/core/tensor/tensor.cc          |  12 +-
 src/core/tensor/tensor_math_cpp.h  |  31 ++--
 src/core/tensor/tensor_math_cuda.h | 309 +++++++++++++-------------------
 4 files changed, 152 insertions(+), 221 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 2c28e0f..b94a982 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -105,12 +105,13 @@ class Tensor {
   }
 
   /*  
-  cudnn requires tensor dimensions to fulfill 2 requirements:
-    1.) dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-    2.) dimensions have to be set to multiples of 8
+  cudnn requires tensor dimensions to fulfill 1 requirement:
+    1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors 
+        if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
 
-    for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,24,24} to be the input
-             Tensor B has shape (2,3,4), cudnn requires shape of {1,16,24,32} to be the input
+    for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+             Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
   */
   vector<int> generate_shape_cuda() const {
     vector<int> shape_arr;
@@ -151,11 +152,11 @@ class Tensor {
 
   /*  
   cudnn requires stride dimensions to conform to the format of the shape input as well
-    1.) stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-    2.) stride dimensions have to be set to powers of 8, depending on the stride order (outer stride = higher power)
+    1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+        If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+        (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
 
-    for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,24,24} and stride {576, 576, 24, 1} to be the inputs,
-             if A is transposed with stride {1,3}, then the new cudnn stride becomes {576, 576, 8, 3}
+    for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3} and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
   */
   vector<int> generate_strides_cuda() const {
     vector<int> strides_arr;
@@ -177,7 +178,7 @@ class Tensor {
         }
       return strides_arr;
     } else {
-      LOG(FATAL) << "Dimensions (strides) beyond 3 are currently not supported" ;
+      LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
     }
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 48751ef..9067242 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -132,10 +132,8 @@ void Tensor::ResetLike(const Tensor &in) {
   shape_multipliers_ = in.shape_multipliers_;
 }
 
-//yisen todo
 //if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
 //if tensor is already transposed i.e strides != 1, it should be copied to a new tensor with newly generated default strides 
-
 void Tensor::Reshape(const Shape &shape) {
   if(strides_.size()==0)
     strides_.push_back(1);
@@ -144,9 +142,8 @@ void Tensor::Reshape(const Shape &shape) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
-  } else if (strides_[0] != 1) {
-    std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
-    return void();
+  } else if (transpose()) {
+    LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
   }
   shape_ = shape;
   Generate_Strides();
@@ -161,9 +158,8 @@ void Tensor::Reshape(Shape &&shape) {
     if (block_ != nullptr && block_->DecRefCount() == 0)
       device_->FreeBlock(block_);
     block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
-  } else if (strides_[0] != 1) {
-    std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
-    return void();
+  } else if (transpose()) {
+    LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
   }
   shape_ = std::move(shape);
   Generate_Strides();

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 01d9fe3..d4cd5da 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -724,7 +724,7 @@ void Uniform<float, lang::Cpp>(const float low,
 
 // ====================Blas operations======================================
 
-//yisen todo, this function has block M overwritting to block M itself
+//warning, this function has block M overwritting to block M itself
 template <>
 void DGMM<float, lang::Cpp>(const bool side_right,
                             const Tensor* M, const Tensor* v,
@@ -817,26 +817,26 @@ template <>
 void Axpy<float, lang::Cpp>(const float alpha,
                             const Tensor *in, Tensor *out, Context *ctx) {
   //check input tensor for strides first
-  if((in->strides())[0] == 1){
+  if(in->strides() != out->strides()){
     const float *inPtr = static_cast<const float *>(in->block()->data());
     float *outPtr = static_cast<float *>(out->block()->mutable_data());
     cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+  } else {
+    LOG(FATAL) << "Axpy, input and output strides do not match." ;
   }
-  //yisen todo
-  //else throw error
 }
 
 template <>
 void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
                            float *out, Context *ctx) {
   //check input tensor for strides first
-  if(((in1->strides())[0] == 1) && ((in2->strides())[0] == 1)){
+  if(!(in1->transpose()) && !(in2->transpose())){
     const float *in1Ptr = static_cast<const float *>(in1->block()->data());
     const float *in2Ptr = static_cast<const float *>(in2->block()->data());
     *out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+  } else {
+    LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
   }
-  //yisen todo
-  //else throw error
 }
 
 template <>
@@ -878,15 +878,14 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
   const float *APtr = static_cast<const float *>(A->block()->data());
   const float *vPtr = static_cast<const float *>(v->block()->data());
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
-  auto trans = ((A->strides())[0] != 1) ? true : false;
   const size_t m = A->shape()[0];
   const size_t n = A->shape()[1];
-  if (!trans) {
-    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
-                beta, outPtr, 1);
-  } else {
+  if (A->transpose()) {
     cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
                 outPtr, 1);
+  } else {
+    cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+                beta, outPtr, 1);
   }
 }
 
@@ -915,9 +914,9 @@ template <>
 void GEMM<float, lang::Cpp>(const float alpha,
                             const Tensor *A, const Tensor *B, const float beta,
                             Tensor *C, Context *ctx) {
-  auto transA = ((A->strides())[0] != 1) ? true : false;
+  auto transA = A->transpose();
   auto transa = transA ? CblasTrans : CblasNoTrans;
-  auto transB = ((B->strides())[0] != 1) ? true : false;
+  auto transB = B->transpose();
   auto transb = transB ? CblasTrans : CblasNoTrans;
   const size_t nrowA = A->shape()[0];
   const size_t ncolA = A->shape()[1];
@@ -1088,7 +1087,6 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
   }
 }
 
-//yisen todo check purpose of sum in this function
 template <>
 void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
                            float *out, Context *ctx) {
@@ -1116,7 +1114,7 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
   float *outPtr = static_cast<float *>(out->block()->mutable_data());
   const float *APtr = static_cast<const float *>(A->block()->data());
   const float *vPtr = static_cast<const float *>(v->block()->data());
-  bool trans = ((A->strides())[0] != 1) ? true : false;
+  bool trans = A->transpose();
   const size_t m = A->shape(0);
   const size_t n = A->shape(1);
   for (size_t r = 0; r < m; r++) {
@@ -1129,7 +1127,6 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
   }
 }
 
-//yisen todo
 #endif  // USE_CBLAS
 template <>
 void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index f4839e3..3e36877 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -32,6 +32,30 @@
 
 namespace singa {
 
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor* x){
+  cudnnTensorDescriptor_t x_desc;
+  cudnnCreateTensorDescriptor(&x_desc);
+  cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
+                             x->generate_dim_cuda(),
+                             x->generate_shape_cuda().data(),
+                             x->generate_strides_cuda().data()
+                             );
+
+  return x_desc;
+}
+
+cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
+  cudnnOpTensorDescriptor_t op_desc;
+  cudnnCreateOpTensorDescriptor(&op_desc);
+  cudnnSetOpTensorDescriptor(op_desc, op,
+                             CUDNN_DATA_FLOAT,
+                             CUDNN_PROPAGATE_NAN
+                             );
+
+  return op_desc;
+}
+
+
 /// out[i] = |in[i]|
 template <>
 void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
@@ -39,41 +63,25 @@ void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
   const float* inPtr = static_cast<const float*>(in->block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
-  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_MAX;
-  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
-  cudnnOpTensorDescriptor_t op_desc;
-  cudnnCreateOpTensorDescriptor(&op_desc);
-  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
-  
-  float alpha1[1] = {1.0};
-  float alpha2[1] = {-1.0};
-  float beta[1] = {0.0};
-  cudnnTensorDescriptor_t in_desc, out_desc;
-  cudnnCreateTensorDescriptor(&in_desc);
-  cudnnCreateTensorDescriptor(&out_desc);
-  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr, 
-                (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
-
+  float alpha1 = 1.0;
+  float alpha2 = -1.0;
+  float beta = 0.0;
+  cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
+  cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
+                (void*)(&alpha1), in_desc, inPtr, 
+                (void*)(&alpha2), in_desc, inPtr,
+                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                );
   cudnnDestroyTensorDescriptor(in_desc);
-  cudnnDestroyTensorDescriptor(out_desc);
 }
 
 template <>
 void Set<float, lang::Cuda>(const float x, Tensor* out,
                             Context* ctx) {
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-  //float valuePtr[1] = {x};
-
-  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-  cudnnTensorDescriptor_t out_desc;
-  cudnnCreateTensorDescriptor(&out_desc);
-  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-  cudnnSetTensor(ctx->cudnn_handle, out_desc, outPtr, (void*)(&x));
 
-  cudnnDestroyTensorDescriptor(out_desc);
+  cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(out), 
+                  outPtr, (void*)(&x));
 }
 
 template <>
@@ -83,17 +91,11 @@ void Add<float, lang::Cuda>(const Tensor* in, const float x,
   const float* inPtr = static_cast<const float*>(in->block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
-  float alpha = 1.0, beta=1.0;
-  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-  cudnnTensorDescriptor_t in_desc, out_desc;
-  cudnnCreateTensorDescriptor(&in_desc);
-  cudnnCreateTensorDescriptor(&out_desc);
-  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-  cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr,  (void*)(&beta), out_desc, outPtr);
-
-  cudnnDestroyTensorDescriptor(in_desc);
-  cudnnDestroyTensorDescriptor(out_desc);
+  float alpha = 1.0, beta = 1.0;
+  cudnnAddTensor(ctx->cudnn_handle,
+                 (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+                 (void*)(&beta), generate_tensorND_desc(out), outPtr
+                 );
 }
 
 /// out = in1 + in2
@@ -104,34 +106,23 @@ void Add<float, lang::Cuda>(const Tensor* in1,
   const float* inPtr2 = static_cast<const float*>(in2->block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
-  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
-  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
-  cudnnOpTensorDescriptor_t op_desc;
-  cudnnCreateOpTensorDescriptor(&op_desc);
-  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
-
-  float alpha1[1] = {1.0};
-  float alpha2[1] = {1.0};
-  float beta[1] = {0.0};
-  cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
-  cudnnCreateTensorDescriptor(&in1_desc);
-  cudnnCreateTensorDescriptor(&in2_desc);
-  cudnnCreateTensorDescriptor(&out_desc);
-  cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+  float alpha1 = 1.0;
+  float alpha2 = 1.0;
+  float beta = 0.0;
+
   if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
-    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+    cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+              (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+              (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+              (void*)(&beta), generate_tensorND_desc(out), outPtr
+              );
   } else {
-    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+    cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+          (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+          (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+          (void*)(&beta), generate_tensorND_desc(out), outPtr
+          );
   }
-
-  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
-                (void*)(alpha2), in2_desc, inPtr2, (void*)(beta), out_desc, outPtr);
-
-  cudnnDestroyTensorDescriptor(in1_desc);
-  cudnnDestroyTensorDescriptor(in2_desc);
-  cudnnDestroyTensorDescriptor(out_desc);
 }
 
 /// out = in1 - in2
@@ -142,34 +133,23 @@ void Sub<float, lang::Cuda>(const Tensor* in1,
   const float* inPtr2 = static_cast<const float*>(in2->block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
-  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
-  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
-  cudnnOpTensorDescriptor_t op_desc;
-  cudnnCreateOpTensorDescriptor(&op_desc);
-  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
-
-  float alpha1[1] = {1.0};
-  float alpha2[1] = {-1.0};
-  float beta[1] = {0.0};
-  cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
-  cudnnCreateTensorDescriptor(&in1_desc);
-  cudnnCreateTensorDescriptor(&in2_desc);
-  cudnnCreateTensorDescriptor(&out_desc);
-  cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+  float alpha1 = 1.0;
+  float alpha2 = -1.0;
+  float beta = 0.0;
+
   if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
-    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+    cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+              (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+              (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+              (void*)(&beta), generate_tensorND_desc(out), outPtr
+              );
   } else {
-    cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+    cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+          (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+          (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+          (void*)(&beta), generate_tensorND_desc(out), outPtr
+          );
   }
-
-  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
-                (void*)(alpha2), in2_desc, inPtr2, (void*)(beta),  out_desc, outPtr);
-
-  cudnnDestroyTensorDescriptor(in1_desc);
-  cudnnDestroyTensorDescriptor(in2_desc);
-  cudnnDestroyTensorDescriptor(out_desc);
 }
 
 /// Element-wise operation, clamp every element into [low, high]
@@ -193,26 +173,21 @@ void Div<float, lang::Cuda>(const Tensor* in1,
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in1->Size();
 
-  if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::div
+  //if both in1 and in2 strides are the same, we proceed to normal cuda::div
+  if(in1->strides() == in2->strides()){
         cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
         out->Set_Strides(in1->strides());
   } else { //else we transform in1 to out to store first
-    float alpha[1] = {1.0};
-    float beta[1] = {0.0};
-
-    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-    cudnnTensorDescriptor_t in1_desc, out_desc;
-    cudnnCreateTensorDescriptor(&in1_desc);
-    cudnnCreateTensorDescriptor(&out_desc);
-    cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+    float alpha = 1.0;
+    float beta = 0.0;
+
     out->Set_Strides(in2->strides());
-    cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
-                         (void*)(beta), out_desc, outPtr);
+    cudnnTransformTensor(ctx->cudnn_handle,
+                        (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        );
 
     cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
-    cudnnDestroyTensorDescriptor(in1_desc);
-    cudnnDestroyTensorDescriptor(out_desc);
   }
 }
 
@@ -234,16 +209,10 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in,
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
 
   float alpha = x, beta = 0.0;
-  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-  cudnnTensorDescriptor_t in_desc, out_desc;
-  cudnnCreateTensorDescriptor(&in_desc);
-  cudnnCreateTensorDescriptor(&out_desc);
-  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-  cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr,  (void*)(&beta), out_desc, outPtr);
-
-  cudnnDestroyTensorDescriptor(in_desc);
-  cudnnDestroyTensorDescriptor(out_desc);
+  cudnnAddTensor(ctx->cudnn_handle,
+                (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                );
 }
 
 /// out = in1 * in2
@@ -256,27 +225,21 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in1->Size();
 
-  if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
+  //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
+  if(in1->strides() == in2->strides()){ 
         cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
         out->Set_Strides(in1->strides());
   } else { //else we transform in1 to out to store first
-    float alpha[1] = {1.0};
-    float beta[1] = {0.0};
+    float alpha = 1.0;
+    float beta = 0.0;
 
-
-    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-    cudnnTensorDescriptor_t in1_desc, out_desc;
-    cudnnCreateTensorDescriptor(&in1_desc);
-    cudnnCreateTensorDescriptor(&out_desc);
-    cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
     out->Set_Strides(in2->strides());
-    cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
-                         (void*)(beta), out_desc, outPtr);
+    cudnnTransformTensor(ctx->cudnn_handle,
+                        (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        );
 
     cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
-    cudnnDestroyTensorDescriptor(in1_desc);
-    cudnnDestroyTensorDescriptor(out_desc);
   }
 }
 
@@ -404,26 +367,20 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
   const size_t num = in1->Size();
 
-  if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::pow
+  if(in1->strides() == in2->strides()){
         cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
         out->Set_Strides(in1->strides());
   } else { //else we transform in1 to out to store first
-    float alpha[1] = {1.0};
-    float beta[1] = {0.0};
-
-    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-    cudnnTensorDescriptor_t in1_desc, out_desc;
-    cudnnCreateTensorDescriptor(&in1_desc);
-    cudnnCreateTensorDescriptor(&out_desc);
-    cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+    float alpha = 1.0;
+    float beta = 0.0;
+
     out->Set_Strides(in2->strides());
-    cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
-                         (void*)(beta), out_desc, outPtr);
+    cudnnTransformTensor(ctx->cudnn_handle,
+                        (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+                        (void*)(&beta), generate_tensorND_desc(out), outPtr
+                        );
 
     cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
-    cudnnDestroyTensorDescriptor(in1_desc);
-    cudnnDestroyTensorDescriptor(out_desc);
   }
 }
 
@@ -525,27 +482,16 @@ void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
                              Context* ctx) {
   const float* inPtr = static_cast<const float*>(in->block()->data());
   float* outPtr = static_cast<float*>(out->block()->mutable_data());
-
-  cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_SQRT;
-  cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-  cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
-  cudnnOpTensorDescriptor_t op_desc;
-  cudnnCreateOpTensorDescriptor(&op_desc);
-  cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
   
-  float alpha1[1] = {1.0};
-  float alpha2[1] = {0.0};
-  float beta[1] = {0.0};
-  cudnnTensorDescriptor_t in_desc, out_desc;
-  cudnnCreateTensorDescriptor(&in_desc);
-  cudnnCreateTensorDescriptor(&out_desc);
-  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-  cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-  cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr, 
-                (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
-
-  cudnnDestroyTensorDescriptor(in_desc);
-  cudnnDestroyTensorDescriptor(out_desc);
+  float alpha1 = 1.0;
+  float alpha2 = 0.0;
+  float beta = 0.0;
+  cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
+  cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
+                (void*)(&alpha1), in_desc, inPtr, 
+                (void*)(&alpha2), in_desc, inPtr,
+                (void*)(&beta), generate_tensorND_desc(out), outPtr
+                );
 }
 
 /// Element-wise operation, out[i]=in[i]^2
@@ -593,30 +539,26 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
                                  cudnn_propagation, cudnn_indices, cudnn_indices_type);
 
   //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
-  Shape reduction_size = {1000};
+  size_t reduction_size_int = Product(in->shape());
+  Shape reduction_size = {reduction_size_int*100};
   Tensor indices(reduction_size, in->device(), in->data_type());
   Tensor workspace(reduction_size, in->device(), in->data_type());
-  size_t indices_bytes = indices.block()->size()*1000;
-  size_t workspace_bytes = workspace.block()->size()*1000;
+  size_t indices_bytes = indices.block()->size()*100;
+  size_t workspace_bytes = workspace.block()->size()*100;
   size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
   float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
   //void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
   //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
 
-  float alpha[1] = {1.0};
-  float beta[1] = {0.0};
-  cudnnTensorDescriptor_t in_desc, t_desc;
-  cudnnCreateTensorDescriptor(&in_desc);
-  cudnnCreateTensorDescriptor(&t_desc);
-  cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-  cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), reduce_all_axes.data(), reduce_all_axes.data());
+  float alpha = 1.0;
+  float beta = 0.0;
   cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
                     indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
-                    (void*)(&alpha), in_desc, inPtr, (void*)(&beta), t_desc, tPtr);
+                    (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+                    (void*)(&beta), generate_tensorND_desc(&t), tPtr
+                    );
 
   *out = tPtr[0];
-  cudnnDestroyTensorDescriptor(in_desc);
-  cudnnDestroyTensorDescriptor(t_desc);
 }
 
 
@@ -922,22 +864,17 @@ void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
   if(in->transpose()){
     Tensor t(in->shape(), in->device(), in->data_type());
     float* tPtr = static_cast<float*>(t.block()->mutable_data());
-    float alpha[1] = {1.0};
-    float beta[1] = {0.0};
-
-    cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
-    cudnnTensorDescriptor_t in_desc, t_desc;
-    cudnnCreateTensorDescriptor(&in_desc);
-    cudnnCreateTensorDescriptor(&t_desc);
-    cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-    cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), t.generate_shape_cuda().data(), t.generate_strides_cuda().data());
-    cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in_desc, inPtr,
-                         (void*)(beta), t_desc, tPtr);
+
+    float alpha = 1.0;
+    float beta = 0.0;
+
+    cudnnTransformTensor(ctx->cudnn_handle,
+                        (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+                        (void*)(&beta), generate_tensorND_desc(&t), tPtr
+                        );
 
     const float* tPtr_const = static_cast<const float*>(t.block()->data());
     cuda::RowMax(nrow, ncol, tPtr_const, outPtr, ctx->stream);
-    cudnnDestroyTensorDescriptor(in_desc);
-    cudnnDestroyTensorDescriptor(t_desc);
   } else {
     cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
   }