You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2018/05/13 15:26:28 UTC
[01/10] incubator-singa git commit: Singa-341 Added stride
functionality to tensors for CPP
Repository: incubator-singa
Updated Branches:
refs/heads/master 394d78d00 -> 600f27ede
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 4f510ed..01d9fe3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -21,7 +21,9 @@
#include "./tensor_math.h"
#include <cfloat>
#include "singa/core/common.h"
+#include "singa/core/tensor.h"
#include <math.h>
+#include <vector>
#ifdef USE_CBLAS
#include <cblas.h>
@@ -29,422 +31,856 @@
namespace singa {
+// template <>
+// void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = fabs(inPtr[i]);
+// }
+// }
+
template <>
-void Abs<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = fabs(inPtr[i]);
- }
+void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out, Context *ctx) {
+ TraverseUnary<float>(in, out, [](float x) {return fabs(x);});
}
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in, const float x,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = inPtr[i] + x;
+// }
+// }
+
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->block()->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->block()->data());
+// vector<int> traversal_info = in->generate_traversal_info();
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = inPtr[traversal_info[in->shape().size()]] + x;
+// in->traverse_next(traversal_info, i+1);
+// }
+// }
+
template <>
-void Add<float, lang::Cpp>(const size_t num, const Block *in, const float x,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = inPtr[i] + x;
- }
+void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
+ auto add_lambda = [&x](float a) {
+ return (a+x);
+ };
+ TraverseUnary<float>(in, out, add_lambda);
}
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// // CHECK_EQ(ctx->stream, nullptr);
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = in1Ptr[i] + in2Ptr[i];
+// }
+// }
+
+// template <>
+// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
+// // CHECK_EQ(ctx->stream, nullptr);
+// float *outPtr = static_cast<float *>(out->block()->mutable_data());
+// const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+// const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+// //call axpy if both strides are 1?
+// vector<int> traversal_info_in1 = in1->generate_traversal_info();
+// vector<int> traversal_info_in2 = in2->generate_traversal_info();
+// for (size_t i = 0; i < in1->Size(); i++) {
+// outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] + in2Ptr[traversal_info_in2[in2->shape().size()]];
+// in1->traverse_next(traversal_info_in1, i+1);
+// in2->traverse_next(traversal_info_in2, i+1);
+// }
+// }
+
template <>
-void Add<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
// CHECK_EQ(ctx->stream, nullptr);
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = in1Ptr[i] + in2Ptr[i];
- }
+ auto add_lambda_binary = [](float a, float b) {
+ return (a+b);
+ };
+ TraverseBinary<float>(in1, in2, out, add_lambda_binary);
+
}
+// template <>
+// void Clamp<float, lang::Cpp>(const float low,
+// const float high, const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// if (inPtr[i] > high) {
+// outPtr[i] = high;
+// } else if (inPtr[i] < low) {
+// outPtr[i] = low;
+// } else {
+// outPtr[i] = inPtr[i];
+// }
+// }
+// }
+
+// template <>
+// void Clamp<float, lang::Cpp>(const Tensor* in, const float low,
+// const float high, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->block()->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->block()->data());
+// vector<int> traversal_info = in->generate_traversal_info();
+// for (size_t i = 0; i < in->Size(); i++) {
+// int traversed_index = traversal_info[in->shape().size()];
+// if (inPtr[traversed_index] > high) {
+// outPtr[i] = high;
+// } else if (inPtr[traversed_index] < low) {
+// outPtr[i] = low;
+// } else {
+// outPtr[i] = inPtr[traversed_index];
+// }
+// in->traverse_next(traversal_info, i+1);
+// }
+// }
+
template <>
-void Clamp<float, lang::Cpp>(const size_t num, const float low,
- const float high, const Block *in, Block *out,
+void Clamp<float, lang::Cpp>(const float low, const float high,
+ const Tensor* in, Tensor* out,
Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- if (inPtr[i] > high) {
- outPtr[i] = high;
- } else if (inPtr[i] < low) {
- outPtr[i] = low;
- } else {
- outPtr[i] = inPtr[i];
- }
- }
+ auto clamp_lambda = [&low, &high](float a) {
+ if(a < low){return low;}
+ else if(a > high){return high;}
+ else {return a;}
+ };
+ TraverseUnary<float>(in, out, clamp_lambda);
}
+
+// template <>
+// void Div<float, lang::Cpp>(const float x, const Tensor* in,
+// Tensor* out, Context *ctx) {
+// const float *inPtr = static_cast<const float *>(in->data());
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// CHECK_NE(inPtr[i], 0.f);
+// outPtr[i] = x / inPtr[i];
+// }
+// }
+
template <>
-void Div<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- CHECK_NE(in2Ptr[i], 0.f);
- outPtr[i] = in1Ptr[i] / in2Ptr[i];
+void Div<float, lang::Cpp>(const float x, const Tensor* in, Tensor* out,
+ Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ vector<int> traversal_info = in->generate_traversal_info();
+ for (size_t i = 0; i < in->Size(); i++) {
+ CHECK_NE(inPtr[traversal_info[in->shape().size()]], 0.f);
+ outPtr[i] = x / inPtr[traversal_info[in->shape().size()]];
+ in->traverse_next(traversal_info, i+1);
}
}
+
+// template <>
+// void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// CHECK_NE(in2Ptr[i], 0.f);
+// outPtr[i] = in1Ptr[i] / in2Ptr[i];
+// }
+// }
+
template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Block *in,
- Block *out, Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
- CHECK_NE(inPtr[i], 0.f);
- outPtr[i] = x / inPtr[i];
+void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+ Tensor* out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+ const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+ vector<int> traversal_info_in1 = in1->generate_traversal_info();
+ vector<int> traversal_info_in2 = in2->generate_traversal_info();
+ for (size_t i = 0; i < in1->Size(); i++) {
+ CHECK_NE(in2Ptr[traversal_info_in2[in2->shape().size()]], 0.f);
+ outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] / in2Ptr[traversal_info_in2[in2->shape().size()]];
+ in1->traverse_next(traversal_info_in1, i+1);
+ in2->traverse_next(traversal_info_in2, i+1);
}
}
+
+// template <>
+// void EltwiseMult<float, lang::Cpp>(const Tensor* in,
+// const float x, Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = inPtr[i] * x;
+// }
+// }
+
template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in,
- const float x, Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = inPtr[i] * x;
- }
+void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+ Context *ctx) {
+ auto eltwisemult_lambda = [&x](float a) {
+ return (a*x);
+ };
+ TraverseUnary<float>(in, out, eltwisemult_lambda);
}
+// template <>
+// void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = in1Ptr[i] * in2Ptr[i];
+// }
+// }
+
template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Block *in1,
- const Block *in2, Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = in1Ptr[i] * in2Ptr[i];
- }
+void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+ Context *ctx) {
+ auto eltwisemult_lambda_binary = [](float a, float b) {
+ return (a*b);
+ };
+ TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
}
+
+// template <>
+// void Exp<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = exp(inPtr[i]);
+// }
+// }
+
template <>
-void Exp<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = exp(inPtr[i]);
- }
+void Exp<float, lang::Cpp>(const Tensor* in, Tensor *out, Context *ctx) {
+ TraverseUnary<float>(in, out, [](float x) {return exp(x);});
}
+// template <>
+// void GE<float, lang::Cpp>(const Tensor* in, const float x,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void GE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
- }
+void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+ Context *ctx) {
+ auto ge_lambda = [&x](float a) {
+ return (a >= x) ? 1.f : 0.f;
+ };
+ TraverseUnary<float>(in, out, ge_lambda);
}
+// template <>
+// void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr1 = static_cast<const float *>(in1->data());
+// const float *inPtr2 = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void GE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr1 = static_cast<const float *>(in1->data());
- const float *inPtr2 = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
- }
+void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+ Context *ctx) {
+ auto ge_lambda_binary = [](float a, float b) {
+ return (a >= b) ? 1.f : 0.f;
+ };
+ TraverseBinary<float>(in1, in2, out, ge_lambda_binary);
}
+
+// template <>
+// void GT<float, lang::Cpp>(const Tensor* in, const float x,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void GT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
- }
+void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+ Context *ctx) {
+ auto gt_lambda = [&x](float a) {
+ return (a > x) ? 1.f : 0.f;
+ };
+ TraverseUnary<float>(in, out, gt_lambda);
}
+
+// template <>
+// void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr1 = static_cast<const float *>(in1->data());
+// const float *inPtr2 = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void GT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr1 = static_cast<const float *>(in1->data());
- const float *inPtr2 = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
- }
+void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+ Context *ctx) {
+ auto gt_lambda_binary = [](float a, float b) {
+ return (a > b) ? 1.f : 0.f;
+ };
+ TraverseBinary<float>(in1, in2, out, gt_lambda_binary);
}
+// template <>
+// void LE<float, lang::Cpp>(const Tensor* in, const float x,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void LE<float, lang::Cpp>(const size_t num, const Block *in, const float x,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
- }
+void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+ Context *ctx) {
+ auto le_lambda = [&x](float a) {
+ return (a <= x) ? 1.f : 0.f;
+ };
+ TraverseUnary<float>(in, out, le_lambda);
}
+
+// template <>
+// void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr1 = static_cast<const float *>(in1->data());
+// const float *inPtr2 = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void LE<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr1 = static_cast<const float *>(in1->data());
- const float *inPtr2 = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
- }
+void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+ Context *ctx) {
+ auto le_lambda_binary = [](float a, float b) {
+ return (a <= b) ? 1.f : 0.f;
+ };
+ TraverseBinary<float>(in1, in2, out, le_lambda_binary);
}
+
+// template <>
+// void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// CHECK_GT(inPtr[i], 0.f);
+// outPtr[i] = log(inPtr[i]);
+// }
+// }
+
template <>
-void Log<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- CHECK_GT(inPtr[i], 0.f);
- outPtr[i] = log(inPtr[i]);
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ vector<int> traversal_info = in->generate_traversal_info();
+ for (size_t i = 0; i < in->Size(); i++) {
+ CHECK_GT(inPtr[traversal_info[in->shape().size()]], 0.f);
+ outPtr[i] = log(inPtr[traversal_info[in->shape().size()]]);
+ in->traverse_next(traversal_info, i+1);
}
}
+
+// template <>
+// void LT<float, lang::Cpp>(const Tensor* in, const float x,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void LT<float, lang::Cpp>(const size_t num, const Block *in, const float x,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
- }
+void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+ Context *ctx) {
+ auto lt_lambda = [&x](float a) {
+ return (a < x) ? 1.f : 0.f;
+ };
+ TraverseUnary<float>(in, out, lt_lambda);
}
+
+// template <>
+// void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr1 = static_cast<const float *>(in1->data());
+// const float *inPtr2 = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
+// }
+// }
+
template <>
-void LT<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr1 = static_cast<const float *>(in1->data());
- const float *inPtr2 = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
- }
+void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+ Context *ctx) {
+ auto lt_lambda_binary = [](float a, float b) {
+ return (a < b) ? 1.f : 0.f;
+ };
+ TraverseBinary<float>(in1, in2, out, lt_lambda_binary);
}
+// template <>
+// void Pow<float, lang::Cpp>(const Tensor* in, const float x,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = pow(inPtr[i], x);
+// }
+// }
+
template <>
-void Pow<float, lang::Cpp>(const size_t num, const Block *in, const float x,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = pow(inPtr[i], x);
- }
+void Pow<float, lang::Cpp>(const Tensor* in, const float x, Tensor *out, Context *ctx) {
+ TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
}
+// template <>
+// void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
+// }
+// }
+
template <>
-void Pow<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
- }
+void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+ Context *ctx) {
+ auto pow_lambda_binary = [](float a, float b) {
+ return pow(a,b);
+ };
+ TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
}
+
+// template <>
+// void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+// }
+// }
+
template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
- }
+void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+ Context *ctx) {
+ auto relu_lambda = [](float a) {
+ return (a >= 0.f) ? a : 0.f;
+ };
+ TraverseUnary<float>(in, out, relu_lambda);
}
+
+// template <>
+// void Set<float, lang::Cpp>(const float x, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
+// }
+
template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Set<float, lang::Cpp>(const float x, Tensor* out,
Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) outPtr[i] = x;
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
}
+
+// template <>
+// void Set<int, lang::Cpp>(const int x, Tensor* out,
+// Context *ctx) {
+// int *outPtr = static_cast<int *>(out->mutable_data());
+// for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
+// }
+
template <>
-void Set<int, lang::Cpp>(const size_t num, const int x, Block *out,
+void Set<int, lang::Cpp>(const int x, Tensor* out,
Context *ctx) {
- int *outPtr = static_cast<int *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) outPtr[i] = x;
+ int *outPtr = static_cast<int *>(out->block()->mutable_data());
+ for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
}
+// template <>
+// void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+// }
+// }
+
template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
- }
+void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+ Context *ctx) {
+ auto sigmoid_lambda = [](float a) {
+ return 1.f / (1.f + exp(-a));
+ };
+ TraverseUnary<float>(in, out, sigmoid_lambda);
}
+// template <>
+// void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
+// }
+// }
+
template <>
-void Sign<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
- }
+void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+ Context *ctx) {
+ auto sign_lambda = [](float a) {
+ return (a > 0) - (a < 0);
+ };
+ TraverseUnary<float>(in, out, sign_lambda);
}
+// template <>
+// void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// CHECK_GE(inPtr[i], 0.f);
+// outPtr[i] = sqrt(inPtr[i]);
+// }
+// }
+
template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- CHECK_GE(inPtr[i], 0.f);
- outPtr[i] = sqrt(inPtr[i]);
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ vector<int> traversal_info = in->generate_traversal_info();
+ for (size_t i = 0; i < in->Size(); i++) {
+ CHECK_GE(inPtr[traversal_info[in->shape().size()]], 0.f);
+ outPtr[i] = sqrt(inPtr[traversal_info[in->shape().size()]]);
+ in->traverse_next(traversal_info, i+1);
}
}
+
/*
template <>
-void Square<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
+void Square<float, lang::Cpp>(const Tensor* in, Tensor* out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
+ for (size_t i = 0; i < in->Size(); i++) {
outPtr[i] = inPtr[i] * inPtr[i];
}
}
*/
+// template <>
+// void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// Tensor* out, Context *ctx) {
+// // CHECK_EQ(ctx->stream, nullptr);
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = in1Ptr[i] - in2Ptr[i];
+// }
+// }
+
template <>
-void Sub<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
+void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+ Tensor* out, Context *ctx) {
// CHECK_EQ(ctx->stream, nullptr);
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = in1Ptr[i] - in2Ptr[i];
- }
+ auto sub_lambda_binary = [](float a, float b) {
+ return (a-b);
+ };
+ TraverseBinary<float>(in1, in2, out, sub_lambda_binary);
}
// sum all elements of input into out
// TODO(wangwei) optimize using omp
template <>
-void Sum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Sum<float, lang::Cpp>(const Tensor* in, float *out,
Context *ctx) {
float s = 0.f;
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ for (size_t i = 0; i < in->Size(); i++) {
s += inPtr[i];
}
*out = s;
}
+// template <>
+// void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = tanh(inPtr[i]);
+// }
+// }
+
template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Block *in, Block *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = tanh(inPtr[i]);
- }
+void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+ Context *ctx) {
+ auto tanh_lambda = [](float a) {
+ return tanh(a);
+ };
+ TraverseUnary<float>(in, out, tanh_lambda);
}
// ===============Random operations==========================================
+// template <>
+// void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
+// Context *ctx) {
+// std::bernoulli_distribution distribution(p);
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+// }
+// }
+
template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Block *out,
+void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
Context *ctx) {
std::bernoulli_distribution distribution(p);
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ for (size_t i = 0; i < out->Size(); i++) {
outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
}
}
+// template <>
+// void Gaussian<float, lang::Cpp>(const float mean,
+// const float std, Tensor* out, Context *ctx) {
+// std::normal_distribution<float> distribution(mean, std);
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+// }
+// }
+
template <>
-void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
- const float std, Block *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const float mean,
+ const float std, Tensor* out, Context *ctx) {
std::normal_distribution<float> distribution(mean, std);
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ for (size_t i = 0; i < out->Size(); i++) {
outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
}
}
+
+// template <>
+// void Uniform<float, lang::Cpp>(const float low,
+// const float high, Tensor* out, Context *ctx) {
+// std::uniform_real_distribution<float> distribution(low, high);
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+// }
+// }
+
template <>
-void Uniform<float, lang::Cpp>(const size_t num, const float low,
- const float high, Block *out, Context *ctx) {
+void Uniform<float, lang::Cpp>(const float low,
+ const float high, Tensor* out, Context *ctx) {
std::uniform_real_distribution<float> distribution(low, high);
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ for (size_t i = 0; i < out->Size(); i++) {
outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
}
}
// ====================Blas operations======================================
+//yisen todo, this function has block M overwritting to block M itself
template <>
-void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
- const size_t ncol, const Block *M, const Block *v,
- Block *out, Context *ctx) {
- const float *MPtr = static_cast<const float *>(M->data());
- const float *vPtr = static_cast<const float *>(v->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
+void DGMM<float, lang::Cpp>(const bool side_right,
+ const Tensor* M, const Tensor* v,
+ Tensor* out, Context *ctx) {
+ const float *MPtr = static_cast<const float *>(M->block()->data());
+ const float *vPtr = static_cast<const float *>(v->block()->data());
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const size_t nrow = M->shape(0);
+ const size_t ncol = M->shape(1);
+ vector<int> traversal_info = M->generate_traversal_info();
+
if (side_right) {
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
- outPtr[offset + c] = MPtr[offset + c] * vPtr[c];
+ outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[c];
+ M->traverse_next(traversal_info, offset+c+1);
}
}
} else {
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
- outPtr[offset + c] = MPtr[offset + c] * vPtr[r];
+ outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[r];
+ M->traverse_next(traversal_info, offset+c+1);
}
}
}
}
+// #ifdef USE_CBLAS
+// template <>
+// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
+// Context *ctx) {
+// const float *inPtr = static_cast<const float *>(in->data());
+// *out = cblas_isamax(in->Size(), inPtr, 1);
+// }
+
+// template <>
+// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
+// Context *ctx) {
+// const float *inPtr = static_cast<const float *>(in->data());
+// *out = cblas_sasum(in->Size(), inPtr, 1);
+// }
+
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+// const Tensor* in, Tensor* out, Context *ctx) {
+// const float *inPtr = static_cast<const float *>(in->data());
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+// }
+
+// template <>
+// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// float *out, Context *ctx) {
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// *out = cblas_sdot(in->Size(), in1Ptr, 1, in2Ptr, 1);
+// }
+// template <>
+// void Scale<float, lang::Cpp>(const float x, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// cblas_sscal(in->Size(), x, outPtr, 1);
+// }
+// template <>
+// void Nrm2<float, lang::Cpp>(const Tensor* in, float *out,
+// Context *ctx) {
+// const float *inPtr = static_cast<const float *>(in->data());
+// *out = cblas_snrm2(in->Size(), inPtr, 1);
+// }
+
#ifdef USE_CBLAS
template <>
-void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->data());
- *out = cblas_isamax(num, inPtr, 1);
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ *out = cblas_isamax(in->Size(), inPtr, 1); //not using strided traversal
}
template <>
-void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor *in, float *out,
Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->data());
- *out = cblas_sasum(num, inPtr, 1);
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ *out = cblas_sasum(in->Size(), inPtr, 1); //not using strided traversal
}
template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
- const Block *in, Block *out, Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
- cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
+void Axpy<float, lang::Cpp>(const float alpha,
+ const Tensor *in, Tensor *out, Context *ctx) {
+ //check input tensor for strides first
+ if((in->strides())[0] == 1){
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+ }
+ //yisen todo
+ //else throw error
}
template <>
-void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
float *out, Context *ctx) {
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- *out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
+ //check input tensor for strides first
+ if(((in1->strides())[0] == 1) && ((in2->strides())[0] == 1)){
+ const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+ const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+ *out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+ }
+ //yisen todo
+ //else throw error
}
+
template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Scale<float, lang::Cpp>(const float x, Tensor *out,
Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- cblas_sscal(num, x, outPtr, 1);
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ cblas_sscal(out->Size(), x, outPtr, 1); //not using strided traversal
}
+
template <>
-void Nrm2<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Nrm2<float, lang::Cpp>(const Tensor *in, float *out,
Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->data());
- *out = cblas_snrm2(num, inPtr, 1);
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ *out = cblas_snrm2(in->Size(), inPtr, 1); //not using strided traversal
}
+// template <>
+// void GEMV<float, lang::Cpp>(//bool trans,
+// const std::vector<int> stridesA,
+// const size_t m, const size_t n,
+// const float alpha, const Tensor* A, const Tensor* v,
+// const float beta, Tensor* out, Context *ctx) {
+// const float *APtr = static_cast<const float *>(A->data());
+// const float *vPtr = static_cast<const float *>(v->data());
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// auto trans = (stridesA.back() == 1) ? true : false;
+// if (!trans) {
+// cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+// beta, outPtr, 1);
+// } else {
+// cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
+// outPtr, 1);
+// }
+// }
+
template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
- const float alpha, const Block *A, const Block *v,
- const float beta, Block *out, Context *ctx) {
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+ const float beta, Tensor *out, Context *ctx) {
+ const float *APtr = static_cast<const float *>(A->block()->data());
+ const float *vPtr = static_cast<const float *>(v->block()->data());
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ auto trans = ((A->strides())[0] != 1) ? true : false;
+ const size_t m = A->shape()[0];
+ const size_t n = A->shape()[1];
if (!trans) {
cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
beta, outPtr, 1);
@@ -454,33 +890,147 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
}
}
+// template <>
+// void GEMM<float, lang::Cpp>(//const bool transA, const bool transB,
+// const std::vector<int> stridesA, const std::vector<int> stridesB,
+// const size_t nrowA, const size_t ncolB,
+// const size_t ncolA, const float alpha,
+// const Tensor* A, const Tensor* B, const float beta,
+// Tensor* C, Context *ctx) {
+// auto transA = (stridesA.back() == 1) ? true : false;
+// auto transa = transA ? CblasTrans : CblasNoTrans;
+// auto transB = (stridesB.back() == 1) ? true : false;
+// auto transb = transB ? CblasTrans : CblasNoTrans;
+// auto lda = transA ? nrowA : ncolA;
+// auto ldb = transB ? ncolA : ncolB;
+// auto ldc = ncolB;
+// const float *APtr = static_cast<const float *>(A->data());
+// const float *BPtr = static_cast<const float *>(B->data());
+// float *CPtr = static_cast<float *>(C->mutable_data());
+// cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
+// lda, BPtr, ldb, beta, CPtr, ldc);
+// }
+
template <>
-void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
- const size_t nrowA, const size_t ncolB,
- const size_t ncolA, const float alpha,
- const Block *A, const Block *B, const float beta,
- Block *C, Context *ctx) {
+void GEMM<float, lang::Cpp>(const float alpha,
+ const Tensor *A, const Tensor *B, const float beta,
+ Tensor *C, Context *ctx) {
+ auto transA = ((A->strides())[0] != 1) ? true : false;
auto transa = transA ? CblasTrans : CblasNoTrans;
+ auto transB = ((B->strides())[0] != 1) ? true : false;
auto transb = transB ? CblasTrans : CblasNoTrans;
+ const size_t nrowA = A->shape()[0];
+ const size_t ncolA = A->shape()[1];
+ const size_t ncolB = B->shape()[1];
auto lda = transA ? nrowA : ncolA;
auto ldb = transB ? ncolA : ncolB;
auto ldc = ncolB;
- const float *APtr = static_cast<const float *>(A->data());
- const float *BPtr = static_cast<const float *>(B->data());
- float *CPtr = static_cast<float *>(C->mutable_data());
+ const float *APtr = static_cast<const float *>(A->block()->data());
+ const float *BPtr = static_cast<const float *>(B->block()->data());
+ float *CPtr = static_cast<float *>(C->block()->mutable_data());
cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
- lda, BPtr, ldb, beta, CPtr, ldc);
+ lda, BPtr, ldb, beta, CPtr, ldc);
}
#else
+// template <>
+// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
+// Context *ctx) {
+// size_t maxPos = 0;
+// float maxVal = 0;
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// if (i == 0) {
+// maxVal = inPtr[i];
+// } else if (inPtr[i] > maxVal) {
+// maxVal = inPtr[i];
+// maxPos = i;
+// }
+// }
+// *out = maxPos;
+// }
+// template <>
+// void Amin<float, lang::Cpp>(const Tensor* in, size_t *out,
+// Context *ctx) {
+// size_t minPos = 0;
+// float minVal = 0;
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// if (i == 0) {
+// minVal = inPtr[i];
+// } else if (inPtr[i] > minVal) {
+// minVal = inPtr[i];
+// minPos = i;
+// }
+// }
+// *out = minPos;
+// }
+
+// template <>
+// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
+// Context *ctx) {
+// float sum = 0;
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// sum += fabs(inPtr[i]);
+// }
+// }
+
+// template <>
+// void Axpy<float, lang::Cpp>(const float alpha,
+// const Tensor* in, Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *inPtr = static_cast<const float *>(in->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] += alpha * inPtr[i];
+// }
+// }
+
+// template <>
+// void Scale<float, lang::Cpp>(const float x, Tensor* out,
+// Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// outPtr[i] *= x;
+// }
+// }
+
+// template <>
+// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+// float *out, Context *ctx) {
+// float sum = 0;
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < in->Size(); i++) {
+// sum += in1Ptr[i] * in2Ptr[i];
+// }
+// }
+
+// template <>
+// void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+// const float alpha, const Tensor* A, const Tensor* v,
+// const float beta, Tensor* out, Context *ctx) {
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// const float *APtr = static_cast<const float *>(A->data());
+// const float *vPtr = static_cast<const float *>(v->data());
+// for (size_t r = 0; r < m; r++) {
+// float sum = 0;
+// for (size_t c = 0; c < n; c++) {
+// size_t idx = trans ? c * m + r : r * n + c;
+// sum += APtr[idx] * vPtr[c];
+// }
+// outPtr[r] = alpha * sum + beta * outPtr[r];
+// }
+// }
+
template <>
-void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
Context *ctx) {
size_t maxPos = 0;
float maxVal = 0;
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
if (i == 0) {
maxVal = inPtr[i];
} else if (inPtr[i] > maxVal) {
@@ -491,12 +1041,12 @@ void Amax<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
*out = maxPos;
}
template <>
-void Amin<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
+void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
Context *ctx) {
size_t minPos = 0;
float minVal = 0;
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
if (i == 0) {
minVal = inPtr[i];
} else if (inPtr[i] > minVal) {
@@ -508,52 +1058,67 @@ void Amin<float, lang::Cpp>(const size_t num, const Block *in, size_t *out,
}
template <>
-void Asum<float, lang::Cpp>(const size_t num, const Block *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor *in, float *out,
Context *ctx) {
float sum = 0;
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- sum += fabs(inPtr[i]);
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ for (size_t i = 0; i < in->Size(); i++) {
+ sum += fabs(inPtr[i]); //not using strided traversal
}
}
template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha,
- const Block *in, Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] += alpha * inPtr[i];
+void Axpy<float, lang::Cpp>(const float alpha,
+ const Tensor *in, Tensor *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ vector<int> traversal_info = in->generate_traversal_info();
+ for (size_t i = 0; i < in->Size(); i++) {
+ outPtr[i] += alpha * inPtr[traversal_info[in->shape().size()]];
+ in->traverse_next(traversal_info, i+1);
}
}
template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Block *out,
+void Scale<float, lang::Cpp>(const float x, Tensor *out,
Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] *= x;
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ for (size_t i = 0; i < out->Size(); i++) {
+ outPtr[i] *= x; //not using strided traversal
}
}
+//yisen todo check purpose of sum in this function
template <>
-void Dot<float, lang::Cpp>(const size_t num, const Block *in1, const Block *in2,
+void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
float *out, Context *ctx) {
float sum = 0;
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t i = 0; i < num; i++) {
- sum += in1Ptr[i] * in2Ptr[i];
+ // const float *in1Ptr = static_cast<const float *>(in1->data());
+ // const float *in2Ptr = static_cast<const float *>(in2->data());
+ // for (size_t i = 0; i < in->Size(); i++) {
+ // sum += in1Ptr[i] * in2Ptr[i];
+ // }
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->block()->data());
+ const float *in2Ptr = static_cast<const float *>(in2->block()->data());
+ vector<int> traversal_info_in1 = in1->generate_traversal_info();
+ vector<int> traversal_info_in2 = in2->generate_traversal_info();
+ for (size_t i = 0; i < in1->Size(); i++) {
+ sum += in1Ptr[traversal_info_in1[in1->shape().size()]] * in2Ptr[traversal_info_in2[in2->shape().size()]];
+ in1->traverse_next(traversal_info_in1, i+1);
+ in2->traverse_next(traversal_info_in2, i+1);
}
}
template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
- const float alpha, const Block *A, const Block *v,
- const float beta, Block *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+ const float beta, Tensor *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const float *APtr = static_cast<const float *>(A->block()->data());
+ const float *vPtr = static_cast<const float *>(v->block()->data());
+ bool trans = ((A->strides())[0] != 1) ? true : false;
+ const size_t m = A->shape(0);
+ const size_t n = A->shape(1);
for (size_t r = 0; r < m; r++) {
float sum = 0;
for (size_t c = 0; c < n; c++) {
@@ -564,6 +1129,7 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
}
}
+//yisen todo
#endif // USE_CBLAS
template <>
void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
@@ -626,16 +1192,35 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool int_target,
}
}
+// template <>
+// void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+// const Tensor* in, Tensor* out, Context *ctx) {
+// const float *inPtr = static_cast<const float *>(in->data());
+// float *outPtr = static_cast<float *>(out->mutable_data());
+// for (size_t r = 0; r < nrow; r++) {
+// int offset = (int)(r * ncol);
+// float maxval = inPtr[offset];
+// for (size_t c = 1; c < ncol; c++)
+// maxval = (std::max)(maxval, inPtr[offset + c]);
+// outPtr[r] = maxval;
+// }
+// }
+
template <>
-void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Block *in, Block *out, Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
+void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in->block()->data());
+ float *outPtr = static_cast<float *>(out->block()->mutable_data());
+ const size_t nrow = in->shape()[0];
+ const size_t ncol = in->shape()[1];
+ vector<int> traversal_info = in->generate_traversal_info();
+
for (size_t r = 0; r < nrow; r++) {
- int offset = (int)(r * ncol);
- float maxval = inPtr[offset];
- for (size_t c = 1; c < ncol; c++)
- maxval = (std::max)(maxval, inPtr[offset + c]);
+ int counter_offset = (r * ncol);
+ float maxval = 0;
+ for (size_t c = 0; c < ncol; c++){
+ maxval = (std::max)(maxval, inPtr[traversal_info[in->shape().size()]]);
+ in->traverse_next(traversal_info, counter_offset+c+1);
+ }
outPtr[r] = maxval;
}
}
@@ -644,7 +1229,7 @@ void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
/*
template <>
void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Block *A, const Block *v, Block *out,
+ const Tensor* A, const Tensor* v, Tensor* out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *APtr = static_cast<const float *>(A->data());
@@ -659,7 +1244,7 @@ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
template <>
void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Block *A, const Block *v, Block *out,
+ const Tensor* A, const Tensor* v, Tensor* out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *APtr = static_cast<const float *>(A->data());
@@ -672,8 +1257,8 @@ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
}
}
template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Block *in1,
- const Block *in2, Block *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
+ const Tensor* in2, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *in1Ptr = static_cast<const float *>(in1->data());
const float *in2Ptr = static_cast<const float *>(in2->data());
@@ -686,7 +1271,7 @@ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Block *in1,
}
template <>
void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Block *in, Block *out, Context *ctx) {
+ const Tensor* in, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
float *bPtr = new float[ncol];
@@ -707,7 +1292,7 @@ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
template <>
void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Block *in, Block *out, Context *ctx) {
+ const Tensor* in, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t c = 0; c < ncol; c++) {
@@ -723,7 +1308,7 @@ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
template <>
void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Block *in, Block *out, Context *ctx) {
+ const Tensor* in, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t r = 0; r < nrow; r++) {
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/proto/core.proto
----------------------------------------------------------------------
diff --git a/src/proto/core.proto b/src/proto/core.proto
index 9264e55..fd25607 100644
--- a/src/proto/core.proto
+++ b/src/proto/core.proto
@@ -50,19 +50,19 @@ enum CopyDirection {
// configuration for device memory pool
message MemPoolConf {
- optional string type = 1 [default = "cnmem"];
- // allocation size for each device, default is 256 MB
- optional uint32 init_size = 2 [default = 256];
+ optional string type = 1 [default = "cnmem"];
+ // allocation size for each device, default is 256 MB
+ optional uint32 init_size = 2 [default = 256];
// size limit in MB; report error/warning if this limit is reached.
// 0 for unlimited memory, i.e., use as much memory as the device has
// not used currently.
- optional uint32 max_size = 3 [default = 0];
+ optional uint32 max_size = 3 [default = 0];
- // memory manager flag for cnmem
- // flag = 0: default flag
- // flag = 1: prevent the manager from growing its memory consumption
- // flag = 2: prevent the manager from stealing memory
- optional uint32 flag = 11 [default = 0];
+ // memory manager flag for cnmem
+ // flag = 0: default flag
+ // flag = 1: prevent the manager from growing its memory consumption
+ // flag = 2: prevent the manager from stealing memory
+ optional uint32 flag = 11 [default = 0];
repeated uint32 device = 12;
}
@@ -70,7 +70,8 @@ message MemPoolConf {
message TensorProto {
repeated uint32 shape = 1;
optional DataType data_type = 2;
- optional bool transpose = 3;
+ //optional bool transpose = 3;
+ repeated int32 strides = 3;
repeated float float_data = 4 [packed = true];
repeated double double_data = 5 [packed = true];
repeated int32 int_data = 6 [packed = true];
[10/10] incubator-singa git commit: Merge branch 'pr367' into latest
Posted by wa...@apache.org.
Merge branch 'pr367' into latest
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/600f27ed
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/600f27ed
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/600f27ed
Branch: refs/heads/master
Commit: 600f27ede2bdf6cb6c1e502ec46bbf79f5bed243
Parents: 394d78d 3e2b75c
Author: Wang Wei <dc...@nus.edu.sg>
Authored: Sun May 13 23:26:07 2018 +0800
Committer: Wang Wei <dc...@nus.edu.sg>
Committed: Sun May 13 23:26:07 2018 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 65 ++-
src/core/tensor/tensor.cc | 410 +++++++++------
src/core/tensor/tensor_math.h | 143 ++---
src/core/tensor/tensor_math_cpp.h | 817 +++++++++++++++++------------
src/core/tensor/tensor_math_cuda.h | 898 +++++++++++++++++++++++++-------
src/proto/core.proto | 21 +-
6 files changed, 1567 insertions(+), 787 deletions(-)
----------------------------------------------------------------------
[07/10] incubator-singa git commit: Streamlining of tensor.h file by
moving respective member functions to cpp or cuda file. Removal of
shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be
passed as reference instead of pointer
Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index d4cd5da..1ca312a 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -23,7 +23,6 @@
#include "singa/core/common.h"
#include "singa/core/tensor.h"
#include <math.h>
-#include <vector>
#ifdef USE_CBLAS
#include <cblas.h>
@@ -31,80 +30,134 @@
namespace singa {
-// template <>
-// void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = fabs(inPtr[i]);
-// }
-// }
+// ===================== Helper Functions =============================
+
+//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
+vector<int> generate_traversal_info(const Tensor& x) {
+ vector<int> traversal_info = {};
+ for(size_t n=0; n<(x.shape().size()+2); ++n) {
+ traversal_info.push_back(0);
+ }
+ return traversal_info;
+};
+
+//generate shape multipliers
+//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
+//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
+//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
+//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
+vector<int> generate_shape_multipliers(const Tensor& x) {
+ Shape y_shape = x.shape();
+ if(y_shape.size()==0){
+ return {1};
+ }
+ vector<int> shape_multipliers = {1};
+ int cumulative_product = 1;
+
+ for (size_t n=0; n<(y_shape.size()-1); ++n) {
+ cumulative_product = cumulative_product*y_shape[y_shape.size()-1-n];
+ shape_multipliers.insert(shape_multipliers.begin(), cumulative_product);
+ }
+ return shape_multipliers;
+};
+
+// ******************************************************************************************
+// CPP traversal operations (works on const declarations without modifying tensor variables)
+// ******************************************************************************************
+
+//this function checks whether the next index falls on a special multiplier of the outer shape
+//so the algorithm knows when to jump over/back to a starting element of the outer shape
+//for e.g. in [[1,4,7], [2,5,8], [3,6,9]], elements 1,2,3 are the starting elements of their respective rows
+//this additional check only has 1 loop for 2d matrix
+//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
+int determine_order(vector<int>& shape_multipliers, int counter) {
+ for (size_t n=0; n<(shape_multipliers.size()-1); ++n) {
+ if((counter%shape_multipliers[n])==0){
+ return ((shape_multipliers.size()) - 1 - n);
+ }
+ }
+ return 0;
+};
+
+//this function updates the base indexes with the current index after every single traversal step,
+//can be generalized beyond 2d cases
+void update_base_index(const Tensor& x, vector<int>& traversal_info) {
+ for (int n=0; n<(traversal_info[x.shape().size()+1]+1); ++n) {
+ traversal_info[n] = traversal_info[x.shape().size()];
+ }
+};
+
+//function to traverse a const strided tensor object
+//it requires an additional vector, traversal_info {0,0,0,0 ...}, comprising (x.shape().size()+2) elements of 0
+//for e.g. 2d matrix:
+//index 0 and 1 store the base row and column index respectively
+//index 2 stores the current index of the traversal
+//index 3 stores the order of the traversal for e.g. if the order is 0,
+//it means the next element can be navigated to using the innermost stride
+void traverse_next(const Tensor& x,
+ vector<int>& shape_multipliers,
+ vector<int>& traversal_info,
+ int counter) {
+
+ update_base_index(x, traversal_info);
+ traversal_info[x.shape().size()+1] = determine_order(shape_multipliers, counter);
+ traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size()+1]] +
+ x.strides()[x.strides().size()-traversal_info[x.shape().size()+1]-1];
+};
+
+template <typename DType>
+void TraverseUnary(const Tensor &in, Tensor* out, std::function<DType(DType)> func){
+ DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+ const DType *inPtr = static_cast<const DType *>(in.block()->data());
+ vector<int> traversal_info = generate_traversal_info(in);
+ vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+ for (size_t i = 0; i < in.Size(); i++) {
+ outPtr[i] = func(inPtr[traversal_info[in.shape().size()]]);
+ traverse_next(in, shape_multipliers, traversal_info, i+1);
+ }
+}
+
+template <typename DType>
+void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out,
+ std::function<DType(DType, DType)> func){
+ DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+ const DType *in1Ptr = static_cast<const DType *>(in1.block()->data());
+ const DType *in2Ptr = static_cast<const DType *>(in2.block()->data());
+ vector<int> traversal_info_in1 = generate_traversal_info(in1);
+ vector<int> traversal_info_in2 = generate_traversal_info(in2);
+ vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+ vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+ for (size_t i = 0; i < in1.Size(); i++) {
+ outPtr[i] = func(in1Ptr[traversal_info_in1[in1.shape().size()]],
+ in2Ptr[traversal_info_in2[in2.shape().size()]]);
+ traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+ traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+ }
+}
+
+// ******************************************************************************************
+// traversal operations end
+// ******************************************************************************************
+
+// ===================== CUDA Functions =============================
template <>
-void Abs<float, lang::Cpp>(const Tensor* in, Tensor* out, Context *ctx) {
+void Abs<float, lang::Cpp>(const Tensor& in, Tensor* out, Context *ctx) {
TraverseUnary<float>(in, out, [](float x) {return fabs(x);});
}
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in, const float x,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = inPtr[i] + x;
-// }
-// }
-
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->block()->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->block()->data());
-// vector<int> traversal_info = in->generate_traversal_info();
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = inPtr[traversal_info[in->shape().size()]] + x;
-// in->traverse_next(traversal_info, i+1);
-// }
-// }
-
template <>
-void Add<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out, Context *ctx) {
auto add_lambda = [&x](float a) {
return (a+x);
};
TraverseUnary<float>(in, out, add_lambda);
}
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// // CHECK_EQ(ctx->stream, nullptr);
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = in1Ptr[i] + in2Ptr[i];
-// }
-// }
-
-// template <>
-// void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
-// // CHECK_EQ(ctx->stream, nullptr);
-// float *outPtr = static_cast<float *>(out->block()->mutable_data());
-// const float *in1Ptr = static_cast<const float *>(in1->block()->data());
-// const float *in2Ptr = static_cast<const float *>(in2->block()->data());
-// //call axpy if both strides are 1?
-// vector<int> traversal_info_in1 = in1->generate_traversal_info();
-// vector<int> traversal_info_in2 = in2->generate_traversal_info();
-// for (size_t i = 0; i < in1->Size(); i++) {
-// outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] + in2Ptr[traversal_info_in2[in2->shape().size()]];
-// in1->traverse_next(traversal_info_in1, i+1);
-// in2->traverse_next(traversal_info_in2, i+1);
-// }
-// }
-
template <>
-void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Context *ctx) {
+void Add<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, Context *ctx) {
// CHECK_EQ(ctx->stream, nullptr);
auto add_lambda_binary = [](float a, float b) {
return (a+b);
@@ -113,46 +166,9 @@ void Add<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out, Co
}
-// template <>
-// void Clamp<float, lang::Cpp>(const float low,
-// const float high, const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// if (inPtr[i] > high) {
-// outPtr[i] = high;
-// } else if (inPtr[i] < low) {
-// outPtr[i] = low;
-// } else {
-// outPtr[i] = inPtr[i];
-// }
-// }
-// }
-
-// template <>
-// void Clamp<float, lang::Cpp>(const Tensor* in, const float low,
-// const float high, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->block()->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->block()->data());
-// vector<int> traversal_info = in->generate_traversal_info();
-// for (size_t i = 0; i < in->Size(); i++) {
-// int traversed_index = traversal_info[in->shape().size()];
-// if (inPtr[traversed_index] > high) {
-// outPtr[i] = high;
-// } else if (inPtr[traversed_index] < low) {
-// outPtr[i] = low;
-// } else {
-// outPtr[i] = inPtr[traversed_index];
-// }
-// in->traverse_next(traversal_info, i+1);
-// }
-// }
-
template <>
void Clamp<float, lang::Cpp>(const float low, const float high,
- const Tensor* in, Tensor* out,
+ const Tensor& in, Tensor* out,
Context *ctx) {
auto clamp_lambda = [&low, &high](float a) {
if(a < low){return low;}
@@ -162,73 +178,42 @@ void Clamp<float, lang::Cpp>(const float low, const float high,
TraverseUnary<float>(in, out, clamp_lambda);
}
-
-// template <>
-// void Div<float, lang::Cpp>(const float x, const Tensor* in,
-// Tensor* out, Context *ctx) {
-// const float *inPtr = static_cast<const float *>(in->data());
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// CHECK_NE(inPtr[i], 0.f);
-// outPtr[i] = x / inPtr[i];
-// }
-// }
-
template <>
-void Div<float, lang::Cpp>(const float x, const Tensor* in, Tensor* out,
+void Div<float, lang::Cpp>(const float x, const Tensor& in, Tensor* out,
Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->block()->data());
+ const float *inPtr = static_cast<const float *>(in.block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- vector<int> traversal_info = in->generate_traversal_info();
- for (size_t i = 0; i < in->Size(); i++) {
- CHECK_NE(inPtr[traversal_info[in->shape().size()]], 0.f);
- outPtr[i] = x / inPtr[traversal_info[in->shape().size()]];
- in->traverse_next(traversal_info, i+1);
+ vector<int> traversal_info = generate_traversal_info(in);
+ vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+ for (size_t i = 0; i < in.Size(); i++) {
+ CHECK_NE(inPtr[traversal_info[in.shape().size()]], 0.f);
+ outPtr[i] = x / inPtr[traversal_info[in.shape().size()]];
+ traverse_next(in, shape_multipliers, traversal_info, i+1);
}
}
-
-// template <>
-// void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// CHECK_NE(in2Ptr[i], 0.f);
-// outPtr[i] = in1Ptr[i] / in2Ptr[i];
-// }
-// }
-
template <>
-void Div<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+void Div<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->block()->data());
- const float *in2Ptr = static_cast<const float *>(in2->block()->data());
- vector<int> traversal_info_in1 = in1->generate_traversal_info();
- vector<int> traversal_info_in2 = in2->generate_traversal_info();
- for (size_t i = 0; i < in1->Size(); i++) {
- CHECK_NE(in2Ptr[traversal_info_in2[in2->shape().size()]], 0.f);
- outPtr[i] = in1Ptr[traversal_info_in1[in1->shape().size()]] / in2Ptr[traversal_info_in2[in2->shape().size()]];
- in1->traverse_next(traversal_info_in1, i+1);
- in2->traverse_next(traversal_info_in2, i+1);
+ const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+ const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+ vector<int> traversal_info_in1 = generate_traversal_info(in1);
+ vector<int> traversal_info_in2 = generate_traversal_info(in2);
+ vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+ vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+ for (size_t i = 0; i < in1.Size(); i++) {
+ CHECK_NE(in2Ptr[traversal_info_in2[in2.shape().size()]], 0.f);
+ outPtr[i] = in1Ptr[traversal_info_in1[in1.shape().size()]] / in2Ptr[traversal_info_in2[in2.shape().size()]];
+ traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+ traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
}
}
-
-// template <>
-// void EltwiseMult<float, lang::Cpp>(const Tensor* in,
-// const float x, Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = inPtr[i] * x;
-// }
-// }
-
template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void EltwiseMult<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
Context *ctx) {
auto eltwisemult_lambda = [&x](float a) {
return (a*x);
@@ -236,19 +221,8 @@ void EltwiseMult<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
TraverseUnary<float>(in, out, eltwisemult_lambda);
}
-// template <>
-// void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = in1Ptr[i] * in2Ptr[i];
-// }
-// }
-
template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void EltwiseMult<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto eltwisemult_lambda_binary = [](float a, float b) {
return (a*b);
@@ -256,33 +230,13 @@ void EltwiseMult<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor*
TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
}
-// template <>
-// void Exp<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = exp(inPtr[i]);
-// }
-// }
-
template <>
-void Exp<float, lang::Cpp>(const Tensor* in, Tensor *out, Context *ctx) {
+void Exp<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
TraverseUnary<float>(in, out, [](float x) {return exp(x);});
}
-// template <>
-// void GE<float, lang::Cpp>(const Tensor* in, const float x,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
-// }
-// }
-
template <>
-void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void GE<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
Context *ctx) {
auto ge_lambda = [&x](float a) {
return (a >= x) ? 1.f : 0.f;
@@ -290,19 +244,8 @@ void GE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
TraverseUnary<float>(in, out, ge_lambda);
}
-// template <>
-// void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr1 = static_cast<const float *>(in1->data());
-// const float *inPtr2 = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr1[i] >= inPtr2[i]) ? 1.f : 0.f;
-// }
-// }
-
template <>
-void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void GE<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto ge_lambda_binary = [](float a, float b) {
return (a >= b) ? 1.f : 0.f;
@@ -310,18 +253,8 @@ void GE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
TraverseBinary<float>(in1, in2, out, ge_lambda_binary);
}
-// template <>
-// void GT<float, lang::Cpp>(const Tensor* in, const float x,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
-// }
-// }
-
template <>
-void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void GT<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
Context *ctx) {
auto gt_lambda = [&x](float a) {
return (a > x) ? 1.f : 0.f;
@@ -329,19 +262,8 @@ void GT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
TraverseUnary<float>(in, out, gt_lambda);
}
-// template <>
-// void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr1 = static_cast<const float *>(in1->data());
-// const float *inPtr2 = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr1[i] > inPtr2[i]) ? 1.f : 0.f;
-// }
-// }
-
template <>
-void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void GT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto gt_lambda_binary = [](float a, float b) {
return (a > b) ? 1.f : 0.f;
@@ -349,18 +271,8 @@ void GT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
TraverseBinary<float>(in1, in2, out, gt_lambda_binary);
}
-// template <>
-// void LE<float, lang::Cpp>(const Tensor* in, const float x,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
-// }
-// }
-
template <>
-void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void LE<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
Context *ctx) {
auto le_lambda = [&x](float a) {
return (a <= x) ? 1.f : 0.f;
@@ -368,19 +280,8 @@ void LE<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
TraverseUnary<float>(in, out, le_lambda);
}
-// template <>
-// void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr1 = static_cast<const float *>(in1->data());
-// const float *inPtr2 = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr1[i] <= inPtr2[i]) ? 1.f : 0.f;
-// }
-// }
-
template <>
-void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void LE<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto le_lambda_binary = [](float a, float b) {
return (a <= b) ? 1.f : 0.f;
@@ -388,42 +289,23 @@ void LE<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
TraverseBinary<float>(in1, in2, out, le_lambda_binary);
}
-// template <>
-// void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// CHECK_GT(inPtr[i], 0.f);
-// outPtr[i] = log(inPtr[i]);
-// }
-// }
-
template <>
-void Log<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Log<float, lang::Cpp>(const Tensor& in, Tensor* out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const float *inPtr = static_cast<const float *>(in->block()->data());
- vector<int> traversal_info = in->generate_traversal_info();
- for (size_t i = 0; i < in->Size(); i++) {
- CHECK_GT(inPtr[traversal_info[in->shape().size()]], 0.f);
- outPtr[i] = log(inPtr[traversal_info[in->shape().size()]]);
- in->traverse_next(traversal_info, i+1);
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ vector<int> traversal_info = generate_traversal_info(in);
+ vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+ for (size_t i = 0; i < in.Size(); i++) {
+ CHECK_GT(inPtr[traversal_info[in.shape().size()]], 0.f);
+ outPtr[i] = log(inPtr[traversal_info[in.shape().size()]]);
+ traverse_next(in, shape_multipliers, traversal_info, i+1);
}
}
-// template <>
-// void LT<float, lang::Cpp>(const Tensor* in, const float x,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
-// }
-// }
-
template <>
-void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
+void LT<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
Context *ctx) {
auto lt_lambda = [&x](float a) {
return (a < x) ? 1.f : 0.f;
@@ -431,19 +313,9 @@ void LT<float, lang::Cpp>(const Tensor* in, const float x, Tensor* out,
TraverseUnary<float>(in, out, lt_lambda);
}
-// template <>
-// void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr1 = static_cast<const float *>(in1->data());
-// const float *inPtr2 = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr1[i] < inPtr2[i]) ? 1.f : 0.f;
-// }
-// }
template <>
-void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void LT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto lt_lambda_binary = [](float a, float b) {
return (a < b) ? 1.f : 0.f;
@@ -451,34 +323,13 @@ void LT<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
TraverseBinary<float>(in1, in2, out, lt_lambda_binary);
}
-// template <>
-// void Pow<float, lang::Cpp>(const Tensor* in, const float x,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = pow(inPtr[i], x);
-// }
-// }
-
template <>
-void Pow<float, lang::Cpp>(const Tensor* in, const float x, Tensor *out, Context *ctx) {
+void Pow<float, lang::Cpp>(const Tensor& in, const float x, Tensor *out, Context *ctx) {
TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
}
-// template <>
-// void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
-// }
-// }
-
template <>
-void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
+void Pow<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto pow_lambda_binary = [](float a, float b) {
return pow(a,b);
@@ -486,18 +337,8 @@ void Pow<float, lang::Cpp>(const Tensor* in1, const Tensor* in2, Tensor* out,
TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
}
-// template <>
-// void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
-// }
-// }
-
template <>
-void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void ReLU<float, lang::Cpp>(const Tensor& in, Tensor* out,
Context *ctx) {
auto relu_lambda = [](float a) {
return (a >= 0.f) ? a : 0.f;
@@ -505,13 +346,6 @@ void ReLU<float, lang::Cpp>(const Tensor* in, Tensor* out,
TraverseUnary<float>(in, out, relu_lambda);
}
-// template <>
-// void Set<float, lang::Cpp>(const float x, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
-// }
-
template <>
void Set<float, lang::Cpp>(const float x, Tensor* out,
Context *ctx) {
@@ -519,13 +353,6 @@ void Set<float, lang::Cpp>(const float x, Tensor* out,
for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
}
-// template <>
-// void Set<int, lang::Cpp>(const int x, Tensor* out,
-// Context *ctx) {
-// int *outPtr = static_cast<int *>(out->mutable_data());
-// for (size_t i = 0; i < in->Size(); i++) outPtr[i] = x;
-// }
-
template <>
void Set<int, lang::Cpp>(const int x, Tensor* out,
Context *ctx) {
@@ -533,18 +360,8 @@ void Set<int, lang::Cpp>(const int x, Tensor* out,
for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
}
-// template <>
-// void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
-// }
-// }
-
template <>
-void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sigmoid<float, lang::Cpp>(const Tensor& in, Tensor* out,
Context *ctx) {
auto sigmoid_lambda = [](float a) {
return 1.f / (1.f + exp(-a));
@@ -552,18 +369,8 @@ void Sigmoid<float, lang::Cpp>(const Tensor* in, Tensor* out,
TraverseUnary<float>(in, out, sigmoid_lambda);
}
-// template <>
-// void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = (inPtr[i] > 0) - (inPtr[i] < 0);
-// }
-// }
-
template <>
-void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sign<float, lang::Cpp>(const Tensor& in, Tensor* out,
Context *ctx) {
auto sign_lambda = [](float a) {
return (a > 0) - (a < 0);
@@ -571,56 +378,23 @@ void Sign<float, lang::Cpp>(const Tensor* in, Tensor* out,
TraverseUnary<float>(in, out, sign_lambda);
}
-// template <>
-// void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// CHECK_GE(inPtr[i], 0.f);
-// outPtr[i] = sqrt(inPtr[i]);
-// }
-// }
-
template <>
-void Sqrt<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Sqrt<float, lang::Cpp>(const Tensor& in, Tensor* out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const float *inPtr = static_cast<const float *>(in->block()->data());
- vector<int> traversal_info = in->generate_traversal_info();
- for (size_t i = 0; i < in->Size(); i++) {
- CHECK_GE(inPtr[traversal_info[in->shape().size()]], 0.f);
- outPtr[i] = sqrt(inPtr[traversal_info[in->shape().size()]]);
- in->traverse_next(traversal_info, i+1);
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ vector<int> traversal_info = generate_traversal_info(in);
+ vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+ for (size_t i = 0; i < in.Size(); i++) {
+ CHECK_GE(inPtr[traversal_info[in.shape().size()]], 0.f);
+ outPtr[i] = sqrt(inPtr[traversal_info[in.shape().size()]]);
+ traverse_next(in, shape_multipliers, traversal_info, i+1);
}
}
-/*
template <>
-void Square<float, lang::Cpp>(const Tensor* in, Tensor* out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < in->Size(); i++) {
- outPtr[i] = inPtr[i] * inPtr[i];
- }
-}
-*/
-
-// template <>
-// void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// Tensor* out, Context *ctx) {
-// // CHECK_EQ(ctx->stream, nullptr);
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = in1Ptr[i] - in2Ptr[i];
-// }
-// }
-
-template <>
-void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
+void Sub<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context *ctx) {
// CHECK_EQ(ctx->stream, nullptr);
auto sub_lambda_binary = [](float a, float b) {
@@ -632,28 +406,18 @@ void Sub<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
// sum all elements of input into out
// TODO(wangwei) optimize using omp
template <>
-void Sum<float, lang::Cpp>(const Tensor* in, float *out,
+void Sum<float, lang::Cpp>(const Tensor& in, float *out,
Context *ctx) {
float s = 0.f;
- const float *inPtr = static_cast<const float *>(in->block()->data());
- for (size_t i = 0; i < in->Size(); i++) {
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ for (size_t i = 0; i < in.Size(); i++) {
s += inPtr[i];
}
*out = s;
}
-// template <>
-// void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = tanh(inPtr[i]);
-// }
-// }
-
template <>
-void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
+void Tanh<float, lang::Cpp>(const Tensor& in, Tensor* out,
Context *ctx) {
auto tanh_lambda = [](float a) {
return tanh(a);
@@ -661,17 +425,6 @@ void Tanh<float, lang::Cpp>(const Tensor* in, Tensor* out,
TraverseUnary<float>(in, out, tanh_lambda);
}
-// ===============Random operations==========================================
-// template <>
-// void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
-// Context *ctx) {
-// std::bernoulli_distribution distribution(p);
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
-// }
-// }
-
template <>
void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
Context *ctx) {
@@ -682,16 +435,6 @@ void Bernoulli<float, lang::Cpp>(const float p, Tensor* out,
}
}
-// template <>
-// void Gaussian<float, lang::Cpp>(const float mean,
-// const float std, Tensor* out, Context *ctx) {
-// std::normal_distribution<float> distribution(mean, std);
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-// }
-// }
-
template <>
void Gaussian<float, lang::Cpp>(const float mean,
const float std, Tensor* out, Context *ctx) {
@@ -702,16 +445,6 @@ void Gaussian<float, lang::Cpp>(const float mean,
}
}
-// template <>
-// void Uniform<float, lang::Cpp>(const float low,
-// const float high, Tensor* out, Context *ctx) {
-// std::uniform_real_distribution<float> distribution(low, high);
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
-// }
-// }
-
template <>
void Uniform<float, lang::Cpp>(const float low,
const float high, Tensor* out, Context *ctx) {
@@ -727,113 +460,72 @@ void Uniform<float, lang::Cpp>(const float low,
//warning, this function has block M overwritting to block M itself
template <>
void DGMM<float, lang::Cpp>(const bool side_right,
- const Tensor* M, const Tensor* v,
+ const Tensor& M, const Tensor& v,
Tensor* out, Context *ctx) {
- const float *MPtr = static_cast<const float *>(M->block()->data());
- const float *vPtr = static_cast<const float *>(v->block()->data());
+ const float *MPtr = static_cast<const float *>(M.block()->data());
+ const float *vPtr = static_cast<const float *>(v.block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const size_t nrow = M->shape(0);
- const size_t ncol = M->shape(1);
- vector<int> traversal_info = M->generate_traversal_info();
+ const size_t nrow = M.shape(0);
+ const size_t ncol = M.shape(1);
+ vector<int> traversal_info = generate_traversal_info(M);
+ vector<int> shape_multipliers = generate_shape_multipliers(M);
if (side_right) {
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
- outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[c];
- M->traverse_next(traversal_info, offset+c+1);
+ outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[c];
+ traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
}
}
} else {
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
- outPtr[traversal_info[M->shape().size()]] = MPtr[traversal_info[M->shape().size()]] * vPtr[r];
- M->traverse_next(traversal_info, offset+c+1);
+ outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[r];
+ traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
}
}
}
}
-// #ifdef USE_CBLAS
-// template <>
-// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
-// Context *ctx) {
-// const float *inPtr = static_cast<const float *>(in->data());
-// *out = cblas_isamax(in->Size(), inPtr, 1);
-// }
-
-// template <>
-// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
-// Context *ctx) {
-// const float *inPtr = static_cast<const float *>(in->data());
-// *out = cblas_sasum(in->Size(), inPtr, 1);
-// }
-
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-// const Tensor* in, Tensor* out, Context *ctx) {
-// const float *inPtr = static_cast<const float *>(in->data());
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
-// }
-
-// template <>
-// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// float *out, Context *ctx) {
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// *out = cblas_sdot(in->Size(), in1Ptr, 1, in2Ptr, 1);
-// }
-// template <>
-// void Scale<float, lang::Cpp>(const float x, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// cblas_sscal(in->Size(), x, outPtr, 1);
-// }
-// template <>
-// void Nrm2<float, lang::Cpp>(const Tensor* in, float *out,
-// Context *ctx) {
-// const float *inPtr = static_cast<const float *>(in->data());
-// *out = cblas_snrm2(in->Size(), inPtr, 1);
-// }
#ifdef USE_CBLAS
template <>
-void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->block()->data());
- *out = cblas_isamax(in->Size(), inPtr, 1); //not using strided traversal
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ *out = cblas_isamax(in.Size(), inPtr, 1); //not using strided traversal
}
template <>
-void Asum<float, lang::Cpp>(const Tensor *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor& in, float *out,
Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->block()->data());
- *out = cblas_sasum(in->Size(), inPtr, 1); //not using strided traversal
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ *out = cblas_sasum(in.Size(), inPtr, 1); //not using strided traversal
}
template <>
void Axpy<float, lang::Cpp>(const float alpha,
- const Tensor *in, Tensor *out, Context *ctx) {
+ const Tensor& in, Tensor *out, Context *ctx) {
//check input tensor for strides first
- if(in->strides() != out->strides()){
- const float *inPtr = static_cast<const float *>(in->block()->data());
+ if(in.strides() == out->strides()){
+ const float *inPtr = static_cast<const float *>(in.block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+ cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
} else {
LOG(FATAL) << "Axpy, input and output strides do not match." ;
}
}
template <>
-void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
float *out, Context *ctx) {
//check input tensor for strides first
- if(!(in1->transpose()) && !(in2->transpose())){
- const float *in1Ptr = static_cast<const float *>(in1->block()->data());
- const float *in2Ptr = static_cast<const float *>(in2->block()->data());
- *out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+ if(!(in1.transpose()) && !(in2.transpose())){
+ const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+ const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+ *out = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
} else {
LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
}
@@ -847,40 +539,21 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
}
template <>
-void Nrm2<float, lang::Cpp>(const Tensor *in, float *out,
+void Nrm2<float, lang::Cpp>(const Tensor& in, float *out,
Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->block()->data());
- *out = cblas_snrm2(in->Size(), inPtr, 1); //not using strided traversal
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ *out = cblas_snrm2(in.Size(), inPtr, 1); //not using strided traversal
}
-// template <>
-// void GEMV<float, lang::Cpp>(//bool trans,
-// const std::vector<int> stridesA,
-// const size_t m, const size_t n,
-// const float alpha, const Tensor* A, const Tensor* v,
-// const float beta, Tensor* out, Context *ctx) {
-// const float *APtr = static_cast<const float *>(A->data());
-// const float *vPtr = static_cast<const float *>(v->data());
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// auto trans = (stridesA.back() == 1) ? true : false;
-// if (!trans) {
-// cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
-// beta, outPtr, 1);
-// } else {
-// cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
-// outPtr, 1);
-// }
-// }
-
template <>
-void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
const float beta, Tensor *out, Context *ctx) {
- const float *APtr = static_cast<const float *>(A->block()->data());
- const float *vPtr = static_cast<const float *>(v->block()->data());
+ const float *APtr = static_cast<const float *>(A.block()->data());
+ const float *vPtr = static_cast<const float *>(v.block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const size_t m = A->shape()[0];
- const size_t n = A->shape()[1];
- if (A->transpose()) {
+ const size_t m = A.shape()[0];
+ const size_t n = A.shape()[1];
+ if (A.transpose()) {
cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
outPtr, 1);
} else {
@@ -889,147 +562,36 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
}
}
-// template <>
-// void GEMM<float, lang::Cpp>(//const bool transA, const bool transB,
-// const std::vector<int> stridesA, const std::vector<int> stridesB,
-// const size_t nrowA, const size_t ncolB,
-// const size_t ncolA, const float alpha,
-// const Tensor* A, const Tensor* B, const float beta,
-// Tensor* C, Context *ctx) {
-// auto transA = (stridesA.back() == 1) ? true : false;
-// auto transa = transA ? CblasTrans : CblasNoTrans;
-// auto transB = (stridesB.back() == 1) ? true : false;
-// auto transb = transB ? CblasTrans : CblasNoTrans;
-// auto lda = transA ? nrowA : ncolA;
-// auto ldb = transB ? ncolA : ncolB;
-// auto ldc = ncolB;
-// const float *APtr = static_cast<const float *>(A->data());
-// const float *BPtr = static_cast<const float *>(B->data());
-// float *CPtr = static_cast<float *>(C->mutable_data());
-// cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
-// lda, BPtr, ldb, beta, CPtr, ldc);
-// }
-
template <>
void GEMM<float, lang::Cpp>(const float alpha,
- const Tensor *A, const Tensor *B, const float beta,
+ const Tensor& A, const Tensor& B, const float beta,
Tensor *C, Context *ctx) {
- auto transA = A->transpose();
+ auto transA = A.transpose();
auto transa = transA ? CblasTrans : CblasNoTrans;
- auto transB = B->transpose();
+ auto transB = B.transpose();
auto transb = transB ? CblasTrans : CblasNoTrans;
- const size_t nrowA = A->shape()[0];
- const size_t ncolA = A->shape()[1];
- const size_t ncolB = B->shape()[1];
+ const size_t nrowA = A.shape()[0];
+ const size_t ncolA = A.shape()[1];
+ const size_t ncolB = B.shape()[1];
auto lda = transA ? nrowA : ncolA;
auto ldb = transB ? ncolA : ncolB;
auto ldc = ncolB;
- const float *APtr = static_cast<const float *>(A->block()->data());
- const float *BPtr = static_cast<const float *>(B->block()->data());
+ const float *APtr = static_cast<const float *>(A.block()->data());
+ const float *BPtr = static_cast<const float *>(B.block()->data());
float *CPtr = static_cast<float *>(C->block()->mutable_data());
cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
lda, BPtr, ldb, beta, CPtr, ldc);
}
-#else
-
-// template <>
-// void Amax<float, lang::Cpp>(const Tensor* in, size_t *out,
-// Context *ctx) {
-// size_t maxPos = 0;
-// float maxVal = 0;
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// if (i == 0) {
-// maxVal = inPtr[i];
-// } else if (inPtr[i] > maxVal) {
-// maxVal = inPtr[i];
-// maxPos = i;
-// }
-// }
-// *out = maxPos;
-// }
-// template <>
-// void Amin<float, lang::Cpp>(const Tensor* in, size_t *out,
-// Context *ctx) {
-// size_t minPos = 0;
-// float minVal = 0;
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// if (i == 0) {
-// minVal = inPtr[i];
-// } else if (inPtr[i] > minVal) {
-// minVal = inPtr[i];
-// minPos = i;
-// }
-// }
-// *out = minPos;
-// }
-
-// template <>
-// void Asum<float, lang::Cpp>(const Tensor* in, float *out,
-// Context *ctx) {
-// float sum = 0;
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// sum += fabs(inPtr[i]);
-// }
-// }
-
-// template <>
-// void Axpy<float, lang::Cpp>(const float alpha,
-// const Tensor* in, Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *inPtr = static_cast<const float *>(in->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] += alpha * inPtr[i];
-// }
-// }
-
-// template <>
-// void Scale<float, lang::Cpp>(const float x, Tensor* out,
-// Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// outPtr[i] *= x;
-// }
-// }
-
-// template <>
-// void Dot<float, lang::Cpp>(const Tensor* in1, const Tensor* in2,
-// float *out, Context *ctx) {
-// float sum = 0;
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < in->Size(); i++) {
-// sum += in1Ptr[i] * in2Ptr[i];
-// }
-// }
-
-// template <>
-// void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
-// const float alpha, const Tensor* A, const Tensor* v,
-// const float beta, Tensor* out, Context *ctx) {
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// const float *APtr = static_cast<const float *>(A->data());
-// const float *vPtr = static_cast<const float *>(v->data());
-// for (size_t r = 0; r < m; r++) {
-// float sum = 0;
-// for (size_t c = 0; c < n; c++) {
-// size_t idx = trans ? c * m + r : r * n + c;
-// sum += APtr[idx] * vPtr[c];
-// }
-// outPtr[r] = alpha * sum + beta * outPtr[r];
-// }
-// }
+#else
template <>
-void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
Context *ctx) {
size_t maxPos = 0;
float maxVal = 0;
- const float *inPtr = static_cast<const float *>(in->block()->data());
- for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ for (size_t i = 0; i < in.Size(); i++) { //not using strided traversal
if (i == 0) {
maxVal = inPtr[i];
} else if (inPtr[i] > maxVal) {
@@ -1040,12 +602,12 @@ void Amax<float, lang::Cpp>(const Tensor *in, size_t *out,
*out = maxPos;
}
template <>
-void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
+void Amin<float, lang::Cpp>(const Tensor& in, size_t *out,
Context *ctx) {
size_t minPos = 0;
float minVal = 0;
- const float *inPtr = static_cast<const float *>(in->block()->data());
- for (size_t i = 0; i < in->Size(); i++) { //not using strided traversal
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ for (size_t i = 0; i < in.Size(); i++) { //not using strided traversal
if (i == 0) {
minVal = inPtr[i];
} else if (inPtr[i] > minVal) {
@@ -1057,24 +619,26 @@ void Amin<float, lang::Cpp>(const Tensor *in, size_t *out,
}
template <>
-void Asum<float, lang::Cpp>(const Tensor *in, float *out,
+void Asum<float, lang::Cpp>(const Tensor& in, float *out,
Context *ctx) {
float sum = 0;
- const float *inPtr = static_cast<const float *>(in->block()->data());
- for (size_t i = 0; i < in->Size(); i++) {
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ for (size_t i = 0; i < in.Size(); i++) {
sum += fabs(inPtr[i]); //not using strided traversal
}
}
template <>
void Axpy<float, lang::Cpp>(const float alpha,
- const Tensor *in, Tensor *out, Context *ctx) {
+ const Tensor& in, Tensor *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const float *inPtr = static_cast<const float *>(in->block()->data());
- vector<int> traversal_info = in->generate_traversal_info();
- for (size_t i = 0; i < in->Size(); i++) {
- outPtr[i] += alpha * inPtr[traversal_info[in->shape().size()]];
- in->traverse_next(traversal_info, i+1);
+ const float *inPtr = static_cast<const float *>(in.block()->data());
+ vector<int> traversal_info = generate_traversal_info(in);
+ vector<int> shape_multipliers = generate_shape_multipliers(in);
+
+ for (size_t i = 0; i < in.Size(); i++) {
+ outPtr[i] += alpha * inPtr[traversal_info[in.shape().size()]];
+ traverse_next(in, shape_multipliers, traversal_info, i+1);
}
}
@@ -1088,35 +652,38 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
}
template <>
-void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
+void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
float *out, Context *ctx) {
float sum = 0;
- // const float *in1Ptr = static_cast<const float *>(in1->data());
- // const float *in2Ptr = static_cast<const float *>(in2->data());
- // for (size_t i = 0; i < in->Size(); i++) {
+ // const float *in1Ptr = static_cast<const float *>(in1.data());
+ // const float *in2Ptr = static_cast<const float *>(in2.data());
+ // for (size_t i = 0; i < in.Size(); i++) {
// sum += in1Ptr[i] * in2Ptr[i];
// }
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->block()->data());
- const float *in2Ptr = static_cast<const float *>(in2->block()->data());
- vector<int> traversal_info_in1 = in1->generate_traversal_info();
- vector<int> traversal_info_in2 = in2->generate_traversal_info();
- for (size_t i = 0; i < in1->Size(); i++) {
- sum += in1Ptr[traversal_info_in1[in1->shape().size()]] * in2Ptr[traversal_info_in2[in2->shape().size()]];
- in1->traverse_next(traversal_info_in1, i+1);
- in2->traverse_next(traversal_info_in2, i+1);
+ const float *in1Ptr = static_cast<const float *>(in1.block()->data());
+ const float *in2Ptr = static_cast<const float *>(in2.block()->data());
+ vector<int> traversal_info_in1 = generate_traversal_info(in1);
+ vector<int> traversal_info_in2 = generate_traversal_info(in2);
+ vector<int> shape_multipliers_in1 = generate_shape_multipliers(in1);
+ vector<int> shape_multipliers_in2 = generate_shape_multipliers(in2);
+
+ for (size_t i = 0; i < in1.Size(); i++) {
+ sum += in1Ptr[traversal_info_in1[in1.shape().size()]] * in2Ptr[traversal_info_in2[in2.shape().size()]];
+ traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
+ traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
}
}
template <>
-void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
+void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
const float beta, Tensor *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const float *APtr = static_cast<const float *>(A->block()->data());
- const float *vPtr = static_cast<const float *>(v->block()->data());
- bool trans = A->transpose();
- const size_t m = A->shape(0);
- const size_t n = A->shape(1);
+ const float *APtr = static_cast<const float *>(A.block()->data());
+ const float *vPtr = static_cast<const float *>(v.block()->data());
+ bool trans = A.transpose();
+ const size_t m = A.shape(0);
+ const size_t n = A.shape(1);
for (size_t r = 0; r < m; r++) {
float sum = 0;
for (size_t c = 0; c < n; c++) {
@@ -1189,34 +756,21 @@ void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool int_target,
}
}
-// template <>
-// void RowMax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
-// const Tensor* in, Tensor* out, Context *ctx) {
-// const float *inPtr = static_cast<const float *>(in->data());
-// float *outPtr = static_cast<float *>(out->mutable_data());
-// for (size_t r = 0; r < nrow; r++) {
-// int offset = (int)(r * ncol);
-// float maxval = inPtr[offset];
-// for (size_t c = 1; c < ncol; c++)
-// maxval = (std::max)(maxval, inPtr[offset + c]);
-// outPtr[r] = maxval;
-// }
-// }
-
template <>
-void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->block()->data());
+void RowMax<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in.block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- const size_t nrow = in->shape()[0];
- const size_t ncol = in->shape()[1];
- vector<int> traversal_info = in->generate_traversal_info();
+ const size_t nrow = in.shape()[0];
+ const size_t ncol = in.shape()[1];
+ vector<int> traversal_info = generate_traversal_info(in);
+ vector<int> shape_multipliers = generate_shape_multipliers(in);
for (size_t r = 0; r < nrow; r++) {
int counter_offset = (r * ncol);
float maxval = 0;
for (size_t c = 0; c < ncol; c++){
- maxval = (std::max)(maxval, inPtr[traversal_info[in->shape().size()]]);
- in->traverse_next(traversal_info, counter_offset+c+1);
+ maxval = (std::max)(maxval, inPtr[traversal_info[in.shape().size()]]);
+ traverse_next(in, shape_multipliers, traversal_info, counter_offset+c+1);
}
outPtr[r] = maxval;
}
@@ -1226,11 +780,11 @@ void RowMax<float, lang::Cpp>(const Tensor *in, Tensor *out, Context *ctx) {
/*
template <>
void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Tensor* A, const Tensor* v, Tensor* out,
+ const Tensor& A, const Tensor& v, Tensor* out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
+ const float *APtr = static_cast<const float *>(A.data());
+ const float *vPtr = static_cast<const float *>(v.data());
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
@@ -1241,11 +795,11 @@ void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
template <>
void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Tensor* A, const Tensor* v, Tensor* out,
+ const Tensor& A, const Tensor& v, Tensor* out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
+ const float *APtr = static_cast<const float *>(A.data());
+ const float *vPtr = static_cast<const float *>(v.data());
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
@@ -1254,11 +808,11 @@ void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
}
}
template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
- const Tensor* in2, Tensor* out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
+ const float *in1Ptr = static_cast<const float *>(in1.data());
+ const float *in2Ptr = static_cast<const float *>(in2.data());
for (size_t r = 0; r < m; r++) {
size_t offset = r * n;
for (size_t c = 0; c < n; c++) {
@@ -1268,9 +822,9 @@ void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Tensor* in1,
}
template <>
void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Tensor* in, Tensor* out, Context *ctx) {
+ const Tensor& in, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
+ const float *inPtr = static_cast<const float *>(in.data());
float *bPtr = new float[ncol];
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
@@ -1289,9 +843,9 @@ void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
template <>
void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Tensor* in, Tensor* out, Context *ctx) {
+ const Tensor& in, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
+ const float *inPtr = static_cast<const float *>(in.data());
for (size_t c = 0; c < ncol; c++) {
outPtr[c] = 0.f;
}
@@ -1305,9 +859,9 @@ void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
template <>
void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Tensor* in, Tensor* out, Context *ctx) {
+ const Tensor& in, Tensor* out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
+ const float *inPtr = static_cast<const float *>(in.data());
for (size_t r = 0; r < nrow; r++) {
size_t offset = r * ncol;
outPtr[r] = 0.f;
[08/10] incubator-singa git commit: Streamlining of tensor.h file by
moving respective member functions to cpp or cuda file. Removal of
shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be
passed as reference instead of pointer
Posted by wa...@apache.org.
Streamlining of tensor.h file by moving respective member functions to cpp or cuda file. Removal of shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be passed as reference instead of pointer
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/c52e2aa3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/c52e2aa3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/c52e2aa3
Branch: refs/heads/master
Commit: c52e2aa3b5272750960ce6d3ae9f14bad1cee397
Parents: a44d2e7
Author: Vaan Ng <cm...@gmail.com>
Authored: Sun May 13 00:24:40 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Sun May 13 00:24:40 2018 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 152 +----
src/core/tensor/tensor.cc | 60 +-
src/core/tensor/tensor_math.h | 124 ++--
src/core/tensor/tensor_math_cpp.h | 1012 +++++++++----------------------
src/core/tensor/tensor_math_cuda.h | 499 ++++++++-------
5 files changed, 647 insertions(+), 1200 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index b94a982..e25aafd 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -22,7 +22,6 @@
#include <vector>
#include <tuple>
#include <memory>
-#include <algorithm>
#include "singa/core/common.h"
#include "singa/core/device.h"
@@ -31,7 +30,6 @@
using std::vector;
using std::tuple;
-using std::reverse;
namespace singa {
typedef vector<size_t> Shape;
@@ -104,43 +102,6 @@ class Tensor {
return shape_.at(idx);
}
- /*
- cudnn requires tensor dimensions to fulfill 1 requirement:
- 1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
- if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
- (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-
- for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
- Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
- */
- vector<int> generate_shape_cuda() const {
- vector<int> shape_arr;
- if(shape_.size() <= 4){
- for (size_t n=0; n<4-shape_.size(); ++n) {
- shape_arr.push_back(1);
- }
- for (size_t n=0; n<shape_.size(); ++n) {
- shape_arr.push_back(shape_.at(n));
- }
- return shape_arr;
- } else if(shape_.size() == 5){
- for (size_t n=0; n<shape_.size(); ++n) {
- shape_arr.push_back(shape_.at(n));
- }
- return shape_arr;
- } else {
- LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
- }
- }
-
- int generate_dim_cuda() const {
- if(shape_.size() <= 4){return 4;}
- else if(shape_.size() == 5){return 5;}
- else{
- LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
- }
- }
-
size_t nDim() const { return shape_.size(); }
bool empty() const { return nDim() == 0; }
@@ -150,40 +111,6 @@ class Tensor {
const vector<int>& strides() const { return strides_; }
- /*
- cudnn requires stride dimensions to conform to the format of the shape input as well
- 1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
- If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
- (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-
- for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3} and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
- */
- vector<int> generate_strides_cuda() const {
- vector<int> strides_arr;
- int product = 1;
- for (size_t n=0; n<(shape_.size()); ++n) {
- product *= shape_[n];
- }
- if(shape_.size() <= 4){
- for (size_t n=0; n<4-shape_.size(); ++n) {
- strides_arr.push_back(product);
- }
- for (size_t n=0; n<strides_.size(); ++n) {
- strides_arr.push_back(strides_[n]);
- }
- return strides_arr;
- } else if(shape_.size() == 5){
- for (size_t n=0; n<strides_.size(); ++n) {
- strides_arr.push_back(strides_[n]);
- }
- return strides_arr;
- } else {
- LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
- }
- }
-
- const vector<int>& shape_multipliers() const { return shape_multipliers_; }
-
/// return true if the content of the tensor is initialized
bool initailized() const {
return block_ != nullptr && block_->initialized();
@@ -292,7 +219,7 @@ class Tensor {
float L2() const;
//generate strides automatically if stride field is not passed
-void Generate_Strides(){
+void generate_strides(){
if(shape_.size()==0){
strides_ = {1};
return void();
@@ -306,84 +233,11 @@ void Generate_Strides(){
}
};
-void Set_Strides(const vector<int> new_strides){
+void set_strides(const vector<int> new_strides){
strides_ = new_strides;
}
-//generate shape multipliers
-//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
-//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
-//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
-//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
-vector<int> Generate_Shape_Multipliers(Shape y_shape) const {
- if(y_shape.size()==0){
- return {1};
- }
- reverse(y_shape.begin(), y_shape.end());
- vector<int> shape_multipliers = {};
- int cumulative_product = 1;
-
- shape_multipliers.push_back(1);
- for (size_t n=0; n<(y_shape.size()-1); ++n) {
- cumulative_product = cumulative_product*y_shape[n];
- shape_multipliers.push_back(cumulative_product);
- }
- reverse(shape_multipliers.begin(), shape_multipliers.end());
- return shape_multipliers;
-};
-
-// ******************************************************************************************
-// Some traversal operations (works on const declarations without modifying tensor variables)
-// ******************************************************************************************
-
-//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
-vector<int> generate_traversal_info() const {
- vector<int> traversal_info = {};
- for(size_t n=0; n<(shape_.size()+2); ++n) {
- traversal_info.push_back(0);
- }
- return traversal_info;
-};
-
-//this function checks whether the next index falls on a special multiplier of the outer shape
-//so the algorithm knows when to jump over/back to a starting element of the outer shape
-//for e.g. in [[1,4,7], [2,5,8], [3,6,9]], elements 1,2,3 are the starting elements of their respective rows
-//this additional check only has 1 loop for 2d matrix
-//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
-int determine_order(int counter) const {
- for (size_t n=0; n<(shape_multipliers_.size()-1); ++n) {
- if((counter%shape_multipliers_[n])==0){
- return ((shape_multipliers_.size()) - 1 - n);
- }
- }
- return 0;
-};
-
-//this function updates the base indexes with the current index after every single traversal step, can be generalized beyond 2d cases
-void update_base_index(std::vector<int>& traversal_info) const {
- for (int n=0; n<(traversal_info[shape_.size()+1]+1); ++n) {
- traversal_info[n] = traversal_info[shape_.size()];
- }
-};
-
-//function to traverse a const strided tensor object
-//it requires an additional vector, traversal_info {0,0,0,0 ...}, comprising (shape_.size()+2) elements of 0
-//for e.g. 2d matrix:
-//index 0 and 1 store the base row and column index respectively
-//index 2 stores the current index of the traversal
-//index 3 stores the order of the traversal for e.g. if the order is 0, it means the next element can be navigated to using the innermost stride
-void traverse_next(std::vector<int>& traversal_info, int counter) const {
- update_base_index(traversal_info);
- traversal_info[shape_.size()+1] = determine_order(counter);
- traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[strides_.size()-traversal_info[shape_.size()+1]-1];
-};
-
-// ******************************************************************************************
-// traversal operations end
-// ******************************************************************************************
-
protected:
- //bool transpose_ = false;
DataType data_type_ = kFloat32;
std::shared_ptr<Device> device_ = nullptr;
/// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
@@ -391,8 +245,6 @@ void traverse_next(std::vector<int>& traversal_info, int counter) const {
Block *block_ = nullptr;
Shape shape_ = {};
vector<int> strides_ = {};
- vector<int> shape_multipliers_ = {};
-
}; //end of tensor class
typedef Shape::iterator ShapeIter;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 9067242..a4efd64 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -34,7 +34,6 @@ Tensor::~Tensor() {
Tensor::Tensor() {
device_ = defaultDevice;
strides_ = {1};
- shape_multipliers_ = {1};
}
//non-strided constructors
@@ -43,16 +42,14 @@ Tensor::Tensor(const Shape &shape, DataType dtype)
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
- Generate_Strides();
- shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+ generate_strides();
}
Tensor::Tensor(Shape &&shape, DataType dtype)
: data_type_(dtype), device_(defaultDevice), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
- Generate_Strides();
- shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+ generate_strides();
}
//non-strided constructors with device
@@ -62,16 +59,14 @@ Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
- Generate_Strides();
- shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+ generate_strides();
}
Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
- Generate_Strides();
- shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+ generate_strides();
}
@@ -81,8 +76,7 @@ Tensor::Tensor(const Tensor &in)
device_(in.device_),
block_(in.block()),
shape_(in.shape_),
- strides_(in.strides_),
- shape_multipliers_(in.shape_multipliers_) {
+ strides_(in.strides_) {
if (block_ != nullptr)
block_->IncRefCount();
}
@@ -95,7 +89,6 @@ Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides)
block_(in.block()),
shape_(new_shape),
strides_(new_strides) {
- shape_multipliers_ = Generate_Shape_Multipliers(shape_);
if (block_ != nullptr)
block_->IncRefCount();
}
@@ -105,8 +98,7 @@ Tensor::Tensor(Tensor &&in)
data_type_(in.data_type_),
device_(in.device_),
shape_(std::move(in.shape_)),
- strides_(in.strides_),
- shape_multipliers_(in.shape_multipliers_) {
+ strides_(in.strides_) {
block_ = in.block_;
in.block_ = nullptr;
}
@@ -129,7 +121,6 @@ void Tensor::ResetLike(const Tensor &in) {
}
shape_ = in.shape_;
strides_ = in.strides_;
- shape_multipliers_ = in.shape_multipliers_;
}
//if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
@@ -146,8 +137,7 @@ void Tensor::Reshape(const Shape &shape) {
LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
}
shape_ = shape;
- Generate_Strides();
- shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+ generate_strides();
}
void Tensor::Reshape(Shape &&shape) {
@@ -162,8 +152,7 @@ void Tensor::Reshape(Shape &&shape) {
LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
}
shape_ = std::move(shape);
- Generate_Strides();
- shape_multipliers_ = Generate_Shape_Multipliers(shape_);
+ generate_strides();
}
void Tensor::AsType(const DataType type) {
@@ -350,7 +339,6 @@ Tensor Tensor::T() const {
t.strides_.clear();
t.strides_.push_back(strides_[1]);
t.strides_.push_back(strides_[0]);
- t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
t.block_ = block_;
block_->IncRefCount();
return t;
@@ -359,7 +347,7 @@ Tensor Tensor::T() const {
//normal transpose without axes
Tensor Tensor::Transpose() const {
// if(shape_.size() != strides_.size())
- // Generate_Strides();
+ // generate_strides();
Tensor t;
t.device_ = device_;
@@ -369,7 +357,6 @@ Tensor Tensor::Transpose() const {
t.shape_.push_back(shape_[shape_.size()-n-1]);
t.strides_.push_back(strides_[shape_.size()-n-1]);
}
- t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
t.block_ = block_;
block_->IncRefCount();
return t;
@@ -382,7 +369,7 @@ Tensor Tensor::Transpose(Shape axes) const {
// return void();
// }
// if(shape_.size() != strides_.size())
- // Generate_Strides();
+ // generate_strides();
Tensor t;
t.device_ = device_;
@@ -392,7 +379,6 @@ Tensor Tensor::Transpose(Shape axes) const {
t.shape_.push_back(shape_[axes[n]]);
t.strides_.push_back(strides_[axes[n]]);
}
- t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
t.block_ = block_;
block_->IncRefCount();
return t;
@@ -564,7 +550,7 @@ float Tensor::L1() const {
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
device_->Exec([&nrm, this](Context *ctx) {
DType ret = DType(0);
- Asum<DType, Lang>(this, &ret, ctx);
+ Asum<DType, Lang>(*this, &ret, ctx);
nrm = TypeCast<DType, float>(ret);
}, {this->block()}, {});
});
@@ -577,7 +563,7 @@ float Tensor::L2() const {
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
device_->Exec([&nrm, this](Context *ctx) {
DType ret = DType(0);
- Nrm2<DType, Lang>(this, &ret, ctx);
+ Nrm2<DType, Lang>(*this, &ret, ctx);
nrm = TypeCast<DType, float>(ret);
}, {this->block()}, {});
});
@@ -603,7 +589,7 @@ template void Tensor::SetValue<int>(const int x);
do { \
TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
ret->device()->Exec([t, ret](Context * ctx) { \
- fn<DType, Lang>(&t, ret, ctx); \
+ fn<DType, Lang>(t, ret, ctx); \
}, {t.block()}, {ret->block()}); \
}); \
} while (0)
@@ -632,7 +618,7 @@ GenUnaryTensorFn(Tanh);
TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \
CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())); \
ret->device()->Exec([lhs, rhs, ret](Context * ctx) { \
- fn<DType, Lang>(&lhs, &rhs, ret, \
+ fn<DType, Lang>(lhs, rhs, ret, \
ctx); \
}, {lhs.block(), rhs.block()}, {ret->block()}); \
}); \
@@ -663,7 +649,7 @@ GenBinaryTensorFn(operator>=, GE);
static_assert(std::is_same<SType, DType>::value, \
"The Scalar type must match the Tensor data type"); \
ret->device()->Exec([t, x, ret](Context * ctx) { \
- fn<DType, Lang>(&t, x, ret, ctx); \
+ fn<DType, Lang>(t, x, ret, ctx); \
}, {t.block()}, {ret->block()}); \
}); \
} while (0)
@@ -706,7 +692,7 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
// TODO(wangwei) type cast SType to DType;
in.device()->Exec([alpha, in, out](Context *ctx) {
- Div<DType, Lang>(alpha, &in, out, ctx);
+ Div<DType, Lang>(alpha, in, out, ctx);
}, {in.block()}, {out->block()});
});
}
@@ -743,7 +729,7 @@ float Sum<float>(const Tensor &in) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
one.device()->Exec([in, one, &s](Context *ctx) {
DType ret = DType(0);
- Dot<DType, Lang>(&in, &one, &ret, ctx);
+ Dot<DType, Lang>(in, one, &ret, ctx);
s = ret;
}, {in.block(), one.block()}, {});
});
@@ -776,7 +762,7 @@ Tensor RowMax(const Tensor &in) {
//size_t nrow = 1;
//if (in.nDim() > 1) nrow = in.shape(0);
//size_t ncol = in.Size() / nrow;
- RowMax<DType, Lang>(&in, &ret, ctx);
+ RowMax<DType, Lang>(in, &ret, ctx);
}, {in.block()}, {ret.block()});
});
return ret;
@@ -1012,7 +998,7 @@ void MultColumn(const Tensor &v, Tensor *M) {
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
v.device()->Exec([M, v](Context *ctx) {
- DGMM<DType, Lang>(false, M, &v,
+ DGMM<DType, Lang>(false, *M, v,
M, ctx);
}, {M->block(), v.block()}, {M->block()});
});
@@ -1027,7 +1013,7 @@ void MultRow(const Tensor &v, Tensor *M) {
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
v.device()->Exec([M, v](Context *ctx) {
- DGMM<DType, Lang>(true, M, &v,
+ DGMM<DType, Lang>(true, *M, v,
M, ctx);
}, {M->block(), v.block()}, {M->block()});
});
@@ -1113,7 +1099,7 @@ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
out->device()->Exec([a, in, out](Context *ctx) {
- Axpy<DType, Lang>(a, &in, out, ctx);
+ Axpy<DType, Lang>(a, in, out, ctx);
}, {in.block(), out->block()}, {out->block()});
});
}
@@ -1143,7 +1129,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
C->device()->Exec([a, A, b, B, C](Context *ctx) {
- GEMV<DType, Lang>(a, &A, &B, b, C, ctx);
+ GEMV<DType, Lang>(a, A, B, b, C, ctx);
}, {A.block(), B.block()}, {C->block()});
});
} else {
@@ -1152,7 +1138,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
C->device()->Exec([a, A, b, B, C](Context *ctx) {
- GEMM<DType, Lang>(a, &A, &B, b, C,
+ GEMM<DType, Lang>(a, A, B, b, C,
ctx);
}, {A.block(), B.block()}, {C->block()});
});
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index c403f30..c7fdfe5 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -40,7 +40,7 @@ namespace singa {
/// 4. Function argument names, use 'num' for total number of elements in
/// elementwise operations; use 'in1' 'in2' for in Tensors; use 'out' for
/// output Tensor or value. With exceptions for some functions, e.g.,
-/// Scale(const float alpha, const Tensor* in, Tensor* out);
+/// Scale(const float alpha, const Tensor &in, Tensor* out);
/// For such cases, use x, v, alpha, etc for scalar types.
/// For blas functions, follow the blas style for argument names.
/// Use 'M' and 'v' for matrix and vector tensors in functions involving both
@@ -50,37 +50,6 @@ namespace singa {
/// 7. Use size_t for the number of elements, rows or columns.
/// 8. Use the same name for the Tensor and Tensor level math functions.
-// template <typename DType>
-// void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){}
-
-// template <typename DType>
-// void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){}
-
-template <typename DType>
-void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){
- DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
- const DType *inPtr = static_cast<const DType *>(in->block()->data());
- vector<int> traversal_info = in->generate_traversal_info();
- for (size_t i = 0; i < in->Size(); i++) {
- outPtr[i] = func(inPtr[traversal_info[in->shape().size()]]);
- in->traverse_next(traversal_info, i+1);
- }
-}
-
-template <typename DType>
-void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){
- DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
- const DType *in1Ptr = static_cast<const DType *>(in1->block()->data());
- const DType *in2Ptr = static_cast<const DType *>(in2->block()->data());
- vector<int> traversal_info_in1 = in1->generate_traversal_info();
- vector<int> traversal_info_in2 = in2->generate_traversal_info();
- for (size_t i = 0; i < in1->Size(); i++) {
- outPtr[i] = func(in1Ptr[traversal_info_in1[in1->shape().size()]], in2Ptr[traversal_info_in2[in2->shape().size()]]);
- in1->traverse_next(traversal_info_in1, i+1);
- in2->traverse_next(traversal_info_in2, i+1);
- }
-}
-
// **************************************
// Element-wise functions
@@ -88,41 +57,41 @@ void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::func
/// out[i] = |in[i]|
template <typename DType, typename Lang>
-void Abs(const Tensor *in, Tensor *out, Context *ctx) {
+void Abs(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Abs Not Implemented";
}
/// out[i] = in[i] + x
template <typename DType, typename Lang>
-void Add(const Tensor *in, const DType x, Tensor *out,
+void Add(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Add Not Implemented";
}
/// out[i] = in1[i] + in2[i]
template <typename DType, typename Lang>
-void Add(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Add(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Add-Pair Not Implemented";
}
/// Clamp every element into [low, high]
/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
template <typename DType, typename Lang>
-void Clamp(const DType low, const DType high, const Tensor *in,
+void Clamp(const DType low, const DType high, const Tensor &in,
Tensor *out, Context *ctx) {
LOG(FATAL) << "Clamp Not Implemented";
}
/// out[i] = x / in[i]
template <typename DType, typename Lang>
-void Div(const DType x, const Tensor *in, Tensor *out,
+void Div(const DType x, const Tensor &in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Div Not Implemented";
}
/// out[i] = in[i] / x
template <typename DType, typename Lang>
-void Div(const Tensor *in, const DType x, Tensor *out,
+void Div(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
CHECK_NE(x, 0.f);
EltwiseMult<DType, Lang>(in, DType(1) / x, out, ctx);
@@ -130,101 +99,101 @@ void Div(const Tensor *in, const DType x, Tensor *out,
/// out[i] = in1[i] / in2[i]
template <typename DType, typename Lang>
-void Div(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Div(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Div-Pair Not Implemented";
}
/// out[i] = in[i] * x
template <typename DType, typename Lang>
-void EltwiseMult(const Tensor *in, const DType x, Tensor *out,
+void EltwiseMult(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "EltwiseMult Not Implemented";
}
/// out[i] = in1[i] * in2[i]
template <typename DType, typename Lang>
-void EltwiseMult(const Tensor *in1, const Tensor *in2, Tensor *out,
+void EltwiseMult(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
}
/// Base is e, Neper number. out[i]=exp(in[i])
template <typename DType, typename Lang>
-void Exp(const Tensor *in, Tensor *out, Context *ctx) {
+void Exp(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Exp Not Implemented";
}
/// out[i]=(in[i]<=x)?1.f:0.f
template <typename DType, typename Lang>
-void LE(const Tensor *in, const DType x, Tensor *out,
+void LE(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "LE Not Implemented";
}
/// out[i]=(in1[i]<=in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void LE(const Tensor *in1, const Tensor *in2, Tensor *out,
+void LE(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
}
/// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
template <typename DType, typename Lang>
-void Log(const Tensor *in, Tensor *out, Context *ctx) {
+void Log(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Log Not Implemented";
}
/// out[i]=(in[i]<x)?1.f:0.f
template <typename DType, typename Lang>
-void LT(const Tensor *in, const DType x, Tensor *out,
+void LT(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "LT Not Implemented";
}
/// out[i]=(in1[i]<in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void LT(const Tensor *in1, const Tensor *in2, Tensor *out,
+void LT(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
}
/// out[i]=(in[i]>=x)?1.f:0.f
template <typename DType, typename Lang>
-void GE(const Tensor *in, const DType x, Tensor *out,
+void GE(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "GE Not Implemented";
}
/// out[i]=(in1[i]>=in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void GE(const Tensor *in1, const Tensor *in2, Tensor *out,
+void GE(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
}
/// out[i]=(in[i]>x)?1.f:0.f
template <typename DType, typename Lang>
-void GT(const Tensor *in, const DType x, Tensor *out,
+void GT(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "GT Not Implemented";
}
/// out[i]=(in[i]>in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void GT(const Tensor *in, const Tensor *in2, Tensor *out,
+void GT(const Tensor &in, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
}
/// out[i] = pow(in[i], x)
template <typename DType, typename Lang>
-void Pow(const Tensor *in, const DType x, Tensor *out,
+void Pow(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Pow Not Implemented";
}
/// out[i]=pow(in1[i], in2[i])
template <typename DType, typename Lang>
-void Pow(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Pow(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Pow-Pair Not Implemented";
}
/// out[i]=max(0, in[i])
template <typename DType, typename Lang>
-void ReLU(const Tensor *in, Tensor *out, Context *ctx) {
+void ReLU(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "ReLU Not Implemented";
}
@@ -235,50 +204,50 @@ void Set(const DType x, Tensor *out, Context *ctx) {
}
/// out[i]=sigmoid(in[i])
template <typename DType, typename Lang>
-void Sigmoid(const Tensor *in, Tensor *out, Context *ctx) {
+void Sigmoid(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Sigmoid Not Implemented";
}
/// out[i] = sign(in[i])
template <typename DType, typename Lang>
-void Sign(const Tensor *in, Tensor *out, Context *ctx) {
+void Sign(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Sign Not Implemented";
}
/// out[i]=sqrt(in[i])
template <typename DType, typename Lang>
-void Sqrt(const Tensor *in, Tensor *out, Context *ctx) {
+void Sqrt(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Sqrt Not Implemented";
}
/// out[i]=square(in[i])
template <typename DType, typename Lang>
-void Square(const Tensor *in, Tensor *out, Context *ctx) {
+void Square(const Tensor &in, Tensor *out, Context *ctx) {
EltwiseMult<DType, Lang>(in, in, out, ctx);
}
/// out[i] = in[i] - x
template <typename DType, typename Lang>
-void Sub(const Tensor *in, const DType x, Tensor *out,
+void Sub(const Tensor &in, const DType x, Tensor *out,
Context *ctx) {
Add<DType, Lang>(in, -x, out, ctx);
}
/// out[i] = in1[i] - in2[i]
template <typename DType, typename Lang>
-void Sub(const Tensor *in1, const Tensor *in2, Tensor *out,
+void Sub(const Tensor &in1, const Tensor &in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Sub-Pair Not Implemented";
}
/// sum all elements of in into out
template <typename DType, typename Lang>
-void Sum(const Tensor *in, DType *out, Context *ctx) {
+void Sum(const Tensor &in, DType *out, Context *ctx) {
LOG(FATAL) << "Sum Not Implemented";
}
/// out[i]=tanh(in[i])
template <typename DType, typename Lang>
-void Tanh(const Tensor *in, Tensor *out, Context *ctx) {
+void Tanh(const Tensor &in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Tanh Not Implemented";
}
@@ -313,31 +282,31 @@ void Uniform(const float low, const float high, Tensor *out,
/// outurn the index of the element with the max value.
template <typename DType, typename Lang>
-void Amax(const Tensor *in, size_t *out, Context *ctx) {
+void Amax(const Tensor &in, size_t *out, Context *ctx) {
LOG(FATAL) << "Amax Not Implemented";
}
/// outurn the index of the element with the min value.
template <typename DType, typename Lang>
-void Amin(const Tensor *in, size_t *out, Context *ctx) {
+void Amin(const Tensor &in, size_t *out, Context *ctx) {
LOG(FATAL) << "Amin Not Implemented";
}
/// out = sum |x| for all x in in
template <typename DType, typename Lang>
-void Asum(const Tensor *in, DType *out, Context *ctx) {
+void Asum(const Tensor &in, DType *out, Context *ctx) {
LOG(FATAL) << "Asum Not Implemented";
}
/// out = alpha * in + out
template <typename DType, typename Lang>
-void Axpy(const DType alpha, const Tensor *in, Tensor *out,
+void Axpy(const DType alpha, const Tensor &in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Axpy Not Implemented";
}
/// out = ||in||_2^2, i.e, L2 norm.
template <typename DType, typename Lang>
-void Nrm2(const Tensor *in, float *out, Context *ctx) {
+void Nrm2(const Tensor &in, float *out, Context *ctx) {
LOG(FATAL) << "Nrm2 Not Implemented";
}
@@ -349,7 +318,7 @@ void Scale(const DType x, Tensor *out, Context *ctx) {
/// inner product of array in1 and in2
template <typename DType, typename Lang>
-void Dot(const Tensor *in1, const Tensor *in2, DType *out,
+void Dot(const Tensor &in1, const Tensor &in2, DType *out,
Context *ctx) {
LOG(FATAL) << "Dot Not Implemented";
}
@@ -358,7 +327,7 @@ void Dot(const Tensor *in1, const Tensor *in2, DType *out,
/// transA indicates if the internal data layout is transposed of A
template <typename DType, typename Lang>
void GEMV(const DType alpha,
- const Tensor *A, const Tensor *v, const DType beta, Tensor *out,
+ const Tensor &A, const Tensor &v, const DType beta, Tensor *out,
Context *ctx) {
LOG(FATAL) << "GEMV Not Implemented";
}
@@ -367,7 +336,7 @@ void GEMV(const DType alpha,
/// if matrix_lef_side is true, do M*v; else do v*M
template <typename DType, typename Lang>
void DGMM(const bool side_right,
- const Tensor *M, const Tensor *v, Tensor *out, Context *ctx) {
+ const Tensor &M, const Tensor &v, Tensor *out, Context *ctx) {
LOG(FATAL) << "DGMM Not Implemented";
}
@@ -375,7 +344,7 @@ void DGMM(const bool side_right,
/// transA indicates if the internal data layout is transposed of A
template <typename DType, typename Lang>
void GEMM(const DType alpha,
- const Tensor *A, const Tensor *B, const DType beta, Tensor *C,
+ const Tensor &A, const Tensor &B, const DType beta, Tensor *C,
Context *ctx) {
LOG(FATAL) << "GEMM Not Implemented";
}
@@ -396,7 +365,7 @@ void SoftmaxCrossEntropyBwd(bool int_target, const size_t batchsize,
}
template <typename DType, typename Lang>
-void RowMax(const Tensor *in, Tensor *out, Context* ctx) {
+void RowMax(const Tensor &in, Tensor *out, Context* ctx) {
LOG(FATAL) << "Not Implemented";
}
// **************************************
@@ -405,28 +374,28 @@ void RowMax(const Tensor *in, Tensor *out, Context* ctx) {
/*
/// Add the vector v to every column of A as the column of out
template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+void AddCol(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor &v,
Tensor *out, Context *ctx) {
LOG(FATAL) << "AddCol Not Implemented";
}
// TODO(wangwei) unify AddRow and AddCol.
/// Add the vector v to every row of A as the row of out
template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+void AddRow(const size_t nrow, const size_t ncol, const Tensor &A, const Tensor &v,
Tensor *out, Context *ctx) {
LOG(FATAL) << "AddRow Not Implemented";
}
/// outer-product.
/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Tensor *in1, const Tensor *in2,
+void Outer(const size_t m, const size_t n, const Tensor &in1, const Tensor &in2,
Tensor *out, Context *ctx) {
LOG(FATAL) << "Outer Not Implemented";
}
/// Sum the columns of the in matrix into a vector
template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
+void SumColumns(const size_t nrow, const size_t ncol, const Tensor &in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "SumColumns Not Implemented";
}
@@ -438,10 +407,11 @@ void Set(const DType x, Tensor *out, Context *ctx) {
// TODO(wangwei) unify SumRow and SumCol.
/// Sum the rows of the in matrix into a vector
template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
+void SumRows(const size_t nrow, const size_t ncol, const Tensor &in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "SumRows Not Implemented";
}
*/
+
} // namespace singa
#endif // SINGA_CORE_MATH_H_
[02/10] incubator-singa git commit: Singa-341 Added stride
functionality to tensors for CPP
Posted by wa...@apache.org.
Singa-341 Added stride functionality to tensors for CPP
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a88efa00
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a88efa00
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a88efa00
Branch: refs/heads/master
Commit: a88efa00c425f610c54a359e597ecaa82d41ff25
Parents: 060e7df
Author: Vaan Ng <cm...@gmail.com>
Authored: Tue Apr 17 20:09:19 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Tue Apr 17 20:09:19 2018 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 118 +++-
src/core/tensor/tensor.cc | 199 ++++--
src/core/tensor/tensor_math.h | 173 +++--
src/core/tensor/tensor_math_cpp.h | 1199 ++++++++++++++++++++++++--------
src/proto/core.proto | 21 +-
5 files changed, 1275 insertions(+), 435 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 6621fa0..6eafbdf 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -22,6 +22,7 @@
#include <vector>
#include <tuple>
#include <memory>
+#include <algorithm>
#include "singa/core/common.h"
#include "singa/core/device.h"
@@ -30,6 +31,7 @@
using std::vector;
using std::tuple;
+using std::reverse;
namespace singa {
typedef vector<size_t> Shape;
@@ -58,12 +60,14 @@ class Tensor {
Tensor();
explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
+
Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
- Tensor(const Shape &shape, std::shared_ptr<Device> dev,
- DataType dtype = kFloat32);
+ Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(const Tensor &from);
+ /// Copy Tensor to share the internal data. No deep copy. For 2 tensors sharing same block but different strides.
+ Tensor(const Tensor &from, Shape &new_shape, vector<int> &new_strides);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(Tensor &&from);
@@ -104,7 +108,12 @@ class Tensor {
bool empty() const { return nDim() == 0; }
- bool transpose() const { return transpose_; }
+ //bool transpose() const { return transpose_; }
+ bool transpose() const { return (strides_[0] != 1); }
+
+ const vector<int>& strides() const { return strides_; }
+
+ const vector<int>& shape_multipliers() const { return shape_multipliers_; }
/// return true if the content of the tensor is initialized
bool initailized() const {
@@ -171,6 +180,10 @@ class Tensor {
/// No data copy, just set the transpose_ filed of the returned tensor.
Tensor T() const;
+ Tensor Transpose() const;
+
+ Tensor Transpose(Shape axes) const;
+
/// Copy the meta info with data block shared.
Tensor &operator=(const Tensor &in);
@@ -209,15 +222,106 @@ class Tensor {
/// Return average L2 norm
float L2() const;
+ //generate strides automatically if stride field is not passed
+void Generate_Strides(){
+ if(shape_.size()==0){
+ strides_ = {1};
+ return void();
+ }
+ strides_.clear();
+ size_t dim = Size();
+ int cumulative_product = 1;
+ for (size_t n=0; n<shape_.size(); ++n) {
+ cumulative_product = cumulative_product*shape_[n];
+ strides_.push_back(dim/cumulative_product);
+ }
+ reverse(strides_.begin(), strides_.end());
+};
+
+//generate shape multipliers
+//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
+//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
+//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
+//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
+vector<int> Generate_Shape_Multipliers(Shape y_shape) const {
+ if(y_shape.size()==0){
+ return {1};
+ }
+ reverse(y_shape.begin(), y_shape.end());
+ vector<int> shape_multipliers = {};
+ int cumulative_product = 1;
+
+ shape_multipliers.push_back(1);
+ for (size_t n=0; n<(y_shape.size()-1); ++n) {
+ cumulative_product = cumulative_product*y_shape[n];
+ shape_multipliers.push_back(cumulative_product);
+ }
+ reverse(shape_multipliers.begin(), shape_multipliers.end());
+ return shape_multipliers;
+};
+
+// ******************************************************************************************
+// Some traversal operations (works on const declarations without modifying tensor variables)
+// ******************************************************************************************
+
+//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
+vector<int> generate_traversal_info() const {
+ vector<int> traversal_info = {};
+ for(size_t n=0; n<(shape_.size()+2); ++n) {
+ traversal_info.push_back(0);
+ }
+ return traversal_info;
+};
+
+//this function checks whether the next index falls on a special multiplier of the outer shape
+//so the algorithm knows when to jump over/back to a starting element of the outer shape
+//for e.g. in [[1,4,7], [2,5,8], [3,6,9]], elements 1,2,3 are the starting elements of their respective rows
+//this additional check only has 1 loop for 2d matrix
+//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
+int determine_order(int counter) const {
+ for (size_t n=0; n<(shape_multipliers_.size()-1); ++n) {
+ if((counter%shape_multipliers_[n])==0){
+ return ((shape_multipliers_.size()) - 1 - n);
+ }
+ }
+ return 0;
+};
+
+//this function updates the base indexes with the current index after every single traversal step, can be generalized beyond 2d cases
+void update_base_index(std::vector<int>& traversal_info) const {
+ for (int n=0; n<(traversal_info[shape_.size()+1]+1); ++n) {
+ traversal_info[n] = traversal_info[shape_.size()];
+ }
+};
+
+//function to traverse a const strided tensor object
+//it requires an additional vector, traversal_info {0,0,0,0 ...}, comprising (shape_.size()+2) elements of 0
+//for e.g. 2d matrix:
+//index 0 and 1 store the base row and column index respectively
+//index 2 stores the current index of the traversal
+//index 3 stores the order of the traversal for e.g. if the order is 0, it means the next element can be navigated to using the innermost stride
+void traverse_next(std::vector<int>& traversal_info, int counter) const {
+ update_base_index(traversal_info);
+ traversal_info[shape_.size()+1] = determine_order(counter);
+ traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[traversal_info[shape_.size()+1]];
+};
+
+// ******************************************************************************************
+// traversal operations end
+// ******************************************************************************************
+
protected:
- bool transpose_ = false;
+ //bool transpose_ = false;
DataType data_type_ = kFloat32;
std::shared_ptr<Device> device_ = nullptr;
/// Note: block_ is allocated in lazy manner to avoid frequent malloc/free.
/// If you want to get an allocated Block, use block() instead of block_.
Block *block_ = nullptr;
Shape shape_ = {};
-};
+ vector<int> strides_ = {};
+ vector<int> shape_multipliers_ = {};
+
+}; //end of tensor class
typedef Shape::iterator ShapeIter;
inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
@@ -452,12 +556,16 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
/// each instance, t[i] could be 2 or [0, 0, 1]. If one instance could have
/// multiple labels, then t[i] could be [1, 0, 1].
/// The loss is computed into p.
+
void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss);
+
/// Compute the dx, given prediction probability 'p' (p=softmax(x)) and
/// the target (ground truth) labels 't'. 'p' and 't' are either 1-d vector
/// or 2-d matrix. 'grad' has the same shape as 'p'. dx is computed into p.
+
void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p);
+
/// Return a tensor consisting of rows ([start, end)) from 'in'. It copies the
/// values from 'in'. 'in' ia a 2D Tensor.
Tensor CopyRows(const Tensor &in, const size_t start, const size_t end);
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index ed4da96..48751ef 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -21,6 +21,7 @@
#include "./tensor_math_cuda.h"
#include "./tensor_math_opencl.h"
#include <utility>
+#include <iostream>
namespace singa {
@@ -30,52 +31,87 @@ Tensor::~Tensor() {
block_ = nullptr;
}
-Tensor::Tensor() { device_ = defaultDevice; }
+Tensor::Tensor() {
+ device_ = defaultDevice;
+ strides_ = {1};
+ shape_multipliers_ = {1};
+}
+//non-strided constructors
Tensor::Tensor(const Shape &shape, DataType dtype)
: data_type_(dtype), device_(defaultDevice), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
+ Generate_Strides();
+ shape_multipliers_ = Generate_Shape_Multipliers(shape_);
}
Tensor::Tensor(Shape &&shape, DataType dtype)
: data_type_(dtype), device_(defaultDevice), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
+ Generate_Strides();
+ shape_multipliers_ = Generate_Shape_Multipliers(shape_);
}
+
+//non-strided constructors with device
Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
+ Generate_Strides();
+ shape_multipliers_ = Generate_Shape_Multipliers(shape_);
}
Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
+ Generate_Strides();
+ shape_multipliers_ = Generate_Shape_Multipliers(shape_);
}
+
+
Tensor::Tensor(const Tensor &in)
- : transpose_(in.transpose_),
+ : //transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ block_(in.block()),
+ shape_(in.shape_),
+ strides_(in.strides_),
+ shape_multipliers_(in.shape_multipliers_) {
+ if (block_ != nullptr)
+ block_->IncRefCount();
+}
+
+//strided constructor taking in a tensor, shape and strides
+Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides)
+ : //transpose_(in.transpose_),
data_type_(in.data_type_),
device_(in.device_),
block_(in.block()),
- shape_(in.shape_) {
+ shape_(new_shape),
+ strides_(new_strides) {
+ shape_multipliers_ = Generate_Shape_Multipliers(shape_);
if (block_ != nullptr)
block_->IncRefCount();
}
Tensor::Tensor(Tensor &&in)
- : transpose_(in.transpose_),
+ : //transpose_(in.transpose_),
data_type_(in.data_type_),
device_(in.device_),
- shape_(std::move(in.shape_)) {
+ shape_(std::move(in.shape_)),
+ strides_(in.strides_),
+ shape_multipliers_(in.shape_multipliers_) {
block_ = in.block_;
in.block_ = nullptr;
}
+
void Tensor::SetBlock(Block *block) {
LOG(WARNING) << "Pls avoid using this function, which may have side-effect.";
if (block_ != nullptr)
@@ -92,24 +128,46 @@ void Tensor::ResetLike(const Tensor &in) {
block_ = device_->NewBlock((int)in.MemSize());
}
shape_ = in.shape_;
+ strides_ = in.strides_;
+ shape_multipliers_ = in.shape_multipliers_;
}
+//yisen todo
+//if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
+//if tensor is already transposed i.e strides != 1, it should be copied to a new tensor with newly generated default strides
+
void Tensor::Reshape(const Shape &shape) {
+ if(strides_.size()==0)
+ strides_.push_back(1);
+
if (Product(shape_) != Product(shape)) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
+ } else if (strides_[0] != 1) {
+ std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
+ return void();
}
shape_ = shape;
+ Generate_Strides();
+ shape_multipliers_ = Generate_Shape_Multipliers(shape_);
}
void Tensor::Reshape(Shape &&shape) {
+ if(strides_.size()==0)
+ strides_.push_back(1);
+
if (Product(shape_) != Product(shape)) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
+ } else if (strides_[0] != 1) {
+ std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
+ return void();
}
shape_ = std::move(shape);
+ Generate_Strides();
+ shape_multipliers_ = Generate_Shape_Multipliers(shape_);
}
void Tensor::AsType(const DataType type) {
@@ -177,7 +235,9 @@ void Tensor::FromProto(const singa::TensorProto &proto) {
for (uint32_t s : proto.shape()) shape.push_back(s);
data_type_ = proto.data_type();
Reshape(shape);
- transpose_ = proto.transpose();
+ //transpose_ = proto.transpose();
+ strides_.clear();
+ for (int32_t s : proto.strides()) strides_.push_back(s);
switch (data_type_) {
case kFloat32: {
std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
@@ -226,7 +286,11 @@ void Tensor::ToProto(singa::TensorProto *proto) const {
proto->add_shape(s);
}
proto->set_data_type(data_type_);
- proto->set_transpose(transpose_);
+ //proto->set_transpose(transpose_);
+ proto->clear_strides();
+ for (auto s : strides_) {
+ proto->add_strides(s);
+ }
switch (data_type_) {
case kFloat32: {
proto->clear_float_data();
@@ -272,19 +336,67 @@ void Tensor::ToProto(singa::TensorProto *proto) const {
Tensor Tensor::Clone(std::shared_ptr<Device> device) const {
if (device == nullptr) device = device_;
Tensor t(shape_, device_, data_type_);
- t.transpose_ = transpose_;
+ //t.transpose_ = transpose_;
+ t.strides_ = strides_;
t.CopyData(*this);
return t;
}
+//yisen todo
Tensor Tensor::T() const {
+ // this function only works for 2d tensors
CHECK_EQ(shape_.size(), 2u);
Tensor t;
t.device_ = device_;
t.data_type_ = data_type_;
- t.transpose_ = !transpose_;
t.shape_.push_back(shape_[1]);
t.shape_.push_back(shape_[0]);
+ t.strides_.clear();
+ t.strides_.push_back(strides_[1]);
+ t.strides_.push_back(strides_[0]);
+ t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
+ t.block_ = block_;
+ block_->IncRefCount();
+ return t;
+}
+
+//normal transpose without axes
+Tensor Tensor::Transpose() const {
+ // if(shape_.size() != strides_.size())
+ // Generate_Strides();
+
+ Tensor t;
+ t.device_ = device_;
+ t.data_type_ = data_type_;
+ t.strides_.clear();
+ for(size_t n=0; n<shape_.size(); ++n){
+ t.shape_.push_back(shape_[shape_.size()-n-1]);
+ t.strides_.push_back(strides_[shape_.size()-n-1]);
+ }
+ t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
+ t.block_ = block_;
+ block_->IncRefCount();
+ return t;
+}
+
+//transpose with axes
+Tensor Tensor::Transpose(Shape axes) const {
+ // if(axes.size() != shape_.size()){
+ // std::cout << "Warning: Size of input axes doesn't match size of shape" << std::endl;
+ // return void();
+ // }
+ // if(shape_.size() != strides_.size())
+ // Generate_Strides();
+
+ Tensor t;
+ t.device_ = device_;
+ t.data_type_ = data_type_;
+ t.strides_.clear();
+ for(size_t n=0; n<axes.size(); ++n){
+ t.shape_.push_back(shape_[axes[n]]);
+ t.strides_.push_back(strides_[axes[n]]);
+ }
+ t.shape_multipliers_ = Generate_Shape_Multipliers(t.shape_);
t.block_ = block_;
block_->IncRefCount();
return t;
@@ -294,7 +406,8 @@ Tensor &Tensor::operator=(const Tensor &in) {
// LOG(ERROR) << "= const &";
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
- transpose_ = in.transpose_;
+ //transpose_ = in.transpose_;
+ strides_ = in.strides_;
data_type_ = in.data_type_;
shape_ = in.shape_;
device_ = in.device_;
@@ -308,7 +421,8 @@ Tensor &Tensor::operator=(Tensor &&in) {
// LOG(ERROR) << "= &&";
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
- transpose_ = in.transpose_;
+ //transpose_ = in.transpose_;
+ strides_ = in.strides_;
data_type_ = in.data_type_;
shape_ = std::move(in.shape_);
device_ = in.device_;
@@ -317,6 +431,7 @@ Tensor &Tensor::operator=(Tensor &&in) {
return *this;
}
+//yisen todo
Tensor Reshape(const Tensor &in, const Shape &s) {
Tensor out(in);
out.Reshape(s);
@@ -373,7 +488,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
(int)s_offset);
} else if (src_dev->lang() == kCpp) {
dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, (int)d_offset,
- (int)s_offset);
+ (int)s_offset);
} else {
LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
}
@@ -453,7 +568,7 @@ float Tensor::L1() const {
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
device_->Exec([&nrm, this](Context *ctx) {
DType ret = DType(0);
- Asum<DType, Lang>(this->Size(), this->block(), &ret, ctx);
+ Asum<DType, Lang>(this, &ret, ctx);
nrm = TypeCast<DType, float>(ret);
}, {this->block()}, {});
});
@@ -466,7 +581,7 @@ float Tensor::L2() const {
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
device_->Exec([&nrm, this](Context *ctx) {
DType ret = DType(0);
- Nrm2<DType, Lang>(this->Size(), this->block(), &ret, ctx);
+ Nrm2<DType, Lang>(this, &ret, ctx);
nrm = TypeCast<DType, float>(ret);
}, {this->block()}, {});
});
@@ -476,12 +591,12 @@ float Tensor::L2() const {
template <typename SType>
void Tensor::SetValue(const SType x) {
CHECK_EQ(sizeof(SType), SizeOf(data_type_));
- auto size = Size();
+ //auto size = Size();
auto ptr = block_;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
// TODO(wangwei) cast x to DType
- device_->Exec([size, x, ptr](Context *ctx) {
- Set<DType, Lang>(size, x, ptr, ctx);
+ device_->Exec([this, x, ptr](Context *ctx) {
+ Set<DType, Lang>(x, this, ctx);
}, {}, {ptr});
});
}
@@ -492,7 +607,7 @@ template void Tensor::SetValue<int>(const int x);
do { \
TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
ret->device()->Exec([t, ret](Context * ctx) { \
- fn<DType, Lang>(t.Size(), t.block(), ret->block(), ctx); \
+ fn<DType, Lang>(&t, ret, ctx); \
}, {t.block()}, {ret->block()}); \
}); \
} while (0)
@@ -521,7 +636,7 @@ GenUnaryTensorFn(Tanh);
TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \
CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())); \
ret->device()->Exec([lhs, rhs, ret](Context * ctx) { \
- fn<DType, Lang>(lhs.Size(), lhs.block(), rhs.block(), ret->block(), \
+ fn<DType, Lang>(&lhs, &rhs, ret, \
ctx); \
}, {lhs.block(), rhs.block()}, {ret->block()}); \
}); \
@@ -552,7 +667,7 @@ GenBinaryTensorFn(operator>=, GE);
static_assert(std::is_same<SType, DType>::value, \
"The Scalar type must match the Tensor data type"); \
ret->device()->Exec([t, x, ret](Context * ctx) { \
- fn<DType, Lang>(t.Size(), t.block(), x, ret->block(), ctx); \
+ fn<DType, Lang>(&t, x, ret, ctx); \
}, {t.block()}, {ret->block()}); \
}); \
} while (0)
@@ -595,7 +710,7 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
// TODO(wangwei) type cast SType to DType;
in.device()->Exec([alpha, in, out](Context *ctx) {
- Div<DType, Lang>(in.Size(), alpha, in.block(), out->block(), ctx);
+ Div<DType, Lang>(alpha, &in, out, ctx);
}, {in.block()}, {out->block()});
});
}
@@ -632,7 +747,7 @@ float Sum<float>(const Tensor &in) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
one.device()->Exec([in, one, &s](Context *ctx) {
DType ret = DType(0);
- Dot<DType, Lang>(in.Size(), in.block(), one.block(), &ret, ctx);
+ Dot<DType, Lang>(&in, &one, &ret, ctx);
s = ret;
}, {in.block(), one.block()}, {});
});
@@ -661,11 +776,11 @@ Tensor SoftMax(const Tensor &in) {
Tensor RowMax(const Tensor &in) {
Tensor ret({in.shape(0)}, in.device(), in.data_type());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
- in.device()->Exec([in, ret](Context *ctx) {
- size_t nrow = 1;
- if (in.nDim() > 1) nrow = in.shape(0);
- size_t ncol = in.Size() / nrow;
- RowMax<DType, Lang>(nrow, ncol, in.block(), ret.block(), ctx);
+ in.device()->Exec([&in, &ret](Context *ctx) {
+ //size_t nrow = 1;
+ //if (in.nDim() > 1) nrow = in.shape(0);
+ //size_t ncol = in.Size() / nrow;
+ RowMax<DType, Lang>(&in, &ret, ctx);
}, {in.block()}, {ret.block()});
});
return ret;
@@ -708,13 +823,13 @@ void AddColumn(const SType alpha, const SType beta, const Tensor &v,
Tensor vmat = Reshape(v, Shape{nb_row, 1});
Mult(alpha, vmat, one, beta, M);
}
-}
+}
template
void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
-/// Sub column 'v' by each column of matrix M; write results into 'out'
+/// Add row 'v' by each column of matrix M; write results into 'out'
template <typename SType>
void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
if (M->transpose()) {
@@ -894,30 +1009,30 @@ void DivRow(const Tensor &v, Tensor *M) {
/// Multiply column 'v' and each column of matrix M; write results into 'out'
void MultColumn(const Tensor &v, Tensor *M) {
- CHECK(!M->transpose()) << "Not supported yet";
+ //CHECK(!M->transpose()) << "Not supported yet";
CHECK_EQ(M->nDim(), 2u);
// CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
CHECK_EQ(v.Size(), M->shape(0));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
v.device()->Exec([M, v](Context *ctx) {
- DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->block(), v.block(),
- M->block(), ctx);
+ DGMM<DType, Lang>(false, M, &v,
+ M, ctx);
}, {M->block(), v.block()}, {M->block()});
});
}
/// Multiply row 'v' with each row of matrix M; write results into 'out'
void MultRow(const Tensor &v, Tensor *M) {
- CHECK(!M->transpose()) << "Not supported yet";
+ //CHECK(!M->transpose()) << "Not supported yet";
CHECK_EQ(M->nDim(), 2u);
// CHECK_EQ(v.nDim(), 1u); (chonho) shape of v is 2-element tuple
CHECK_EQ(v.Size(), M->shape(1));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
v.device()->Exec([M, v](Context *ctx) {
- DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->block(), v.block(),
- M->block(), ctx);
+ DGMM<DType, Lang>(true, M, &v,
+ M, ctx);
}, {M->block(), v.block()}, {M->block()});
});
}
@@ -963,7 +1078,7 @@ void Bernoulli(const SType p, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto prob = TypeCast<SType, DType>(p);
out->device()->Exec([prob, out](Context *ctx) {
- Bernoulli<DType, Lang>(out->Size(), prob, out->block(), ctx);
+ Bernoulli<DType, Lang>(prob, out, ctx);
}, {}, {out->block()}, true);
});
}
@@ -976,7 +1091,7 @@ void Uniform(const SType low, const SType high, Tensor *out) {
auto l = TypeCast<SType, DType>(low);
auto h = TypeCast<SType, DType>(high);
out->device()->Exec([l, h, out](Context *ctx) {
- Uniform<DType, Lang>(out->Size(), l, h, out->block(), ctx);
+ Uniform<DType, Lang>(l, h, out, ctx);
}, {}, {out->block()}, true);
});
}
@@ -989,7 +1104,7 @@ void Gaussian(const SType mean, const SType std, Tensor *out) {
auto m = TypeCast<SType, DType>(mean);
auto s = TypeCast<SType, DType>(std);
out->device()->Exec([m, s, out](Context *ctx) {
- Gaussian<DType, Lang>(out->Size(), m, s, out->block(), ctx);
+ Gaussian<DType, Lang>(m, s, out, ctx);
}, {}, {out->block()}, true);
});
}
@@ -1002,7 +1117,7 @@ void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
out->device()->Exec([a, in, out](Context *ctx) {
- Axpy<DType, Lang>(in.Size(), a, in.block(), out->block(), ctx);
+ Axpy<DType, Lang>(a, &in, out, ctx);
}, {in.block(), out->block()}, {out->block()});
});
}
@@ -1032,8 +1147,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
C->device()->Exec([a, A, b, B, C](Context *ctx) {
- GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.block(),
- B.block(), b, C->block(), ctx);
+ GEMV<DType, Lang>(a, &A, &B, b, C, ctx);
}, {A.block(), B.block()}, {C->block()});
});
} else {
@@ -1042,8 +1156,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
C->device()->Exec([a, A, b, B, C](Context *ctx) {
- GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
- A.shape(1), a, A.block(), B.block(), b, C->block(),
+ GEMM<DType, Lang>(a, &A, &B, b, C,
ctx);
}, {A.block(), B.block()}, {C->block()});
});
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/a88efa00/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 6d42211..c403f30 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -19,7 +19,9 @@
#define SINGA_CORE_MATH_H_
#include <type_traits>
#include "singa/core/common.h"
+#include "singa/core/tensor.h"
#include "singa/utils/logging.h"
+#include <vector>
namespace singa {
@@ -33,20 +35,52 @@ namespace singa {
/// first
/// letter.
/// 2. Order functions based on function name in alphabetical order.
-/// 3. Function arguments order is [const basic type] [const Block] [mutable
-/// Block].
+/// 3. Function arguments order is [const basic type] [const Tensor] [mutable
+/// Tensor].
/// 4. Function argument names, use 'num' for total number of elements in
-/// elementwise operations; use 'in1' 'in2' for in blocks; use 'out' for
-/// output block or value. With exceptions for some functions, e.g.,
-/// Scale(const float alpha, const Block* in, Block* out);
+/// elementwise operations; use 'in1' 'in2' for in Tensors; use 'out' for
+/// output Tensor or value. With exceptions for some functions, e.g.,
+/// Scale(const float alpha, const Tensor* in, Tensor* out);
/// For such cases, use x, v, alpha, etc for scalar types.
/// For blas functions, follow the blas style for argument names.
/// Use 'M' and 'v' for matrix and vector tensors in functions involving both
/// matrix and vectors.
-/// 5. For Block argument xxx, name its raw pointer as xxxPtr.
+/// 5. For Tensor argument xxx, name its raw pointer as xxxPtr.
/// 6. Pass the 'cudaStream_t s' to every function in math_kernel.h
/// 7. Use size_t for the number of elements, rows or columns.
-/// 8. Use the same name for the Tensor and Block level math functions.
+/// 8. Use the same name for the Tensor and Tensor level math functions.
+
+// template <typename DType>
+// void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){}
+
+// template <typename DType>
+// void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){}
+
+template <typename DType>
+void TraverseUnary(const Tensor* in, Tensor* out, std::function<DType(DType)> func){
+ DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+ const DType *inPtr = static_cast<const DType *>(in->block()->data());
+ vector<int> traversal_info = in->generate_traversal_info();
+ for (size_t i = 0; i < in->Size(); i++) {
+ outPtr[i] = func(inPtr[traversal_info[in->shape().size()]]);
+ in->traverse_next(traversal_info, i+1);
+ }
+}
+
+template <typename DType>
+void TraverseBinary(const Tensor* in1, const Tensor* in2, Tensor* out, std::function<DType(DType, DType)> func){
+ DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
+ const DType *in1Ptr = static_cast<const DType *>(in1->block()->data());
+ const DType *in2Ptr = static_cast<const DType *>(in2->block()->data());
+ vector<int> traversal_info_in1 = in1->generate_traversal_info();
+ vector<int> traversal_info_in2 = in2->generate_traversal_info();
+ for (size_t i = 0; i < in1->Size(); i++) {
+ outPtr[i] = func(in1Ptr[traversal_info_in1[in1->shape().size()]], in2Ptr[traversal_info_in2[in2->shape().size()]]);
+ in1->traverse_next(traversal_info_in1, i+1);
+ in2->traverse_next(traversal_info_in2, i+1);
+ }
+}
+
// **************************************
// Element-wise functions
@@ -54,197 +88,197 @@ namespace singa {
/// out[i] = |in[i]|
template <typename DType, typename Lang>
-void Abs(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Abs(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Abs Not Implemented";
}
/// out[i] = in[i] + x
template <typename DType, typename Lang>
-void Add(const size_t num, const Block *in, const DType x, Block *out,
+void Add(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Add Not Implemented";
}
/// out[i] = in1[i] + in2[i]
template <typename DType, typename Lang>
-void Add(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Add(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Add-Pair Not Implemented";
}
/// Clamp every element into [low, high]
/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
template <typename DType, typename Lang>
-void Clamp(const size_t num, const DType low, const DType high, const Block *in,
- Block *out, Context *ctx) {
+void Clamp(const DType low, const DType high, const Tensor *in,
+ Tensor *out, Context *ctx) {
LOG(FATAL) << "Clamp Not Implemented";
}
/// out[i] = x / in[i]
template <typename DType, typename Lang>
-void Div(const size_t num, const DType x, const Block *in, Block *out,
+void Div(const DType x, const Tensor *in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Div Not Implemented";
}
/// out[i] = in[i] / x
template <typename DType, typename Lang>
-void Div(const size_t num, const Block *in, const DType x, Block *out,
+void Div(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
CHECK_NE(x, 0.f);
- EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
+ EltwiseMult<DType, Lang>(in, DType(1) / x, out, ctx);
}
/// out[i] = in1[i] / in2[i]
template <typename DType, typename Lang>
-void Div(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Div(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Div-Pair Not Implemented";
}
/// out[i] = in[i] * x
template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Block *in, const DType x, Block *out,
+void EltwiseMult(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "EltwiseMult Not Implemented";
}
/// out[i] = in1[i] * in2[i]
template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Block *in1, const Block *in2, Block *out,
+void EltwiseMult(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
}
/// Base is e, Neper number. out[i]=exp(in[i])
template <typename DType, typename Lang>
-void Exp(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Exp(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Exp Not Implemented";
}
/// out[i]=(in[i]<=x)?1.f:0.f
template <typename DType, typename Lang>
-void LE(const size_t num, const Block *in, const DType x, Block *out,
+void LE(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "LE Not Implemented";
}
/// out[i]=(in1[i]<=in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void LE(const size_t num, const Block *in1, const Block *in2, Block *out,
+void LE(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor LE Not Implemented";
}
/// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
template <typename DType, typename Lang>
-void Log(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Log(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Log Not Implemented";
}
/// out[i]=(in[i]<x)?1.f:0.f
template <typename DType, typename Lang>
-void LT(const size_t num, const Block *in, const DType x, Block *out,
+void LT(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "LT Not Implemented";
}
/// out[i]=(in1[i]<in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void LT(const size_t num, const Block *in1, const Block *in2, Block *out,
+void LT(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor LT Not Implemented";
}
/// out[i]=(in[i]>=x)?1.f:0.f
template <typename DType, typename Lang>
-void GE(const size_t num, const Block *in, const DType x, Block *out,
+void GE(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "GE Not Implemented";
}
/// out[i]=(in1[i]>=in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void GE(const size_t num, const Block *in1, const Block *in2, Block *out,
+void GE(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor GE Not Implemented";
}
/// out[i]=(in[i]>x)?1.f:0.f
template <typename DType, typename Lang>
-void GT(const size_t num, const Block *in, const DType x, Block *out,
+void GT(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "GT Not Implemented";
}
/// out[i]=(in[i]>in2[i])?1.f:0.f
template <typename DType, typename Lang>
-void GT(const size_t num, const Block *in, const Block *in2, Block *out,
+void GT(const Tensor *in, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Tensor-Tensor GT Not Implemented";
}
/// out[i] = pow(in[i], x)
template <typename DType, typename Lang>
-void Pow(const size_t num, const Block *in, const DType x, Block *out,
+void Pow(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Pow Not Implemented";
}
/// out[i]=pow(in1[i], in2[i])
template <typename DType, typename Lang>
-void Pow(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Pow(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Pow-Pair Not Implemented";
}
/// out[i]=max(0, in[i])
template <typename DType, typename Lang>
-void ReLU(const size_t num, const Block *in, Block *out, Context *ctx) {
+void ReLU(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "ReLU Not Implemented";
}
/// out[i] = x
template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Block *out, Context *ctx) {
+void Set(const DType x, Tensor *out, Context *ctx) {
LOG(FATAL) << "Set Not Implemented";
}
/// out[i]=sigmoid(in[i])
template <typename DType, typename Lang>
-void Sigmoid(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Sigmoid(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Sigmoid Not Implemented";
}
/// out[i] = sign(in[i])
template <typename DType, typename Lang>
-void Sign(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Sign(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Sign Not Implemented";
}
/// out[i]=sqrt(in[i])
template <typename DType, typename Lang>
-void Sqrt(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Sqrt(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Sqrt Not Implemented";
}
/// out[i]=square(in[i])
template <typename DType, typename Lang>
-void Square(const size_t num, const Block *in, Block *out, Context *ctx) {
- EltwiseMult<DType, Lang>(num, in, in, out, ctx);
+void Square(const Tensor *in, Tensor *out, Context *ctx) {
+ EltwiseMult<DType, Lang>(in, in, out, ctx);
}
/// out[i] = in[i] - x
template <typename DType, typename Lang>
-void Sub(const size_t num, const Block *in, const DType x, Block *out,
+void Sub(const Tensor *in, const DType x, Tensor *out,
Context *ctx) {
- Add<DType, Lang>(num, in, -x, out, ctx);
+ Add<DType, Lang>(in, -x, out, ctx);
}
/// out[i] = in1[i] - in2[i]
template <typename DType, typename Lang>
-void Sub(const size_t num, const Block *in1, const Block *in2, Block *out,
+void Sub(const Tensor *in1, const Tensor *in2, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Sub-Pair Not Implemented";
}
/// sum all elements of in into out
template <typename DType, typename Lang>
-void Sum(const size_t num, const Block *in, DType *out, Context *ctx) {
+void Sum(const Tensor *in, DType *out, Context *ctx) {
LOG(FATAL) << "Sum Not Implemented";
}
/// out[i]=tanh(in[i])
template <typename DType, typename Lang>
-void Tanh(const size_t num, const Block *in, Block *out, Context *ctx) {
+void Tanh(const Tensor *in, Tensor *out, Context *ctx) {
LOG(FATAL) << "Tanh Not Implemented";
}
@@ -255,20 +289,20 @@ void Tanh(const size_t num, const Block *in, Block *out, Context *ctx) {
// Get the random generator from 'ctx'
// If DType is not float, then convert the threshold to DType
template <typename DType, typename Lang>
-void Bernoulli(const size_t num, const float p, Block *out, Context *ctx) {
+void Bernoulli(const float p, Tensor *out, Context *ctx) {
LOG(FATAL) << "Bernoulli Not Implemented";
}
// The random generator should be extracted from ctx.
// If DType is not float, then convert the mean and std to DType
template <typename DType, typename Lang>
-void Gaussian(const size_t num, const float mean, const float std, Block *out,
+void Gaussian(const float mean, const float std, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Gaussian Not Implemented";
}
// The random generator should be extracted from ctx.
// If DType is not float, then convert the low and high to DType
template <typename DType, typename Lang>
-void Uniform(const size_t num, const float low, const float high, Block *out,
+void Uniform(const float low, const float high, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Uniform Not Implemented";
}
@@ -279,43 +313,43 @@ void Uniform(const size_t num, const float low, const float high, Block *out,
/// outurn the index of the element with the max value.
template <typename DType, typename Lang>
-void Amax(const size_t num, const Block *in, size_t *out, Context *ctx) {
+void Amax(const Tensor *in, size_t *out, Context *ctx) {
LOG(FATAL) << "Amax Not Implemented";
}
/// outurn the index of the element with the min value.
template <typename DType, typename Lang>
-void Amin(const size_t num, const Block *in, size_t *out, Context *ctx) {
+void Amin(const Tensor *in, size_t *out, Context *ctx) {
LOG(FATAL) << "Amin Not Implemented";
}
/// out = sum |x| for all x in in
template <typename DType, typename Lang>
-void Asum(const size_t num, const Block *in, DType *out, Context *ctx) {
+void Asum(const Tensor *in, DType *out, Context *ctx) {
LOG(FATAL) << "Asum Not Implemented";
}
/// out = alpha * in + out
template <typename DType, typename Lang>
-void Axpy(const size_t num, const DType alpha, const Block *in, Block *out,
+void Axpy(const DType alpha, const Tensor *in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "Axpy Not Implemented";
}
/// out = ||in||_2^2, i.e, L2 norm.
template <typename DType, typename Lang>
-void Nrm2(const size_t num, const Block *in, float *out, Context *ctx) {
+void Nrm2(const Tensor *in, float *out, Context *ctx) {
LOG(FATAL) << "Nrm2 Not Implemented";
}
/// out *= x
template <typename DType, typename Lang>
-void Scale(const size_t num, const DType x, Block *out, Context *ctx) {
+void Scale(const DType x, Tensor *out, Context *ctx) {
LOG(FATAL) << "Scale Not Implemented";
}
/// inner product of array in1 and in2
template <typename DType, typename Lang>
-void Dot(const size_t num, const Block *in1, const Block *in2, DType *out,
+void Dot(const Tensor *in1, const Tensor *in2, DType *out,
Context *ctx) {
LOG(FATAL) << "Dot Not Implemented";
}
@@ -323,8 +357,8 @@ void Dot(const size_t num, const Block *in1, const Block *in2, DType *out,
/// out = alpha * A * v + beta * out.
/// transA indicates if the internal data layout is transposed of A
template <typename DType, typename Lang>
-void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
- const Block *A, const Block *v, const DType beta, Block *out,
+void GEMV(const DType alpha,
+ const Tensor *A, const Tensor *v, const DType beta, Tensor *out,
Context *ctx) {
LOG(FATAL) << "GEMV Not Implemented";
}
@@ -332,21 +366,21 @@ void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
/// if matrix_lef_side is true, do M*v; else do v*M
template <typename DType, typename Lang>
-void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
- const Block *M, const Block *v, Block *out, Context *ctx) {
+void DGMM(const bool side_right,
+ const Tensor *M, const Tensor *v, Tensor *out, Context *ctx) {
LOG(FATAL) << "DGMM Not Implemented";
}
/// C = alpha * A * B + beta * C.
/// transA indicates if the internal data layout is transposed of A
template <typename DType, typename Lang>
-void GEMM(const bool transA, const bool transB, const size_t nrowA,
- const size_t ncolB, const size_t ncolA, const DType alpha,
- const Block *A, const Block *B, const DType beta, Block *C,
+void GEMM(const DType alpha,
+ const Tensor *A, const Tensor *B, const DType beta, Tensor *C,
Context *ctx) {
LOG(FATAL) << "GEMM Not Implemented";
}
+//yisen todo
template <typename DType, typename Lang>
void ComputeCrossEntropy(bool int_target, const size_t batchsize,
const size_t dim, const Block *p, const Block *t,
@@ -362,8 +396,7 @@ void SoftmaxCrossEntropyBwd(bool int_target, const size_t batchsize,
}
template <typename DType, typename Lang>
-void RowMax(const size_t nrow, const size_t ncol, const Block *in,
- Block *ret, Context* ctx) {
+void RowMax(const Tensor *in, Tensor *out, Context* ctx) {
LOG(FATAL) << "Not Implemented";
}
// **************************************
@@ -372,40 +405,40 @@ void RowMax(const size_t nrow, const size_t ncol, const Block *in,
/*
/// Add the vector v to every column of A as the column of out
template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
- Block *out, Context *ctx) {
+void AddCol(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+ Tensor *out, Context *ctx) {
LOG(FATAL) << "AddCol Not Implemented";
}
// TODO(wangwei) unify AddRow and AddCol.
/// Add the vector v to every row of A as the row of out
template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Block *A, const Block *v,
- Block *out, Context *ctx) {
+void AddRow(const size_t nrow, const size_t ncol, const Tensor *A, const Tensor *v,
+ Tensor *out, Context *ctx) {
LOG(FATAL) << "AddRow Not Implemented";
}
/// outer-product.
/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Block *in1, const Block *in2,
- Block *out, Context *ctx) {
+void Outer(const size_t m, const size_t n, const Tensor *in1, const Tensor *in2,
+ Tensor *out, Context *ctx) {
LOG(FATAL) << "Outer Not Implemented";
}
/// Sum the columns of the in matrix into a vector
template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Block *in, Block *out,
+void SumColumns(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "SumColumns Not Implemented";
}
template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Block *out, Context *ctx) {
+void Set(const DType x, Tensor *out, Context *ctx) {
LOG(FATAL) << "Not Implemented";
}
// TODO(wangwei) unify SumRow and SumCol.
/// Sum the rows of the in matrix into a vector
template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Block *in, Block *out,
+void SumRows(const size_t nrow, const size_t ncol, const Tensor *in, Tensor *out,
Context *ctx) {
LOG(FATAL) << "SumRows Not Implemented";
}
[03/10] incubator-singa git commit: Singa-351 Added stride support
and cudnn codes to cuda
Posted by wa...@apache.org.
Singa-351 Added stride support and cudnn codes to cuda
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/26101eee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/26101eee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/26101eee
Branch: refs/heads/master
Commit: 26101eee95db67316d31bf96956b10a28c37b0e1
Parents: a88efa0
Author: Vaan Ng <cm...@gmail.com>
Authored: Sun May 6 23:24:35 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Thu May 10 14:39:26 2018 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 79 ++-
src/core/tensor/tensor_math_cuda.h | 860 +++++++++++++++++++++++++-------
2 files changed, 745 insertions(+), 194 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/26101eee/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 6eafbdf..2c28e0f 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -104,15 +104,83 @@ class Tensor {
return shape_.at(idx);
}
+ /*
+ cudnn requires tensor dimensions to fulfill 2 requirements:
+ 1.) dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+ 2.) dimensions have to be set to multiples of 8
+
+ for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,24,24} to be the input
+ Tensor B has shape (2,3,4), cudnn requires shape of {1,16,24,32} to be the input
+ */
+ vector<int> generate_shape_cuda() const {
+ vector<int> shape_arr;
+ if(shape_.size() <= 4){
+ for (size_t n=0; n<4-shape_.size(); ++n) {
+ shape_arr.push_back(1);
+ }
+ for (size_t n=0; n<shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
+ }
+ return shape_arr;
+ } else if(shape_.size() == 5){
+ for (size_t n=0; n<shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
+ }
+ return shape_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+ }
+ }
+
+ int generate_dim_cuda() const {
+ if(shape_.size() <= 4){return 4;}
+ else if(shape_.size() == 5){return 5;}
+ else{
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+ }
+ }
+
size_t nDim() const { return shape_.size(); }
bool empty() const { return nDim() == 0; }
//bool transpose() const { return transpose_; }
- bool transpose() const { return (strides_[0] != 1); }
+ bool transpose() const { return (strides_.back() != 1); }
const vector<int>& strides() const { return strides_; }
+ /*
+ cudnn requires stride dimensions to conform to the format of the shape input as well
+ 1.) stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+ 2.) stride dimensions have to be set to powers of 8, depending on the stride order (outer stride = higher power)
+
+ for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,24,24} and stride {576, 576, 24, 1} to be the inputs,
+ if A is transposed with stride {1,3}, then the new cudnn stride becomes {576, 576, 8, 3}
+ */
+ vector<int> generate_strides_cuda() const {
+ vector<int> strides_arr;
+ int product = 1;
+ for (size_t n=0; n<(shape_.size()); ++n) {
+ product *= shape_[n];
+ }
+ if(shape_.size() <= 4){
+ for (size_t n=0; n<4-shape_.size(); ++n) {
+ strides_arr.push_back(product);
+ }
+ for (size_t n=0; n<strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
+ }
+ return strides_arr;
+ } else if(shape_.size() == 5){
+ for (size_t n=0; n<strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
+ }
+ return strides_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (strides) beyond 3 are currently not supported" ;
+ }
+ }
+
const vector<int>& shape_multipliers() const { return shape_multipliers_; }
/// return true if the content of the tensor is initialized
@@ -235,9 +303,12 @@ void Generate_Strides(){
cumulative_product = cumulative_product*shape_[n];
strides_.push_back(dim/cumulative_product);
}
- reverse(strides_.begin(), strides_.end());
};
+void Set_Strides(const vector<int> new_strides){
+ strides_ = new_strides;
+}
+
//generate shape multipliers
//for e.g. tensor of shape (3,3), stride (1,3) will have shape multipliers of (3,1)
//for e.g. tensor of shape (3,3), stride (3,1) will also have shape multipliers of (3,1)
@@ -303,7 +374,7 @@ void update_base_index(std::vector<int>& traversal_info) const {
void traverse_next(std::vector<int>& traversal_info, int counter) const {
update_base_index(traversal_info);
traversal_info[shape_.size()+1] = determine_order(counter);
- traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[traversal_info[shape_.size()+1]];
+ traversal_info[shape_.size()] = traversal_info[traversal_info[shape_.size()+1]]+strides_[strides_.size()-traversal_info[shape_.size()+1]-1];
};
// ******************************************************************************************
@@ -498,6 +569,8 @@ void MultColumn(const Tensor &v, Tensor *M);
void MultRow(const Tensor &v, Tensor *M);
/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
Tensor SoftMax(const Tensor &in);
+
+Tensor RowMax(const Tensor &in);
/// Do softmax for each row. 'in' could be a 1-d or 2-d Tensor.
void SoftMax(const Tensor &in, Tensor *out);
/// Sub column 'v' by each column of matrix M
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/26101eee/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 8a9e47a..f4839e3 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -20,6 +20,7 @@
#define SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
#include "singa/singa_config.h"
#ifdef USE_CUDA
+#include "singa/core/tensor.h"
#include "./tensor_math.h"
#include "./math_kernel.h"
#include "singa/utils/cuda_utils.h"
@@ -27,254 +28,636 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include "singa/utils/cuda_utils.h"
+#include <cudnn.h>
namespace singa {
/// out[i] = |in[i]|
template <>
-void Abs<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::abs(num, inPtr, outPtr, ctx->stream);
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+ cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_MAX;
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+ cudnnOpTensorDescriptor_t op_desc;
+ cudnnCreateOpTensorDescriptor(&op_desc);
+ cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+
+ float alpha1[1] = {1.0};
+ float alpha2[1] = {-1.0};
+ float beta[1] = {0.0};
+ cudnnTensorDescriptor_t in_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr,
+ (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
+
+ cudnnDestroyTensorDescriptor(in_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
}
-/// out = in + x
+
template <>
-void Add<float, lang::Cuda>(const size_t num, const Block* in, const float x,
- Block* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::add(num, inPtr, x, outPtr, ctx->stream);
+void Set<float, lang::Cuda>(const float x, Tensor* out,
+ Context* ctx) {
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ //float valuePtr[1] = {x};
+
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnTensorDescriptor_t out_desc;
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnSetTensor(ctx->cudnn_handle, out_desc, outPtr, (void*)(&x));
+
+ cudnnDestroyTensorDescriptor(out_desc);
+}
+
+template <>
+void Add<float, lang::Cuda>(const Tensor* in, const float x,
+ Tensor* out, Context* ctx) {
+ Set<float, lang::Cuda>(x, out, ctx);
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+ float alpha = 1.0, beta=1.0;
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnTensorDescriptor_t in_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
+
+ cudnnDestroyTensorDescriptor(in_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
}
+
/// out = in1 + in2
template <>
-void Add<float, lang::Cuda>(const size_t num, const Block* in1,
- const Block* in2, Block* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::add(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void Add<float, lang::Cuda>(const Tensor* in1,
+ const Tensor* in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+ cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+ cudnnOpTensorDescriptor_t op_desc;
+ cudnnCreateOpTensorDescriptor(&op_desc);
+ cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+
+ float alpha1[1] = {1.0};
+ float alpha2[1] = {1.0};
+ float beta[1] = {0.0};
+ cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in1_desc);
+ cudnnCreateTensorDescriptor(&in2_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+ cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+ } else {
+ cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ }
+
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
+ (void*)(alpha2), in2_desc, inPtr2, (void*)(beta), out_desc, outPtr);
+
+ cudnnDestroyTensorDescriptor(in1_desc);
+ cudnnDestroyTensorDescriptor(in2_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
+}
+
+/// out = in1 - in2
+template <>
+void Sub<float, lang::Cuda>(const Tensor* in1,
+ const Tensor* in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+ cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+ cudnnOpTensorDescriptor_t op_desc;
+ cudnnCreateOpTensorDescriptor(&op_desc);
+ cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+
+ float alpha1[1] = {1.0};
+ float alpha2[1] = {-1.0};
+ float beta[1] = {0.0};
+ cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in1_desc);
+ cudnnCreateTensorDescriptor(&in2_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+ cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+ } else {
+ cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ }
+
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
+ (void*)(alpha2), in2_desc, inPtr2, (void*)(beta), out_desc, outPtr);
+
+ cudnnDestroyTensorDescriptor(in1_desc);
+ cudnnDestroyTensorDescriptor(in2_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
}
+
/// Element-wise operation, clamp every element into [low, high]
/// if x>high, then x=high; if x<low, then x=low.
template <>
-void Clamp<float, lang::Cuda>(const size_t num, const float low,
- const float high, const Block* in, Block* out,
+void Clamp<float, lang::Cuda>(const float low,
+ const float high, const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
/// out = in1 / in2
template <>
-void Div<float, lang::Cuda>(const size_t num, const Block* in1,
- const Block* in2, Block* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void Div<float, lang::Cuda>(const Tensor* in1,
+ const Tensor* in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in1->Size();
+
+ if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::div
+ cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ out->Set_Strides(in1->strides());
+ } else { //else we transform in1 to out to store first
+ float alpha[1] = {1.0};
+ float beta[1] = {0.0};
+
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnTensorDescriptor_t in1_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in1_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ out->Set_Strides(in2->strides());
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
+ (void*)(beta), out_desc, outPtr);
+
+ cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
+ cudnnDestroyTensorDescriptor(in1_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
+ }
}
template <>
-void Div<float, lang::Cuda>(const size_t num, const float x, const Block* in,
- Block* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+void Div<float, lang::Cuda>(const float x, const Tensor* in,
+ Tensor* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::div(num, x, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
/// out = in * x
template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in,
- const float x, Block* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::mult(num, inPtr, x, outPtr, ctx->stream);
+void EltwiseMult<float, lang::Cuda>(const Tensor* in,
+ const float x, Tensor* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+ float alpha = x, beta = 0.0;
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnTensorDescriptor_t in_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
+
+ cudnnDestroyTensorDescriptor(in_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
}
+
/// out = in1 * in2
template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Block* in1,
- const Block* in2, Block* out,
+void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
+ const Tensor* in2, Tensor* out,
Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in1->Size();
+
+ if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
+ cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ out->Set_Strides(in1->strides());
+ } else { //else we transform in1 to out to store first
+ float alpha[1] = {1.0};
+ float beta[1] = {0.0};
+
+
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnTensorDescriptor_t in1_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in1_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ out->Set_Strides(in2->strides());
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
+ (void*)(beta), out_desc, outPtr);
+
+ cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
+ cudnnDestroyTensorDescriptor(in1_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
+ }
}
+
+
/// Base is e. out[i]=e^in[i]
template <>
-void Exp<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Exp<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::exp(num, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
template <>
-void GE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
+void GE<float, lang::Cuda>(const Tensor* in, const float x,
+ Tensor* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ const size_t num = in->Size();
cuda::ge(num, inPtr, x, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
template <>
-void GE<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void GE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+ Tensor* out, Context* ctx) {
+ Sub<float, lang::Cuda>(in1, in2, out, ctx);
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ const size_t num = in1->Size();
+ //cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
-void GT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
+void GT<float, lang::Cuda>(const Tensor* in, const float x,
+ Tensor* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ const size_t num = in->Size();
cuda::gt(num, inPtr, x, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
template <>
-void GT<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void GT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+ Tensor* out, Context* ctx) {
+ Sub<float, lang::Cuda>(in1, in2, out, ctx);
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ const size_t num = in1->Size();
+ //cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
-void LE<float, lang::Cuda>(const size_t num, const Block* in, const float x,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
+void LE<float, lang::Cuda>(const Tensor* in, const float x,
+ Tensor* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ const size_t num = in->Size();
cuda::le(num, inPtr, x, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
template <>
-void LE<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void LE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+ Tensor* out, Context* ctx) {
+ Sub<float, lang::Cuda>(in1, in2, out, ctx);
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ const size_t num = in1->Size();
+ //cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
template <>
-void Log<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Log<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::log(num, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
template <>
-void LT<float, lang::Cuda>(const size_t num, const Block* in, const float x,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
+void LT<float, lang::Cuda>(const Tensor* in, const float x,
+ Tensor* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ const size_t num = in->Size();
cuda::lt(num, inPtr, x, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
template <>
-void LT<float, lang::Cuda>(const size_t num, const Block* in1, const Block* in2,
- Block* out, Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void LT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+ Tensor* out, Context* ctx) {
+ Sub<float, lang::Cuda>(in1, in2, out, ctx);
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ const size_t num = in1->Size();
+ //cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Element-wise operation, out[i] = in[i]^x
template <>
-void Pow<float, lang::Cuda>(const size_t num, const Block* in, const float x,
- Block* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+void Pow<float, lang::Cuda>(const Tensor* in, const float x,
+ Tensor* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::pow(num, inPtr, x, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
/// Element-wise operation, out[i] = in1[i]^in2[i]
template <>
-void Pow<float, lang::Cuda>(const size_t num, const Block* in1,
- const Block* in2, Block* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
+void Pow<float, lang::Cuda>(const Tensor* in1,
+ const Tensor* in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in1->Size();
+
+ if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::pow
+ cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ out->Set_Strides(in1->strides());
+ } else { //else we transform in1 to out to store first
+ float alpha[1] = {1.0};
+ float beta[1] = {0.0};
+
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnTensorDescriptor_t in1_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in1_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ out->Set_Strides(in2->strides());
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
+ (void*)(beta), out_desc, outPtr);
+
+ cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
+ cudnnDestroyTensorDescriptor(in1_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
+ }
}
/// Element-wise operation, out[i]=max(0, in[i])
+// template <>
+// void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// Context* ctx) {
+// const float* inPtr = static_cast<const float*>(in->block()->data());
+// float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+// cudnnActivationDescriptor_t act_desc;
+// cudnnActivationMode_t mode = CUDNN_ACTIVATION_RELU;
+// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+// double coef = 0.0; //only used for CLIPPED_RELU or ELU
+// cudnnCreateActivationDescriptor(&act_desc);
+// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
+
+// float alpha[1] = {1.0};
+// float beta[1] = {0.0};
+// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+// cudnnTensorDescriptor_t in_desc, out_desc;
+// cudnnCreateTensorDescriptor(&in_desc);
+// cudnnCreateTensorDescriptor(&out_desc);
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
+// (void*)(&beta), out_desc, outPtr);
+
+// cudnnDestroyTensorDescriptor(in_desc);
+// cudnnDestroyTensorDescriptor(out_desc);
+// cudnnDestroyActivationDescriptor(act_desc);
+// }
+
template <>
-void ReLU<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::relu(num, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
-/// out[i] = x
-template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Block* out,
- Context* ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::set(num, x, outPtr, ctx->stream);
-}
+// /// Element-wise operation, out[i]=sigmoid([in[i])
+// template <>
+// void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// Context* ctx) {
+// const float* inPtr = static_cast<const float*>(in->block()->data());
+// float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+// cudnnActivationDescriptor_t act_desc;
+// cudnnActivationMode_t mode = CUDNN_ACTIVATION_SIGMOID;
+// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+// double coef = 0.0; //only used for CLIPPED_RELU or ELU
+// cudnnCreateActivationDescriptor(&act_desc);
+// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
+
+// float alpha[1] = {1.0};
+// float beta[1] = {0.0};
+// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+// cudnnTensorDescriptor_t in_desc, out_desc;
+// cudnnCreateTensorDescriptor(&in_desc);
+// cudnnCreateTensorDescriptor(&out_desc);
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
+// (void*)(&beta), out_desc, outPtr);
+
+// cudnnDestroyTensorDescriptor(in_desc);
+// cudnnDestroyTensorDescriptor(out_desc);
+// cudnnDestroyActivationDescriptor(act_desc);
+// }
+
/// Element-wise operation, out[i]=sigmoid([in[i])
template <>
-void Sigmoid<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
+
// out[i] = sign(in[i])
template <>
-void Sign<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Sign<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::sign(num, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
-/// Element-wise operation, out[i]=sqrt([in[i])
+// Element-wise operation, out[i]=sqrt([in[i])
template <>
-void Sqrt<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::sqrt(num, inPtr, outPtr, ctx->stream);
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+ cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_SQRT;
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+ cudnnOpTensorDescriptor_t op_desc;
+ cudnnCreateOpTensorDescriptor(&op_desc);
+ cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
+
+ float alpha1[1] = {1.0};
+ float alpha2[1] = {0.0};
+ float beta[1] = {0.0};
+ cudnnTensorDescriptor_t in_desc, out_desc;
+ cudnnCreateTensorDescriptor(&in_desc);
+ cudnnCreateTensorDescriptor(&out_desc);
+ cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+ cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr,
+ (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
+
+ cudnnDestroyTensorDescriptor(in_desc);
+ cudnnDestroyTensorDescriptor(out_desc);
}
/// Element-wise operation, out[i]=in[i]^2
template <>
-void Square<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
+void Square<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::square(num, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
-/// out = in1 - in2
-template <>
-void Sub<float, lang::Cuda>(const size_t num, const Block* in1,
- const Block* in2, Block* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::sub(num, inPtr1, inPtr2, outPtr, ctx->stream);
-}
-/// sum all elements of input into out
+// template <>
+// void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+// Context* ctx) {
+// LOG(FATAL) << "Cuda Sum is not implemented!";
+// // const float* inPtr = static_cast<const float*>(in->data());
+// // cuda::sum(num, inPtr, out, ctx->stream);
+// }
+
template <>
-void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+void Sum<float, lang::Cuda>(const Tensor* in, float* out,
Context* ctx) {
- LOG(FATAL) << "Cuda Sum is not implemented!";
- // const float* inPtr = static_cast<const float*>(in->data());
- // cuda::sum(num, inPtr, out, ctx->stream);
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+
+ //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
+ Shape reduced_shape = {1};
+ Tensor t(reduced_shape, in->device(), in->data_type());
+ float* tPtr = static_cast<float*>(t.block()->mutable_data());
+ vector<int> reduce_all_axes = in->generate_shape_cuda();
+ for (size_t n=0; n<reduce_all_axes.size(); ++n) {
+ reduce_all_axes[n] = 1;
+ }
+
+ //reduce_desc
+ cudnnReduceTensorDescriptor_t reduce_desc;
+ cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+ cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES;
+ cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES;
+ cudnnCreateReduceTensorDescriptor(&reduce_desc);
+ cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype,
+ cudnn_propagation, cudnn_indices, cudnn_indices_type);
+
+ //instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
+ Shape reduction_size = {1000};
+ Tensor indices(reduction_size, in->device(), in->data_type());
+ Tensor workspace(reduction_size, in->device(), in->data_type());
+ size_t indices_bytes = indices.block()->size()*1000;
+ size_t workspace_bytes = workspace.block()->size()*1000;
+ size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
+ float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
+ //void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
+ //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
+
+ float alpha[1] = {1.0};
+ float beta[1] = {0.0};
+ cudnnTensorDescriptor_t in_desc, t_desc;
+ cudnnCreateTensorDescriptor(&in_desc);
+ cudnnCreateTensorDescriptor(&t_desc);
+ cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+ cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), reduce_all_axes.data(), reduce_all_axes.data());
+ cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
+ indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
+ (void*)(&alpha), in_desc, inPtr, (void*)(&beta), t_desc, tPtr);
+
+ *out = tPtr[0];
+ cudnnDestroyTensorDescriptor(in_desc);
+ cudnnDestroyTensorDescriptor(t_desc);
}
+
/// Element-wise operation, out[i]=tanh([in[i])
+// template <>
+// void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// Context* ctx) {
+// const float* inPtr = static_cast<const float*>(in->block()->data());
+// float* outPtr = static_cast<float*>(out->block()->mutable_data());
+
+// cudnnActivationDescriptor_t act_desc;
+// cudnnActivationMode_t mode = CUDNN_ACTIVATION_TANH;
+// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+// double coef = 0.0; //only used for CLIPPED_RELU or ELU
+// cudnnCreateActivationDescriptor(&act_desc);
+// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
+
+// float alpha[1] = {1.0};
+// float beta[1] = {0.0};
+// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+// cudnnTensorDescriptor_t in_desc, out_desc;
+// cudnnCreateTensorDescriptor(&in_desc);
+// cudnnCreateTensorDescriptor(&out_desc);
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
+// (void*)(&beta), out_desc, outPtr);
+
+// cudnnDestroyTensorDescriptor(in_desc);
+// cudnnDestroyTensorDescriptor(out_desc);
+// cudnnDestroyActivationDescriptor(act_desc);
+// }
+
template <>
-void Tanh<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
- Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = in->Size();
cuda::tanh(num, inPtr, outPtr, ctx->stream);
+ out->Set_Strides(in->strides());
}
// ================Random functions===========================================
@@ -282,10 +665,11 @@ void Tanh<float, lang::Cuda>(const size_t num, const Block* in, Block* out,
// Get the random generator from 'ctx'
// If DType is not float, then convert the threshold to DType
template <>
-void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Block* out,
+void Bernoulli<float, lang::Cuda>(const float p, Tensor* out,
Context* ctx) {
auto rgen = ctx->curand_generator;
- float* outPtr = static_cast<float*>(out->mutable_data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = out->Size();
CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
}
@@ -293,10 +677,11 @@ void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Block* out,
// The random generator should be extracted from ctx.
// If DType is not float, then convert the low and high to DType
template <>
-void Uniform<float, lang::Cuda>(const size_t num, const float low,
- const float high, Block* out, Context* ctx) {
+void Uniform<float, lang::Cuda>(const float low,
+ const float high, Tensor* out, Context* ctx) {
auto rgen = ctx->curand_generator;
- float* outPtr = static_cast<float*>(out->mutable_data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = out->Size();
CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
cuda::add(num, outPtr, low, outPtr, ctx->stream);
@@ -305,88 +690,97 @@ void Uniform<float, lang::Cuda>(const size_t num, const float low,
// The random generator should be extracted from ctx.
// If DType is not float, then convert the mean and delta to DType
template <>
-void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
- const float std, Block* out, Context* ctx) {
+void Gaussian<float, lang::Cuda>(const float mean,
+ const float std, Tensor* out, Context* ctx) {
auto rgen = ctx->curand_generator;
- float* outPtr = static_cast<float*>(out->mutable_data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = out->Size();
CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
}
// =========================Blas operations==================================
// ref to http://docs.nvidia.com/cuda/cublas
template <>
-void Amax<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
+void Amax<float, lang::Cuda>(const Tensor* in, size_t* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
+ const size_t num = in->Size();
CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
*out = idx - 1; // cublas index starts from 1
}
/// return the index of the element with the min value.
template <>
-void Amin<float, lang::Cuda>(const size_t num, const Block* in, size_t* out,
+void Amin<float, lang::Cuda>(const Tensor* in, size_t* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
+ const size_t num = in->Size();
CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
*out = idx - 1;
}
/// out = sum |x| for all x in in
template <>
-void Asum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+void Asum<float, lang::Cuda>(const Tensor* in, float* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ const size_t num = in->Size();
CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
}
/// out = alpha * in + out
template <>
-void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
- const Block* in, Block* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+void Axpy<float, lang::Cuda>(const float alpha,
+ const Tensor* in, Tensor* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ const size_t num = in->Size();
CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
}
/// out = \sum_i in1[i] * in2[i]
template <>
-void Dot<float, lang::Cuda>(const size_t num, const Block* in1,
- const Block* in2, float* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->data());
- const float* inPtr2 = static_cast<const float*>(in2->data());
+void Dot<float, lang::Cuda>(const Tensor* in1,
+ const Tensor* in2, float* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2->block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ const size_t num = in1->Size();
CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
}
template <>
-void Nrm2<float, lang::Cuda>(const size_t num, const Block* in, float* out,
+void Nrm2<float, lang::Cuda>(const Tensor* in, float* out,
Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const float* inPtr = static_cast<const float*>(in->data());
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ const size_t num = in->Size();
cublasSnrm2(handle, num, inPtr, 1, out);
}
template <>
-void Scale<float, lang::Cuda>(const size_t num, const float x, Block* out,
+void Scale<float, lang::Cuda>(const float x, Tensor* out,
Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- float* outPtr = static_cast<float*>(out->mutable_data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t num = out->Size();
CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
}
// NOTE: cublas uses column major order.
// http://peterwittek.com/cublas-matrix-c-style.html
template <>
-void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
- const size_t ncol, const Block* M, const Block* v,
- Block* out, Context* ctx) {
+void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tensor* v,
+ Tensor* out, Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const float* MPtr = static_cast<const float*>(M->data());
- const float* vPtr = static_cast<const float*>(v->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* MPtr = static_cast<const float*>(M->block()->data());
+ const float* vPtr = static_cast<const float*>(v->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t nrow = M->shape(0);
+ const size_t ncol = M->shape(1);
if (side_right) {
CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
vPtr, 1, outPtr, ncol));
@@ -396,14 +790,16 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
}
}
template <>
-void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
- const float alpha, const Block* A, const Block* v,
- const float beta, Block* out, Context* ctx) {
- const float* APtr = static_cast<const float*>(A->data());
- const float* vPtr = static_cast<const float*>(v->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
+void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v,
+ const float beta, Tensor* out, Context* ctx) {
+ const float* APtr = static_cast<const float*>(A->block()->data());
+ const float* vPtr = static_cast<const float*>(v->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t m = A->shape()[0];
+ const size_t n = A->shape()[1];
+
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- if (!trans)
+ if (!(A->transpose()))
CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
1, &beta, outPtr, 1));
else
@@ -413,19 +809,22 @@ void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
template <>
-void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
- const size_t nrowA, const size_t ncolB,
- const size_t ncolA, const float alpha,
- const Block* A, const Block* B, const float beta,
- Block* C, Context* ctx) {
+void GEMM<float, lang::Cuda>(const float alpha,
+ const Tensor* A, const Tensor* B, const float beta,
+ Tensor* C, Context* ctx) {
+ auto transA = A->transpose();
auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+ auto transB = B->transpose();
auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+ const size_t nrowA = A->shape()[0];
+ const size_t ncolA = A->shape()[1];
+ const size_t ncolB = B->shape()[1];
int lda = transA ? nrowA : ncolA;
int ldb = transB ? ncolA : ncolB;
int ldc = ncolB;
- const float* APtr = static_cast<const float*>(A->data());
- const float* BPtr = static_cast<const float*>(B->data());
- float* CPtr = static_cast<float*>(C->mutable_data());
+ const float* APtr = static_cast<const float*>(A->block()->data());
+ const float* BPtr = static_cast<const float*>(B->block()->data());
+ float* CPtr = static_cast<float*>(C->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
@@ -457,14 +856,93 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
ctx->stream);
}
+// template <>
+// void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// Context* ctx) {
+// const float* inPtr = static_cast<const float*>(in->block()->data());
+// float* outPtr = static_cast<float*>(out->block()->mutable_data());
+// // const size_t nrow = in->shape()[0];
+// // const size_t ncol = in->shape()[1];
+// // cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+
+// //vector<int> reduce_row_axes_shape = in->generate_shape_cuda();
+// //reduce_row_axes_shape.back() = 1; //reduce axis 1, so we set last element d in shape {a,b,c,d} to 1
+
+// vector<int> reduce_row_axes_shape = {1,1,1,1};
+// vector<int> reduced_strides = {1,1,1,1};
+
+// //reduce_desc
+// cudnnReduceTensorDescriptor_t reduce_desc;
+// cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
+// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+// cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
+// cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_NO_INDICES;
+// //cudnnReduceTensorIndices_t cudnn_indices = CUDNN_REDUCE_TENSOR_FLATTENED_INDICES;
+// cudnnIndicesType_t cudnn_indices_type = CUDNN_32BIT_INDICES;
+// cudnnCreateReduceTensorDescriptor(&reduce_desc);
+// cudnnSetReduceTensorDescriptor(reduce_desc, reduce_op, cudnn_dtype,
+// cudnn_propagation, cudnn_indices, cudnn_indices_type);
+
+// //instantiate new tensor to use new blocks as memory instead of cudaMalloc
+// //create 2 tensors of same size as input tensor
+// Shape reduction_size = {1000};
+// Tensor indices(reduction_size, in->device(), in->data_type());
+// Tensor workspace(reduction_size, in->device(), in->data_type());
+// size_t indices_bytes = indices.block()->size()*1000;
+// size_t workspace_bytes = workspace.block()->size()*1000;
+// size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
+// float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
+// //void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
+// //cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
+
+// float alpha[1] = {1.0};
+// float beta[1] = {0.0};
+// cudnnTensorDescriptor_t in_desc, out_desc;
+// cudnnCreateTensorDescriptor(&in_desc);
+// cudnnCreateTensorDescriptor(&out_desc);
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), reduce_row_axes_shape.data(), reduced_strides.data());
+// cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
+// indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
+// (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
+
+// cudnnDestroyTensorDescriptor(in_desc);
+// cudnnDestroyTensorDescriptor(out_desc);
+// }
+
template <>
-void RowMax<float, lang::Cuda>(const size_t nrow, const size_t ncol,
- const Block* in, Block* out,
+void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->data());
- float* outPtr = static_cast<float*>(out->mutable_data());
- cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+ const float* inPtr = static_cast<const float*>(in->block()->data());
+ float* outPtr = static_cast<float*>(out->block()->mutable_data());
+ const size_t nrow = in->shape()[0];
+ const size_t ncol = in->shape()[1];
+
+ if(in->transpose()){
+ Tensor t(in->shape(), in->device(), in->data_type());
+ float* tPtr = static_cast<float*>(t.block()->mutable_data());
+ float alpha[1] = {1.0};
+ float beta[1] = {0.0};
+
+ cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
+ cudnnTensorDescriptor_t in_desc, t_desc;
+ cudnnCreateTensorDescriptor(&in_desc);
+ cudnnCreateTensorDescriptor(&t_desc);
+ cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
+ cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), t.generate_shape_cuda().data(), t.generate_strides_cuda().data());
+ cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in_desc, inPtr,
+ (void*)(beta), t_desc, tPtr);
+
+ const float* tPtr_const = static_cast<const float*>(t.block()->data());
+ cuda::RowMax(nrow, ncol, tPtr_const, outPtr, ctx->stream);
+ cudnnDestroyTensorDescriptor(in_desc);
+ cudnnDestroyTensorDescriptor(t_desc);
+ } else {
+ cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
+ }
}
+
} // namespace singa
#endif // USE_CUDA
[06/10] incubator-singa git commit: Streamlining of tensor.h file by
moving respective member functions to cpp or cuda file. Removal of
shape_multipliers_ attribute in tensor.h. Changed read-in tensors to be
passed as reference instead of pointer
Posted by wa...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/c52e2aa3/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 3e36877..6e86ca7 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -32,13 +32,88 @@
namespace singa {
-cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor* x){
+// ===================== Helper Functions =============================
+
+ /*
+ cudnn requires tensor dimensions to fulfill 1 requirement:
+ 1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+ if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+ (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+ for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+ Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
+ */
+ vector<int> generate_shape_cuda(const Tensor& x) {
+ Shape shape_ = x.shape();
+ vector<int> shape_arr;
+ if(shape_.size() <= 4){
+ for (size_t n=0; n<4-shape_.size(); ++n) {
+ shape_arr.push_back(1);
+ }
+ for (size_t n=0; n<shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
+ }
+ return shape_arr;
+ } else if(shape_.size() == 5){
+ for (size_t n=0; n<shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
+ }
+ return shape_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+ }
+ }
+
+ int generate_dim_cuda(const Tensor& x) {
+ if(x.shape().size() <= 4){return 4;}
+ else if(x.shape().size() == 5){return 5;}
+ else{
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+ }
+ }
+
+/*
+ cudnn requires stride dimensions to conform to the format of the shape input as well
+ 1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+ If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+ (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+ for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3}
+ and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
+ */
+ vector<int> generate_strides_cuda(const Tensor& x) {
+ Shape shape_ = x.shape();
+ vector<int> strides_ = x.strides();
+ vector<int> strides_arr;
+ int product = 1;
+ for (size_t n=0; n<(shape_.size()); ++n) {
+ product *= shape_[n];
+ }
+ if(shape_.size() <= 4){
+ for (size_t n=0; n<4-shape_.size(); ++n) {
+ strides_arr.push_back(product);
+ }
+ for (size_t n=0; n<strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
+ }
+ return strides_arr;
+ } else if(shape_.size() == 5){
+ for (size_t n=0; n<strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
+ }
+ return strides_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
+ }
+ }
+
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x){
cudnnTensorDescriptor_t x_desc;
cudnnCreateTensorDescriptor(&x_desc);
cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
- x->generate_dim_cuda(),
- x->generate_shape_cuda().data(),
- x->generate_strides_cuda().data()
+ generate_dim_cuda(x),
+ generate_shape_cuda(x).data(),
+ generate_strides_cuda(x).data()
);
return x_desc;
@@ -55,12 +130,13 @@ cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
return op_desc;
}
+// ===================== CUDA Functions =============================
/// out[i] = |in[i]|
template <>
-void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
@@ -70,7 +146,7 @@ void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
(void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cudnnDestroyTensorDescriptor(in_desc);
}
@@ -80,74 +156,74 @@ void Set<float, lang::Cuda>(const float x, Tensor* out,
Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(out),
+ cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out),
outPtr, (void*)(&x));
}
template <>
-void Add<float, lang::Cuda>(const Tensor* in, const float x,
+void Add<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
Set<float, lang::Cuda>(x, out, ctx);
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = 1.0, beta = 1.0;
cudnnAddTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
/// out = in1 + in2
template <>
-void Add<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Add<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
float alpha2 = 1.0;
float beta = 0.0;
- if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+ if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
} else {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
}
/// out = in1 - in2
template <>
-void Sub<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Sub<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
float alpha2 = -1.0;
float beta = 0.0;
- if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
+ if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
} else {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
(void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
(void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
}
@@ -156,35 +232,35 @@ void Sub<float, lang::Cuda>(const Tensor* in1,
/// if x>high, then x=high; if x<low, then x=low.
template <>
void Clamp<float, lang::Cuda>(const float low,
- const float high, const Tensor* in, Tensor* out,
+ const float high, const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
/// out = in1 / in2
template <>
-void Div<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Div<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
//if both in1 and in2 strides are the same, we proceed to normal cuda::div
- if(in1->strides() == in2->strides()){
+ if(in1.strides() == in2.strides()){
cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->Set_Strides(in1->strides());
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
- out->Set_Strides(in2->strides());
+ out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -192,51 +268,51 @@ void Div<float, lang::Cuda>(const Tensor* in1,
}
template <>
-void Div<float, lang::Cuda>(const float x, const Tensor* in,
+void Div<float, lang::Cuda>(const float x, const Tensor& in,
Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::div(num, x, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
/// out = in * x
template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in,
const float x, Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = x, beta = 0.0;
cudnnAddTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
/// out = in1 * in2
template <>
-void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out,
+void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out,
Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
//if both in1 and in2 strides are the same, we proceed to normal cuda::mult
- if(in1->strides() == in2->strides()){
+ if(in1.strides() == in2.strides()){
cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->Set_Strides(in1->strides());
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
- out->Set_Strides(in2->strides());
+ out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -246,138 +322,138 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
/// Base is e. out[i]=e^in[i]
template <>
-void Exp<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Exp<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::exp(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void GE<float, lang::Cuda>(const Tensor* in, const float x,
+void GE<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::ge(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void GE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::ge(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::ge(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
-void GT<float, lang::Cuda>(const Tensor* in, const float x,
+void GT<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::gt(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void GT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void GT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::gt(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::gt(num, outPtr, 0.0, outPtr, ctx->stream);
}
template <>
-void LE<float, lang::Cuda>(const Tensor* in, const float x,
+void LE<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::le(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void LE<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LE<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::le(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::le(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
template <>
-void Log<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Log<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::log(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void LT<float, lang::Cuda>(const Tensor* in, const float x,
+void LT<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cuda::lt(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
template <>
-void LT<float, lang::Cuda>(const Tensor* in1, const Tensor* in2,
+void LT<float, lang::Cuda>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context* ctx) {
Sub<float, lang::Cuda>(in1, in2, out, ctx);
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- // const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- // const float* inPtr2 = static_cast<const float*>(in2->block()->data());
- const size_t num = in1->Size();
+ // const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ // const float* inPtr2 = static_cast<const float*>(in2.block()->data());
+ const size_t num = in1.Size();
//cuda::lt(num, inPtr1, inPtr2, outPtr, ctx->stream);
cuda::lt(num, outPtr, 0.0, outPtr, ctx->stream);
}
/// Element-wise operation, out[i] = in[i]^x
template <>
-void Pow<float, lang::Cuda>(const Tensor* in, const float x,
+void Pow<float, lang::Cuda>(const Tensor& in, const float x,
Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::pow(num, inPtr, x, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
/// Element-wise operation, out[i] = in1[i]^in2[i]
template <>
-void Pow<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, Tensor* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Pow<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, Tensor* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
- if(in1->strides() == in2->strides()){
+ if(in1.strides() == in2.strides()){
cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->Set_Strides(in1->strides());
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
- out->Set_Strides(in2->strides());
+ out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -386,9 +462,9 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
/// Element-wise operation, out[i]=max(0, in[i])
// template <>
-// void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
@@ -404,8 +480,10 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
@@ -415,20 +493,20 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
// }
template <>
-void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::relu(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// /// Element-wise operation, out[i]=sigmoid([in[i])
// template <>
-// void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
@@ -444,8 +522,10 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
@@ -456,31 +536,31 @@ void ReLU<float, lang::Cuda>(const Tensor* in, Tensor* out,
/// Element-wise operation, out[i]=sigmoid([in[i])
template <>
-void Sigmoid<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sigmoid<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// out[i] = sign(in[i])
template <>
-void Sign<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sign<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::sign(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// Element-wise operation, out[i]=sqrt([in[i])
template <>
-void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha1 = 1.0;
@@ -490,39 +570,39 @@ void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
(void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
- (void*)(&beta), generate_tensorND_desc(out), outPtr
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
/// Element-wise operation, out[i]=in[i]^2
template <>
-void Square<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Square<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::square(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// template <>
// void Sum<float, lang::Cuda>(const size_t num, const Block* in, float* out,
// Context* ctx) {
// LOG(FATAL) << "Cuda Sum is not implemented!";
-// // const float* inPtr = static_cast<const float*>(in->data());
+// // const float* inPtr = static_cast<const float*>(in.data());
// // cuda::sum(num, inPtr, out, ctx->stream);
// }
template <>
-void Sum<float, lang::Cuda>(const Tensor* in, float* out,
+void Sum<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
//reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
Shape reduced_shape = {1};
- Tensor t(reduced_shape, in->device(), in->data_type());
+ Tensor t(reduced_shape, in.device(), in.data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
- vector<int> reduce_all_axes = in->generate_shape_cuda();
+ vector<int> reduce_all_axes = generate_shape_cuda(in);
for (size_t n=0; n<reduce_all_axes.size(); ++n) {
reduce_all_axes[n] = 1;
}
@@ -539,10 +619,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
cudnn_propagation, cudnn_indices, cudnn_indices_type);
//instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
- size_t reduction_size_int = Product(in->shape());
+ size_t reduction_size_int = Product(in.shape());
Shape reduction_size = {reduction_size_int*100};
- Tensor indices(reduction_size, in->device(), in->data_type());
- Tensor workspace(reduction_size, in->device(), in->data_type());
+ Tensor indices(reduction_size, in.device(), in.data_type());
+ Tensor workspace(reduction_size, in.device(), in.data_type());
size_t indices_bytes = indices.block()->size()*100;
size_t workspace_bytes = workspace.block()->size()*100;
size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -555,7 +635,7 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(&t), tPtr
+ (void*)(&beta), generate_tensorND_desc(t), tPtr
);
*out = tPtr[0];
@@ -564,9 +644,9 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
/// Element-wise operation, out[i]=tanh([in[i])
// template <>
-// void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
// cudnnActivationDescriptor_t act_desc;
@@ -582,8 +662,10 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
@@ -593,13 +675,13 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
// }
template <>
-void Tanh<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t num = in->Size();
+ const size_t num = in.Size();
cuda::tanh(num, inPtr, outPtr, ctx->stream);
- out->Set_Strides(in->strides());
+ out->set_strides(in.strides());
}
// ================Random functions===========================================
@@ -643,65 +725,65 @@ void Gaussian<float, lang::Cuda>(const float mean,
// =========================Blas operations==================================
// ref to http://docs.nvidia.com/cuda/cublas
template <>
-void Amax<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amax<float, lang::Cuda>(const Tensor& in, size_t* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
*out = idx - 1; // cublas index starts from 1
}
/// return the index of the element with the min value.
template <>
-void Amin<float, lang::Cuda>(const Tensor* in, size_t* out,
+void Amin<float, lang::Cuda>(const Tensor& in, size_t* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
int idx = 1;
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
*out = idx - 1;
}
/// out = sum |x| for all x in in
template <>
-void Asum<float, lang::Cuda>(const Tensor* in, float* out,
+void Asum<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
}
/// out = alpha * in + out
template <>
void Axpy<float, lang::Cuda>(const float alpha,
- const Tensor* in, Tensor* out, Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const Tensor& in, Tensor* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const size_t num = in->Size();
+ const size_t num = in.Size();
CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
}
/// out = \sum_i in1[i] * in2[i]
template <>
-void Dot<float, lang::Cuda>(const Tensor* in1,
- const Tensor* in2, float* out, Context* ctx) {
- const float* inPtr1 = static_cast<const float*>(in1->block()->data());
- const float* inPtr2 = static_cast<const float*>(in2->block()->data());
+void Dot<float, lang::Cuda>(const Tensor& in1,
+ const Tensor& in2, float* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1.block()->data());
+ const float* inPtr2 = static_cast<const float*>(in2.block()->data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const size_t num = in1->Size();
+ const size_t num = in1.Size();
CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
}
template <>
-void Nrm2<float, lang::Cuda>(const Tensor* in, float* out,
+void Nrm2<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const float* inPtr = static_cast<const float*>(in->block()->data());
- const size_t num = in->Size();
+ const float* inPtr = static_cast<const float*>(in.block()->data());
+ const size_t num = in.Size();
cublasSnrm2(handle, num, inPtr, 1, out);
}
template <>
@@ -715,14 +797,14 @@ void Scale<float, lang::Cuda>(const float x, Tensor* out,
// NOTE: cublas uses column major order.
// http://peterwittek.com/cublas-matrix-c-style.html
template <>
-void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tensor* v,
+void DGMM<float, lang::Cuda>(const bool side_right, const Tensor& M, const Tensor& v,
Tensor* out, Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const float* MPtr = static_cast<const float*>(M->block()->data());
- const float* vPtr = static_cast<const float*>(v->block()->data());
+ const float* MPtr = static_cast<const float*>(M.block()->data());
+ const float* vPtr = static_cast<const float*>(v.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t nrow = M->shape(0);
- const size_t ncol = M->shape(1);
+ const size_t nrow = M.shape(0);
+ const size_t ncol = M.shape(1);
if (side_right) {
CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
vPtr, 1, outPtr, ncol));
@@ -732,16 +814,16 @@ void DGMM<float, lang::Cuda>(const bool side_right, const Tensor* M, const Tenso
}
}
template <>
-void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v,
+void GEMV<float, lang::Cuda>(const float alpha, const Tensor& A, const Tensor& v,
const float beta, Tensor* out, Context* ctx) {
- const float* APtr = static_cast<const float*>(A->block()->data());
- const float* vPtr = static_cast<const float*>(v->block()->data());
+ const float* APtr = static_cast<const float*>(A.block()->data());
+ const float* vPtr = static_cast<const float*>(v.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t m = A->shape()[0];
- const size_t n = A->shape()[1];
+ const size_t m = A.shape()[0];
+ const size_t n = A.shape()[1];
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- if (!(A->transpose()))
+ if (!(A.transpose()))
CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
1, &beta, outPtr, 1));
else
@@ -752,20 +834,20 @@ void GEMV<float, lang::Cuda>(const float alpha, const Tensor* A, const Tensor* v
// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
template <>
void GEMM<float, lang::Cuda>(const float alpha,
- const Tensor* A, const Tensor* B, const float beta,
+ const Tensor& A, const Tensor& B, const float beta,
Tensor* C, Context* ctx) {
- auto transA = A->transpose();
+ auto transA = A.transpose();
auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
- auto transB = B->transpose();
+ auto transB = B.transpose();
auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
- const size_t nrowA = A->shape()[0];
- const size_t ncolA = A->shape()[1];
- const size_t ncolB = B->shape()[1];
+ const size_t nrowA = A.shape()[0];
+ const size_t ncolA = A.shape()[1];
+ const size_t ncolB = B.shape()[1];
int lda = transA ? nrowA : ncolA;
int ldb = transB ? ncolA : ncolB;
int ldc = ncolB;
- const float* APtr = static_cast<const float*>(A->block()->data());
- const float* BPtr = static_cast<const float*>(B->block()->data());
+ const float* APtr = static_cast<const float*>(A.block()->data());
+ const float* BPtr = static_cast<const float*>(B.block()->data());
float* CPtr = static_cast<float*>(C->block()->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
@@ -799,15 +881,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
}
// template <>
-// void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+// void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
// Context* ctx) {
-// const float* inPtr = static_cast<const float*>(in->block()->data());
+// const float* inPtr = static_cast<const float*>(in.block()->data());
// float* outPtr = static_cast<float*>(out->block()->mutable_data());
-// // const size_t nrow = in->shape()[0];
-// // const size_t ncol = in->shape()[1];
+// // const size_t nrow = in.shape()[0];
+// // const size_t ncol = in.shape()[1];
// // cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
-// //vector<int> reduce_row_axes_shape = in->generate_shape_cuda();
+// //vector<int> reduce_row_axes_shape = in.generate_shape_cuda();
// //reduce_row_axes_shape.back() = 1; //reduce axis 1, so we set last element d in shape {a,b,c,d} to 1
// vector<int> reduce_row_axes_shape = {1,1,1,1};
@@ -828,8 +910,8 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
// //instantiate new tensor to use new blocks as memory instead of cudaMalloc
// //create 2 tensors of same size as input tensor
// Shape reduction_size = {1000};
-// Tensor indices(reduction_size, in->device(), in->data_type());
-// Tensor workspace(reduction_size, in->device(), in->data_type());
+// Tensor indices(reduction_size, in.device(), in.data_type());
+// Tensor workspace(reduction_size, in.device(), in.data_type());
// size_t indices_bytes = indices.block()->size()*1000;
// size_t workspace_bytes = workspace.block()->size()*1000;
// size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
@@ -842,9 +924,12 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
-// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), reduce_row_axes_shape.data(), reduced_strides.data());
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
+// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// reduce_row_axes_shape.data(), reduced_strides.data());
// cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
// indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
// (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
@@ -854,15 +939,15 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
// }
template <>
-void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
+void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
- const float* inPtr = static_cast<const float*>(in->block()->data());
+ const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- const size_t nrow = in->shape()[0];
- const size_t ncol = in->shape()[1];
+ const size_t nrow = in.shape()[0];
+ const size_t ncol = in.shape()[1];
- if(in->transpose()){
- Tensor t(in->shape(), in->device(), in->data_type());
+ if(in.transpose()){
+ Tensor t(in.shape(), in.device(), in.data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
float alpha = 1.0;
@@ -870,7 +955,7 @@ void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
cudnnTransformTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(&t), tPtr
+ (void*)(&beta), generate_tensorND_desc(t), tPtr
);
const float* tPtr_const = static_cast<const float*>(t.block()->data());
[09/10] incubator-singa git commit: reformat the code
Posted by wa...@apache.org.
reformat the code
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/3e2b75cb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/3e2b75cb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/3e2b75cb
Branch: refs/heads/master
Commit: 3e2b75cbe86908f551ac3f492a8aba07008b227b
Parents: c52e2aa
Author: Wang Wei <dc...@nus.edu.sg>
Authored: Sun May 13 20:42:52 2018 +0800
Committer: Wang Wei <dc...@nus.edu.sg>
Committed: Sun May 13 20:42:52 2018 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 55 +++---
src/core/tensor/tensor.cc | 291 ++++++++++++++++----------------
src/core/tensor/tensor_math_cpp.h | 163 +++++++++---------
src/core/tensor/tensor_math_cuda.h | 286 +++++++++++++++----------------
4 files changed, 403 insertions(+), 392 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index e25aafd..3cc28ff 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -36,7 +36,8 @@ typedef vector<size_t> Shape;
/// hardcode the width of types defined in DataType
const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2,
sizeof(int), sizeof(char),
- sizeof(double), sizeof(unsigned char)};
+ sizeof(double), sizeof(unsigned char)
+ };
inline size_t SizeOf(DataType t) {
static_assert(kNumDataType == sizeof(kDataWidth) / sizeof(size_t),
"Num of data types not match num of data width");
@@ -51,7 +52,7 @@ inline size_t SizeOf(DataType t) {
/// Tensor.
/// For all operations, if the result tensor is passed as an argument,
/// then it must be set up correctly (shape, device). Otherwise, runtime error
-/// like SegmentFault would happen. Simply type/device check would be conducted.
+/// like SegmentFault would happen. Simple type/device check would be conducted.
class Tensor {
public:
~Tensor();
@@ -59,12 +60,17 @@ class Tensor {
explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
- Tensor(Shape &&shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
- Tensor(const Shape &shape, std::shared_ptr<Device> dev, DataType dtype = kFloat32);
+ Tensor(Shape &&shape,
+ std::shared_ptr<Device> dev,
+ DataType dtype = kFloat32);
+ Tensor(const Shape &shape,
+ std::shared_ptr<Device> dev,
+ DataType dtype = kFloat32);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(const Tensor &from);
- /// Copy Tensor to share the internal data. No deep copy. For 2 tensors sharing same block but different strides.
+ /// Copy Tensor to share the internal data. No deep copy.
+ /// For 2 tensors sharing same block but different strides.
Tensor(const Tensor &from, Shape &new_shape, vector<int> &new_strides);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(Tensor &&from);
@@ -89,7 +95,7 @@ class Tensor {
void GetValue(SType *value, const size_t num) {
CHECK(device_ == defaultDevice);
const SType* ptr = data<SType>();
- for(size_t i = 0; i < num; i++) value[i] = ptr[i];
+ for (size_t i = 0; i < num; i++) value[i] = ptr[i];
}
/// data type, including kFloat16, kFloat32, kInt
@@ -106,7 +112,7 @@ class Tensor {
bool empty() const { return nDim() == 0; }
- //bool transpose() const { return transpose_; }
+ /// Check if the tensor's last stride==1
bool transpose() const { return (strides_.back() != 1); }
const vector<int>& strides() const { return strides_; }
@@ -131,9 +137,8 @@ class Tensor {
void Reshape(Shape &&shape);
/// Reset the shape, device, and data type as given tensor.
- /// If block size changes, then reallocate a new block. The previous block
- /// would
- /// be deleted.
+ /// If block size changes, then reallocate a new block.
+ /// The previous block would be deleted.
void ResetLike(const Tensor &t);
/// Reset the data type, it would reallocate block if type changes.
@@ -176,9 +181,11 @@ class Tensor {
/// No data copy, just set the transpose_ filed of the returned tensor.
Tensor T() const;
+ /// Reverse the shape vector
Tensor Transpose() const;
- Tensor Transpose(Shape axes) const;
+ /// Change the axes
+ Tensor Transpose(const vector<size_t>& axes) const;
/// Copy the meta info with data block shared.
Tensor &operator=(const Tensor &in);
@@ -219,23 +226,24 @@ class Tensor {
float L2() const;
//generate strides automatically if stride field is not passed
-void generate_strides(){
- if(shape_.size()==0){
- strides_ = {1};
- return void();
- }
+ void generate_strides() {
strides_.clear();
+ if (shape_.size() == 0) {
+ strides_.push_back(1);
+ return;
+ }
+
size_t dim = Size();
int cumulative_product = 1;
- for (size_t n=0; n<shape_.size(); ++n) {
- cumulative_product = cumulative_product*shape_[n];
- strides_.push_back(dim/cumulative_product);
+ for (size_t n = 0; n < shape_.size(); ++n) {
+ cumulative_product = cumulative_product * shape_[n];
+ strides_.push_back(dim / cumulative_product);
}
-};
+ }
-void set_strides(const vector<int> new_strides){
- strides_ = new_strides;
-}
+ void set_strides(const vector<int> new_strides) {
+ strides_ = new_strides;
+ }
protected:
DataType data_type_ = kFloat32;
@@ -247,7 +255,6 @@ void set_strides(const vector<int> new_strides){
vector<int> strides_ = {};
}; //end of tensor class
-typedef Shape::iterator ShapeIter;
inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
if (len == 0) len = shape.size();
if (len == 0)
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index a4efd64..d98e6a6 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -21,7 +21,6 @@
#include "./tensor_math_cuda.h"
#include "./tensor_math_opencl.h"
#include <utility>
-#include <iostream>
namespace singa {
@@ -31,21 +30,21 @@ Tensor::~Tensor() {
block_ = nullptr;
}
-Tensor::Tensor() {
+Tensor::Tensor() {
device_ = defaultDevice;
strides_ = {1};
}
-//non-strided constructors
+//non-strided constructors
Tensor::Tensor(const Shape &shape, DataType dtype)
- : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
generate_strides();
}
Tensor::Tensor(Shape &&shape, DataType dtype)
- : data_type_(dtype), device_(defaultDevice), shape_(shape) {
+ : data_type_(dtype), device_(defaultDevice), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
@@ -55,14 +54,14 @@ Tensor::Tensor(Shape &&shape, DataType dtype)
//non-strided constructors with device
Tensor::Tensor(const Shape &shape, std::shared_ptr<Device> device,
DataType dtype)
- : data_type_(dtype), device_(device), shape_(shape) {
+ : data_type_(dtype), device_(device), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
generate_strides();
}
Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
- : data_type_(dtype), device_(device), shape_(shape) {
+ : data_type_(dtype), device_(device), shape_(shape) {
size_t size = Product(shape_) * SizeOf(data_type_);
if (size)
block_ = device_->NewBlock((int)size);
@@ -71,34 +70,34 @@ Tensor::Tensor(Shape &&shape, std::shared_ptr<Device> device, DataType dtype)
Tensor::Tensor(const Tensor &in)
- : //transpose_(in.transpose_),
- data_type_(in.data_type_),
- device_(in.device_),
- block_(in.block()),
- shape_(in.shape_),
- strides_(in.strides_) {
+ : //transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ block_(in.block()),
+ shape_(in.shape_),
+ strides_(in.strides_) {
if (block_ != nullptr)
block_->IncRefCount();
}
//strided constructor taking in a tensor, shape and strides
Tensor::Tensor(const Tensor &in, Shape &new_shape, vector<int> &new_strides)
- : //transpose_(in.transpose_),
- data_type_(in.data_type_),
- device_(in.device_),
- block_(in.block()),
- shape_(new_shape),
- strides_(new_strides) {
+ : //transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ block_(in.block()),
+ shape_(new_shape),
+ strides_(new_strides) {
if (block_ != nullptr)
block_->IncRefCount();
}
Tensor::Tensor(Tensor &&in)
- : //transpose_(in.transpose_),
- data_type_(in.data_type_),
- device_(in.device_),
- shape_(std::move(in.shape_)),
- strides_(in.strides_) {
+ : //transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ shape_(std::move(in.shape_)),
+ strides_(in.strides_) {
block_ = in.block_;
in.block_ = nullptr;
}
@@ -123,10 +122,13 @@ void Tensor::ResetLike(const Tensor &in) {
strides_ = in.strides_;
}
-//if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
-//if tensor is already transposed i.e strides != 1, it should be copied to a new tensor with newly generated default strides
+// if tensor is not transposed yet i.e strides == 1,
+// then we simply change the shape and generate new default strides
+// if tensor is already transposed i.e strides != 1,
+// it should be copied to a new tensor with newly generated default strides
+// TODO(wangwei) raise error if the shape not match
void Tensor::Reshape(const Shape &shape) {
- if(strides_.size()==0)
+ if (strides_.size() == 0)
strides_.push_back(1);
if (Product(shape_) != Product(shape)) {
@@ -141,7 +143,7 @@ void Tensor::Reshape(const Shape &shape) {
}
void Tensor::Reshape(Shape &&shape) {
- if(strides_.size()==0)
+ if (strides_.size() == 0)
strides_.push_back(1);
if (Product(shape_) != Product(shape)) {
@@ -196,12 +198,12 @@ void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num,
}
}
template void Tensor::CopyDataFromHostPtr(const unsigned char *src,
- const size_t num,
- const size_t offset);
+ const size_t num,
+ const size_t offset);
template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num,
- const size_t offset);
+ const size_t offset);
template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num,
- const size_t offset);
+ const size_t offset);
void Tensor::CopyData(const Tensor &src) {
CHECK_EQ(Size(), src.Size());
@@ -224,44 +226,44 @@ void Tensor::FromProto(const singa::TensorProto &proto) {
strides_.clear();
for (int32_t s : proto.strides()) strides_.push_back(s);
switch (data_type_) {
- case kFloat32: {
- std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
- for (size_t i = 0; i < Product(shape_); ++i)
- data_ptr[i] = static_cast<float>(proto.float_data((int)i));
- CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
- break;
- }
- case kDouble: {
- std::unique_ptr<double[]> data(new double[Product(shape_)]);
- for (size_t i = 0; i < Product(shape_); ++i)
- data[i] = proto.double_data((int)i);
- CopyDataFromHostPtr<double>(data.get(), Product(shape_));
- break;
- }
- case kInt: {
- std::unique_ptr<int[]> data(new int[Product(shape_)]);
- for (size_t i = 0; i < Product(shape_); ++i) data[i] = proto.int_data((int)i);
- CopyDataFromHostPtr<int>(data.get(), Product(shape_));
- break;
- }
- ///TODO(wangji): Implement to support C++ type char using bytes type in protobuf
- /// which is equivalent to string type is different from the other cases. The kchar
- /// and kUChar case is to be implemented.
- /*
- case kChar: {
- std::unique_ptr<char[]> data(new char[Product(shape_)]);
- for (size_t i = 0; i < Product(shape_); ++i)
- data[i] = static_cast<char>(proto.bytes_data(i));
- break;
- }
- case kUChar: {
- std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
- for (size_t i = 0; i < Product(shape_); ++i)
- data[i] = static_cast<unsigned char>(proto.bytes_data(i));
- break;
- }
- */
- default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
+ case kFloat32: {
+ std::unique_ptr<float[]> data_ptr(new float[Product(shape_)]);
+ for (size_t i = 0; i < Product(shape_); ++i)
+ data_ptr[i] = static_cast<float>(proto.float_data((int)i));
+ CopyDataFromHostPtr<float>(data_ptr.get(), Product(shape_));
+ break;
+ }
+ case kDouble: {
+ std::unique_ptr<double[]> data(new double[Product(shape_)]);
+ for (size_t i = 0; i < Product(shape_); ++i)
+ data[i] = proto.double_data((int)i);
+ CopyDataFromHostPtr<double>(data.get(), Product(shape_));
+ break;
+ }
+ case kInt: {
+ std::unique_ptr<int[]> data(new int[Product(shape_)]);
+ for (size_t i = 0; i < Product(shape_); ++i) data[i] = proto.int_data((int)i);
+ CopyDataFromHostPtr<int>(data.get(), Product(shape_));
+ break;
+ }
+ ///TODO(wangji): Implement to support C++ type char using bytes type in protobuf
+ /// which is equivalent to string type is different from the other cases. The kchar
+ /// and kUChar case is to be implemented.
+ /*
+ case kChar: {
+ std::unique_ptr<char[]> data(new char[Product(shape_)]);
+ for (size_t i = 0; i < Product(shape_); ++i)
+ data[i] = static_cast<char>(proto.bytes_data(i));
+ break;
+ }
+ case kUChar: {
+ std::unique_ptr<unsigned char[]> data(new unsigned char[Product(shape_)]);
+ for (size_t i = 0; i < Product(shape_); ++i)
+ data[i] = static_cast<unsigned char>(proto.bytes_data(i));
+ break;
+ }
+ */
+ default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
}
}
@@ -277,44 +279,44 @@ void Tensor::ToProto(singa::TensorProto *proto) const {
proto->add_strides(s);
}
switch (data_type_) {
- case kFloat32: {
- proto->clear_float_data();
- const float *data_ptr = data<float>();
- for (size_t i = 0; i < Product(shape_); ++i)
- proto->add_float_data(data_ptr[i]);
- break;
- }
- case kDouble: {
- proto->clear_double_data();
- const double *data_ptr = data<double>();
- for (size_t i = 0; i < Product(shape_); ++i)
- proto->add_double_data(data_ptr[i]);
- break;
- }
- case kInt: {
- proto->clear_int_data();
- const int *data_ptr = data<int>();
- for (size_t i = 0; i < Product(shape_); ++i)
- proto->add_int_data(data_ptr[i]);
- break;
- }
- /*
- case kChar: {
- proto->clear_bytes_data();
- const char *data = data<char>();
- for (size_t i = 0; i < Product(shape_); ++i)
- proto->add_bytes_data(static_cast<unsigned char>(data[i]));
- break;
- }
- case kUChar: {
- proto->clear_bytes_data();
- const unsigned char *data = data<unsigned char>();
- for (size_t i = 0; i < Product(shape_); ++i)
- proto->add_bytes_data(static_cast<unsigned char>(data[i]));
- break;
- }
- */
- default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
+ case kFloat32: {
+ proto->clear_float_data();
+ const float *data_ptr = data<float>();
+ for (size_t i = 0; i < Product(shape_); ++i)
+ proto->add_float_data(data_ptr[i]);
+ break;
+ }
+ case kDouble: {
+ proto->clear_double_data();
+ const double *data_ptr = data<double>();
+ for (size_t i = 0; i < Product(shape_); ++i)
+ proto->add_double_data(data_ptr[i]);
+ break;
+ }
+ case kInt: {
+ proto->clear_int_data();
+ const int *data_ptr = data<int>();
+ for (size_t i = 0; i < Product(shape_); ++i)
+ proto->add_int_data(data_ptr[i]);
+ break;
+ }
+ /*
+ case kChar: {
+ proto->clear_bytes_data();
+ const char *data = data<char>();
+ for (size_t i = 0; i < Product(shape_); ++i)
+ proto->add_bytes_data(static_cast<unsigned char>(data[i]));
+ break;
+ }
+ case kUChar: {
+ proto->clear_bytes_data();
+ const unsigned char *data = data<unsigned char>();
+ for (size_t i = 0; i < Product(shape_); ++i)
+ proto->add_bytes_data(static_cast<unsigned char>(data[i]));
+ break;
+ }
+ */
+ default: { LOG(FATAL) << "Unsupported Type" << DataType_Name(data_type_); }
}
}
@@ -353,9 +355,9 @@ Tensor Tensor::Transpose() const {
t.device_ = device_;
t.data_type_ = data_type_;
t.strides_.clear();
- for(size_t n=0; n<shape_.size(); ++n){
- t.shape_.push_back(shape_[shape_.size()-n-1]);
- t.strides_.push_back(strides_[shape_.size()-n-1]);
+ for (size_t n = 0; n < shape_.size(); ++n) {
+ t.shape_.push_back(shape_[shape_.size() - n - 1]);
+ t.strides_.push_back(strides_[shape_.size() - n - 1]);
}
t.block_ = block_;
block_->IncRefCount();
@@ -363,6 +365,7 @@ Tensor Tensor::Transpose() const {
}
//transpose with axes
+// TODO(wangwei) the shape and axes should match
Tensor Tensor::Transpose(Shape axes) const {
// if(axes.size() != shape_.size()){
// std::cout << "Warning: Size of input axes doesn't match size of shape" << std::endl;
@@ -375,7 +378,7 @@ Tensor Tensor::Transpose(Shape axes) const {
t.device_ = device_;
t.data_type_ = data_type_;
t.strides_.clear();
- for(size_t n=0; n<axes.size(); ++n){
+ for (size_t n = 0; n < axes.size(); ++n) {
t.shape_.push_back(shape_[axes[n]]);
t.strides_.push_back(strides_[axes[n]]);
}
@@ -404,7 +407,7 @@ Tensor &Tensor::operator=(Tensor &&in) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
//transpose_ = in.transpose_;
- strides_ = in.strides_;
+ strides_ = std::move(in.strides_);
data_type_ = in.data_type_;
shape_ = std::move(in.shape_);
device_ = in.device_;
@@ -470,7 +473,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
(int)s_offset);
} else if (src_dev->lang() == kCpp) {
dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, (int)d_offset,
- (int)s_offset);
+ (int)s_offset);
} else {
LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
}
@@ -548,7 +551,7 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
float Tensor::L1() const {
float nrm = 0.0f;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
- device_->Exec([&nrm, this](Context *ctx) {
+ device_->Exec([&nrm, this](Context * ctx) {
DType ret = DType(0);
Asum<DType, Lang>(*this, &ret, ctx);
nrm = TypeCast<DType, float>(ret);
@@ -561,7 +564,7 @@ float Tensor::L1() const {
float Tensor::L2() const {
float nrm = 0.0f;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
- device_->Exec([&nrm, this](Context *ctx) {
+ device_->Exec([&nrm, this](Context * ctx) {
DType ret = DType(0);
Nrm2<DType, Lang>(*this, &ret, ctx);
nrm = TypeCast<DType, float>(ret);
@@ -577,7 +580,7 @@ void Tensor::SetValue(const SType x) {
auto ptr = block_;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
// TODO(wangwei) cast x to DType
- device_->Exec([this, x, ptr](Context *ctx) {
+ device_->Exec([this, x, ptr](Context * ctx) {
Set<DType, Lang>(x, this, ctx);
}, {}, {ptr});
});
@@ -691,7 +694,7 @@ void Div(const SType alpha, const Tensor &in, Tensor *out) {
CHECK(in.shape() == out->shape());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
// TODO(wangwei) type cast SType to DType;
- in.device()->Exec([alpha, in, out](Context *ctx) {
+ in.device()->Exec([alpha, in, out](Context * ctx) {
Div<DType, Lang>(alpha, in, out, ctx);
}, {in.block()}, {out->block()});
});
@@ -727,7 +730,7 @@ float Sum<float>(const Tensor &in) {
Tensor one(in.shape(), in.device(), in.data_type());
one.SetValue(1.0f);
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
- one.device()->Exec([in, one, &s](Context *ctx) {
+ one.device()->Exec([in, one, &s](Context * ctx) {
DType ret = DType(0);
Dot<DType, Lang>(in, one, &ret, ctx);
s = ret;
@@ -758,7 +761,7 @@ Tensor SoftMax(const Tensor &in) {
Tensor RowMax(const Tensor &in) {
Tensor ret({in.shape(0)}, in.device(), in.data_type());
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
- in.device()->Exec([&in, &ret](Context *ctx) {
+ in.device()->Exec([&in, &ret](Context * ctx) {
//size_t nrow = 1;
//if (in.nDim() > 1) nrow = in.shape(0);
//size_t ncol = in.Size() / nrow;
@@ -805,7 +808,7 @@ void AddColumn(const SType alpha, const SType beta, const Tensor &v,
Tensor vmat = Reshape(v, Shape{nb_row, 1});
Mult(alpha, vmat, one, beta, M);
}
-}
+}
template
void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
@@ -846,16 +849,16 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) {
CHECK_GE(dim, 2u) << " Only work for tensor of dim >=2 ";
size_t size = in[0].Size() / in[0].shape(axis);
size_t new_size = 0u;
- for (const auto& t: in) {
+ for (const auto& t : in) {
CHECK_EQ(dim, t.shape().size()) << "All tensors should have the same dim";
CHECK_EQ(size, t.Size() / t.shape(axis)) << "The size of all axis should "
- <<" be the same except the concatenated axis";
+ << " be the same except the concatenated axis";
new_size += t.shape(axis);
}
out_shape[axis] = new_size;
if (axis == 0) {
size_t nrow = 0;
- for (const auto& t: in) {
+ for (const auto& t : in) {
nrow += t.shape(0);
tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)}));
}
@@ -863,7 +866,7 @@ Tensor ConcatOn(const vector<Tensor> &in, int axis) {
ret.Reshape(out_shape);
return ret;
} else {
- for (const auto& t: in) {
+ for (const auto& t : in) {
size_t nrow = 1;
for (int i = 0; i < axis; i++)
nrow *= t.shape(i);
@@ -944,7 +947,7 @@ Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int axis)
out_shape[axis] = end - start;
if (axis == 0) {
auto ret = SliceRows(Reshape(in, {in.shape(0), in.Size() / in.shape(0)}),
- start, end);
+ start, end);
ret.Reshape(out_shape);
return ret;
} else {
@@ -953,7 +956,7 @@ Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int axis)
nrow *= in.shape(i);
auto suffix = in.Size() / nrow / in.shape(axis);
auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}),
- start * suffix, end * suffix);
+ start * suffix, end * suffix);
ret.Reshape(out_shape);
return ret;
}
@@ -997,9 +1000,9 @@ void MultColumn(const Tensor &v, Tensor *M) {
CHECK_EQ(v.Size(), M->shape(0));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
- v.device()->Exec([M, v](Context *ctx) {
+ v.device()->Exec([M, v](Context * ctx) {
DGMM<DType, Lang>(false, *M, v,
- M, ctx);
+ M, ctx);
}, {M->block(), v.block()}, {M->block()});
});
}
@@ -1012,9 +1015,9 @@ void MultRow(const Tensor &v, Tensor *M) {
CHECK_EQ(v.Size(), M->shape(1));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
- v.device()->Exec([M, v](Context *ctx) {
+ v.device()->Exec([M, v](Context * ctx) {
DGMM<DType, Lang>(true, *M, v,
- M, ctx);
+ M, ctx);
}, {M->block(), v.block()}, {M->block()});
});
}
@@ -1059,7 +1062,7 @@ template <typename SType>
void Bernoulli(const SType p, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto prob = TypeCast<SType, DType>(p);
- out->device()->Exec([prob, out](Context *ctx) {
+ out->device()->Exec([prob, out](Context * ctx) {
Bernoulli<DType, Lang>(prob, out, ctx);
}, {}, {out->block()}, true);
});
@@ -1072,7 +1075,7 @@ void Uniform(const SType low, const SType high, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto l = TypeCast<SType, DType>(low);
auto h = TypeCast<SType, DType>(high);
- out->device()->Exec([l, h, out](Context *ctx) {
+ out->device()->Exec([l, h, out](Context * ctx) {
Uniform<DType, Lang>(l, h, out, ctx);
}, {}, {out->block()}, true);
});
@@ -1085,7 +1088,7 @@ void Gaussian(const SType mean, const SType std, Tensor *out) {
TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
auto m = TypeCast<SType, DType>(mean);
auto s = TypeCast<SType, DType>(std);
- out->device()->Exec([m, s, out](Context *ctx) {
+ out->device()->Exec([m, s, out](Context * ctx) {
Gaussian<DType, Lang>(m, s, out, ctx);
}, {}, {out->block()}, true);
});
@@ -1098,7 +1101,7 @@ template <typename SType>
void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
- out->device()->Exec([a, in, out](Context *ctx) {
+ out->device()->Exec([a, in, out](Context * ctx) {
Axpy<DType, Lang>(a, in, out, ctx);
}, {in.block(), out->block()}, {out->block()});
});
@@ -1128,7 +1131,7 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
- C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ C->device()->Exec([a, A, b, B, C](Context * ctx) {
GEMV<DType, Lang>(a, A, B, b, C, ctx);
}, {A.block(), B.block()}, {C->block()});
});
@@ -1137,9 +1140,9 @@ void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
auto a = TypeCast<SType, DType>(alpha);
auto b = TypeCast<SType, DType>(beta);
- C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ C->device()->Exec([a, A, b, B, C](Context * ctx) {
GEMM<DType, Lang>(a, A, B, b, C,
- ctx);
+ ctx);
}, {A.block(), B.block()}, {C->block()});
});
}
@@ -1155,10 +1158,10 @@ void ComputeCrossEntropy(const Tensor &p, const Tensor &t, Tensor *loss) {
if (p.nDim() == 2u) batchsize = p.shape(0);
size_t dim = p.Size() / batchsize;
TYPE_LANG_SWITCH(p.data_type(), DType, p.device()->lang(), Lang, {
- p.device()->Exec([batchsize, dim, t, p, loss](Context *ctx) {
- bool int_target = t.Size() == batchsize;
- ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p.block(),
- t.block(), loss->block(), ctx);
+ p.device()->Exec([batchsize, dim, t, p, loss](Context * ctx) {
+ bool int_target = t.Size() == batchsize;
+ ComputeCrossEntropy<DType, Lang>(int_target, batchsize, dim, p.block(),
+ t.block(), loss->block(), ctx);
}, {p.block(), t.block()}, {loss->block()});
});
}
@@ -1170,10 +1173,10 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p) {
if (p->nDim() == 2u) batchsize = p->shape(0);
size_t dim = p->Size() / batchsize;
TYPE_LANG_SWITCH(p->data_type(), DType, p->device()->lang(), Lang, {
- p->device()->Exec([batchsize, dim, t, p](Context *ctx) {
+ p->device()->Exec([batchsize, dim, t, p](Context * ctx) {
bool int_target = t.Size() == batchsize;
SoftmaxCrossEntropyBwd<DType, Lang>(int_target, batchsize, dim,
- p->block(), t.block(), p->block(), ctx);
+ p->block(), t.block(), p->block(), ctx);
}, {p->block(), t.block()}, {p->block()});
});
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 1ca312a..bfdd026 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -32,13 +32,14 @@ namespace singa {
// ===================== Helper Functions =============================
-//generate a traversal_info vector based on the tensor's shape for the traverse_next function to work
+// generate a traversal_info vector based on the tensor's shape for the
+// traverse_next function to work
vector<int> generate_traversal_info(const Tensor& x) {
- vector<int> traversal_info = {};
- for(size_t n=0; n<(x.shape().size()+2); ++n) {
- traversal_info.push_back(0);
- }
- return traversal_info;
+ vector<int> traversal_info = {};
+ for (size_t n = 0; n < (x.shape().size() + 2); ++n) {
+ traversal_info.push_back(0);
+ }
+ return traversal_info;
};
//generate shape multipliers
@@ -47,18 +48,18 @@ vector<int> generate_traversal_info(const Tensor& x) {
//this means that the 3rd, 6th, and 9th index of the array will always be the starting element of their respective rows
//so we need to need use the inner stride when jumping from 1st->2nd element, and outer stride when jumping from 2nd->3rd
vector<int> generate_shape_multipliers(const Tensor& x) {
- Shape y_shape = x.shape();
- if(y_shape.size()==0){
- return {1};
- }
- vector<int> shape_multipliers = {1};
- int cumulative_product = 1;
+ Shape y_shape = x.shape();
+ if (y_shape.size() == 0) {
+ return {1};
+ }
+ vector<int> shape_multipliers = {1};
+ int cumulative_product = 1;
- for (size_t n=0; n<(y_shape.size()-1); ++n) {
- cumulative_product = cumulative_product*y_shape[y_shape.size()-1-n];
- shape_multipliers.insert(shape_multipliers.begin(), cumulative_product);
- }
- return shape_multipliers;
+ for (size_t n = 0; n < (y_shape.size() - 1); ++n) {
+ cumulative_product = cumulative_product * y_shape[y_shape.size() - 1 - n];
+ shape_multipliers.insert(shape_multipliers.begin(), cumulative_product);
+ }
+ return shape_multipliers;
};
// ******************************************************************************************
@@ -71,20 +72,20 @@ vector<int> generate_shape_multipliers(const Tensor& x) {
//this additional check only has 1 loop for 2d matrix
//but runtime performance might degrade to O(nlog(n)) for higher dimensional tensors
int determine_order(vector<int>& shape_multipliers, int counter) {
- for (size_t n=0; n<(shape_multipliers.size()-1); ++n) {
- if((counter%shape_multipliers[n])==0){
- return ((shape_multipliers.size()) - 1 - n);
- }
+ for (size_t n = 0; n < (shape_multipliers.size() - 1); ++n) {
+ if ((counter % shape_multipliers[n]) == 0) {
+ return ((shape_multipliers.size()) - 1 - n);
}
- return 0;
+ }
+ return 0;
};
//this function updates the base indexes with the current index after every single traversal step,
//can be generalized beyond 2d cases
void update_base_index(const Tensor& x, vector<int>& traversal_info) {
- for (int n=0; n<(traversal_info[x.shape().size()+1]+1); ++n) {
- traversal_info[n] = traversal_info[x.shape().size()];
- }
+ for (int n = 0; n < (traversal_info[x.shape().size() + 1] + 1); ++n) {
+ traversal_info[n] = traversal_info[x.shape().size()];
+ }
};
//function to traverse a const strided tensor object
@@ -95,32 +96,32 @@ void update_base_index(const Tensor& x, vector<int>& traversal_info) {
//index 3 stores the order of the traversal for e.g. if the order is 0,
//it means the next element can be navigated to using the innermost stride
void traverse_next(const Tensor& x,
- vector<int>& shape_multipliers,
+ vector<int>& shape_multipliers,
vector<int>& traversal_info,
int counter) {
- update_base_index(x, traversal_info);
- traversal_info[x.shape().size()+1] = determine_order(shape_multipliers, counter);
- traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size()+1]] +
- x.strides()[x.strides().size()-traversal_info[x.shape().size()+1]-1];
+ update_base_index(x, traversal_info);
+ traversal_info[x.shape().size() + 1] = determine_order(shape_multipliers, counter);
+ traversal_info[x.shape().size()] = traversal_info[traversal_info[x.shape().size() + 1]] +
+ x.strides()[x.strides().size() - traversal_info[x.shape().size() + 1] - 1];
};
template <typename DType>
-void TraverseUnary(const Tensor &in, Tensor* out, std::function<DType(DType)> func){
+void TraverseUnary(const Tensor &in, Tensor* out, std::function<DType(DType)> func) {
DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
const DType *inPtr = static_cast<const DType *>(in.block()->data());
vector<int> traversal_info = generate_traversal_info(in);
vector<int> shape_multipliers = generate_shape_multipliers(in);
- for (size_t i = 0; i < in.Size(); i++) {
+ for (size_t i = 0; i < in.Size(); i++) {
outPtr[i] = func(inPtr[traversal_info[in.shape().size()]]);
- traverse_next(in, shape_multipliers, traversal_info, i+1);
+ traverse_next(in, shape_multipliers, traversal_info, i + 1);
}
}
template <typename DType>
-void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out,
- std::function<DType(DType, DType)> func){
+void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out,
+ std::function<DType(DType, DType)> func) {
DType *outPtr = static_cast<DType *>(out->block()->mutable_data());
const DType *in1Ptr = static_cast<const DType *>(in1.block()->data());
const DType *in2Ptr = static_cast<const DType *>(in2.block()->data());
@@ -132,8 +133,8 @@ void TraverseBinary(const Tensor &in1, const Tensor &in2, Tensor* out,
for (size_t i = 0; i < in1.Size(); i++) {
outPtr[i] = func(in1Ptr[traversal_info_in1[in1.shape().size()]],
in2Ptr[traversal_info_in2[in2.shape().size()]]);
- traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
- traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+ traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
+ traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
}
}
@@ -151,7 +152,7 @@ void Abs<float, lang::Cpp>(const Tensor& in, Tensor* out, Context *ctx) {
template <>
void Add<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out, Context *ctx) {
auto add_lambda = [&x](float a) {
- return (a+x);
+ return (a + x);
};
TraverseUnary<float>(in, out, add_lambda);
}
@@ -160,10 +161,10 @@ template <>
void Add<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out, Context *ctx) {
// CHECK_EQ(ctx->stream, nullptr);
auto add_lambda_binary = [](float a, float b) {
- return (a+b);
+ return (a + b);
};
TraverseBinary<float>(in1, in2, out, add_lambda_binary);
-
+
}
template <>
@@ -171,8 +172,8 @@ void Clamp<float, lang::Cpp>(const float low, const float high,
const Tensor& in, Tensor* out,
Context *ctx) {
auto clamp_lambda = [&low, &high](float a) {
- if(a < low){return low;}
- else if(a > high){return high;}
+ if (a < low) {return low;}
+ else if (a > high) {return high;}
else {return a;}
};
TraverseUnary<float>(in, out, clamp_lambda);
@@ -189,7 +190,7 @@ void Div<float, lang::Cpp>(const float x, const Tensor& in, Tensor* out,
for (size_t i = 0; i < in.Size(); i++) {
CHECK_NE(inPtr[traversal_info[in.shape().size()]], 0.f);
outPtr[i] = x / inPtr[traversal_info[in.shape().size()]];
- traverse_next(in, shape_multipliers, traversal_info, i+1);
+ traverse_next(in, shape_multipliers, traversal_info, i + 1);
}
}
@@ -207,8 +208,8 @@ void Div<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
for (size_t i = 0; i < in1.Size(); i++) {
CHECK_NE(in2Ptr[traversal_info_in2[in2.shape().size()]], 0.f);
outPtr[i] = in1Ptr[traversal_info_in1[in1.shape().size()]] / in2Ptr[traversal_info_in2[in2.shape().size()]];
- traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
- traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+ traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
+ traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
}
}
@@ -216,16 +217,16 @@ template <>
void EltwiseMult<float, lang::Cpp>(const Tensor& in, const float x, Tensor* out,
Context *ctx) {
auto eltwisemult_lambda = [&x](float a) {
- return (a*x);
+ return (a * x);
};
TraverseUnary<float>(in, out, eltwisemult_lambda);
}
template <>
-void EltwiseMult<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
+void EltwiseMult<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto eltwisemult_lambda_binary = [](float a, float b) {
- return (a*b);
+ return (a * b);
};
TraverseBinary<float>(in1, in2, out, eltwisemult_lambda_binary);
}
@@ -300,7 +301,7 @@ void Log<float, lang::Cpp>(const Tensor& in, Tensor* out,
for (size_t i = 0; i < in.Size(); i++) {
CHECK_GT(inPtr[traversal_info[in.shape().size()]], 0.f);
outPtr[i] = log(inPtr[traversal_info[in.shape().size()]]);
- traverse_next(in, shape_multipliers, traversal_info, i+1);
+ traverse_next(in, shape_multipliers, traversal_info, i + 1);
}
}
@@ -325,21 +326,21 @@ void LT<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
template <>
void Pow<float, lang::Cpp>(const Tensor& in, const float x, Tensor *out, Context *ctx) {
- TraverseUnary<float>(in, out, [x](float y) {return pow(y,x);});
+ TraverseUnary<float>(in, out, [x](float y) {return pow(y, x);});
}
template <>
void Pow<float, lang::Cpp>(const Tensor& in1, const Tensor& in2, Tensor* out,
Context *ctx) {
auto pow_lambda_binary = [](float a, float b) {
- return pow(a,b);
+ return pow(a, b);
};
TraverseBinary<float>(in1, in2, out, pow_lambda_binary);
}
template <>
void ReLU<float, lang::Cpp>(const Tensor& in, Tensor* out,
- Context *ctx) {
+ Context *ctx) {
auto relu_lambda = [](float a) {
return (a >= 0.f) ? a : 0.f;
};
@@ -355,14 +356,14 @@ void Set<float, lang::Cpp>(const float x, Tensor* out,
template <>
void Set<int, lang::Cpp>(const int x, Tensor* out,
- Context *ctx) {
+ Context *ctx) {
int *outPtr = static_cast<int *>(out->block()->mutable_data());
for (size_t i = 0; i < out->Size(); i++) outPtr[i] = x;
}
template <>
void Sigmoid<float, lang::Cpp>(const Tensor& in, Tensor* out,
- Context *ctx) {
+ Context *ctx) {
auto sigmoid_lambda = [](float a) {
return 1.f / (1.f + exp(-a));
};
@@ -371,7 +372,7 @@ void Sigmoid<float, lang::Cpp>(const Tensor& in, Tensor* out,
template <>
void Sign<float, lang::Cpp>(const Tensor& in, Tensor* out,
- Context *ctx) {
+ Context *ctx) {
auto sign_lambda = [](float a) {
return (a > 0) - (a < 0);
};
@@ -389,7 +390,7 @@ void Sqrt<float, lang::Cpp>(const Tensor& in, Tensor* out,
for (size_t i = 0; i < in.Size(); i++) {
CHECK_GE(inPtr[traversal_info[in.shape().size()]], 0.f);
outPtr[i] = sqrt(inPtr[traversal_info[in.shape().size()]]);
- traverse_next(in, shape_multipliers, traversal_info, i+1);
+ traverse_next(in, shape_multipliers, traversal_info, i + 1);
}
}
@@ -398,7 +399,7 @@ void Sub<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
Tensor* out, Context *ctx) {
// CHECK_EQ(ctx->stream, nullptr);
auto sub_lambda_binary = [](float a, float b) {
- return (a-b);
+ return (a - b);
};
TraverseBinary<float>(in1, in2, out, sub_lambda_binary);
}
@@ -418,7 +419,7 @@ void Sum<float, lang::Cpp>(const Tensor& in, float *out,
template <>
void Tanh<float, lang::Cpp>(const Tensor& in, Tensor* out,
- Context *ctx) {
+ Context *ctx) {
auto tanh_lambda = [](float a) {
return tanh(a);
};
@@ -475,7 +476,7 @@ void DGMM<float, lang::Cpp>(const bool side_right,
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[c];
- traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
+ traverse_next(M, shape_multipliers, traversal_info, offset + c + 1);
}
}
} else {
@@ -483,7 +484,7 @@ void DGMM<float, lang::Cpp>(const bool side_right,
size_t offset = r * ncol;
for (size_t c = 0; c < ncol; c++) {
outPtr[traversal_info[M.shape().size()]] = MPtr[traversal_info[M.shape().size()]] * vPtr[r];
- traverse_next(M, shape_multipliers, traversal_info, offset+c+1);
+ traverse_next(M, shape_multipliers, traversal_info, offset + c + 1);
}
}
}
@@ -509,7 +510,7 @@ template <>
void Axpy<float, lang::Cpp>(const float alpha,
const Tensor& in, Tensor *out, Context *ctx) {
//check input tensor for strides first
- if(in.strides() == out->strides()){
+ if (in.strides() == out->strides()) {
const float *inPtr = static_cast<const float *>(in.block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
cblas_saxpy(in.Size(), alpha, inPtr, 1, outPtr, 1);
@@ -522,7 +523,7 @@ template <>
void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
float *out, Context *ctx) {
//check input tensor for strides first
- if(!(in1.transpose()) && !(in2.transpose())){
+ if (!(in1.transpose()) && !(in2.transpose())) {
const float *in1Ptr = static_cast<const float *>(in1.block()->data());
const float *in2Ptr = static_cast<const float *>(in2.block()->data());
*out = cblas_sdot(in1.Size(), in1Ptr, 1, in2Ptr, 1);
@@ -580,10 +581,10 @@ void GEMM<float, lang::Cpp>(const float alpha,
const float *BPtr = static_cast<const float *>(B.block()->data());
float *CPtr = static_cast<float *>(C->block()->mutable_data());
cblas_sgemm(CblasRowMajor, transa, transb, nrowA, ncolB, ncolA, alpha, APtr,
- lda, BPtr, ldb, beta, CPtr, ldc);
+ lda, BPtr, ldb, beta, CPtr, ldc);
}
-#else
+#else
template <>
void Amax<float, lang::Cpp>(const Tensor& in, size_t *out,
@@ -636,9 +637,9 @@ void Axpy<float, lang::Cpp>(const float alpha,
vector<int> traversal_info = generate_traversal_info(in);
vector<int> shape_multipliers = generate_shape_multipliers(in);
- for (size_t i = 0; i < in.Size(); i++) {
+ for (size_t i = 0; i < in.Size(); i++) {
outPtr[i] += alpha * inPtr[traversal_info[in.shape().size()]];
- traverse_next(in, shape_multipliers, traversal_info, i+1);
+ traverse_next(in, shape_multipliers, traversal_info, i + 1);
}
}
@@ -658,7 +659,7 @@ void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
// const float *in1Ptr = static_cast<const float *>(in1.data());
// const float *in2Ptr = static_cast<const float *>(in2.data());
// for (size_t i = 0; i < in.Size(); i++) {
- // sum += in1Ptr[i] * in2Ptr[i];
+ // sum += in1Ptr[i] * in2Ptr[i];
// }
float *outPtr = static_cast<float *>(out->block()->mutable_data());
const float *in1Ptr = static_cast<const float *>(in1.block()->data());
@@ -670,8 +671,8 @@ void Dot<float, lang::Cpp>(const Tensor& in1, const Tensor& in2,
for (size_t i = 0; i < in1.Size(); i++) {
sum += in1Ptr[traversal_info_in1[in1.shape().size()]] * in2Ptr[traversal_info_in2[in2.shape().size()]];
- traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i+1);
- traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i+1);
+ traverse_next(in1, shape_multipliers_in1, traversal_info_in1, i + 1);
+ traverse_next(in2, shape_multipliers_in2, traversal_info_in2, i + 1);
}
}
@@ -697,10 +698,10 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor& A, const Tensor& v,
#endif // USE_CBLAS
template <>
void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
- const size_t batchsize,
- const size_t dim, const Block *p,
- const Block *t, Block *loss,
- Context *ctx) {
+ const size_t batchsize,
+ const size_t dim, const Block *p,
+ const Block *t, Block *loss,
+ Context *ctx) {
const float *pPtr = static_cast<const float *>(p->data());
const int *tPtr = static_cast<const int *>(t->data());
float *lossPtr = static_cast<float *>(loss->mutable_data());
@@ -712,7 +713,7 @@ void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
lossPtr[i] = -std::log((std::max)(prob_of_truth, FLT_MIN));
}
} else {
- for (size_t i = 0;i < batchsize; i++) {
+ for (size_t i = 0; i < batchsize; i++) {
float sum = 0.f;
for (size_t j = 0; j < dim; j++) {
sum += tPtr[i * dim + j];
@@ -728,10 +729,10 @@ void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
template <>
void SoftmaxCrossEntropyBwd<float, lang::Cpp>(bool int_target,
- const size_t batchsize,
- const size_t dim, const Block *p,
- const Block *t, Block *grad,
- Context *ctx) {
+ const size_t batchsize,
+ const size_t dim, const Block *p,
+ const Block *t, Block *grad,
+ Context *ctx) {
CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
// const float* pPtr = static_cast<const float*>(p->data());
const int *tPtr = static_cast<const int *>(t->data());
@@ -764,13 +765,13 @@ void RowMax<float, lang::Cpp>(const Tensor& in, Tensor *out, Context *ctx) {
const size_t ncol = in.shape()[1];
vector<int> traversal_info = generate_traversal_info(in);
vector<int> shape_multipliers = generate_shape_multipliers(in);
-
+
for (size_t r = 0; r < nrow; r++) {
int counter_offset = (r * ncol);
float maxval = 0;
- for (size_t c = 0; c < ncol; c++){
+ for (size_t c = 0; c < ncol; c++) {
maxval = (std::max)(maxval, inPtr[traversal_info[in.shape().size()]]);
- traverse_next(in, shape_multipliers, traversal_info, counter_offset+c+1);
+ traverse_next(in, shape_multipliers, traversal_info, counter_offset + c + 1);
}
outPtr[r] = maxval;
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/3e2b75cb/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 6e86ca7..55d6a1b 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -34,45 +34,45 @@ namespace singa {
// ===================== Helper Functions =============================
- /*
- cudnn requires tensor dimensions to fulfill 1 requirement:
- 1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
- if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
- (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
-
- for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
- Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
- */
- vector<int> generate_shape_cuda(const Tensor& x) {
- Shape shape_ = x.shape();
- vector<int> shape_arr;
- if(shape_.size() <= 4){
- for (size_t n=0; n<4-shape_.size(); ++n) {
- shape_arr.push_back(1);
- }
- for (size_t n=0; n<shape_.size(); ++n) {
- shape_arr.push_back(shape_.at(n));
- }
- return shape_arr;
- } else if(shape_.size() == 5){
- for (size_t n=0; n<shape_.size(); ++n) {
- shape_arr.push_back(shape_.at(n));
- }
- return shape_arr;
- } else {
- LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
+/*
+cudnn requires tensor dimensions to fulfill 1 requirement:
+ 1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+ if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+ (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
+
+ for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+ Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
+*/
+vector<int> generate_shape_cuda(const Tensor& x) {
+ Shape shape_ = x.shape();
+ vector<int> shape_arr;
+ if (shape_.size() <= 4) {
+ for (size_t n = 0; n < 4 - shape_.size(); ++n) {
+ shape_arr.push_back(1);
+ }
+ for (size_t n = 0; n < shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
}
+ return shape_arr;
+ } else if (shape_.size() == 5) {
+ for (size_t n = 0; n < shape_.size(); ++n) {
+ shape_arr.push_back(shape_.at(n));
+ }
+ return shape_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
}
+}
- int generate_dim_cuda(const Tensor& x) {
- if(x.shape().size() <= 4){return 4;}
- else if(x.shape().size() == 5){return 5;}
- else{
- LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
- }
+int generate_dim_cuda(const Tensor& x) {
+ if (x.shape().size() <= 4) {return 4;}
+ else if (x.shape().size() == 5) {return 5;}
+ else {
+ LOG(FATAL) << "Dimensions (shape) beyond 5 are currently not supported" ;
}
+}
-/*
+/*
cudnn requires stride dimensions to conform to the format of the shape input as well
1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
@@ -81,51 +81,51 @@ namespace singa {
for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3}
and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
*/
- vector<int> generate_strides_cuda(const Tensor& x) {
- Shape shape_ = x.shape();
- vector<int> strides_ = x.strides();
- vector<int> strides_arr;
- int product = 1;
- for (size_t n=0; n<(shape_.size()); ++n) {
- product *= shape_[n];
+vector<int> generate_strides_cuda(const Tensor& x) {
+ Shape shape_ = x.shape();
+ vector<int> strides_ = x.strides();
+ vector<int> strides_arr;
+ int product = 1;
+ for (size_t n = 0; n < (shape_.size()); ++n) {
+ product *= shape_[n];
+ }
+ if (shape_.size() <= 4) {
+ for (size_t n = 0; n < 4 - shape_.size(); ++n) {
+ strides_arr.push_back(product);
+ }
+ for (size_t n = 0; n < strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
}
- if(shape_.size() <= 4){
- for (size_t n=0; n<4-shape_.size(); ++n) {
- strides_arr.push_back(product);
- }
- for (size_t n=0; n<strides_.size(); ++n) {
- strides_arr.push_back(strides_[n]);
- }
- return strides_arr;
- } else if(shape_.size() == 5){
- for (size_t n=0; n<strides_.size(); ++n) {
- strides_arr.push_back(strides_[n]);
- }
- return strides_arr;
- } else {
- LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
+ return strides_arr;
+ } else if (shape_.size() == 5) {
+ for (size_t n = 0; n < strides_.size(); ++n) {
+ strides_arr.push_back(strides_[n]);
}
+ return strides_arr;
+ } else {
+ LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
}
+}
-cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x){
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor& x) {
cudnnTensorDescriptor_t x_desc;
cudnnCreateTensorDescriptor(&x_desc);
cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
generate_dim_cuda(x),
generate_shape_cuda(x).data(),
generate_strides_cuda(x).data()
- );
+ );
return x_desc;
}
-cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
+cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op) {
cudnnOpTensorDescriptor_t op_desc;
cudnnCreateOpTensorDescriptor(&op_desc);
cudnnSetOpTensorDescriptor(op_desc, op,
CUDNN_DATA_FLOAT,
CUDNN_PROPAGATE_NAN
- );
+ );
return op_desc;
}
@@ -144,10 +144,10 @@ void Abs<float, lang::Cuda>(const Tensor& in, Tensor* out,
float beta = 0.0;
cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
- (void*)(&alpha1), in_desc, inPtr,
+ (void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
(void*)(&beta), generate_tensorND_desc(*out), outPtr
- );
+ );
cudnnDestroyTensorDescriptor(in_desc);
}
@@ -156,8 +156,8 @@ void Set<float, lang::Cuda>(const float x, Tensor* out,
Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out),
- outPtr, (void*)(&x));
+ cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(*out),
+ outPtr, (void*)(&x));
}
template <>
@@ -171,7 +171,7 @@ void Add<float, lang::Cuda>(const Tensor& in, const float x,
cudnnAddTensor(ctx->cudnn_handle,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
(void*)(&beta), generate_tensorND_desc(*out), outPtr
- );
+ );
}
/// out = in1 + in2
@@ -186,18 +186,18 @@ void Add<float, lang::Cuda>(const Tensor& in1,
float alpha2 = 1.0;
float beta = 0.0;
- if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
+ if ((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)) {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
- (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
- (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
- );
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ );
} else {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
- (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
- (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
- );
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ );
}
}
@@ -213,18 +213,18 @@ void Sub<float, lang::Cuda>(const Tensor& in1,
float alpha2 = -1.0;
float beta = 0.0;
- if((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)){
+ if ((in1.nDim() == in2.nDim()) || (in2.nDim() == 1)) {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
- (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
- (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
- );
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ );
} else {
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
- (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
- (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
- );
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ );
}
}
@@ -250,17 +250,17 @@ void Div<float, lang::Cuda>(const Tensor& in1,
const size_t num = in1.Size();
//if both in1 and in2 strides are the same, we proceed to normal cuda::div
- if(in1.strides() == in2.strides()){
- cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->set_strides(in1.strides());
+ if (in1.strides() == in2.strides()) {
+ cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
- (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -286,8 +286,8 @@ void EltwiseMult<float, lang::Cuda>(const Tensor& in,
float alpha = x, beta = 0.0;
cudnnAddTensor(ctx->cudnn_handle,
- (void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
}
@@ -302,17 +302,17 @@ void EltwiseMult<float, lang::Cuda>(const Tensor& in1,
const size_t num = in1.Size();
//if both in1 and in2 strides are the same, we proceed to normal cuda::mult
- if(in1.strides() == in2.strides()){
- cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->set_strides(in1.strides());
+ if (in1.strides() == in2.strides()) {
+ cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
- (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -443,17 +443,17 @@ void Pow<float, lang::Cuda>(const Tensor& in1,
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1.Size();
- if(in1.strides() == in2.strides()){
- cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
- out->set_strides(in1.strides());
+ if (in1.strides() == in2.strides()) {
+ cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
+ out->set_strides(in1.strides());
} else { //else we transform in1 to out to store first
float alpha = 1.0;
float beta = 0.0;
out->set_strides(in2.strides());
cudnnTransformTensor(ctx->cudnn_handle,
- (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
- (void*)(&beta), generate_tensorND_desc(*out), outPtr
+ (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&beta), generate_tensorND_desc(*out), outPtr
);
cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
@@ -473,18 +473,18 @@ void Pow<float, lang::Cuda>(const Tensor& in1,
// double coef = 0.0; //only used for CLIPPED_RELU or ELU
// cudnnCreateActivationDescriptor(&act_desc);
// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
-
+
// float alpha[1] = {1.0};
// float beta[1] = {0.0};
// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
+// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
// cudnnDestroyTensorDescriptor(in_desc);
@@ -515,18 +515,18 @@ void ReLU<float, lang::Cuda>(const Tensor& in, Tensor* out,
// double coef = 0.0; //only used for CLIPPED_RELU or ELU
// cudnnCreateActivationDescriptor(&act_desc);
// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
-
+
// float alpha[1] = {1.0};
// float beta[1] = {0.0};
// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
+// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
// cudnnDestroyTensorDescriptor(in_desc);
@@ -562,16 +562,16 @@ void Sqrt<float, lang::Cuda>(const Tensor& in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
-
+
float alpha1 = 1.0;
float alpha2 = 0.0;
float beta = 0.0;
cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
- (void*)(&alpha1), in_desc, inPtr,
+ (void*)(&alpha1), in_desc, inPtr,
(void*)(&alpha2), in_desc, inPtr,
(void*)(&beta), generate_tensorND_desc(*out), outPtr
- );
+ );
}
/// Element-wise operation, out[i]=in[i]^2
@@ -598,15 +598,15 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
- //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
- Shape reduced_shape = {1};
- Tensor t(reduced_shape, in.device(), in.data_type());
- float* tPtr = static_cast<float*>(t.block()->mutable_data());
- vector<int> reduce_all_axes = generate_shape_cuda(in);
- for (size_t n=0; n<reduce_all_axes.size(); ++n) {
+ //reduce all axes to 1 for cudnnReduce, e.g. Tensor A with shape (2,4) will be reduced to (1)
+ Shape reduced_shape = {1};
+ Tensor t(reduced_shape, in.device(), in.data_type());
+ float* tPtr = static_cast<float*>(t.block()->mutable_data());
+ vector<int> reduce_all_axes = generate_shape_cuda(in);
+ for (size_t n = 0; n < reduce_all_axes.size(); ++n) {
reduce_all_axes[n] = 1;
- }
-
+ }
+
//reduce_desc
cudnnReduceTensorDescriptor_t reduce_desc;
cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
@@ -620,11 +620,11 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
//instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
size_t reduction_size_int = Product(in.shape());
- Shape reduction_size = {reduction_size_int*100};
+ Shape reduction_size = {reduction_size_int * 100};
Tensor indices(reduction_size, in.device(), in.data_type());
Tensor workspace(reduction_size, in.device(), in.data_type());
- size_t indices_bytes = indices.block()->size()*100;
- size_t workspace_bytes = workspace.block()->size()*100;
+ size_t indices_bytes = indices.block()->size() * 100;
+ size_t workspace_bytes = workspace.block()->size() * 100;
size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
//void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
@@ -636,7 +636,7 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
(void*)(&alpha), generate_tensorND_desc(in), inPtr,
(void*)(&beta), generate_tensorND_desc(t), tPtr
- );
+ );
*out = tPtr[0];
}
@@ -655,18 +655,18 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
// double coef = 0.0; //only used for CLIPPED_RELU or ELU
// cudnnCreateActivationDescriptor(&act_desc);
// cudnnSetActivationDescriptor(act_desc, mode, cudnn_propagation, coef);
-
+
// float alpha[1] = {1.0};
// float beta[1] = {0.0};
// cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
+// cudnnActivationForward(ctx->cudnn_handle, act_desc, (void*)(&alpha), in_desc, inPtr,
// (void*)(&beta), out_desc, outPtr);
// cudnnDestroyTensorDescriptor(in_desc);
@@ -676,7 +676,7 @@ void Sum<float, lang::Cuda>(const Tensor& in, float* out,
template <>
void Tanh<float, lang::Cuda>(const Tensor& in, Tensor* out,
- Context* ctx) {
+ Context* ctx) {
const float* inPtr = static_cast<const float*>(in.block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in.Size();
@@ -856,22 +856,22 @@ void GEMM<float, lang::Cuda>(const float alpha,
template <>
void ComputeCrossEntropy<float, lang::Cuda>(bool int_target,
- const size_t batchsize,
- const size_t dim, const Block* p,
- const Block* t, Block* loss,
- Context* ctx) {
+ const size_t batchsize,
+ const size_t dim, const Block* p,
+ const Block* t, Block* loss,
+ Context* ctx) {
const float* pPtr = static_cast<const float*>(p->data());
const int* tPtr = static_cast<const int*>(t->data());
float* lossPtr = static_cast<float*>(loss->mutable_data());
cuda::ComputeCrossEntropy(int_target, batchsize, dim, pPtr, tPtr, lossPtr,
- ctx->stream);
+ ctx->stream);
}
template <>
void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
- const size_t batchsize,
- const size_t dim, const Block* p,
- const Block* t, Block* grad,
- Context* ctx) {
+ const size_t batchsize,
+ const size_t dim, const Block* p,
+ const Block* t, Block* grad,
+ Context* ctx) {
CHECK_EQ(p, grad) << "Use the same pointer to optimize performance";
const float* pPtr = static_cast<const float*>(p->data());
const int* tPtr = static_cast<const int*>(t->data());
@@ -924,11 +924,11 @@ void SoftmaxCrossEntropyBwd<float, lang::Cuda>(bool int_target,
// cudnnTensorDescriptor_t in_desc, out_desc;
// cudnnCreateTensorDescriptor(&in_desc);
// cudnnCreateTensorDescriptor(&out_desc);
-// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in.generate_dim_cuda(),
// in.generate_shape_cuda().data(), in.generate_strides_cuda().data());
-// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// //cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
-// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
+// cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(),
// reduce_row_axes_shape.data(), reduced_strides.data());
// cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
// indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
@@ -946,7 +946,7 @@ void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
const size_t nrow = in.shape()[0];
const size_t ncol = in.shape()[1];
- if(in.transpose()){
+ if (in.transpose()) {
Tensor t(in.shape(), in.device(), in.data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
@@ -954,8 +954,8 @@ void RowMax<float, lang::Cuda>(const Tensor& in, Tensor* out,
float beta = 0.0;
cudnnTransformTensor(ctx->cudnn_handle,
- (void*)(&alpha), generate_tensorND_desc(in), inPtr,
- (void*)(&beta), generate_tensorND_desc(t), tPtr
+ (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+ (void*)(&beta), generate_tensorND_desc(t), tPtr
);
const float* tPtr_const = static_cast<const float*>(t.block()->data());
[05/10] incubator-singa git commit: Merge branch 'master' of
github.com:apache/incubator-singa into SINGA-341-351
Posted by wa...@apache.org.
Merge branch 'master' of github.com:apache/incubator-singa into SINGA-341-351
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/a44d2e76
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/a44d2e76
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/a44d2e76
Branch: refs/heads/master
Commit: a44d2e76b70a54e6ad1e063c0b8e895c43018b63
Parents: 75f9a0e b5600d3
Author: Vaan Ng <cm...@gmail.com>
Authored: Fri May 11 15:29:56 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Fri May 11 15:29:56 2018 +0800
----------------------------------------------------------------------
.gitignore | 1 +
.travis.yml | 3 +-
CMakeLists.txt | 14 +-
cmake/Dependencies.cmake | 14 +-
doc/_static/style.css | 3 +
doc/_templates/layout.html | 2 +-
doc/conf.py | 8 +-
doc/en/community/team-list.rst | 2 +-
doc/en/docs/install_macos1013.rst | 18 ++
doc/en/docs/install_win.rst | 178 +++++++++++++++++++
doc/en/docs/installation.md | 36 +---
doc/zh/community/issue-tracking.md | 9 +
doc/zh/community/mail-lists.rst | 28 +++
doc/zh/community/source-repository.md | 22 +++
doc/zh/community/team-list.rst | 84 +++++++++
doc/zh/develop/contribute-code.md | 48 +++++
doc/zh/develop/how-contribute.md | 9 +
doc/zh/develop/schedule.rst | 66 +++++++
doc/zh/docs.rst | 23 +++
doc/zh/downloads.md | 109 ++++++++++++
doc/zh/index.rst | 42 ++++-
python/CMakeLists.txt | 10 +-
python/setup.py.in | 2 +-
python/singa/layer.py | 6 +-
python/singa/net.py | 24 +--
python/singa/tensor.py | 135 ++++++++++++++
src/api/model_layer.i | 21 +++
src/core/device/opencl_func.h | 6 +-
src/model/layer/cudnn_activation.cc | 13 --
src/model/layer/cudnn_convolution.cc | 13 +-
src/model/layer/cudnn_pooling.cc | 8 -
src/model/layer/cudnn_rnn.cc | 4 +-
test/python/run.py | 11 +-
test/python/test_tensor.py | 16 ++
tool/conda/README.md | 33 ++++
tool/conda/build.sh | 29 +--
tool/conda/meta.yaml | 9 +-
tool/docker/README.md | 33 ++--
tool/docker/build.sh | 19 +-
tool/docker/devel/Dockerfile | 36 ----
tool/docker/devel/conda/cuda/Dockerfile | 52 ++++++
tool/docker/devel/cuda/Dockerfile | 39 ----
tool/docker/devel/native/centos6/Dockerfile | 48 +++++
.../devel/native/ubuntu/cuda/py2/Dockerfile | 53 ++++++
.../devel/native/ubuntu/cuda/py3/Dockerfile | 54 ++++++
tool/docker/runtime/Dockerfile | 31 +++-
tool/docker/runtime/cuda/Dockerfile | 31 ----
tool/jenkins/README.md | 97 +++++-----
tool/jenkins/docker/devel/centos6/Dockerfile | 64 -------
tool/jenkins/docker/devel/ubuntu/Dockerfile | 70 --------
tool/jenkins/docker/runtime/Dockerfile | 51 ------
tool/jenkins/gen_doc.sh | 39 ++++
tool/jenkins/jenkins_doc.sh | 37 ----
tool/jenkins/jenkins_test.sh | 57 ------
tool/jenkins/test.sh | 63 +++++++
tool/opencl/clsrc_to_str.py | 4 +-
tool/travis/build.sh | 40 +++--
tool/travis/conda.sh | 38 ----
tool/travis/depends.sh | 41 ++---
59 files changed, 1369 insertions(+), 687 deletions(-)
----------------------------------------------------------------------
[04/10] incubator-singa git commit: misc. changes and further
abstraction of some cudnn codes
Posted by wa...@apache.org.
misc. changes and further abstraction of some cudnn codes
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/75f9a0e3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/75f9a0e3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/75f9a0e3
Branch: refs/heads/master
Commit: 75f9a0e39520fe86f6e774f5295d65830bd274ab
Parents: 26101ee
Author: Vaan Ng <cm...@gmail.com>
Authored: Thu May 10 18:34:44 2018 +0800
Committer: Vaan Ng <cm...@gmail.com>
Committed: Thu May 10 18:34:44 2018 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 21 +--
src/core/tensor/tensor.cc | 12 +-
src/core/tensor/tensor_math_cpp.h | 31 ++--
src/core/tensor/tensor_math_cuda.h | 309 +++++++++++++-------------------
4 files changed, 152 insertions(+), 221 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 2c28e0f..b94a982 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -105,12 +105,13 @@ class Tensor {
}
/*
- cudnn requires tensor dimensions to fulfill 2 requirements:
- 1.) dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
- 2.) dimensions have to be set to multiples of 8
+ cudnn requires tensor dimensions to fulfill 1 requirement:
+ 1.) Dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+ if input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+ (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
- for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,24,24} to be the input
- Tensor B has shape (2,3,4), cudnn requires shape of {1,16,24,32} to be the input
+ for e.g. Tensor A has shape {3,3}, cudnn requires shape of {1,1,3,3} to be the input
+ Tensor B has shape (2,3,4), cudnn requires shape of {1,2,3,4} to be the input
*/
vector<int> generate_shape_cuda() const {
vector<int> shape_arr;
@@ -151,11 +152,11 @@ class Tensor {
/*
cudnn requires stride dimensions to conform to the format of the shape input as well
- 1.) stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
- 2.) stride dimensions have to be set to powers of 8, depending on the stride order (outer stride = higher power)
+ 1.) Stride dimensions to be set to a minimum of 4 for 4d and lower dimensional tensors
+ If input tensor is 5d, cudnn will take a 5d tensor as input. Beyond 5d, certain operations are not supported.
+ (cudnnOp supports up to 5d, cudnnReduce supports up to 8d)
- for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,24,24} and stride {576, 576, 24, 1} to be the inputs,
- if A is transposed with stride {1,3}, then the new cudnn stride becomes {576, 576, 8, 3}
+ for e.g. Tensor A has shape {3,3}, stride {3,1}, cudnn requires shape {1,1,3,3} and stride {9, 9, 3, 1} or {9, 9, 1, 3} to be the inputs
*/
vector<int> generate_strides_cuda() const {
vector<int> strides_arr;
@@ -177,7 +178,7 @@ class Tensor {
}
return strides_arr;
} else {
- LOG(FATAL) << "Dimensions (strides) beyond 3 are currently not supported" ;
+ LOG(FATAL) << "Dimensions (strides) beyond 5 are currently not supported" ;
}
}
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 48751ef..9067242 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -132,10 +132,8 @@ void Tensor::ResetLike(const Tensor &in) {
shape_multipliers_ = in.shape_multipliers_;
}
-//yisen todo
//if tensor is not transposed yet i.e strides == 1, then we simply change the shape and generate new default strides
//if tensor is already transposed i.e strides != 1, it should be copied to a new tensor with newly generated default strides
-
void Tensor::Reshape(const Shape &shape) {
if(strides_.size()==0)
strides_.push_back(1);
@@ -144,9 +142,8 @@ void Tensor::Reshape(const Shape &shape) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
- } else if (strides_[0] != 1) {
- std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
- return void();
+ } else if (transpose()) {
+ LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
}
shape_ = shape;
Generate_Strides();
@@ -161,9 +158,8 @@ void Tensor::Reshape(Shape &&shape) {
if (block_ != nullptr && block_->DecRefCount() == 0)
device_->FreeBlock(block_);
block_ = device_->NewBlock((int)(Product(shape) * SizeOf(data_type_)));
- } else if (strides_[0] != 1) {
- std::cout << "Reshape Error: Tranposed tensor must return new tensor. Not implemented yet." << std::endl;
- return void();
+ } else if (transpose()) {
+ LOG(FATAL) << "Reshape Error: Reshape called on tranposed tensor. Not implemented yet." ;
}
shape_ = std::move(shape);
Generate_Strides();
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 01d9fe3..d4cd5da 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -724,7 +724,7 @@ void Uniform<float, lang::Cpp>(const float low,
// ====================Blas operations======================================
-//yisen todo, this function has block M overwritting to block M itself
+//warning, this function has block M overwritting to block M itself
template <>
void DGMM<float, lang::Cpp>(const bool side_right,
const Tensor* M, const Tensor* v,
@@ -817,26 +817,26 @@ template <>
void Axpy<float, lang::Cpp>(const float alpha,
const Tensor *in, Tensor *out, Context *ctx) {
//check input tensor for strides first
- if((in->strides())[0] == 1){
+ if(in->strides() != out->strides()){
const float *inPtr = static_cast<const float *>(in->block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
cblas_saxpy(in->Size(), alpha, inPtr, 1, outPtr, 1);
+ } else {
+ LOG(FATAL) << "Axpy, input and output strides do not match." ;
}
- //yisen todo
- //else throw error
}
template <>
void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
float *out, Context *ctx) {
//check input tensor for strides first
- if(((in1->strides())[0] == 1) && ((in2->strides())[0] == 1)){
+ if(!(in1->transpose()) && !(in2->transpose())){
const float *in1Ptr = static_cast<const float *>(in1->block()->data());
const float *in2Ptr = static_cast<const float *>(in2->block()->data());
*out = cblas_sdot(in1->Size(), in1Ptr, 1, in2Ptr, 1);
+ } else {
+ LOG(FATAL) << "Dot, one of the input is tranposed. Not implemented yet." ;
}
- //yisen todo
- //else throw error
}
template <>
@@ -878,15 +878,14 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
const float *APtr = static_cast<const float *>(A->block()->data());
const float *vPtr = static_cast<const float *>(v->block()->data());
float *outPtr = static_cast<float *>(out->block()->mutable_data());
- auto trans = ((A->strides())[0] != 1) ? true : false;
const size_t m = A->shape()[0];
const size_t n = A->shape()[1];
- if (!trans) {
- cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
- beta, outPtr, 1);
- } else {
+ if (A->transpose()) {
cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
outPtr, 1);
+ } else {
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+ beta, outPtr, 1);
}
}
@@ -915,9 +914,9 @@ template <>
void GEMM<float, lang::Cpp>(const float alpha,
const Tensor *A, const Tensor *B, const float beta,
Tensor *C, Context *ctx) {
- auto transA = ((A->strides())[0] != 1) ? true : false;
+ auto transA = A->transpose();
auto transa = transA ? CblasTrans : CblasNoTrans;
- auto transB = ((B->strides())[0] != 1) ? true : false;
+ auto transB = B->transpose();
auto transb = transB ? CblasTrans : CblasNoTrans;
const size_t nrowA = A->shape()[0];
const size_t ncolA = A->shape()[1];
@@ -1088,7 +1087,6 @@ void Scale<float, lang::Cpp>(const float x, Tensor *out,
}
}
-//yisen todo check purpose of sum in this function
template <>
void Dot<float, lang::Cpp>(const Tensor *in1, const Tensor *in2,
float *out, Context *ctx) {
@@ -1116,7 +1114,7 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
float *outPtr = static_cast<float *>(out->block()->mutable_data());
const float *APtr = static_cast<const float *>(A->block()->data());
const float *vPtr = static_cast<const float *>(v->block()->data());
- bool trans = ((A->strides())[0] != 1) ? true : false;
+ bool trans = A->transpose();
const size_t m = A->shape(0);
const size_t n = A->shape(1);
for (size_t r = 0; r < m; r++) {
@@ -1129,7 +1127,6 @@ void GEMV<float, lang::Cpp>(const float alpha, const Tensor *A, const Tensor *v,
}
}
-//yisen todo
#endif // USE_CBLAS
template <>
void ComputeCrossEntropy<float, lang::Cpp>(bool int_target,
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/75f9a0e3/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index f4839e3..3e36877 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -32,6 +32,30 @@
namespace singa {
+cudnnTensorDescriptor_t generate_tensorND_desc(const Tensor* x){
+ cudnnTensorDescriptor_t x_desc;
+ cudnnCreateTensorDescriptor(&x_desc);
+ cudnnSetTensorNdDescriptor(x_desc, CUDNN_DATA_FLOAT,
+ x->generate_dim_cuda(),
+ x->generate_shape_cuda().data(),
+ x->generate_strides_cuda().data()
+ );
+
+ return x_desc;
+}
+
+cudnnOpTensorDescriptor_t generate_Op_desc(cudnnOpTensorOp_t op){
+ cudnnOpTensorDescriptor_t op_desc;
+ cudnnCreateOpTensorDescriptor(&op_desc);
+ cudnnSetOpTensorDescriptor(op_desc, op,
+ CUDNN_DATA_FLOAT,
+ CUDNN_PROPAGATE_NAN
+ );
+
+ return op_desc;
+}
+
+
/// out[i] = |in[i]|
template <>
void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
@@ -39,41 +63,25 @@ void Abs<float, lang::Cuda>(const Tensor* in, Tensor* out,
const float* inPtr = static_cast<const float*>(in->block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_MAX;
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
- cudnnOpTensorDescriptor_t op_desc;
- cudnnCreateOpTensorDescriptor(&op_desc);
- cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
-
- float alpha1[1] = {1.0};
- float alpha2[1] = {-1.0};
- float beta[1] = {0.0};
- cudnnTensorDescriptor_t in_desc, out_desc;
- cudnnCreateTensorDescriptor(&in_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr,
- (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
-
+ float alpha1 = 1.0;
+ float alpha2 = -1.0;
+ float beta = 0.0;
+ cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
+ cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_MAX),
+ (void*)(&alpha1), in_desc, inPtr,
+ (void*)(&alpha2), in_desc, inPtr,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
cudnnDestroyTensorDescriptor(in_desc);
- cudnnDestroyTensorDescriptor(out_desc);
}
template <>
void Set<float, lang::Cuda>(const float x, Tensor* out,
Context* ctx) {
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- //float valuePtr[1] = {x};
-
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnTensorDescriptor_t out_desc;
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnSetTensor(ctx->cudnn_handle, out_desc, outPtr, (void*)(&x));
- cudnnDestroyTensorDescriptor(out_desc);
+ cudnnSetTensor(ctx->cudnn_handle, generate_tensorND_desc(out),
+ outPtr, (void*)(&x));
}
template <>
@@ -83,17 +91,11 @@ void Add<float, lang::Cuda>(const Tensor* in, const float x,
const float* inPtr = static_cast<const float*>(in->block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- float alpha = 1.0, beta=1.0;
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnTensorDescriptor_t in_desc, out_desc;
- cudnnCreateTensorDescriptor(&in_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
-
- cudnnDestroyTensorDescriptor(in_desc);
- cudnnDestroyTensorDescriptor(out_desc);
+ float alpha = 1.0, beta = 1.0;
+ cudnnAddTensor(ctx->cudnn_handle,
+ (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
}
/// out = in1 + in2
@@ -104,34 +106,23 @@ void Add<float, lang::Cuda>(const Tensor* in1,
const float* inPtr2 = static_cast<const float*>(in2->block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
- cudnnOpTensorDescriptor_t op_desc;
- cudnnCreateOpTensorDescriptor(&op_desc);
- cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
-
- float alpha1[1] = {1.0};
- float alpha2[1] = {1.0};
- float beta[1] = {0.0};
- cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
- cudnnCreateTensorDescriptor(&in1_desc);
- cudnnCreateTensorDescriptor(&in2_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ float alpha1 = 1.0;
+ float alpha2 = 1.0;
+ float beta = 0.0;
+
if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
- cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
} else {
- cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
}
-
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
- (void*)(alpha2), in2_desc, inPtr2, (void*)(beta), out_desc, outPtr);
-
- cudnnDestroyTensorDescriptor(in1_desc);
- cudnnDestroyTensorDescriptor(in2_desc);
- cudnnDestroyTensorDescriptor(out_desc);
}
/// out = in1 - in2
@@ -142,34 +133,23 @@ void Sub<float, lang::Cuda>(const Tensor* in1,
const float* inPtr2 = static_cast<const float*>(in2->block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
- cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_ADD;
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
- cudnnOpTensorDescriptor_t op_desc;
- cudnnCreateOpTensorDescriptor(&op_desc);
- cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
-
- float alpha1[1] = {1.0};
- float alpha2[1] = {-1.0};
- float beta[1] = {0.0};
- cudnnTensorDescriptor_t in1_desc, in2_desc, out_desc;
- cudnnCreateTensorDescriptor(&in1_desc);
- cudnnCreateTensorDescriptor(&in2_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ float alpha1 = 1.0;
+ float alpha2 = -1.0;
+ float beta = 0.0;
+
if((in1->nDim() == in2->nDim()) || (in2->nDim() == 1)){
- cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in2->generate_dim_cuda(), in2->generate_shape_cuda().data(), in2->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in2), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
} else {
- cudnnSetTensorNdDescriptor(in2_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_ADD),
+ (void*)(&alpha1), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&alpha2), generate_tensorND_desc(in1), inPtr2,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
}
-
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(alpha1), in1_desc, inPtr1,
- (void*)(alpha2), in2_desc, inPtr2, (void*)(beta), out_desc, outPtr);
-
- cudnnDestroyTensorDescriptor(in1_desc);
- cudnnDestroyTensorDescriptor(in2_desc);
- cudnnDestroyTensorDescriptor(out_desc);
}
/// Element-wise operation, clamp every element into [low, high]
@@ -193,26 +173,21 @@ void Div<float, lang::Cuda>(const Tensor* in1,
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1->Size();
- if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::div
+ //if both in1 and in2 strides are the same, we proceed to normal cuda::div
+ if(in1->strides() == in2->strides()){
cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
out->Set_Strides(in1->strides());
} else { //else we transform in1 to out to store first
- float alpha[1] = {1.0};
- float beta[1] = {0.0};
-
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnTensorDescriptor_t in1_desc, out_desc;
- cudnnCreateTensorDescriptor(&in1_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ float alpha = 1.0;
+ float beta = 0.0;
+
out->Set_Strides(in2->strides());
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
- (void*)(beta), out_desc, outPtr);
+ cudnnTransformTensor(ctx->cudnn_handle,
+ (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
cuda::div(num, outPtr, inPtr2, outPtr, ctx->stream);
- cudnnDestroyTensorDescriptor(in1_desc);
- cudnnDestroyTensorDescriptor(out_desc);
}
}
@@ -234,16 +209,10 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in,
float* outPtr = static_cast<float*>(out->block()->mutable_data());
float alpha = x, beta = 0.0;
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnTensorDescriptor_t in_desc, out_desc;
- cudnnCreateTensorDescriptor(&in_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnAddTensor(ctx->cudnn_handle, (void*)(&alpha), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
-
- cudnnDestroyTensorDescriptor(in_desc);
- cudnnDestroyTensorDescriptor(out_desc);
+ cudnnAddTensor(ctx->cudnn_handle,
+ (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
}
/// out = in1 * in2
@@ -256,27 +225,21 @@ void EltwiseMult<float, lang::Cuda>(const Tensor* in1,
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1->Size();
- if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
+ //if both in1 and in2 strides are the same, we proceed to normal cuda::mult
+ if(in1->strides() == in2->strides()){
cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
out->Set_Strides(in1->strides());
} else { //else we transform in1 to out to store first
- float alpha[1] = {1.0};
- float beta[1] = {0.0};
+ float alpha = 1.0;
+ float beta = 0.0;
-
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnTensorDescriptor_t in1_desc, out_desc;
- cudnnCreateTensorDescriptor(&in1_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
out->Set_Strides(in2->strides());
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
- (void*)(beta), out_desc, outPtr);
+ cudnnTransformTensor(ctx->cudnn_handle,
+ (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
cuda::mult(num, outPtr, inPtr2, outPtr, ctx->stream);
- cudnnDestroyTensorDescriptor(in1_desc);
- cudnnDestroyTensorDescriptor(out_desc);
}
}
@@ -404,26 +367,20 @@ void Pow<float, lang::Cuda>(const Tensor* in1,
float* outPtr = static_cast<float*>(out->block()->mutable_data());
const size_t num = in1->Size();
- if(in1->strides() == in2->strides()){ //if both in1 and in2 strides are the same, we proceed to normal cuda::pow
+ if(in1->strides() == in2->strides()){
cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
out->Set_Strides(in1->strides());
} else { //else we transform in1 to out to store first
- float alpha[1] = {1.0};
- float beta[1] = {0.0};
-
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnTensorDescriptor_t in1_desc, out_desc;
- cudnnCreateTensorDescriptor(&in1_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in1_desc, cudnn_dtype, in1->generate_dim_cuda(), in1->generate_shape_cuda().data(), in1->generate_strides_cuda().data());
+ float alpha = 1.0;
+ float beta = 0.0;
+
out->Set_Strides(in2->strides());
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in1_desc, inPtr1,
- (void*)(beta), out_desc, outPtr);
+ cudnnTransformTensor(ctx->cudnn_handle,
+ (void*)(&alpha), generate_tensorND_desc(in1), inPtr1,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
cuda::pow(num, outPtr, inPtr2, outPtr, ctx->stream);
- cudnnDestroyTensorDescriptor(in1_desc);
- cudnnDestroyTensorDescriptor(out_desc);
}
}
@@ -525,27 +482,16 @@ void Sqrt<float, lang::Cuda>(const Tensor* in, Tensor* out,
Context* ctx) {
const float* inPtr = static_cast<const float*>(in->block()->data());
float* outPtr = static_cast<float*>(out->block()->mutable_data());
-
- cudnnOpTensorOp_t op = CUDNN_OP_TENSOR_SQRT;
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnNanPropagation_t cudnn_propagation = CUDNN_PROPAGATE_NAN;
- cudnnOpTensorDescriptor_t op_desc;
- cudnnCreateOpTensorDescriptor(&op_desc);
- cudnnSetOpTensorDescriptor(op_desc, op, cudnn_dtype, cudnn_propagation);
- float alpha1[1] = {1.0};
- float alpha2[1] = {0.0};
- float beta[1] = {0.0};
- cudnnTensorDescriptor_t in_desc, out_desc;
- cudnnCreateTensorDescriptor(&in_desc);
- cudnnCreateTensorDescriptor(&out_desc);
- cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
- cudnnSetTensorNdDescriptor(out_desc, cudnn_dtype, out->generate_dim_cuda(), out->generate_shape_cuda().data(), out->generate_strides_cuda().data());
- cudnnOpTensor(ctx->cudnn_handle, op_desc, (void*)(&alpha1), in_desc, inPtr,
- (void*)(&alpha2), in_desc, inPtr, (void*)(&beta), out_desc, outPtr);
-
- cudnnDestroyTensorDescriptor(in_desc);
- cudnnDestroyTensorDescriptor(out_desc);
+ float alpha1 = 1.0;
+ float alpha2 = 0.0;
+ float beta = 0.0;
+ cudnnTensorDescriptor_t in_desc = generate_tensorND_desc(in);
+ cudnnOpTensor(ctx->cudnn_handle, generate_Op_desc(CUDNN_OP_TENSOR_SQRT),
+ (void*)(&alpha1), in_desc, inPtr,
+ (void*)(&alpha2), in_desc, inPtr,
+ (void*)(&beta), generate_tensorND_desc(out), outPtr
+ );
}
/// Element-wise operation, out[i]=in[i]^2
@@ -593,30 +539,26 @@ void Sum<float, lang::Cuda>(const Tensor* in, float* out,
cudnn_propagation, cudnn_indices, cudnn_indices_type);
//instantiate 2 new tensors to use new blocks as memory instead of cudaMalloc
- Shape reduction_size = {1000};
+ size_t reduction_size_int = Product(in->shape());
+ Shape reduction_size = {reduction_size_int*100};
Tensor indices(reduction_size, in->device(), in->data_type());
Tensor workspace(reduction_size, in->device(), in->data_type());
- size_t indices_bytes = indices.block()->size()*1000;
- size_t workspace_bytes = workspace.block()->size()*1000;
+ size_t indices_bytes = indices.block()->size()*100;
+ size_t workspace_bytes = workspace.block()->size()*100;
size_t* indicesPtr = static_cast<size_t*>(indices.block()->mutable_data());
float* workspacePtr = static_cast<float*>(workspace.block()->mutable_data());
//void* indicesPtr{nullptr}; void* workspacePtr{nullptr};
//cudaMalloc(&indicesPtr, indices_bytes); cudaMalloc(&workspacePtr, workspace_bytes);
- float alpha[1] = {1.0};
- float beta[1] = {0.0};
- cudnnTensorDescriptor_t in_desc, t_desc;
- cudnnCreateTensorDescriptor(&in_desc);
- cudnnCreateTensorDescriptor(&t_desc);
- cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
- cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), reduce_all_axes.data(), reduce_all_axes.data());
+ float alpha = 1.0;
+ float beta = 0.0;
cudnnReduceTensor(ctx->cudnn_handle, reduce_desc,
indicesPtr, indices_bytes, workspacePtr, workspace_bytes,
- (void*)(&alpha), in_desc, inPtr, (void*)(&beta), t_desc, tPtr);
+ (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+ (void*)(&beta), generate_tensorND_desc(&t), tPtr
+ );
*out = tPtr[0];
- cudnnDestroyTensorDescriptor(in_desc);
- cudnnDestroyTensorDescriptor(t_desc);
}
@@ -922,22 +864,17 @@ void RowMax<float, lang::Cuda>(const Tensor* in, Tensor* out,
if(in->transpose()){
Tensor t(in->shape(), in->device(), in->data_type());
float* tPtr = static_cast<float*>(t.block()->mutable_data());
- float alpha[1] = {1.0};
- float beta[1] = {0.0};
-
- cudnnDataType_t cudnn_dtype = CUDNN_DATA_FLOAT;
- cudnnTensorDescriptor_t in_desc, t_desc;
- cudnnCreateTensorDescriptor(&in_desc);
- cudnnCreateTensorDescriptor(&t_desc);
- cudnnSetTensorNdDescriptor(in_desc, cudnn_dtype, in->generate_dim_cuda(), in->generate_shape_cuda().data(), in->generate_strides_cuda().data());
- cudnnSetTensorNdDescriptor(t_desc, cudnn_dtype, t.generate_dim_cuda(), t.generate_shape_cuda().data(), t.generate_strides_cuda().data());
- cudnnTransformTensor(ctx->cudnn_handle, (void*)(alpha), in_desc, inPtr,
- (void*)(beta), t_desc, tPtr);
+
+ float alpha = 1.0;
+ float beta = 0.0;
+
+ cudnnTransformTensor(ctx->cudnn_handle,
+ (void*)(&alpha), generate_tensorND_desc(in), inPtr,
+ (void*)(&beta), generate_tensorND_desc(&t), tPtr
+ );
const float* tPtr_const = static_cast<const float*>(t.block()->data());
cuda::RowMax(nrow, ncol, tPtr_const, outPtr, ctx->stream);
- cudnnDestroyTensorDescriptor(in_desc);
- cudnnDestroyTensorDescriptor(t_desc);
} else {
cuda::RowMax(nrow, ncol, inPtr, outPtr, ctx->stream);
}