You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by zh...@apache.org on 2016/06/12 07:27:53 UTC
[1/5] incubator-singa git commit: SINGA-182 Clean math function APIs
and implementations
Repository: incubator-singa
Updated Branches:
refs/heads/dev 01aaf4900 -> 6d69047ad
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 94ca283..38a9291 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -5,17 +5,17 @@ using singa::Shape;
using singa::Device;
class TestTensorMath : public ::testing::Test {
-protected:
+ protected:
virtual void SetUp() {
a.Reshape(singa::Shape{6});
b.Reshape(singa::Shape{6});
c.Reshape(singa::Shape{6, 1});
d.Reshape(singa::Shape{3, 2});
- e.Reshape(singa::Shape{3, 2});
+ e.Reshape(singa::Shape{3, 2});
a.CopyDataFromHostPtr<float>(dat1, 6);
b.CopyDataFromHostPtr<float>(dat2, 6);
- e.CopyDataFromHostPtr<float>(dat1, 6);
+ e.CopyDataFromHostPtr<float>(dat1, 6);
}
Tensor a, b, c, d, e;
const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
@@ -23,264 +23,262 @@ protected:
};
TEST_F(TestTensorMath, MemberAbs) {
- Tensor aa = a.Clone();
- Tensor bb = b.Clone();
- Tensor cc = aa - bb;
- const float* dptr = cc.data<const float*>();
- EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+ Tensor aa = a.Clone();
+ Tensor bb = b.Clone();
+ Tensor cc = aa - bb;
+ const float *dptr = cc.data<const float *>();
+ EXPECT_NEAR(-0.1, dptr[0], 1e-5);
EXPECT_NEAR(-0.1, dptr[1], 1e-5);
EXPECT_NEAR(-0.1, dptr[2], 1e-5);
- Tensor p = Abs(cc);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+ Tensor p = Abs(cc);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(0.1, dptr1[0], 1e-5);
EXPECT_NEAR(0.1, dptr1[1], 1e-5);
EXPECT_NEAR(0.1, dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberExp) {
- Tensor p = Exp(a);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+ Tensor p = Exp(a);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberLog) {
- Tensor p = Log(a);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+ Tensor p = Log(a);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberReLU) {
- Tensor aa = a.Clone();
- Tensor cc = aa - 2.0f;
- const float* dptr = cc.data<const float*>();
- EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+ Tensor aa = a.Clone();
+ Tensor cc = aa - 2.0f;
+ const float *dptr = cc.data<const float *>();
+ EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
EXPECT_NEAR(0.0f, dptr[1], 1e-5);
EXPECT_NEAR(1.0f, dptr[2], 1e-5);
- Tensor p = ReLU(cc);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+ Tensor p = ReLU(cc);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberSigmoid) {
- Tensor p = Sigmoid(a);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(1.0f/(1.0f + exp(-1.0f)), dptr1[0], 1e-5);
- EXPECT_NEAR(1.0f/(1.0f + exp(-2.0f)), dptr1[1], 1e-5);
- EXPECT_NEAR(1.0f/(1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+ Tensor p = Sigmoid(a);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(1.0f / (1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+ EXPECT_NEAR(1.0f / (1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+ EXPECT_NEAR(1.0f / (1.0f + exp(-3.0f)), dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberSign) {
- Tensor aa = a.Clone();
- Tensor cc = aa - 2.0f;
- const float* dptr = cc.data<const float*>();
- EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+ Tensor aa = a.Clone();
+ Tensor cc = aa - 2.0f;
+ const float *dptr = cc.data<const float *>();
+ EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
EXPECT_NEAR(0.0f, dptr[1], 1e-5);
EXPECT_NEAR(1.0f, dptr[2], 1e-5);
- Tensor p = Sign(cc);
- const float* dptr1 = p.data<const float*>();
- EXPECT_EQ(0.0f, dptr1[0]);
+ Tensor p = Sign(cc);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_EQ(0.0f, dptr1[0]);
EXPECT_EQ(0.0f, dptr1[1]);
EXPECT_EQ(1.0f, dptr1[2]);
}
TEST_F(TestTensorMath, MemberSqrt) {
- Tensor p = Sqrt(a);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+ Tensor p = Sqrt(a);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberSquare) {
- Tensor p = Square(a);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+ Tensor p = Square(a);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(1.0, dptr1[0], 1e-5);
EXPECT_NEAR(4.0, dptr1[1], 1e-5);
EXPECT_NEAR(9.0, dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberTanh) {
- Tensor p = Tanh(a);
- const float* dptr1 = p.data<const float*>();
- EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+ Tensor p = Tanh(a);
+ const float *dptr1 = p.data<const float *>();
+ EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, Sum) {
- Tensor p1 = Sum(e, 0);
+ Tensor p1 = Sum(e, 0);
const float *dptr1 = p1.data<const float *>();
- EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
- EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
+ EXPECT_FLOAT_EQ(9.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(12.0f, dptr1[1]);
- Tensor p2(Shape{3,1});
- p2 = Sum(e, 1);
+ Tensor p2(Shape{3, 1});
+ p2 = Sum(e, 1);
const float *dptr2 = p2.data<const float *>();
- EXPECT_FLOAT_EQ(3.0f,dptr2[0]);
- EXPECT_FLOAT_EQ(7.0f,dptr2[1]);
- EXPECT_FLOAT_EQ(11.0f,dptr2[2]);
+ EXPECT_FLOAT_EQ(3.0f, dptr2[0]);
+ EXPECT_FLOAT_EQ(7.0f, dptr2[1]);
+ EXPECT_FLOAT_EQ(11.0f, dptr2[2]);
}
TEST_F(TestTensorMath, SoftMax) {
- Tensor p1(Shape{3,2});
- p1 = SoftMax(e,0);
+ Tensor p1(Shape{3, 2});
+ p1 = SoftMax(e, 0);
const float *dptr1 = p1.data<const float *>();
- float sum = 0;
- for(int i = 0; i < 6; i++) sum += exp(i+1);
- EXPECT_NEAR(exp(1)/sum, dptr1[0],1e-5);
- EXPECT_NEAR(exp(3)/sum, dptr1[2],1e-5);
- EXPECT_NEAR(exp(5)/sum, dptr1[4],1e-5);
- EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
- EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
- EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
-
- Tensor p2(Shape{3,2});
- p2 = SoftMax(e,1);
+ float sum = 0;
+ for (int i = 0; i < 6; i++) sum += exp(i + 1);
+ EXPECT_NEAR(exp(1) / sum, dptr1[0], 1e-5);
+ EXPECT_NEAR(exp(3) / sum, dptr1[2], 1e-5);
+ EXPECT_NEAR(exp(5) / sum, dptr1[4], 1e-5);
+ EXPECT_NEAR(exp(2) / sum, dptr1[1], 1e-5);
+ EXPECT_NEAR(exp(4) / sum, dptr1[3], 1e-5);
+ EXPECT_NEAR(exp(6) / sum, dptr1[5], 1e-5);
+
+ Tensor p2(Shape{3, 2});
+ p2 = SoftMax(e, 1);
const float *dptr2 = p2.data<const float *>();
- EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
- EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
+ EXPECT_NEAR(exp(1) / (exp(1) + exp(2)), dptr2[0], 1e-5);
+ EXPECT_NEAR(exp(2) / (exp(1) + exp(2)), dptr2[1], 1e-5);
}
TEST_F(TestTensorMath, MemberLT) {
- Tensor p1 = a < 2.0f;
- const float *dptr1 = p1.data<const float *>();
- EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
- EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
- EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ Tensor p1 = a < 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
}
TEST_F(TestTensorMath, MemberLE) {
- Tensor p1 = a <= 2.0f;
- const float *dptr1 = p1.data<const float *>();
- EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
- EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
- EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+ Tensor p1 = a <= 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
}
TEST_F(TestTensorMath, MemberGT) {
- Tensor p1 = a > 2.0f;
- const float *dptr1 = p1.data<const float *>();
- EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
- EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
- EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ Tensor p1 = a > 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
}
TEST_F(TestTensorMath, MemberGE) {
- Tensor p1 = a >= 2.0f;
- const float *dptr1 = p1.data<const float *>();
- EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
- EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
- EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+ Tensor p1 = a >= 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
}
TEST_F(TestTensorMath, MemberPow) {
- Tensor p1 = Pow(b,3.0f);
- const float *dptr1 = p1.data<const float *>();
- EXPECT_FLOAT_EQ(pow(1.1f,3.0f), dptr1[0]);
- EXPECT_FLOAT_EQ(pow(2.1f,3.0f), dptr1[1]);
- EXPECT_FLOAT_EQ(pow(3.1f,3.0f), dptr1[2]);
+ Tensor p1 = Pow(b, 3.0f);
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(pow(1.1f, 3.0f), dptr1[0]);
+ EXPECT_FLOAT_EQ(pow(2.1f, 3.0f), dptr1[1]);
+ EXPECT_FLOAT_EQ(pow(3.1f, 3.0f), dptr1[2]);
- //TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the function is complete
- //Tensor p2 = Pow(a,b);
- //const float *dptr2 = p2.data<const float *>();
- //EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
- //EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
- //EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+ // TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the
+ // function is complete
+ // Tensor p2 = Pow(a,b);
+ // const float *dptr2 = p2.data<const float *>();
+ // EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+ // EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+ // EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
}
-
TEST_F(TestTensorMath, MemberSub) {
- Tensor p1 = a - b;
- const float* dptr1 = p1.data<const float*>();
- EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+ Tensor p1 = a - b;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberEltwiseMult) {
- Tensor p1 = a * b;
- const float* dptr1 = p1.data<const float*>();
- EXPECT_NEAR(1.0*1.1, dptr1[0], 1e-5);
- EXPECT_NEAR(2.0*2.1, dptr1[1], 1e-5);
- EXPECT_NEAR(3.0*3.1, dptr1[2], 1e-5);
+ Tensor p1 = a * b;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_NEAR(1.0 * 1.1, dptr1[0], 1e-5);
+ EXPECT_NEAR(2.0 * 2.1, dptr1[1], 1e-5);
+ EXPECT_NEAR(3.0 * 3.1, dptr1[2], 1e-5);
}
TEST_F(TestTensorMath, MemberDiv) {
- Tensor p1 = a / b;
- const float* dptr1 = p1.data<const float*>();
- EXPECT_NEAR(1.0/1.1, dptr1[0], 1e-5);
- EXPECT_NEAR(2.0/2.1, dptr1[1], 1e-5);
- EXPECT_NEAR(3.0/3.1, dptr1[2], 1e-5);
+ Tensor p1 = a / b;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_NEAR(1.0 / 1.1, dptr1[0], 1e-5);
+ EXPECT_NEAR(2.0 / 2.1, dptr1[1], 1e-5);
+ EXPECT_NEAR(3.0 / 3.1, dptr1[2], 1e-5);
- Tensor p2 = Div(10.0f,b);
- const float* dptr2 = p2.data<const float*>();
- EXPECT_NEAR(10.0/1.1, dptr2[0], 1e-5);
- EXPECT_NEAR(10.0/2.1, dptr2[1], 1e-5);
- EXPECT_NEAR(10.0/3.1, dptr2[2], 1e-5);
+ Tensor p2 = Div(10.0f, b);
+ const float *dptr2 = p2.data<const float *>();
+ EXPECT_NEAR(10.0 / 1.1, dptr2[0], 1e-5);
+ EXPECT_NEAR(10.0 / 2.1, dptr2[1], 1e-5);
+ EXPECT_NEAR(10.0 / 3.1, dptr2[2], 1e-5);
- Tensor p3 = a / 8.0f;
- const float* dptr3 = p3.data<const float*>();
- EXPECT_NEAR(1.0/8.0, dptr3[0], 1e-5);
- EXPECT_NEAR(2.0/8.0, dptr3[1], 1e-5);
- EXPECT_NEAR(3.0/8.0, dptr3[2], 1e-5);
+ Tensor p3 = a / 8.0f;
+ const float *dptr3 = p3.data<const float *>();
+ EXPECT_NEAR(1.0 / 8.0, dptr3[0], 1e-5);
+ EXPECT_NEAR(2.0 / 8.0, dptr3[1], 1e-5);
+ EXPECT_NEAR(3.0 / 8.0, dptr3[2], 1e-5);
}
TEST_F(TestTensorMath, MemberBernoulli) {
- Tensor p1(Shape{10000});
- Bernoulli(0.3f, &p1);
- const float* dptr1 = p1.data<const float*>();
- float sum = 0;
- for(int i = 0; i < 10000; i++) sum += dptr1[i];
- float mean = sum/10000;
- EXPECT_NEAR(mean, 0.3f, 1e-2);
+ Tensor p1(Shape{10000});
+ Bernoulli(0.3f, &p1);
+ const float *dptr1 = p1.data<const float *>();
+ float sum = 0;
+ for (int i = 0; i < 10000; i++) sum += dptr1[i];
+ float mean = sum / 10000;
+ EXPECT_NEAR(mean, 0.3f, 1e-2);
- sum = 0;
- for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
- float variance = sum/9999;
- EXPECT_NEAR(variance, 0.3*0.7, 1e-2);
+ sum = 0;
+ for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+ float variance = sum / 9999;
+ EXPECT_NEAR(variance, 0.3 * 0.7, 1e-2);
}
TEST_F(TestTensorMath, MemberUniform) {
- Tensor p1(Shape{10000});
- Uniform(0.1f,0.2f,&p1);
- const float* dptr1 = p1.data<const float*>();
- float sum = 0;
- for(int i = 0; i < 10000; i++) sum += dptr1[i];
- float mean = sum/10000;
- EXPECT_NEAR(mean, 0.15f, 1e-3);
+ Tensor p1(Shape{10000});
+ Uniform(0.1f, 0.2f, &p1);
+ const float *dptr1 = p1.data<const float *>();
+ float sum = 0;
+ for (int i = 0; i < 10000; i++) sum += dptr1[i];
+ float mean = sum / 10000;
+ EXPECT_NEAR(mean, 0.15f, 1e-3);
- sum = 0;
- for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
- float variance = sum/9999;
- EXPECT_NEAR(variance, 0.01f/12, 1e-3);
+ sum = 0;
+ for (int i = 0; i < 10000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+ float variance = sum / 9999;
+ EXPECT_NEAR(variance, 0.01f / 12, 1e-3);
}
TEST_F(TestTensorMath, MemberGaussian) {
- Tensor p1(Shape{50000});
- Gaussian(0.0f,1.0f,&p1);
- const float* dptr1 = p1.data<const float*>();
- float sum = 0;
- for(int i = 0; i < 50000; i++) sum += dptr1[i];
- float mean = sum/50000;
- EXPECT_NEAR(mean, 0.0, 1e-2);
+ Tensor p1(Shape{50000});
+ Gaussian(0.0f, 1.0f, &p1);
+ const float *dptr1 = p1.data<const float *>();
+ float sum = 0;
+ for (int i = 0; i < 50000; i++) sum += dptr1[i];
+ float mean = sum / 50000;
+ EXPECT_NEAR(mean, 0.0, 1e-2);
- sum = 0;
- for(int i = 0; i < 50000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
- float variance = sum/49999;
- EXPECT_NEAR(variance, 1.0, 1e-2);
+ sum = 0;
+ for (int i = 0; i < 50000; i++) sum += (dptr1[i] - mean) * (dptr1[i] - mean);
+ float variance = sum / 49999;
+ EXPECT_NEAR(variance, 1.0, 1e-2);
}
-
-
TEST_F(TestTensorMath, MemberAddTensor) {
Tensor aa = a.Clone();
aa += a;
@@ -333,8 +331,7 @@ TEST_F(TestTensorMath, SetValue) {
Tensor t(Shape{4});
t.SetValue(0.3f);
const float *ptr = t.data<const float *>();
- for (int i = 0; i < 4; i++)
- EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+ for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
}
TEST_F(TestTensorMath, Reshape) {
@@ -344,10 +341,15 @@ TEST_F(TestTensorMath, Reshape) {
const float *ptr = t.data<const float *>();
EXPECT_EQ(p.shape(0), 4u);
EXPECT_EQ(p.shape(1), 1u);
- for (int i = 0; i < 4; i++)
- EXPECT_FLOAT_EQ(ptr[i], 0.3f);
+ for (int i = 0; i < 4; i++) EXPECT_FLOAT_EQ(ptr[i], 0.3f);
}
#ifdef USE_CBLAS
+TEST_F(TestTensorMath, L2Cpp) {
+ float l2 = a.L2();
+ float target = 0.0f;
+ for (size_t i = 0; i < a.Size(); i++) target += dat1[i] * dat1[i];
+ EXPECT_FLOAT_EQ(l2, sqrt(target));
+}
TEST_F(TestTensorMath, MultCpp) {
const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
Tensor t(Shape{2, 2});
@@ -368,8 +370,7 @@ TEST_F(TestTensorMath, MultCpp) {
Tensor s(Shape{4, 2});
s.CopyDataFromHostPtr(y, 8);
const float *sPtr = s.data<const float *>();
- for (int i = 0; i < 8; i++)
- EXPECT_FLOAT_EQ(sPtr[i], y[i]);
+ for (int i = 0; i < 8; i++) EXPECT_FLOAT_EQ(sPtr[i], y[i]);
Tensor D = Mult(d, s.T());
const float *DPtr = D.data<const float *>();
for (int i = 0; i < 3; i++) {
@@ -423,7 +424,6 @@ TEST_F(TestTensorMath, SubColumnCpp) {
}
}
-
TEST_F(TestTensorMath, DivColumnCpp) {
const float x[3] = {1.0f, 2.0f, 3.0f};
Tensor t(Shape{3});
@@ -438,7 +438,6 @@ TEST_F(TestTensorMath, DivColumnCpp) {
}
}
-
TEST_F(TestTensorMath, AddRowCpp) {
const float x[2] = {1.1f, 2.1f};
Tensor t(Shape{2});
@@ -453,7 +452,6 @@ TEST_F(TestTensorMath, AddRowCpp) {
}
}
-
TEST_F(TestTensorMath, SubRowCpp) {
const float x[2] = {1.1f, 2.1f};
Tensor t(Shape{2});
@@ -468,7 +466,6 @@ TEST_F(TestTensorMath, SubRowCpp) {
}
}
-
TEST_F(TestTensorMath, MultRowCpp) {
const float x[2] = {1.1f, 2.1f};
Tensor t(Shape{2});
@@ -483,7 +480,6 @@ TEST_F(TestTensorMath, MultRowCpp) {
}
}
-
TEST_F(TestTensorMath, SumRowsCpp) {
Tensor t(Shape{2});
d.CopyDataFromHostPtr(dat1, 6);
@@ -498,7 +494,6 @@ TEST_F(TestTensorMath, SumRowsCpp) {
}
}
-
TEST_F(TestTensorMath, SumColumnsCpp) {
Tensor t(Shape{3});
d.CopyDataFromHostPtr(dat1, 6);
@@ -514,6 +509,15 @@ TEST_F(TestTensorMath, SumColumnsCpp) {
}
#endif
#ifdef USE_CUDA
+TEST_F(TestTensorMath, L2Cuda) {
+ singa::CudaGPU dev;
+ Tensor t(Shape{3, 2}, &dev);
+ t.CopyDataFromHostPtr(dat1, 6);
+ float l2 = t.L2();
+ float target = 0.0f;
+ for (size_t i = 0; i < t.Size(); i++) target += dat1[i] * dat1[i];
+ EXPECT_FLOAT_EQ(l2, sqrt(target));
+}
TEST_F(TestTensorMath, MultCuda) {
const float x[4] = {1.0f, 2.0f, 3.0f, 4.0f};
singa::CudaGPU dev;
@@ -582,7 +586,6 @@ TEST_F(TestTensorMath, AddColumnCuda) {
}
}
-
TEST_F(TestTensorMath, SubColumnCuda) {
const float x[3] = {1.0f, 2.0f, 3.0f};
singa::CudaGPU dev;
@@ -757,4 +760,5 @@ TEST_F(TestTensorMath, SumColumnCuda) {
EXPECT_FLOAT_EQ(tptr[i], tmp);
}
}
+
#endif
[3/5] incubator-singa git commit: SINGA-182 Clean math function APIs
and implementations
Posted by zh...@apache.org.
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index ec7a892..2c5c272 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -25,12 +25,11 @@
#include <cblas.h>
#endif
-/// TODO(wangwei) Clean the implementations following the comments in
-/// tensor_math.h.
namespace singa {
-template<>
-void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+template <>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
@@ -39,180 +38,150 @@ void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context
}
template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) outPtr[i] = x;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = inPtr[i] + x;
+ }
}
-// sum all elements of input into out
-// TODO(wangwei) optimize using omp
template <>
-void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
- float s = 0.f;
- const float *inPtr = static_cast<const float *>(in->data());
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ // CHECK_EQ(ctx->stream, nullptr);
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
for (size_t i = 0; i < num; i++) {
- s += inPtr[i];
+ outPtr[i] = in1Ptr[i] + in2Ptr[i];
}
- *out = s;
}
template <>
-void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Clamp<float, lang::Cpp>(const size_t num, const float low,
+ const float high, const Blob *in, Blob *out,
+ Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float*>(in->data());
+ const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f;
+ if (inPtr[i] > high) {
+ outPtr[i] = high;
+ } else if (inPtr[i] < low) {
+ outPtr[i] = low;
+ } else {
+ outPtr[i] = inPtr[i];
+ }
}
}
template <>
-void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ CHECK_NE(in2Ptr[i], 0.f);
+ outPtr[i] = in1Ptr[i] / in2Ptr[i];
+ }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
+ Blob *out, Context *ctx) {
const float *inPtr = static_cast<const float *>(in->data());
+ float *outPtr = static_cast<float *>(out->mutable_data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = exp(inPtr[i]);
+ CHECK_NE(inPtr[i], 0.f);
+ outPtr[i] = x / inPtr[i];
}
}
template <>
-void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in,
+ const float x, Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- CHECK_GT(inPtr[i], 0.f);
- outPtr[i] = log(inPtr[i]);
+ outPtr[i] = inPtr[i] * x;
}
}
template <>
-void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1,
+ const Blob *in2, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = in1Ptr[i] * in2Ptr[i];
+ }
+}
+template <>
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- CHECK_GT(inPtr[i], 0.f);
- outPtr[i] = sqrt(inPtr[i]);
+ outPtr[i] = exp(inPtr[i]);
}
}
template <>
-void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = inPtr[i] * inPtr[i];
+ outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
}
}
template <>
-void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = tanh(inPtr[i]);
+ outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
}
}
-
template <>
-void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+ outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
}
}
-
template <>
-void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
+ CHECK_GT(inPtr[i], 0.f);
+ outPtr[i] = log(inPtr[i]);
}
}
-
template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
- Blob *out, Context *ctx) {
+void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
- float *bPtr = new float[ncol];
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- float denom = 0.f;
- for (size_t c = 0; c < ncol; c++) {
- bPtr[c] = exp(inPtr[offset + c]);
- denom += bPtr[c];
- }
- for (size_t c = 0; c < ncol; c++) {
- size_t idx = offset + c;
- outPtr[idx] = bPtr[c] / denom;
- }
- }
- delete bPtr;
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- outPtr[r] = 0.f;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[r] += inPtr[offset + c];
- }
- }
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t c = 0; c < ncol; c++) {
- outPtr[c] = 0.f;
- }
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[c] += inPtr[offset + c];
- }
- }
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[offset + c] = APtr[offset + c] + vPtr[c];
- }
- }
-}
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[offset + c] = APtr[offset + c] + vPtr[r];
- }
- }
-}
-
-template <>
-void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+ }
+}
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
outPtr[i] = pow(inPtr[i], x);
}
}
@@ -220,252 +189,230 @@ void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob
template <>
void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
Blob *out, Context *ctx) {
- float *outPtr= static_cast<float *>(out->mutable_data());
- const float *in1Ptr= static_cast<const float *>(in1->data());
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
const float *in2Ptr = static_cast<const float *>(in2->data());
for (size_t i = 0; i < num; i++) {
outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
}
}
-
template <>
-void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- if (inPtr[i] > high) {
- outPtr[i] = high;
- }
- else if (inPtr[i] < low) {
- outPtr[i] = low;
- }
- else {
- outPtr[i] = inPtr[i];
- }
- }
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
+ }
}
-
template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+template <>
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = inPtr[i] + x;
+ outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
}
}
template <>
-void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- // CHECK_EQ(ctx->stream, nullptr);
- float *outPtr= static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = in1Ptr[i] + in2Ptr[i];
+ outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f;
}
}
template <>
-void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- // CHECK_EQ(ctx->stream, nullptr);
- float *outPtr= static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = in1Ptr[i] - in2Ptr[i];
+ CHECK_GT(inPtr[i], 0.f);
+ outPtr[i] = sqrt(inPtr[i]);
}
}
template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
- float *outPtr= static_cast<float *>(out->mutable_data());
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = inPtr[i] * x;
+ outPtr[i] = inPtr[i] * inPtr[i];
}
}
template <>
-void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- float *outPtr= static_cast<float *>(out->mutable_data());
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ // CHECK_EQ(ctx->stream, nullptr);
+ float *outPtr = static_cast<float *>(out->mutable_data());
const float *in1Ptr = static_cast<const float *>(in1->data());
const float *in2Ptr = static_cast<const float *>(in2->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = in1Ptr[i] * in2Ptr[i];
+ outPtr[i] = in1Ptr[i] - in2Ptr[i];
}
}
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
template <>
-void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- float *outPtr= static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+ Context *ctx) {
+ float s = 0.f;
+ const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- CHECK_NE(in2Ptr[i],0.f);
- outPtr[i] = in1Ptr[i] / in2Ptr[i];
+ s += inPtr[i];
}
+ *out = s;
}
template <>
-void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
- Blob *out, Context *ctx) {
- float *outPtr= static_cast<float *>(out->mutable_data());
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- CHECK_NE(inPtr[i],0.f);
- outPtr[i] = x / inPtr[i];
+ outPtr[i] = tanh(inPtr[i]);
}
}
+// =========Matrix operations ================================================
+
template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- float *outPtr= static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t r = 0; r < m ; r++) {
- size_t offset = r * n;
- for (size_t c = 0; c < n; c++) {
- outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
- }
- }
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *A, const Blob *v, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+ }
+ }
}
template <>
-void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *A, const Blob *v, Blob *out,
+ Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] < x) ? 1.f : 0.f;
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+ }
}
}
-
template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+ const Blob *in2, Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t r = 0; r < m; r++) {
+ size_t offset = r * n;
+ for (size_t c = 0; c < n; c++) {
+ outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+ }
}
}
-
template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *in, Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+ float *bPtr = new float[ncol];
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ float denom = 0.f;
+ for (size_t c = 0; c < ncol; c++) {
+ bPtr[c] = exp(inPtr[offset + c]);
+ denom += bPtr[c];
+ }
+ for (size_t c = 0; c < ncol; c++) {
+ size_t idx = offset + c;
+ outPtr[idx] = bPtr[c] / denom;
+ }
}
+ delete bPtr;
}
template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *in, Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[c] = 0.f;
+ }
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[c] += inPtr[offset + c];
+ }
}
}
template <>
-void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
- size_t maxPos = 0;
- float maxVal = 0;
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- if (i == 0) {
- maxVal = inPtr[i];
- }
- else if (inPtr[i] > maxVal) {
- maxVal = inPtr[i];
- maxPos = i;
- }
- }
- *out = maxPos;
-}
-
-template <>
-void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
- size_t minPos = 0;
- float minVal = 0;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- if (i == 0) {
- minVal = inPtr[i];
- }
- else if (inPtr[i] > minVal) {
- minVal = inPtr[i];
- minPos = i;
- }
- }
- *out = minPos;
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ outPtr[r] = 0.f;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[r] += inPtr[offset + c];
+ }
+ }
}
+// ===============Random operations==========================================
template <>
-void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
- float sum = 0;
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- sum += fabs(inPtr[i]);
- }
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
+ Context *ctx) {
+ std::bernoulli_distribution distribution(p);
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+ }
}
template <>
-void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
- Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean,
+ const float std, Blob *out, Context *ctx) {
+ std::normal_distribution<float> distribution(mean, std);
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] += alpha * inPtr[i];
- }
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+ }
}
-
template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] *= x;
- }
+void Uniform<float, lang::Cpp>(const size_t num, const float low,
+ const float high, Blob *out, Context *ctx) {
+ std::uniform_real_distribution<float> distribution(low, high);
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
+ }
}
-//template <>
-//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
-// float *out, Context *ctx) {
-// float sum = 0;
-// const float *in1Ptr = static_cast<const float *>(in1->data());
-// const float *in2Ptr = static_cast<const float *>(in2->data());
-// for (size_t i = 0; i < num; i++) {
-// sum += in1Ptr[i] * in2Ptr[i];
-// }
-//}
-
-template <>
-void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
- const Blob *A, const Blob *v, const float beta,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float* APtr = static_cast<const float *>(A->data());
- const float* vPtr = static_cast<const float *>(v->data());
- for (size_t r = 0; r < m; r++) {
- float sum = 0;
- for (size_t c = 0; c < n; c++) {
- size_t idx = trans ? c * m + r : r * n + c;
- sum += APtr[idx] * vPtr[c];
- }
- outPtr[r] = alpha * sum + beta * outPtr[r];
- }
-}
+// ====================Blas operations======================================
template <>
void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
@@ -491,37 +438,21 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
}
}
+#ifdef USE_CBLAS
template <>
-void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
- std::bernoulli_distribution distribution(p);
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+ Blob *out, Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in->data());
float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
- }
+ cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
}
-
-template <>
-void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
- Context *ctx) {
- std::uniform_real_distribution<float> distribution(low, high);
- float *outPtr= static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
- }
-}
-
template <>
-void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
- Context *ctx) {
- std::normal_distribution<float> distribution(mean, std);
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+ Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) {
- outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
- }
+ cblas_sscal(num, x, outPtr, 1);
}
-
-#ifdef USE_CBLAS
template <>
void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
float *out, Context *ctx) {
@@ -529,6 +460,21 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
const float *in2Ptr = static_cast<const float *>(in2->data());
*out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
}
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+ const float alpha, const Blob *A, const Blob *v,
+ const float beta, Blob *out, Context *ctx) {
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ if (!trans) {
+ cblas_sgemv(CblasRowMajor, CblasNoTrans, m, n, alpha, APtr, n, vPtr, 1,
+ beta, outPtr, 1);
+ } else {
+ cblas_sgemv(CblasRowMajor, CblasTrans, n, m, alpha, APtr, m, vPtr, 1, beta,
+ outPtr, 1);
+ }
+}
template <>
void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
@@ -548,6 +494,98 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
lda, BPtr, ldb, beta, CPtr, ldc);
}
+#else
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+ Context *ctx) {
+ size_t maxPos = 0;
+ float maxVal = 0;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ if (i == 0) {
+ maxVal = inPtr[i];
+ } else if (inPtr[i] > maxVal) {
+ maxVal = inPtr[i];
+ maxPos = i;
+ }
+ }
+ *out = maxPos;
+}
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+ Context *ctx) {
+ size_t minPos = 0;
+ float minVal = 0;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ if (i == 0) {
+ minVal = inPtr[i];
+ } else if (inPtr[i] > minVal) {
+ minVal = inPtr[i];
+ minPos = i;
+ }
+ }
+ *out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+ Context *ctx) {
+ float sum = 0;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ sum += fabs(inPtr[i]);
+ }
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] += alpha * inPtr[i];
+ }
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] *= x;
+ }
+}
+
+template <>
+void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ float *out, Context *ctx) {
+ float sum = 0;
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ sum += in1Ptr[i] * in2Ptr[i];
+ }
+}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
+ const float alpha, const Blob *A, const Blob *v,
+ const float beta, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < m; r++) {
+ float sum = 0;
+ for (size_t c = 0; c < n; c++) {
+ size_t idx = trans ? c * m + r : r * n + c;
+ sum += APtr[idx] * vPtr[c];
+ }
+ outPtr[r] = alpha * sum + beta * outPtr[r];
+ }
+}
+
#endif // USE_CBLAS
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index 4a2ba66..f9841a3 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -26,75 +26,100 @@
#include "singa/core/common.h"
namespace singa {
-
-// TODO(wangwei) Clean implementations following comments in tensor_math_cpp.h.
-// TODO(wangwei) optimize using stream
+// =================Elementwise operations===================================
template <>
-void Add<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
- Blob *ret, Context *ctx) {
- const float *a = static_cast<const float *>(lhs->data());
- const float *b = static_cast<const float *>(rhs->data());
- float *c = static_cast<float *>(ret->mutable_data());
- cuda::add(count, a, b, c);
+void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ cuda::add(num, in1Ptr, in2Ptr, outPtr);
}
-// TODO(wangwei) optimize using stream
+// follow the consistency guide of math API
template <>
-void Sub<float, lang::Cuda>(int count, const Blob *lhs, const Blob *rhs,
- Blob *ret, Context *ctx) {
- const float *a = static_cast<const float *>(lhs->data());
- const float *b = static_cast<const float *>(rhs->data());
- float *c = static_cast<float *>(ret->mutable_data());
- cuda::sub(count, a, b, c);
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ cuda::Div(num, x, inPtr, outPtr, ctx->stream);
}
template <>
-void EltwiseMult<float, lang::Cuda>(int count, const Blob *input, float x,
- Blob *ret, Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- cuda::mult(count, lptr, x, dptr);
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
+ const float x, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ cuda::mult(num, inPtr, x, outPtr);
}
-// TODO(wangwei) optimize using stream
template <>
-void Square<float, lang::Cuda>(int count, const Blob *input, Blob *ret,
- Context *ctx) {
- const float *in = static_cast<const float *>(input->data());
- float *out = static_cast<float *>(ret->mutable_data());
- cuda::square(count, in, out);
+void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ cuda::GE(num, inPtr, x, outPtr, ctx->stream);
}
-
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
template <>
-void Sum<float, lang::Cuda>(int count, const Blob *input, float *ret,
- Context *ctx) {
- const float *in = static_cast<const float *>(input->data());
- cuda::sum(count, in, ret);
+void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ cuda::GT(num, inPtr, x, outPtr, ctx->stream);
}
-
-// follow the consistency guide of math API
template <>
-void Div<float, lang::Cuda>(const size_t num, const float alpha, const Blob *in,
- Blob *out, Context *ctx) {
+void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
- cuda::Div(num, alpha, inPtr, outPtr, ctx->stream);
+ cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ cuda::LT(num, inPtr, x, outPtr, ctx->stream);
}
-
template <>
void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
Context *ctx) {
float *outPtr = static_cast<float *>(out->mutable_data());
cuda::Set(num, x, outPtr, ctx->stream);
}
+// TODO(wangwei) optimize using stream
+template <>
+void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ cuda::square(num, inPtr, outPtr);
+}
+// TODO(wangwei) optimize using stream
+template <>
+void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+}
+// sum all elements of input into ret
+// TODO(wangwei) optimize using stream
+template <>
+void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
+ Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in->data());
+ cuda::sum(num, inPtr, out);
+}
+
+// =========================Blas operations==================================
// NOTE: cublas uses column major order.
// http://peterwittek.com/cublas-matrix-c-style.html
template <>
void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
const size_t ncol, const Blob *M, const Blob *v,
Blob *out, Context *ctx) {
- auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
const float *MPtr = static_cast<const float *>(M->data());
const float *vPtr = static_cast<const float *>(v->data());
float *outPtr = static_cast<float *>(out->mutable_data());
@@ -106,6 +131,22 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
vPtr, 1, outPtr, ncol));
}
}
+template <>
+void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
+ const float alpha, const Blob *A, const Blob *v,
+ const float beta, Blob *out, Context *ctx) {
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ if (!trans)
+ CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
+ 1, &beta, outPtr, 1));
+ else
+ CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_N, m, n, &alpha, APtr, m, vPtr,
+ 1, &beta, outPtr, 1));
+}
+
// http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
template <>
void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
@@ -121,44 +162,11 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
const float *APtr = static_cast<const float *>(A->data());
const float *BPtr = static_cast<const float *>(B->data());
float *CPtr = static_cast<float *>(C->mutable_data());
- auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
}
-template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
- Blob* out, Context *ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
- cuda::GE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
- Blob* out, Context *ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
- cuda::GT(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
- Blob* out, Context *ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
- cuda::LE(num, inPtr, x, outPtr, ctx->stream);
-}
-template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
- Blob* out, Context *ctx) {
- float* outPtr = static_cast<float*>(out->mutable_data());
- const float* inPtr = static_cast<const float*>(in->data());
- cuda::LT(num, inPtr, x, outPtr, ctx->stream);
-}
-
-
-
-
-
} // namespace singa
#endif // USE_CUDA
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 823445f..94ca283 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -117,12 +117,11 @@ TEST_F(TestTensorMath, MemberTanh) {
}
TEST_F(TestTensorMath, Sum) {
- Tensor p1(Shape{1,2});
- p1 = Sum(e, 0);
+ Tensor p1 = Sum(e, 0);
const float *dptr1 = p1.data<const float *>();
EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
-
+
Tensor p2(Shape{3,1});
p2 = Sum(e, 1);
const float *dptr2 = p2.data<const float *>();
@@ -143,9 +142,9 @@ TEST_F(TestTensorMath, SoftMax) {
EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
-
+
Tensor p2(Shape{3,2});
- p2 = SoftMax(e,1);
+ p2 = SoftMax(e,1);
const float *dptr2 = p2.data<const float *>();
EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
@@ -237,12 +236,12 @@ TEST_F(TestTensorMath, MemberDiv) {
TEST_F(TestTensorMath, MemberBernoulli) {
Tensor p1(Shape{10000});
- Bernoulli(0.3,&p1);
+ Bernoulli(0.3f, &p1);
const float* dptr1 = p1.data<const float*>();
float sum = 0;
for(int i = 0; i < 10000; i++) sum += dptr1[i];
float mean = sum/10000;
- EXPECT_NEAR(mean, 0.3, 1e-2);
+ EXPECT_NEAR(mean, 0.3f, 1e-2);
sum = 0;
for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
@@ -267,7 +266,7 @@ TEST_F(TestTensorMath, MemberUniform) {
TEST_F(TestTensorMath, MemberGaussian) {
Tensor p1(Shape{50000});
- Gaussian(0.0,1.0,&p1);
+ Gaussian(0.0f,1.0f,&p1);
const float* dptr1 = p1.data<const float*>();
float sum = 0;
for(int i = 0; i < 50000; i++) sum += dptr1[i];
[2/5] incubator-singa git commit: SINGA-182 Clean math function APIs
and implementations
Posted by zh...@apache.org.
SINGA-182 Clean math function APIs and implementations
Merge branch 'cuda' from #jinyangturbo.
Clean the cuda related code (tensor_math_cuda.h, kernel_math.h and kernel_math.cu)
by unify the function arugments (names and arg order).
Need to reorder the functions.
Add Nrm2 for L2 norm using cblas and cublas.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/6d69047a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/6d69047a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/6d69047a
Branch: refs/heads/dev
Commit: 6d69047addc46e5c9f381b7e1d4cebd20ce9b2e3
Parents: 564c88a
Author: Wei Wang <wa...@comp.nus.edu.sg>
Authored: Sun Jun 12 12:08:48 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 2 +
src/core/tensor/math_kernel.cu | 656 +++++++++++++++++---------------
src/core/tensor/math_kernel.h | 93 ++---
src/core/tensor/tensor.cc | 14 +
src/core/tensor/tensor_math.h | 140 ++++---
src/core/tensor/tensor_math_cpp.h | 227 ++++++-----
src/core/tensor/tensor_math_cuda.h | 384 +++++++++++++++----
test/singa/test_tensor_math.cc | 346 ++++++++---------
8 files changed, 1092 insertions(+), 770 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 82bbe81..cd750c5 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -173,6 +173,8 @@ class Tensor {
template <typename SType>
Tensor &operator/=(const SType x);
+ float L2() const;
+
protected:
bool transpose_ = false;
DataType data_type_ = kFloat32;
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.cu
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.cu b/src/core/tensor/math_kernel.cu
index aed6add..b618f9b 100644
--- a/src/core/tensor/math_kernel.cu
+++ b/src/core/tensor/math_kernel.cu
@@ -35,36 +35,16 @@
namespace singa {
// Cuda Kernel Functions
namespace cuda {
-__global__ void kernel_softmax_loss(const float *prob, const int *label,
- float *loss, int n, int dim) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- float prob_of_truth = prob[index * dim + label[index]];
- loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
- }
-}
-
-__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
- int dim, float scale) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- int pos = index * dim + label[index];
- grad[pos] = (grad[pos] - 1.0f) * scale;
- }
-}
-
-__global__ void kernel_sum_vec(const float *data, float *sum, int n) {
+__global__ void KernelSum(const size_t n, const float *in, float *out) {
int THREADS = blockDim.x;
__shared__ float aux[CU1DBLOCK];
int steps = (n - 1) / THREADS + 1;
- aux[threadIdx.x] = data[threadIdx.x];
+ aux[threadIdx.x] = in[threadIdx.x];
for (int i = 1; i < steps; ++i) {
if (threadIdx.x + i * THREADS < n) {
- aux[threadIdx.x] += data[threadIdx.x + i * THREADS];
+ aux[threadIdx.x] += in[threadIdx.x + i * THREADS];
}
}
@@ -83,432 +63,484 @@ __global__ void kernel_sum_vec(const float *data, float *sum, int n) {
}
__syncthreads();
- *sum = aux[0];
+ *out = aux[0];
}
-__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
- int rows, int cols, int stride) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < rows; index += num_threads) {
- dst_vec_data[index] = 0.0f;
- for (int k = 0; k < cols; k++) {
- dst_vec_data[index] += src_mat_data[index * stride + k];
- }
+__global__ void KernelAdd(const size_t n, const float *in1, const float *in2,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in1[i] + in2[i];
}
}
-__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
- int rows, int cols, int stride) {
- int j = blockIdx.x;
- int THREADS = blockDim.x;
- if (j >= cols) {
- return;
- }
-
- __shared__ float aux[CU1DBLOCK];
- int steps = (rows - 1) / THREADS + 1;
- aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
- for (int i = 1; i < steps; ++i) {
- if (threadIdx.x + i * THREADS < rows) {
- aux[threadIdx.x] +=
- src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
- }
+__global__ void KernelAdd(const size_t n, const float *in, const float x,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in[i] + x;
}
+}
- int total_threads = THREADS;
- __syncthreads();
- while (total_threads > 1) {
- int half_point = ((1 + total_threads) >> 1);
- if (threadIdx.x < half_point) {
- if (threadIdx.x + half_point < total_threads) {
- aux[threadIdx.x] += aux[threadIdx.x + half_point];
- }
- }
- __syncthreads();
- total_threads = ((total_threads + 1) >> 1);
+__global__ void KernelSub(const size_t n, const float *in1, const float *in2,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in1[i] - in2[i];
}
-
- __syncthreads();
- dst_vec_data[j] = aux[0];
}
-__global__ void kernel_add_vec_row(const float *src_vec_data,
- const float *src_mat_data,
- float *des_mat_data, int rows, int cols,
- int stride) {
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- int j = blockIdx.y * blockDim.y + threadIdx.y;
- int num_threads_x = blockDim.x * gridDim.x;
- int num_threads_y = blockDim.y * gridDim.y;
- int index = 0;
- for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
- index = j * stride + i;
- des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
+__global__ void KernelExp(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = std::exp(in[i]);
}
}
-__global__ void kernel_add(const float *src1, const float *src2, float *out,
- int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- out[index] = src1[index] + src2[index];
+
+__global__ void KernelLog(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = std::log(in[i]);
}
}
-__global__ void kernel_sub(const float *src1, const float *src2, float *out,
- int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- out[index] = src1[index] - src2[index];
+__global__ void KernelSigmoid(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = 1.0f / (1.0f + expf(-in[i]));
}
}
-__global__ void kernel_exp(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = std::exp(src_data[index]);
+__global__ void KernelSign(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ if (in[i] > 0.0f)
+ out[i] = 1.0f;
+ else if (in[i] < 0.0f)
+ out[i] = -1.0f;
+ else
+ out[i] = 0.0f;
}
}
-__global__ void kernel_log(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = std::log(src_data[index]);
+__global__ void KernelClamp(const size_t n, const float low, const float high,
+ const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ if (in[i] > high)
+ out[i] = high;
+ else if (in[i] < low)
+ out[i] = low;
+ else
+ out[i] = in[i];
}
}
-__global__ void kernel_sigmoid(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelRelu(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = max(in[i], 0.0f);
}
}
-__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
- int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data[index] * (1.0f - src_data[index]);
+__global__ void KernelAbs(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = max(in[i], -in[i]);
}
}
-__global__ void kernel_relu(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = max(src_data[index], 0.0f);
+__global__ void KernelTanh(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = tanhf(in[i]);
}
}
-__global__ void kernel_relu_grad(const float *src_data, float *des_data,
- int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+__global__ void KernelSoftplus(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = logf(1 + expf(in[i]));
}
}
-
-__global__ void kernel_tanh(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = tanhf(src_data[index]);
+__global__ void KernelSquare(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in[i] * in[i];
}
}
-
-__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
- int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = (1.0f - src_data[index] * src_data[index]);
+__global__ void KernelSqrt(const size_t n, const float *in, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = std::sqrt(in[i]);
}
}
-__global__ void kernel_softplus(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = logf(1 + expf(src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in1, const float *in2,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = std::pow(in1[i], in2[i]);
}
}
-__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
- int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+__global__ void KernelPow(const size_t n, const float *in, const float x,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = std::pow(in[i], x);
}
}
-__global__ void kernel_square(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data[index] * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in1, const float *in2,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in1[i] * in2[i];
}
}
-__global__ void kernel_square_grad(const float *src_data, float *des_data,
- int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = 2 * src_data[index];
+__global__ void KernelMult(const size_t n, const float *in, const float x,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in[i] * x;
}
}
-__global__ void kernel_sqrt(const float *src_data, float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = std::sqrt(src_data[index]);
+__global__ void KernelDiv(const size_t n, const float *in1, const float *in2,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in1[i] / in2[i];
}
}
-
-__global__ void kernel_pow(const float *src_data_a, const float *src_data_b,
- float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = std::pow(src_data_a[index], src_data_b[index]);
+__global__ void KernelDiv(const size_t n, const float x, const float *in,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = x / in[i];
}
}
-
-__global__ void kernel_mult(const float *src_data_a, const float *src_data_b,
- float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data_a[index] * src_data_b[index];
+__global__ static void KernelSet(const size_t n, const float x, float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = x;
}
}
-__global__ void kernel_mult(const float *src_data_a, const float x,
- float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data_a[index] * x;
+__global__ void KernelThreshold(const size_t n, const float x, const float *in,
+ float *out) {
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;
+ i += blockDim.x * gridDim.x) {
+ out[i] = in[i] < x ? 1.0f : 0.0f;
}
}
-__global__ void kernel_div(const float *src_data_a, const float *src_data_b,
- float *des_data, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data_a[index] / src_data_b[index];
+__global__ void KernelGE(const int num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] >= x ? 1.0f : 0.0f;
}
}
-
-__global__ static void kernel_set_value(float *data, float value, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- data[index] = value;
+__global__ void KernelGT(const int num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] > x ? 1.0f : 0.0f;
}
}
-
-__global__ void kernel_threshold(const float *src_data, float *des_data,
- float alpha, int n) {
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- int num_threads = blockDim.x * gridDim.x;
- for (; index < n; index += num_threads) {
- des_data[index] = src_data[index] < alpha ? 1.0f : 0.0f;
+__global__ void KernelLE(const int num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] <= x ? 1.0f : 0.0f;
}
}
-void sum(int n, const float *in, float *out) {
- int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
- // here, we only need one block
- int num_blocks = 1;
- kernel_sum_vec << <num_blocks, threads_per_block>>> (in, out, n);
+__global__ void KernelLT(const int num, const float *in, const float x,
+ float *out) {
+ for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
+ idx += blockDim.x * gridDim.x) {
+ out[idx] = in[idx] < x ? 1.0f : 0.0f;
+ }
}
-void sum_row(int rows, int cols, int stride, const float *in, float *out) {
- int threads_per_block = rows > CU1DBLOCK ? CU1DBLOCK : rows;
- int num_blocks = cols;
+// ********************************
+// Functions call kernels
+// ********************************
- kernel_sum_row << <num_blocks, threads_per_block>>>
- (in, out, rows, cols, stride);
+void set(const size_t n, const float v, float *out, cudaStream_t s) {
+ KernelSet <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, v, out);
}
-void sum_col(int rows, int cols, int stride, const float *in, float *out) {
- int threads_per_block = cols > CU1DBLOCK ? CU1DBLOCK : cols;
- int num_blocks = rows;
+void abs(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelAbs <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
- kernel_sum_col << <num_blocks, threads_per_block>>>
- (in, out, rows, cols, stride);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSign <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void add_row(int rows, int cols, int stride, const float *in_row,
- const float *in_mat, float *out) {
- dim3 threads_per_block(CU2DBLOCK_X, CU2DBLOCK_Y);
- dim3 num_blocks(
- cols / threads_per_block.x + (cols % threads_per_block.x == 0 ? 0 : 1),
- rows / threads_per_block.y + (rows % threads_per_block.y == 0 ? 0 : 1));
- kernel_add_vec_row << <num_blocks, threads_per_block>>>
- (in_row, in_mat, out, rows, cols, stride);
+
+void exp(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelExp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void add(int n, const float *a, const float *b, float *out) {
- kernel_add << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void log(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelLog <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void sub(int n, const float *a, const float *b, float *out) {
- kernel_sub << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSqrt <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void exp(int n, const float *in, float *out) {
- kernel_exp << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+
+void square(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSquare <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void log(int n, const float *in, float *out) {
- kernel_log << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelTanh <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
}
-void sigmoid(int n, const float *in, float *out) {
- kernel_sigmoid << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelRelu <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s) {
+ KernelSigmoid <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s) {
+ KernelSoftplus <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, out);
+}
+void clamp(const size_t n, const float low, const float high, const float *in,
+ float *out, cudaStream_t s) {
+ KernelClamp <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, low, high, in, out);
}
-void sigmoid_grad(int n, const float *in, float *out) {
- kernel_sigmoid_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
}
-void relu(int n, const float *in, float *out) {
- kernel_relu << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
}
-void relu_grad(int n, const float *in, float *out) {
- kernel_relu_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in, x, out);
}
-void tanh(int n, const float *in, float *out) {
- kernel_tanh << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void div(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s) {
+ KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
}
-void tanh_grad(int n, const float *in, float *out) {
- kernel_tanh_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void threshold(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s) {
+ KernelThreshold <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, x, in, out);
}
-void softplus(int n, const float *in, float *out) {
- kernel_softplus << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void gt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelGT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void ge(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelGE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void lt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelLT <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+}
+void le(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s) {
+ KernelLE <<<ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
}
-void softplus_grad(int n, const float *in, float *out) {
- kernel_softplus_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelPow <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void square(int n, const float *in, float *out) {
- kernel_square << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelAdd <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void square_grad(int n, const float *in, float *out) {
- kernel_square_grad << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelSub <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void sqrt(int n, const float *in, float *out) {
- kernel_sqrt << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelMult <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void pow(int n, const float *a, const float *b, float *out) {
- kernel_pow << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s) {
+ KernelDiv <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (n, in1, in2, out);
}
-void mult(int n, const float *a, const float *b, float *out) {
- kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s) {
+ int threads_per_block = n > CU1DBLOCK ? CU1DBLOCK : n;
+ // here, we only need one block
+ int num_blocks = 1;
+ KernelSum <<<num_blocks, threads_per_block>>> (n, in, out);
+}
+/*
+void square_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_square_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void mult(int n, const float *a, const float x, float *out) {
- kernel_mult << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, x, out, n);
+void tanh_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_tanh_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void div(int n, const float *a, const float *b, float *out) {
- kernel_div << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (a, b, out, n);
+
+void relu_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_relu_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void set_value(int n, float v, float *out) {
- kernel_set_value << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (out, v, n);
+
+void sigmoid_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_sigmoid_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-void threshold(int n, float alpha, const float *in, float *out) {
- kernel_threshold << <ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, alpha, n);
+void softplus_grad(int n, const float *in, float *out, cudaStream_t s) {
+ kernel_softplus_grad <<<ceil(n / CU1DBLOCKF), CU1DBLOCKF>>> (in, out, n);
}
-// follow the consistency guide for math API
-__global__ void KernelDiv(const size_t num, const float alpha, const float *in,
- float *out) {
- for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
- idx += blockDim.x * gridDim.x) {
- out[idx] = alpha / in[idx];
+
+__global__ void kernel_sum_col(const float *src_mat_data, float *dst_vec_data,
+ int rows, int cols, int stride) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < rows; index += num_threads) {
+ dst_vec_data[index] = 0.0f;
+ for (int k = 0; k < cols; k++) {
+ dst_vec_data[index] += src_mat_data[index * stride + k];
+ }
}
}
-__global__ void KernelGE(const int num, const float *in, const float x,
- float *out) {
- for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
- idx += blockDim.x * gridDim.x) {
- out[idx] = in[idx] >= x ? 1.0f : 0.0f;
+__global__ void kernel_sum_row(const float *src_mat_data, float *dst_vec_data,
+ int rows, int cols, int stride) {
+ int j = blockIdx.x;
+ int THREADS = blockDim.x;
+ if (j >= cols) {
+ return;
}
-}
-__global__ void KernelGT(const int num, const float *in, const float x,
- float *out) {
- for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
- idx += blockDim.x * gridDim.x) {
- out[idx] = in[idx] > x ? 1.0f : 0.0f;
+
+ __shared__ float aux[CU1DBLOCK];
+ int steps = (rows - 1) / THREADS + 1;
+ aux[threadIdx.x] = src_mat_data[j + threadIdx.x * stride];
+ for (int i = 1; i < steps; ++i) {
+ if (threadIdx.x + i * THREADS < rows) {
+ aux[threadIdx.x] +=
+ src_mat_data[j + (threadIdx.x + i * THREADS) * stride];
+ }
}
-}
-__global__ void KernelLE(const int num, const float *in, const float x,
- float *out) {
- for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
- idx += blockDim.x * gridDim.x) {
- out[idx] = in[idx] <= x ? 1.0f : 0.0f;
+
+ int total_threads = THREADS;
+ __syncthreads();
+ while (total_threads > 1) {
+ int half_point = ((1 + total_threads) >> 1);
+ if (threadIdx.x < half_point) {
+ if (threadIdx.x + half_point < total_threads) {
+ aux[threadIdx.x] += aux[threadIdx.x + half_point];
+ }
+ }
+ __syncthreads();
+ total_threads = ((total_threads + 1) >> 1);
}
+
+ __syncthreads();
+ dst_vec_data[j] = aux[0];
}
-__global__ void KernelLT(const int num, const float *in, const float x,
- float *out) {
- for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
- idx += blockDim.x * gridDim.x) {
- out[idx] = in[idx] < x ? 1.0f : 0.0f;
+
+__global__ void kernel_add_vec_row(const float *src_vec_data,
+ const float *src_mat_data,
+ float *des_mat_data, int rows, int cols,
+ int stride) {
+ int i = blockIdx.x * blockDim.x + threadIdx.x;
+ int j = blockIdx.y * blockDim.y + threadIdx.y;
+ int num_threads_x = blockDim.x * gridDim.x;
+ int num_threads_y = blockDim.y * gridDim.y;
+ int index = 0;
+ for (; i < cols && j < rows; i += num_threads_x, j += num_threads_y) {
+ index = j * stride + i;
+ des_mat_data[index] = src_mat_data[index] + src_vec_data[i];
}
}
-__global__ void KernelSet(const size_t num, const float x, float *out) {
- for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num;
- idx += blockDim.x * gridDim.x) {
- out[idx] = x;
+__global__ void kernel_sigmoid_grad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = src_data[index] * (1.0f - src_data[index]);
}
}
-void Set(const size_t num, const float x, float *out, cudaStream_t s) {
- KernelSet << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, x, out);
+
+__global__ void kernel_relu_grad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = src_data[index] > 0.0f ? 1.0f : 0.0f;
+ }
}
-void Div(const size_t num, float alpha, const float *in, float *out,
- cudaStream_t s) {
- KernelDiv << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, alpha, in, out);
+
+__global__ void kernel_tanh_grad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = (1.0f - src_data[index] * src_data[index]);
+ }
}
-void GT(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelGT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+
+__global__ void kernel_softplus_grad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = 1.0f / (1.0f + expf(-src_data[index]));
+ }
}
-void GE(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelGE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void KernelSquareGrad(const float *src_data, float *des_data,
+ int n) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ des_data[index] = 2 * src_data[index];
+ }
}
-void LT(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelLT << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_loss(const float *prob, const int *label,
+ float *loss, int n, int dim) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ float prob_of_truth = prob[index * dim + label[index]];
+ loss[index] -= std::log(max(prob_of_truth, FLT_MIN));
+ }
}
-void LE(const size_t num, const float *in, const float x, float *out,
- cudaStream_t s) {
- KernelLE << <ceil(num / CU1DBLOCKF), CU1DBLOCKF>>> (num, in, x, out);
+__global__ void kernel_softmax_gradient(float *grad, const int *label, int n,
+ int dim, float scale) {
+ int index = blockIdx.x * blockDim.x + threadIdx.x;
+ int num_threads = blockDim.x * gridDim.x;
+ for (; index < n; index += num_threads) {
+ int pos = index * dim + label[index];
+ grad[pos] = (grad[pos] - 1.0f) * scale;
+ }
}
+*/
+
} // namespace cuda
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/math_kernel.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/math_kernel.h b/src/core/tensor/math_kernel.h
index 5c906a9..d8a58a5 100644
--- a/src/core/tensor/math_kernel.h
+++ b/src/core/tensor/math_kernel.h
@@ -31,65 +31,66 @@ namespace singa {
// TODO(wangwei) make all function templates.
namespace cuda {
-void sum(int n, const float *in, float *out);
-void sum_row(int rows, int cols, int stride, const float *in, float *out);
-
-void sum_col(int rows, int cols, int stride, const float *in, float *out);
-
-void add_row(int rows, int cols, int stride, const float *in_row,
- const float *in_mat, float *out);
-
-void add(int n, const float *a, const float *b, float *out);
-
-void sub(int n, const float *a, const float *b, float *out);
-
-void exp(int n, const float *in, float *out);
-
-void log(int n, const float *in, float *out);
-
-void sigmoid(int n, const float *in, float *out);
-
-void sigmoid_grad(int n, const float *in, float *out);
-
-void relu(int n, const float *in, float *out);
-
-void relu_grad(int n, const float *in, float *out);
-
-void tanh(int n, const float *in, float *out);
-
-void tanh_grad(int n, const float *in, float *out);
+// 0 input
+void set(const size_t n, const float v, float *out, cudaStream_t s);
+
+// 1 input
+void abs(const size_t n, const float *in, float *out, cudaStream_t s);
+void sign(const size_t n, const float *in, float *out, cudaStream_t s);
+void exp(const size_t n, const float *in, float *out, cudaStream_t s);
+void log(const size_t n, const float *in, float *out, cudaStream_t s);
+void sqrt(const size_t n, const float *in, float *out, cudaStream_t s);
+void square(const size_t n, const float *in, float *out, cudaStream_t s);
+void tanh(const size_t n, const float *in, float *out, cudaStream_t s);
+void relu(const size_t n, const float *in, float *out, cudaStream_t s);
+void sigmoid(const int n, const float *in, float *out, cudaStream_t s);
+void softplus(const size_t n, const float *in, float *out, cudaStream_t s);
+void clamp(const size_t n, const float low, const float high, const float *in,
+ float *out, cudaStream_t s);
+
+void pow(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s);
-void softplus(int n, const float *in, float *out);
+void add(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s);
-void softplus_grad(int n, const float *in, float *out);
+void mult(const size_t n, const float *in, const float x, float *out,
+ cudaStream_t s);
-void square(int n, const float *in, float *out);
+void div(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s);
-void square_grad(int n, const float *in, float *out);
+void threshold(const size_t n, const float x, const float *in, float *out,
+ cudaStream_t s);
-void sqrt(int n, const float *in, float *out);
+void gt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
+void ge(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
+void lt(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
+void le(const size_t num, const float *in, const float x, float *out,
+ cudaStream_t s);
-void pow(int n, const float *a, const float *b, float *out);
+// 2 inputs
+void pow(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void mult(int n, const float *a, const float *b, float *out);
+void add(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void mult(int n, const float *a, const float x, float *out);
+void sub(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void div(int n, const float *a, const float *b, float *out);
+void mult(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void set_value(int n, float v, float *out);
+void div(const size_t n, const float *in1, const float *in2, float *out,
+ cudaStream_t s);
-void threshold(int n, float alpha, const float *in, float *out);
+void sum(const size_t n, const float *in, float *out, cudaStream_t s);
-// follow the consistency guide for math API
-void Div(const size_t num, const float x, const float *in, float *out,
- cudaStream_t s);
-void Set(const size_t num, const float x, float *out, cudaStream_t s);
-void GT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void GE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LT(size_t num, const float *in, const float x, float *out, cudaStream_t s);
-void LE(size_t num, const float *in, const float x, float *out, cudaStream_t s);
} // cuda
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index f4e9da2..e62386a 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -219,6 +219,8 @@ GenUnaryScalarArgMemberFn(operator+=, Add);
GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
GenUnaryScalarArgMemberFn(operator/=, Div);
+
+
// ====================Tensor Operations=======================================
void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
const size_t dst_offset, const size_t src_offset) {
@@ -309,6 +311,18 @@ void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
} while (0)
// =============Element-wise operations====================================
+/// L2 norm, Do not use Nrm2 (name conflict).
+float Tensor::L2() const {
+ float nrm = 0.0f;
+ TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
+ device_->Exec([&nrm, this](Context *ctx) {
+ DType ret;
+ Nrm2<DType, Lang>(this->Size(), this->blob(), &ret, ctx);
+ nrm = TypeCast<DType, float>(ret);
+ }, {this->blob()}, {});
+ });
+ return nrm;
+}
template <typename SType>
void Tensor::SetValue(const SType x) {
CHECK_EQ(sizeof(SType), SizeOf(data_type_));
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index b5d0ba9..b86e1cb 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -48,41 +48,45 @@ namespace singa {
/// 7. Use size_t for the number of elements, rows or columns.
/// 8. Use the same name for the Tensor and Blob level math functions.
-// =============Element-wise operations====================================
+// **************************************
+// Element-wise functions
+// **************************************
+
/// out[i] = |in[i]|
template <typename DType, typename Lang>
void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
LOG(FATAL) << "Abs Not Implemented";
}
-/// out = in + x
+/// out[i] = in[i] + x
template <typename DType, typename Lang>
void Add(const size_t num, const Blob *in, const DType x, Blob *out,
Context *ctx) {
LOG(FATAL) << "Add Not Implemented";
}
-/// out = in1 + in2
+/// out[i] = in1[i] + in2[i]
template <typename DType, typename Lang>
void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
Context *ctx) {
LOG(FATAL) << "Add-Pair Not Implemented";
}
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Clamp every element into [low, high]
+/// if in[i]>high, then out[i]=high; if in[i]<low, then out[i]=low.
template <typename DType, typename Lang>
void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
Blob *out, Context *ctx) {
LOG(FATAL) << "Clamp Not Implemented";
}
-/// out = x / in
+/// out[i] = x / in[i]
template <typename DType, typename Lang>
void Div(const size_t num, const DType x, const Blob *in, Blob *out,
Context *ctx) {
LOG(FATAL) << "Div Not Implemented";
}
+/// out[i] = in[i] / x
template <typename DType, typename Lang>
void Div(const size_t num, const Blob *in, const DType x, Blob *out,
Context *ctx) {
@@ -90,21 +94,21 @@ void Div(const size_t num, const Blob *in, const DType x, Blob *out,
EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
}
-/// out = in1 / in2
+/// out[i] = in1[i] / in2[i]
template <typename DType, typename Lang>
void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
Context *ctx) {
LOG(FATAL) << "Div-Pair Not Implemented";
}
-/// out = in * x
+/// out[i] = in[i] * x
template <typename DType, typename Lang>
void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
Context *ctx) {
LOG(FATAL) << "EltwiseMult Not Implemented";
}
-/// out = in2 * in2
+/// out[i] = in1[i] * in2[i]
template <typename DType, typename Lang>
void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
Context *ctx) {
@@ -146,31 +150,32 @@ void GT(const size_t num, const Blob *in, const DType x, Blob *out,
Context *ctx) {
LOG(FATAL) << "GT Not Implemented";
}
-/// Element-wise operation, do v^x for every v from the in tensor
+/// out[i] = pow(in[i], x)
template <typename DType, typename Lang>
void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
Context *ctx) {
LOG(FATAL) << "Pow Not Implemented";
}
-/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
+/// out[i]=pow(in1[i], in2[i])
template <typename DType, typename Lang>
void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
Context *ctx) {
LOG(FATAL) << "Pow-Pair Not Implemented";
}
-/// Element-wise operation, out[i]=max(0, in[i])
+/// out[i]=max(0, in[i])
template <typename DType, typename Lang>
void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
LOG(FATAL) << "ReLU Not Implemented";
}
+/// out[i] = x
template <typename DType, typename Lang>
void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
LOG(FATAL) << "Set Not Implemented";
}
-/// Element-wise operation, out[i]=sigmoid([in[i])
+/// out[i]=sigmoid(in[i])
template <typename DType, typename Lang>
void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
LOG(FATAL) << "Sigmoid Not Implemented";
@@ -181,85 +186,47 @@ template <typename DType, typename Lang>
void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
LOG(FATAL) << "Sign Not Implemented";
}
-/// Element-wise operation, out[i]=sqrt([in[i])
+/// out[i]=sqrt(in[i])
template <typename DType, typename Lang>
void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
LOG(FATAL) << "Sqrt Not Implemented";
}
-/// Element-wise operation, out[i]=square([in[i])
+/// out[i]=square(in[i])
template <typename DType, typename Lang>
void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Square Not Implemented";
+ EltwiseMult<DType, Lang>(num, in, in, out, ctx);
}
-/// out = in - x
+/// out[i] = in[i] - x
template <typename DType, typename Lang>
void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
Context *ctx) {
Add<DType, Lang>(num, in, -x, out, ctx);
}
-/// out = in1 - in2
+/// out[i] = in1[i] - in2[i]
template <typename DType, typename Lang>
void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
Context *ctx) {
LOG(FATAL) << "Sub-Pair Not Implemented";
}
+
/// sum all elements of in into out
template <typename DType, typename Lang>
void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
LOG(FATAL) << "Sum Not Implemented";
}
-/// Element-wise operation, out[i]=tanh([in[i])
+/// out[i]=tanh(in[i])
template <typename DType, typename Lang>
void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
LOG(FATAL) << "Tanh Not Implemented";
}
-// =========== Matrix operations ===========================================
-/// Add the vector v to every column of A as the column of out
-template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "AddCol Not Implemented";
-}
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out
-template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "AddRow Not Implemented";
-}
-/// outer-product.
-/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
-template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Outer Not Implemented";
-}
-// Do softmax for each row invidually
-template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "Softmax Not Implemented";
-}
-/// Sum the columns of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "SumColumns Not Implemented";
-}
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the in matrix into a vector
-template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "SumRows Not Implemented";
-}
-
-// ================Random functions===========================================
+// **************************************
+// Random functions
+// **************************************
/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
// Get the random generator from 'ctx'
// If DType is not float, then convert the threshold to DType
@@ -282,7 +249,10 @@ void Uniform(const size_t num, const float low, const float high, Blob *out,
LOG(FATAL) << "Uniform Not Implemented";
}
-// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+// BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
+// *********************************************************
+
/// outurn the index of the element with the max value.
template <typename DType, typename Lang>
void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
@@ -307,12 +277,19 @@ void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
LOG(FATAL) << "Axpy Not Implemented";
}
+/// out = ||in||_2^2, i.e, L2 norm.
+template <typename DType, typename Lang>
+void Nrm2(const size_t num, const Blob *in, float *out, Context *ctx) {
+ LOG(FATAL) << "Nrm2 Not Implemented";
+}
+
/// out *= x
template <typename DType, typename Lang>
void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
LOG(FATAL) << "Scale Not Implemented";
}
+/// inner product of array in1 and in2
template <typename DType, typename Lang>
void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
Context *ctx) {
@@ -346,5 +323,44 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
LOG(FATAL) << "GEMM Not Implemented";
}
+// **************************************
+// Matrix functions
+// **************************************
+/*
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "AddRow Not Implemented";
+}
+/// outer-product.
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
+template <typename DType, typename Lang>
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Outer Not Implemented";
+}
+
+/// Sum the columns of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "SumColumns Not Implemented";
+}
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
+template <typename DType, typename Lang>
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "SumRows Not Implemented";
+}
+*/
} // namespace singa
#endif // SINGA_CORE_MATH_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 2c5c272..0b280a3 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -241,7 +241,7 @@ void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
outPtr[i] = sqrt(inPtr[i]);
}
}
-
+/*
template <>
void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
Context *ctx) {
@@ -251,6 +251,7 @@ void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
outPtr[i] = inPtr[i] * inPtr[i];
}
}
+*/
template <>
void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -287,101 +288,6 @@ void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out,
}
}
-// =========Matrix operations ================================================
-
-template <>
-void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Blob *A, const Blob *v, Blob *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[offset + c] = APtr[offset + c] + vPtr[r];
- }
- }
-}
-
-template <>
-void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Blob *A, const Blob *v, Blob *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[offset + c] = APtr[offset + c] + vPtr[c];
- }
- }
-}
-template <>
-void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
- const Blob *in2, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- for (size_t r = 0; r < m; r++) {
- size_t offset = r * n;
- for (size_t c = 0; c < n; c++) {
- outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
- }
- }
-}
-template <>
-void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Blob *in, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- float *bPtr = new float[ncol];
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- float denom = 0.f;
- for (size_t c = 0; c < ncol; c++) {
- bPtr[c] = exp(inPtr[offset + c]);
- denom += bPtr[c];
- }
- for (size_t c = 0; c < ncol; c++) {
- size_t idx = offset + c;
- outPtr[idx] = bPtr[c] / denom;
- }
- }
- delete bPtr;
-}
-
-template <>
-void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Blob *in, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t c = 0; c < ncol; c++) {
- outPtr[c] = 0.f;
- }
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[c] += inPtr[offset + c];
- }
- }
-}
-
-template <>
-void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
- const Blob *in, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- for (size_t r = 0; r < nrow; r++) {
- size_t offset = r * ncol;
- outPtr[r] = 0.f;
- for (size_t c = 0; c < ncol; c++) {
- outPtr[r] += inPtr[offset + c];
- }
- }
-}
-
// ===============Random operations==========================================
template <>
void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out,
@@ -440,18 +346,26 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
#ifdef USE_CBLAS
template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out,
+ Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in->data());
+ *out = cblas_isamax(num, inPtr, 1);
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+ Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in->data());
+ *out = cblas_sasum(num, inPtr, 1);
+}
+
+template <>
void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
Blob *out, Context *ctx) {
const float *inPtr = static_cast<const float *>(in->data());
float *outPtr = static_cast<float *>(out->mutable_data());
cblas_saxpy(num, alpha, inPtr, 1, outPtr, 1);
}
-template <>
-void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- cblas_sscal(num, x, outPtr, 1);
-}
template <>
void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -461,6 +375,19 @@ void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
*out = cblas_sdot(num, in1Ptr, 1, in2Ptr, 1);
}
template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ cblas_sscal(num, x, outPtr, 1);
+}
+template <>
+void Nrm2<float, lang::Cpp>(const size_t num, const Blob *in, float *out,
+ Context *ctx) {
+ const float *inPtr = static_cast<const float *>(in->data());
+ *out = cblas_snrm2(num, inPtr, 1);
+}
+
+template <>
void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
const float alpha, const Blob *A, const Blob *v,
const float beta, Blob *out, Context *ctx) {
@@ -587,6 +514,102 @@ void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n,
}
#endif // USE_CBLAS
+
+// =========Matrix operations ================================================
+/*
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *A, const Blob *v, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+ }
+ }
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *A, const Blob *v, Blob *out,
+ Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+ }
+ }
+}
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1,
+ const Blob *in2, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t r = 0; r < m; r++) {
+ size_t offset = r * n;
+ for (size_t c = 0; c < n; c++) {
+ outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+ }
+ }
+}
+template <>
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ float *bPtr = new float[ncol];
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ float denom = 0.f;
+ for (size_t c = 0; c < ncol; c++) {
+ bPtr[c] = exp(inPtr[offset + c]);
+ denom += bPtr[c];
+ }
+ for (size_t c = 0; c < ncol; c++) {
+ size_t idx = offset + c;
+ outPtr[idx] = bPtr[c] / denom;
+ }
+ }
+ delete bPtr;
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[c] = 0.f;
+ }
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[c] += inPtr[offset + c];
+ }
+ }
+}
+
+template <>
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol,
+ const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ outPtr[r] = 0.f;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[r] += inPtr[offset + c];
+ }
+ }
+}
+*/
} // namespace singa
#endif // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/6d69047a/src/core/tensor/tensor_math_cuda.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cuda.h b/src/core/tensor/tensor_math_cuda.h
index f9841a3..e2597d5 100644
--- a/src/core/tensor/tensor_math_cuda.h
+++ b/src/core/tensor/tensor_math_cuda.h
@@ -24,105 +24,336 @@
#include "./math_kernel.h"
#include "singa/utils/cuda_utils.h"
#include "singa/core/common.h"
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include "singa/utils/cuda_utils.h"
namespace singa {
-// =================Elementwise operations===================================
+
+/// out[i] = |in[i]|
+template <>
+void Abs<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::abs(num, inPtr, outPtr, ctx->stream);
+}
+/// out = in + x
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+ Blob* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::add(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 + in2
+template <>
+void Add<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+ Blob* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->data());
+ const float* inPtr2 = static_cast<const float*>(in2->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::add(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
+template <>
+void Clamp<float, lang::Cuda>(const size_t num, const float low,
+ const float high, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::clamp(num, low, high, inPtr, outPtr, ctx->stream);
+}
+/// out = in1 / in2
+template <>
+void Div<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+ Blob* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->data());
+ const float* inPtr2 = static_cast<const float*>(in2->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::div(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+
+template <>
+void Div<float, lang::Cuda>(const size_t num, const float x, const Blob* in,
+ Blob* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::div(num, x, inPtr, outPtr, ctx->stream);
+}
+
+/// out = in * x
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in,
+ const float x, Blob* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::mult(num, inPtr, x, outPtr, ctx->stream);
+}
+/// out = in1 * in2
+template <>
+void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob* in1,
+ const Blob* in2, Blob* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->data());
+ const float* inPtr2 = static_cast<const float*>(in2->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::mult(num, inPtr1, inPtr2, outPtr, ctx->stream);
+}
+/// Base is e. out[i]=e^in[i]
+template <>
+void Exp<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::exp(num, inPtr, outPtr, ctx->stream);
+}
+
+template <>
+void GE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+ Blob* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->data());
+ cuda::ge(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void GT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+ Blob* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->data());
+ cuda::gt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+template <>
+void LE<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+ Blob* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->data());
+ cuda::le(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Natual logarithm, the base is e, Neper number out[i]=ln(in[i]).
+template <>
+void Log<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::log(num, inPtr, outPtr, ctx->stream);
+}
+template <>
+void LT<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+ Blob* out, Context* ctx) {
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ const float* inPtr = static_cast<const float*>(in->data());
+ cuda::lt(num, inPtr, x, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i] = in[i]^x
+template <>
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in, const float x,
+ Blob* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::pow(num, inPtr, x, outPtr, ctx->stream);
+}
+/// Element-wise operation, out[i] = in1[i]^in2[i]
template <>
-void Add<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
- cuda::add(num, in1Ptr, in2Ptr, outPtr);
+void Pow<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+ Blob* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->data());
+ const float* inPtr2 = static_cast<const float*>(in2->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::pow(num, inPtr1, inPtr2, outPtr, ctx->stream);
}
-// follow the consistency guide of math API
+/// Element-wise operation, out[i]=max(0, in[i])
template <>
-void Div<float, lang::Cuda>(const size_t num, const float x, const Blob *in,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::Div(num, x, inPtr, outPtr, ctx->stream);
+void ReLU<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::relu(num, inPtr, outPtr, ctx->stream);
}
+/// out[i] = x
template <>
-void EltwiseMult<float, lang::Cuda>(const size_t num, const Blob *in,
- const float x, Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::mult(num, inPtr, x, outPtr);
+void Set<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+ Context* ctx) {
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::set(num, x, outPtr, ctx->stream);
}
+/// Element-wise operation, out[i]=sigmoid([in[i])
template <>
-void GE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::GE(num, inPtr, x, outPtr, ctx->stream);
+void Sigmoid<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::sigmoid(num, inPtr, outPtr, ctx->stream);
}
+// out[i] = sign(in[i])
template <>
-void GT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::GT(num, inPtr, x, outPtr, ctx->stream);
+void Sign<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::sign(num, inPtr, outPtr, ctx->stream);
}
+
+/// Element-wise operation, out[i]=sqrt([in[i])
+template <>
+void Sqrt<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::sqrt(num, inPtr, outPtr, ctx->stream);
+}
+
+/// Element-wise operation, out[i]=in[i]^2
template <>
-void LE<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::LE(num, inPtr, x, outPtr, ctx->stream);
+void Square<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::square(num, inPtr, outPtr, ctx->stream);
}
+/// out = in1 - in2
template <>
-void LT<float, lang::Cuda>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::LT(num, inPtr, x, outPtr, ctx->stream);
+void Sub<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+ Blob* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->data());
+ const float* inPtr2 = static_cast<const float*>(in2->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::sub(num, inPtr1, inPtr2, outPtr, ctx->stream);
}
+
+/// sum all elements of input into out
template <>
-void Set<float, lang::Cuda>(const size_t num, const float x, Blob *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- cuda::Set(num, x, outPtr, ctx->stream);
+void Sum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ cuda::sum(num, inPtr, out, ctx->stream);
}
-// TODO(wangwei) optimize using stream
+
+/// Element-wise operation, out[i]=tanh([in[i])
template <>
-void Square<float, lang::Cuda>(const size_t num, const Blob *in, Blob *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::square(num, inPtr, outPtr);
+void Tanh<float, lang::Cuda>(const size_t num, const Blob* in, Blob* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ cuda::tanh(num, inPtr, outPtr, ctx->stream);
}
-// TODO(wangwei) optimize using stream
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
template <>
-void Sub<float, lang::Cuda>(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *in1Ptr = static_cast<const float *>(in1->data());
- const float *in2Ptr = static_cast<const float *>(in2->data());
- cuda::sub(num, in1Ptr, in2Ptr, outPtr);
+void Bernoulli<float, lang::Cuda>(const size_t num, const float p, Blob* out,
+ Context* ctx) {
+ auto rgen = ctx->curand_generator;
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+ cuda::threshold(num, p, outPtr, outPtr, ctx->stream);
}
-// sum all elements of input into ret
-// TODO(wangwei) optimize using stream
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
template <>
-void Sum<float, lang::Cuda>(const size_t num, const Blob *in, float *out,
- Context *ctx) {
- const float *inPtr = static_cast<const float *>(in->data());
- cuda::sum(num, inPtr, out);
+void Uniform<float, lang::Cuda>(const size_t num, const float low,
+ const float high, Blob* out, Context* ctx) {
+ auto rgen = ctx->curand_generator;
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ CURAND_CHECK(curandGenerateUniform(rgen, outPtr, num));
+ cuda::mult(num, outPtr, high - low, outPtr, ctx->stream);
+ cuda::add(num, outPtr, low, outPtr, ctx->stream);
+}
+
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and delta to DType
+template <>
+void Gaussian<float, lang::Cuda>(const size_t num, const float mean,
+ const float std, Blob* out, Context* ctx) {
+ auto rgen = ctx->curand_generator;
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ CURAND_CHECK(curandGenerateNormal(rgen, outPtr, num, mean, std));
}
// =========================Blas operations==================================
+// ref to http://docs.nvidia.com/cuda/cublas
+template <>
+void Amax<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ int idx = 1;
+ CUBLAS_CHECK(cublasIsamax(handle, num, inPtr, 1, &idx));
+ *out = idx - 1; // cublas index starts from 1
+}
+
+/// return the index of the element with the min value.
+template <>
+void Amin<float, lang::Cuda>(const size_t num, const Blob* in, size_t* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ int idx = 1;
+ CUBLAS_CHECK(cublasIsamin(handle, num, inPtr, 1, &idx));
+ *out = idx - 1;
+}
+
+/// out = sum |x| for all x in in
+template <>
+void Asum<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+ Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ CUBLAS_CHECK(cublasSasum(handle, num, inPtr, 1, out));
+}
+
+/// out = alpha * in + out
+template <>
+void Axpy<float, lang::Cuda>(const size_t num, const float alpha,
+ const Blob* in, Blob* out, Context* ctx) {
+ const float* inPtr = static_cast<const float*>(in->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ CUBLAS_CHECK(cublasSaxpy(handle, num, &alpha, inPtr, 1, outPtr, 1));
+}
+
+/// out = \sum_i in1[i] * in2[i]
+template <>
+void Dot<float, lang::Cuda>(const size_t num, const Blob* in1, const Blob* in2,
+ float* out, Context* ctx) {
+ const float* inPtr1 = static_cast<const float*>(in1->data());
+ const float* inPtr2 = static_cast<const float*>(in2->data());
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ CUBLAS_CHECK(cublasSdot(handle, num, inPtr1, 1, inPtr2, 1, out));
+}
+template <>
+void Nrm2<float, lang::Cuda>(const size_t num, const Blob* in, float* out,
+ Context* ctx) {
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ const float* inPtr = static_cast<const float*>(in->data());
+ cublasSnrm2(handle, num, inPtr, 1, out);
+}
+template <>
+void Scale<float, lang::Cuda>(const size_t num, const float x, Blob* out,
+ Context* ctx) {
+ auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
+ float* outPtr = static_cast<float*>(out->mutable_data());
+ CUBLAS_CHECK(cublasSscal(handle, num, &x, outPtr, 1));
+}
// NOTE: cublas uses column major order.
// http://peterwittek.com/cublas-matrix-c-style.html
template <>
void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
- const size_t ncol, const Blob *M, const Blob *v,
- Blob *out, Context *ctx) {
+ const size_t ncol, const Blob* M, const Blob* v,
+ Blob* out, Context* ctx) {
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
- const float *MPtr = static_cast<const float *>(M->data());
- const float *vPtr = static_cast<const float *>(v->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
+ const float* MPtr = static_cast<const float*>(M->data());
+ const float* vPtr = static_cast<const float*>(v->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
if (side_right) {
CUBLAS_CHECK(cublasSdgmm(handle, CUBLAS_SIDE_LEFT, ncol, nrow, MPtr, ncol,
vPtr, 1, outPtr, ncol));
@@ -133,11 +364,11 @@ void DGMM<float, lang::Cuda>(const bool side_right, const size_t nrow,
}
template <>
void GEMV<float, lang::Cuda>(bool trans, const size_t m, const size_t n,
- const float alpha, const Blob *A, const Blob *v,
- const float beta, Blob *out, Context *ctx) {
- const float *APtr = static_cast<const float *>(A->data());
- const float *vPtr = static_cast<const float *>(v->data());
- float *outPtr = static_cast<float *>(out->mutable_data());
+ const float alpha, const Blob* A, const Blob* v,
+ const float beta, Blob* out, Context* ctx) {
+ const float* APtr = static_cast<const float*>(A->data());
+ const float* vPtr = static_cast<const float*>(v->data());
+ float* outPtr = static_cast<float*>(out->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
if (!trans)
CUBLAS_CHECK(cublasSgemv(handle, CUBLAS_OP_T, n, m, &alpha, APtr, n, vPtr,
@@ -152,16 +383,16 @@ template <>
void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
const size_t nrowA, const size_t ncolB,
const size_t ncolA, const float alpha,
- const Blob *A, const Blob *B, const float beta,
- Blob *C, Context *ctx) {
+ const Blob* A, const Blob* B, const float beta,
+ Blob* C, Context* ctx) {
auto transa = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
auto transb = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
int lda = transA ? nrowA : ncolA;
int ldb = transB ? ncolA : ncolB;
int ldc = ncolB;
- const float *APtr = static_cast<const float *>(A->data());
- const float *BPtr = static_cast<const float *>(B->data());
- float *CPtr = static_cast<float *>(C->mutable_data());
+ const float* APtr = static_cast<const float*>(A->data());
+ const float* BPtr = static_cast<const float*>(B->data());
+ float* CPtr = static_cast<float*>(C->mutable_data());
auto handle = ctx->cublas_handle; // TODO(wangwei) set cudastream
CUBLAS_CHECK(cublasSgemm(handle, transb, transa, ncolB, nrowA, ncolA, &alpha,
BPtr, ldb, APtr, lda, &beta, CPtr, ldc));
@@ -171,4 +402,3 @@ void GEMM<float, lang::Cuda>(const bool transA, const bool transB,
#endif // USE_CUDA
#endif // SINGA_CORE_TENSOR_TENSOR_MATH_CUDA_H_
-
[5/5] incubator-singa git commit: SINGA-168 Implement Cpp Math
functions APIs
Posted by zh...@apache.org.
SINGA-168 Implement Cpp Math functions APIs
Update error log for tensor_math.h to include the function name, e.g.
"Foo is not implemented".
Add Tensor Math Cpp Implementation and Test Cases
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/07c49da5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/07c49da5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/07c49da5
Branch: refs/heads/dev
Commit: 07c49da5b1ee6582780f5faef6c6bf3418a7a0b6
Parents: 01aaf49
Author: liyuchenmike@gmail.com <li...@gmail.com>
Authored: Fri Jun 3 20:46:16 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800
----------------------------------------------------------------------
src/core/tensor/tensor_math.h | 293 +++++++++----------
src/core/tensor/tensor_math_cpp.h | 508 ++++++++++++++++++++++++---------
test/singa/test_tensor_math.cc | 264 ++++++++++++++++-
3 files changed, 774 insertions(+), 291 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index ff865e0..1bf6fc7 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -50,277 +50,259 @@ namespace singa {
// ================Linear algebra functions====================================
/// ret[i] = |input[i]|
template <typename DType, typename Lang>
-void Abs(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Abs Not Implemented";
}
template <typename DType, typename Lang>
-void Set(int count, DType x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Set Not Implemented";
}
+
/// sum all elements of input into ret
template <typename DType, typename Lang>
-void Sum(int count, const Blob *input, DType *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+ LOG(FATAL) << "Sum Not Implemented";
}
/// ret[i] = sign(input[i])
template <typename DType, typename Lang>
-void Sign(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Sign Not Implemented";
}
/// Base is e, Neper number. ret[i]=exp(input[i])
template <typename DType, typename Lang>
-void Exp(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Exp Not Implemented";
}
/// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
template <typename DType, typename Lang>
-void Log(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Log Not Implemented";
}
-
/// Element-wise operation, ret[i]=sqrt([input[i])
template <typename DType, typename Lang>
-void Sqrt(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Sqrt Not Implemented";
}
/// Element-wise operation, ret[i]=square([input[i])
template <typename DType, typename Lang>
-void Square(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Square Not Implemented";
}
/// Element-wise operation, ret[i]=tanh([input[i])
template <typename DType, typename Lang>
-void Tanh(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Tanh Not Implemented";
}
/// Element-wise operation, ret[i]=max(0, input[i])
template <typename DType, typename Lang>
-void ReLU(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "ReLU Not Implemented";
}
/// Element-wise operation, ret[i]=sigmoid([input[i])
template <typename DType, typename Lang>
-void Sigmoid(int count, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Sigmoid Not Implemented";
}
-/// Do softmax for each row invidually
+// Do softmax for each row invidually
template <typename DType, typename Lang>
-void Softmax(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Softmax Not Implemented";
}
// TODO(wangwei) unify SumRow and SumCol.
/// Sum the rows of the input matrix into a vector
template <typename DType, typename Lang>
-void SumRows(int nrow, int ncol, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "SumRows Not Implemented";
}
/// Sum the columns of the input matrix into a vector
template <typename DType, typename Lang>
-void SumColumns(int nrow, int ncol, const Blob *input, Blob *ret,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "SumColumns Not Implemented";
}
// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of ret
+/// Add the vector v to every row of A as the row of out
template <typename DType, typename Lang>
-void AddRow(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "AddRow Not Implemented";
}
-/// Add the vector v to every column of A as the column of ret
+/// Add the vector v to every column of A as the column of out
template <typename DType, typename Lang>
-void AddCol(int nrow, int ncol, const Blob *A, const Blob *v, Blob *ret,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "AddCol Not Implemented";
}
/// Element-wise operation, do v^x for every v from the input tensor
template <typename DType, typename Lang>
-void Pow(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Pow Not Implemented";
}
/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
template <typename DType, typename Lang>
-void Pow(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Pow(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Pow-Pair Not Implemented";
}
/// Element-wise operation, clamp every element into [low, high]
/// if x>high, then x=high; if x<low, then x=low.
template <typename DType, typename Lang>
-void Clamp(int count, DType low, DType high, const Blob *input, Blob *ret,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Clamp Not Implemented";
}
/// ret = input + x
template <typename DType, typename Lang>
-void Add(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Add Not Implemented";
}
+
+/// ret = lhs + rhs
+template <typename DType, typename Lang>
+void Add(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Add-Pair Not Implemented";
+}
+
/// ret = input - x
template <typename DType, typename Lang>
-void Sub(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
- Add<DType, Lang>(count, input, -x, ret, ctx);
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+ Add<DType, Lang>(num, in, -x, out, ctx);
}
-/// ret = input * x
+
+/// ret = lhs - rhs
template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *input, DType x, Blob *ret,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Sub-Pair Not Implemented";
}
-/// ret = input / x
+
+/// ret = input * x
template <typename DType, typename Lang>
-void Div(int count, const Blob *input, DType x, Blob *ret, Context *ctx) {
- EltwiseMult<DType, Lang>(count, input, DType(1) / x, ret, ctx);
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "EltwiseMult Not Implemented";
}
-/// ret = lhs + rhs
+/// ret = lhs * rhs
template <typename DType, typename Lang>
-void Add(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
}
-/// ret = lhs - rhs
+/// ret = input / x
template <typename DType, typename Lang>
-void Sub(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Div Not Implemented";
}
-/// ret = lhs * rhs
template <typename DType, typename Lang>
-void EltwiseMult(int count, const Blob *lhs, const Blob *rhs, Blob *ret,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+ CHECK_NE(x,0.f);
+ EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
}
/// ret = lhs / rhs
template <typename DType, typename Lang>
-void Div(int count, const Blob *lhs, const Blob *rhs, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Div-Pair Not Implemented";
}
/// outer-product.
/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
template <typename DType, typename Lang>
-void Outer(int m, int n, const Blob *lhs, const Blob *rhs, Blob *ret,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Outer Not Implemented";
}
/// ret[i]=(input[i]<x)?1.f:0.f
template <typename DType, typename Lang>
-void LT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "LT Not Implemented";
}
/// ret[i]=(input[i]<=x)?1.f:0.f
template <typename DType, typename Lang>
-void LE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "LE Not Implemented";
}
/// ret[i]=(input[i]>x)?1.f:0.f
template <typename DType, typename Lang>
-void GT(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "GT Not Implemented";
}
-/// ret[i]=(input[i]>x)?1.f:0.f
+/// ret[i]=(input[i]>=x)?1.f:0.f
template <typename DType, typename Lang>
-void GE(int count, const Blob *input, float x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "GE Not Implemented";
}
// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
// ===== Level 1
/// return the index of the element with the max value.
template <typename DType, typename Lang>
-void Amax(int count, const Blob *input, int *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+ LOG(FATAL) << "Amax Not Implemented";
}
/// return the index of the element with the min value.
template <typename DType, typename Lang>
-void Amin(int count, const Blob *input, int *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+ LOG(FATAL) << "Amin Not Implemented";
}
/// ret = sum |x| for all x in input
template <typename DType, typename Lang>
-void Asum(int count, const Blob *input, DType *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+ LOG(FATAL) << "Asum Not Implemented";
}
/// ret = alpha * input + ret
template <typename DType, typename Lang>
-void Axpy(int count, DType alpha, const Blob *input, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Axpy(const size_t num, const DType alpha, const Blob *in,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Axpy Not Implemented";
}
/// ret *= x
template <typename DType, typename Lang>
-void Scale(int count, DType x, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Scale Not Implemented";
}
template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Dot(const size_t num, const Blob *in1, const Blob *in2,
+ DType *out, Context *ctx) {
+ LOG(FATAL) << "Dot Not Implemented";
}
// ===== Level 2
/// ret = alpha * op(A) * v + beta * ret.
/// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
template <typename DType, typename Lang>
-void GEMV(bool trans, int m, int n, DType alpha, const Blob *A, const Blob *v,
- DType beta, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
-}
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(int count, float p, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(int count, float low, float high, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(int count, float mean, float std, Blob *ret, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
-}
-
-// ========follow the consistency guide of math API
-
-template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
-}
-/// Divide alpha by each element of 'in'.
-template <typename DType, typename Lang>
-void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+ const Blob *A, const Blob *v,
+ const DType beta, Blob *out, Context *ctx) {
+ LOG(FATAL) << "GEMV Not Implemented";
}
/// multiply a matrix with a diagnoal matrix constructed using values from 'v'.
@@ -328,7 +310,7 @@ void Div(const size_t num, const DType alpha, const Blob *in, Blob *out,
template <typename DType, typename Lang>
void DGMM(const bool side_right, const size_t nrow, const size_t ncol,
const Blob *M, const Blob *v, Blob *out, Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+ LOG(FATAL) << "DGMM Not Implemented";
}
/// C = alpha * A * B + beta * C.
@@ -338,32 +320,37 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
const size_t ncolB, const size_t ncolA, const DType alpha,
const Blob *A, const Blob *B, const DType beta, Blob *C,
Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+ LOG(FATAL) << "GEMM Not Implemented";
}
-/// ret[i]=(input[i]<x)?1.f:0.f
-template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
-}
-/// ret[i]=(input[i]<=x)?1.f:0.f
+
+
+// ===== Level 3
+
+// ================Random functions===========================================
+/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Bernoulli Not Implemented";
}
-/// ret[i]=(input[i]>x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Uniform(const size_t num, const float low, const float high,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Uniform Not Implemented";
}
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Gaussian Not Implemented";
}
+
+
+
} // namespace singa
#endif // SINGA_CORE_MATH_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/src/core/tensor/tensor_math_cpp.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math_cpp.h b/src/core/tensor/tensor_math_cpp.h
index 693f09c..ec7a892 100644
--- a/src/core/tensor/tensor_math_cpp.h
+++ b/src/core/tensor/tensor_math_cpp.h
@@ -27,195 +27,317 @@
/// TODO(wangwei) Clean the implementations following the comments in
/// tensor_math.h.
-/// For Blob argument xxx, name its pointer as xxxPtr.
namespace singa {
+
+template<>
+void Abs<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = fabs(inPtr[i]);
+ }
+}
+
template <>
-void Square<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *in = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = in[i] * in[i];
+void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ for (size_t i = 0; i < num; i++) outPtr[i] = x;
+}
+
+// sum all elements of input into out
+// TODO(wangwei) optimize using omp
+template <>
+void Sum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+ float s = 0.f;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ s += inPtr[i];
}
+ *out = s;
}
template <>
-void Add<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
- Blob *ret, Context *ctx) {
- // CHECK_EQ(ctx->stream, nullptr);
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(lhs->data());
- const float *rptr = static_cast<const float *>(rhs->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = lptr[i] + rptr[i];
+void Sign<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float*>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = inPtr[i] > 0 ? 1.0f : 0.0f;
}
}
template <>
-void Add<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = lptr[i] + x;
+void Exp<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = exp(inPtr[i]);
}
}
template <>
-void Sub<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
- Blob *ret, Context *ctx) {
- // CHECK_EQ(ctx->stream, nullptr);
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(lhs->data());
- const float *rptr = static_cast<const float *>(rhs->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = lptr[i] - rptr[i];
+void Log<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ CHECK_GT(inPtr[i], 0.f);
+ outPtr[i] = log(inPtr[i]);
}
}
-// sum all elements of input into ret
-// TODO(wangwei) optimize using omp
template <>
-void Sum<float, lang::Cpp>(int count, const Blob *input, float *ret,
- Context *ctx) {
- float s = 0.f;
- const float *in = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- s += in[i];
+void Sqrt<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ CHECK_GT(inPtr[i], 0.f);
+ outPtr[i] = sqrt(inPtr[i]);
}
- *ret = s;
}
template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *input, float x,
- Blob *ret, Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = lptr[i] * x;
+void Square<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = inPtr[i] * inPtr[i];
}
}
template <>
-void EltwiseMult<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
- Blob *ret, Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(lhs->data());
- const float *rptr = static_cast<const float *>(rhs->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = lptr[i] * rptr[i];
+void Tanh<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = tanh(inPtr[i]);
}
}
template <>
-void Exp<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = exp(lptr[i]);
+void ReLU<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = (inPtr[i] >= 0.f) ? inPtr[i] : 0.f;
}
}
template <>
-void Log<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- CHECK_GT(lptr[i], 0.f);
- dptr[i] = log(lptr[i]);
+void Sigmoid<float, lang::Cpp>(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = 1.f / (1.f + exp(-inPtr[i]));
}
}
template <>
-void Tanh<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = tanh(lptr[i]);
+void Softmax<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ float *bPtr = new float[ncol];
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ float denom = 0.f;
+ for (size_t c = 0; c < ncol; c++) {
+ bPtr[c] = exp(inPtr[offset + c]);
+ denom += bPtr[c];
+ }
+ for (size_t c = 0; c < ncol; c++) {
+ size_t idx = offset + c;
+ outPtr[idx] = bPtr[c] / denom;
+ }
}
+ delete bPtr;
}
template <>
-void ReLU<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = (lptr[i] >= 0.f) ? lptr[i] : 0.f;
+void SumRows<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ outPtr[r] = 0.f;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[r] += inPtr[offset + c];
+ }
+ }
+}
+
+template <>
+void SumColumns<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *in, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[c] = 0.f;
+ }
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[c] += inPtr[offset + c];
+ }
+ }
+}
+
+template <>
+void AddRow<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[offset + c] = APtr[offset + c] + vPtr[c];
+ }
+ }
+}
+
+template <>
+void AddCol<float, lang::Cpp>(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *APtr = static_cast<const float *>(A->data());
+ const float *vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < nrow; r++) {
+ size_t offset = r * ncol;
+ for (size_t c = 0; c < ncol; c++) {
+ outPtr[offset + c] = APtr[offset + c] + vPtr[r];
+ }
+ }
+}
+
+template <>
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in, const float x, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = pow(inPtr[i], x);
}
}
template <>
-void Sigmoid<float, lang::Cpp>(int count, const Blob *input, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = 1.f / (1.f + exp(-lptr[i]));
+void Pow<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ float *outPtr= static_cast<float *>(out->mutable_data());
+ const float *in1Ptr= static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = pow(in1Ptr[i], in2Ptr[i]);
}
}
template <>
-void Pow<float, lang::Cpp>(int count, const Blob *input, float x, Blob *ret,
- Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(input->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = pow(lptr[i], x);
+void Clamp<float, lang::Cpp>(const size_t num, const float low, const float high, const Blob *in,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ if (inPtr[i] > high) {
+ outPtr[i] = high;
+ }
+ else if (inPtr[i] < low) {
+ outPtr[i] = low;
+ }
+ else {
+ outPtr[i] = inPtr[i];
+ }
+ }
+}
+
+template <>
+void Add<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = inPtr[i] + x;
}
}
template <>
-void Pow<float, lang::Cpp>(int count, const Blob *lhs, const Blob *rhs,
- Blob *ret, Context *ctx) {
- float *dptr = static_cast<float *>(ret->mutable_data());
- const float *lptr = static_cast<const float *>(lhs->data());
- const float *rptr = static_cast<const float *>(rhs->data());
- for (int i = 0; i < count; i++) {
- dptr[i] = pow(lptr[i], rptr[i]);
+void Add<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ // CHECK_EQ(ctx->stream, nullptr);
+ float *outPtr= static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = in1Ptr[i] + in2Ptr[i];
}
}
template <>
-void Bernoulli<float, lang::Cpp>(int count, float p, Blob *ret, Context *ctx) {
- std::bernoulli_distribution distribution(p);
- float *ptr = static_cast<float *>(ret->mutable_data());
- for (int i = 0; i < count; i++) {
- ptr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
+void Sub<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ // CHECK_EQ(ctx->stream, nullptr);
+ float *outPtr= static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = in1Ptr[i] - in2Ptr[i];
}
}
template <>
-void Uniform<float, lang::Cpp>(int count, float low, float high, Blob *ret,
- Context *ctx) {
- std::uniform_real_distribution<float> distribution(low, high);
- float *ptr = static_cast<float *>(ret->mutable_data());
- for (int i = 0; i < count; i++) {
- ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr= static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = inPtr[i] * x;
}
}
template <>
-void Gaussian<float, lang::Cpp>(int count, float mean, float std, Blob *ret,
- Context *ctx) {
- std::normal_distribution<float> distribution(mean, std);
- float *ptr = static_cast<float *>(ret->mutable_data());
- for (int i = 0; i < count; i++) {
- ptr[i] = static_cast<float>(distribution(ctx->random_generator));
+void EltwiseMult<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ float *outPtr= static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = in1Ptr[i] * in2Ptr[i];
}
}
-// follow the consistency guide of math API
template <>
-void Div<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
+void Div<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ float *outPtr= static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t i = 0; i < num; i++) {
+ CHECK_NE(in2Ptr[i],0.f);
+ outPtr[i] = in1Ptr[i] / in2Ptr[i];
+ }
+}
+
+template <>
+void Div<float, lang::Cpp>(const size_t num, const float x, const Blob *in,
+ Blob *out, Context *ctx) {
+ float *outPtr= static_cast<float *>(out->mutable_data());
const float *inPtr = static_cast<const float *>(in->data());
- for (size_t i = 0; i < num; i++) outPtr[i] = alpha / inPtr[i];
+ for (size_t i = 0; i < num; i++) {
+ CHECK_NE(inPtr[i],0.f);
+ outPtr[i] = x / inPtr[i];
+ }
}
+
+template <>
+void Outer<float, lang::Cpp>(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
+ float *outPtr= static_cast<float *>(out->mutable_data());
+ const float *in1Ptr = static_cast<const float *>(in1->data());
+ const float *in2Ptr = static_cast<const float *>(in2->data());
+ for (size_t r = 0; r < m ; r++) {
+ size_t offset = r * n;
+ for (size_t c = 0; c < n; c++) {
+ outPtr[offset + c] = in1Ptr[r] * in2Ptr[c];
+ }
+ }
+}
+
template <>
void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
Blob *out, Context *ctx) {
@@ -227,6 +349,125 @@ void LT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
}
template <>
+void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+ }
+}
+
+template <>
+void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+ }
+}
+
+template <>
+void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+ }
+}
+
+template <>
+void Amax<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+ size_t maxPos = 0;
+ float maxVal = 0;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ if (i == 0) {
+ maxVal = inPtr[i];
+ }
+ else if (inPtr[i] > maxVal) {
+ maxVal = inPtr[i];
+ maxPos = i;
+ }
+ }
+ *out = maxPos;
+}
+
+template <>
+void Amin<float, lang::Cpp>(const size_t num, const Blob *in, size_t *out, Context *ctx) {
+ size_t minPos = 0;
+ float minVal = 0;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ if (i == 0) {
+ minVal = inPtr[i];
+ }
+ else if (inPtr[i] > minVal) {
+ minVal = inPtr[i];
+ minPos = i;
+ }
+ }
+ *out = minPos;
+}
+
+template <>
+void Asum<float, lang::Cpp>(const size_t num, const Blob *in, float *out, Context *ctx) {
+ float sum = 0;
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ sum += fabs(inPtr[i]);
+ }
+}
+
+template <>
+void Axpy<float, lang::Cpp>(const size_t num, const float alpha, const Blob *in,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float *inPtr = static_cast<const float *>(in->data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] += alpha * inPtr[i];
+ }
+}
+
+template <>
+void Scale<float, lang::Cpp>(const size_t num, const float x, Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ for (size_t i = 0; i < num; i++) {
+ outPtr[i] *= x;
+ }
+}
+
+//template <>
+//void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
+// float *out, Context *ctx) {
+// float sum = 0;
+// const float *in1Ptr = static_cast<const float *>(in1->data());
+// const float *in2Ptr = static_cast<const float *>(in2->data());
+// for (size_t i = 0; i < num; i++) {
+// sum += in1Ptr[i] * in2Ptr[i];
+// }
+//}
+
+template <>
+void GEMV<float, lang::Cpp>(bool trans, const size_t m, const size_t n, const float alpha,
+ const Blob *A, const Blob *v, const float beta,
+ Blob *out, Context *ctx) {
+ float *outPtr = static_cast<float *>(out->mutable_data());
+ const float* APtr = static_cast<const float *>(A->data());
+ const float* vPtr = static_cast<const float *>(v->data());
+ for (size_t r = 0; r < m; r++) {
+ float sum = 0;
+ for (size_t c = 0; c < n; c++) {
+ size_t idx = trans ? c * m + r : r * n + c;
+ sum += APtr[idx] * vPtr[c];
+ }
+ outPtr[r] = alpha * sum + beta * outPtr[r];
+ }
+}
+
+template <>
void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
const size_t ncol, const Blob *M, const Blob *v,
Blob *out, Context *ctx) {
@@ -251,41 +492,35 @@ void DGMM<float, lang::Cpp>(const bool side_right, const size_t nrow,
}
template <>
-void Set<float, lang::Cpp>(const size_t num, const float x, Blob *out,
- Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- for (size_t i = 0; i < num; i++) outPtr[i] = x;
-}
-template <>
-void LE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
+void Bernoulli<float, lang::Cpp>(const size_t num, const float p, Blob *out, Context *ctx) {
+ std::bernoulli_distribution distribution(p);
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] <= x) ? 1.f : 0.f;
+ outPtr[i] = distribution(ctx->random_generator) ? 1.0f : 0.0f;
}
}
template <>
-void GT<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
- float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
+void Uniform<float, lang::Cpp>(const size_t num, const float low, const float high, Blob *out,
+ Context *ctx) {
+ std::uniform_real_distribution<float> distribution(low, high);
+ float *outPtr= static_cast<float *>(out->mutable_data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] > x) ? 1.f : 0.f;
+ outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
}
}
template <>
-void GE<float, lang::Cpp>(const size_t num, const Blob *in, const float x,
- Blob *out, Context *ctx) {
+void Gaussian<float, lang::Cpp>(const size_t num, const float mean, const float std, Blob *out,
+ Context *ctx) {
+ std::normal_distribution<float> distribution(mean, std);
float *outPtr = static_cast<float *>(out->mutable_data());
- const float *inPtr = static_cast<const float *>(in->data());
for (size_t i = 0; i < num; i++) {
- outPtr[i] = (inPtr[i] >= x) ? 1.f : 0.f;
+ outPtr[i] = static_cast<float>(distribution(ctx->random_generator));
}
}
+
#ifdef USE_CBLAS
template <>
void Dot<float, lang::Cpp>(const size_t num, const Blob *in1, const Blob *in2,
@@ -314,7 +549,6 @@ void GEMM<float, lang::Cpp>(const bool transA, const bool transB,
}
#endif // USE_CBLAS
-
} // namespace singa
#endif // SINGA_CORE_TENSOR_TENSOR_MATH_CPP_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/07c49da5/test/singa/test_tensor_math.cc
----------------------------------------------------------------------
diff --git a/test/singa/test_tensor_math.cc b/test/singa/test_tensor_math.cc
index 170b96c..823445f 100644
--- a/test/singa/test_tensor_math.cc
+++ b/test/singa/test_tensor_math.cc
@@ -11,15 +11,277 @@ protected:
b.Reshape(singa::Shape{6});
c.Reshape(singa::Shape{6, 1});
d.Reshape(singa::Shape{3, 2});
+ e.Reshape(singa::Shape{3, 2});
a.CopyDataFromHostPtr<float>(dat1, 6);
b.CopyDataFromHostPtr<float>(dat2, 6);
+ e.CopyDataFromHostPtr<float>(dat1, 6);
}
- Tensor a, b, c, d;
+ Tensor a, b, c, d, e;
const float dat1[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
const float dat2[6] = {1.1f, 2.1f, 3.1f, 4.1f, 5.1f, 6.1f};
};
+TEST_F(TestTensorMath, MemberAbs) {
+ Tensor aa = a.Clone();
+ Tensor bb = b.Clone();
+ Tensor cc = aa - bb;
+ const float* dptr = cc.data<const float*>();
+ EXPECT_NEAR(-0.1, dptr[0], 1e-5);
+ EXPECT_NEAR(-0.1, dptr[1], 1e-5);
+ EXPECT_NEAR(-0.1, dptr[2], 1e-5);
+
+ Tensor p = Abs(cc);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(0.1, dptr1[0], 1e-5);
+ EXPECT_NEAR(0.1, dptr1[1], 1e-5);
+ EXPECT_NEAR(0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberExp) {
+ Tensor p = Exp(a);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(exp(1.0f), dptr1[0], 1e-5);
+ EXPECT_NEAR(exp(2.0f), dptr1[1], 1e-5);
+ EXPECT_NEAR(exp(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLog) {
+ Tensor p = Log(a);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(log(1.0f), dptr1[0], 1e-5);
+ EXPECT_NEAR(log(2.0f), dptr1[1], 1e-5);
+ EXPECT_NEAR(log(3.0f), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberReLU) {
+ Tensor aa = a.Clone();
+ Tensor cc = aa - 2.0f;
+ const float* dptr = cc.data<const float*>();
+ EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+ EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+ EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+ Tensor p = ReLU(cc);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(0.0f, dptr1[0], 1e-5);
+ EXPECT_NEAR(0.0f, dptr1[1], 1e-5);
+ EXPECT_NEAR(1.0f, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSigmoid) {
+ Tensor p = Sigmoid(a);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(1.0f/(1.0f + exp(-1.0f)), dptr1[0], 1e-5);
+ EXPECT_NEAR(1.0f/(1.0f + exp(-2.0f)), dptr1[1], 1e-5);
+ EXPECT_NEAR(1.0f/(1.0f + exp(-3.0f)), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSign) {
+ Tensor aa = a.Clone();
+ Tensor cc = aa - 2.0f;
+ const float* dptr = cc.data<const float*>();
+ EXPECT_NEAR(-1.0f, dptr[0], 1e-5);
+ EXPECT_NEAR(0.0f, dptr[1], 1e-5);
+ EXPECT_NEAR(1.0f, dptr[2], 1e-5);
+
+ Tensor p = Sign(cc);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_EQ(0.0f, dptr1[0]);
+ EXPECT_EQ(0.0f, dptr1[1]);
+ EXPECT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberSqrt) {
+ Tensor p = Sqrt(a);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(sqrt(1.0), dptr1[0], 1e-5);
+ EXPECT_NEAR(sqrt(2.0), dptr1[1], 1e-5);
+ EXPECT_NEAR(sqrt(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberSquare) {
+ Tensor p = Square(a);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(1.0, dptr1[0], 1e-5);
+ EXPECT_NEAR(4.0, dptr1[1], 1e-5);
+ EXPECT_NEAR(9.0, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberTanh) {
+ Tensor p = Tanh(a);
+ const float* dptr1 = p.data<const float*>();
+ EXPECT_NEAR(tanh(1.0), dptr1[0], 1e-5);
+ EXPECT_NEAR(tanh(2.0), dptr1[1], 1e-5);
+ EXPECT_NEAR(tanh(3.0), dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, Sum) {
+ Tensor p1(Shape{1,2});
+ p1 = Sum(e, 0);
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(9.0f,dptr1[0]);
+ EXPECT_FLOAT_EQ(12.0f,dptr1[1]);
+
+ Tensor p2(Shape{3,1});
+ p2 = Sum(e, 1);
+ const float *dptr2 = p2.data<const float *>();
+ EXPECT_FLOAT_EQ(3.0f,dptr2[0]);
+ EXPECT_FLOAT_EQ(7.0f,dptr2[1]);
+ EXPECT_FLOAT_EQ(11.0f,dptr2[2]);
+}
+
+TEST_F(TestTensorMath, SoftMax) {
+ Tensor p1(Shape{3,2});
+ p1 = SoftMax(e,0);
+ const float *dptr1 = p1.data<const float *>();
+ float sum = 0;
+ for(int i = 0; i < 6; i++) sum += exp(i+1);
+ EXPECT_NEAR(exp(1)/sum, dptr1[0],1e-5);
+ EXPECT_NEAR(exp(3)/sum, dptr1[2],1e-5);
+ EXPECT_NEAR(exp(5)/sum, dptr1[4],1e-5);
+ EXPECT_NEAR(exp(2)/sum, dptr1[1],1e-5);
+ EXPECT_NEAR(exp(4)/sum, dptr1[3],1e-5);
+ EXPECT_NEAR(exp(6)/sum, dptr1[5],1e-5);
+
+ Tensor p2(Shape{3,2});
+ p2 = SoftMax(e,1);
+ const float *dptr2 = p2.data<const float *>();
+ EXPECT_NEAR(exp(1)/(exp(1)+exp(2)),dptr2[0], 1e-5);
+ EXPECT_NEAR(exp(2)/(exp(1)+exp(2)),dptr2[1], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberLT) {
+ Tensor p1 = a < 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberLE) {
+ Tensor p1 = a <= 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(1.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGT) {
+ Tensor p1 = a > 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(0.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberGE) {
+ Tensor p1 = a >= 2.0f;
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(0.0f, dptr1[0]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[1]);
+ EXPECT_FLOAT_EQ(1.0f, dptr1[2]);
+}
+
+TEST_F(TestTensorMath, MemberPow) {
+ Tensor p1 = Pow(b,3.0f);
+ const float *dptr1 = p1.data<const float *>();
+ EXPECT_FLOAT_EQ(pow(1.1f,3.0f), dptr1[0]);
+ EXPECT_FLOAT_EQ(pow(2.1f,3.0f), dptr1[1]);
+ EXPECT_FLOAT_EQ(pow(3.1f,3.0f), dptr1[2]);
+
+ //TODO(Yuchen): check pow(tensor a, tensor b) and add testcase after the function is complete
+ //Tensor p2 = Pow(a,b);
+ //const float *dptr2 = p2.data<const float *>();
+ //EXPECT_FLOAT_EQ(pow(1.0f,1.1f), dptr2[0]);
+ //EXPECT_FLOAT_EQ(pow(2.0f,2.1f), dptr2[1]);
+ //EXPECT_FLOAT_EQ(pow(3.0f,3.1f), dptr2[2]);
+}
+
+
+TEST_F(TestTensorMath, MemberSub) {
+ Tensor p1 = a - b;
+ const float* dptr1 = p1.data<const float*>();
+ EXPECT_NEAR(-0.1, dptr1[0], 1e-5);
+ EXPECT_NEAR(-0.1, dptr1[1], 1e-5);
+ EXPECT_NEAR(-0.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberEltwiseMult) {
+ Tensor p1 = a * b;
+ const float* dptr1 = p1.data<const float*>();
+ EXPECT_NEAR(1.0*1.1, dptr1[0], 1e-5);
+ EXPECT_NEAR(2.0*2.1, dptr1[1], 1e-5);
+ EXPECT_NEAR(3.0*3.1, dptr1[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberDiv) {
+ Tensor p1 = a / b;
+ const float* dptr1 = p1.data<const float*>();
+ EXPECT_NEAR(1.0/1.1, dptr1[0], 1e-5);
+ EXPECT_NEAR(2.0/2.1, dptr1[1], 1e-5);
+ EXPECT_NEAR(3.0/3.1, dptr1[2], 1e-5);
+
+ Tensor p2 = Div(10.0f,b);
+ const float* dptr2 = p2.data<const float*>();
+ EXPECT_NEAR(10.0/1.1, dptr2[0], 1e-5);
+ EXPECT_NEAR(10.0/2.1, dptr2[1], 1e-5);
+ EXPECT_NEAR(10.0/3.1, dptr2[2], 1e-5);
+
+ Tensor p3 = a / 8.0f;
+ const float* dptr3 = p3.data<const float*>();
+ EXPECT_NEAR(1.0/8.0, dptr3[0], 1e-5);
+ EXPECT_NEAR(2.0/8.0, dptr3[1], 1e-5);
+ EXPECT_NEAR(3.0/8.0, dptr3[2], 1e-5);
+}
+
+TEST_F(TestTensorMath, MemberBernoulli) {
+ Tensor p1(Shape{10000});
+ Bernoulli(0.3,&p1);
+ const float* dptr1 = p1.data<const float*>();
+ float sum = 0;
+ for(int i = 0; i < 10000; i++) sum += dptr1[i];
+ float mean = sum/10000;
+ EXPECT_NEAR(mean, 0.3, 1e-2);
+
+ sum = 0;
+ for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+ float variance = sum/9999;
+ EXPECT_NEAR(variance, 0.3*0.7, 1e-2);
+}
+
+TEST_F(TestTensorMath, MemberUniform) {
+ Tensor p1(Shape{10000});
+ Uniform(0.1f,0.2f,&p1);
+ const float* dptr1 = p1.data<const float*>();
+ float sum = 0;
+ for(int i = 0; i < 10000; i++) sum += dptr1[i];
+ float mean = sum/10000;
+ EXPECT_NEAR(mean, 0.15f, 1e-3);
+
+ sum = 0;
+ for(int i = 0; i < 10000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+ float variance = sum/9999;
+ EXPECT_NEAR(variance, 0.01f/12, 1e-3);
+}
+
+TEST_F(TestTensorMath, MemberGaussian) {
+ Tensor p1(Shape{50000});
+ Gaussian(0.0,1.0,&p1);
+ const float* dptr1 = p1.data<const float*>();
+ float sum = 0;
+ for(int i = 0; i < 50000; i++) sum += dptr1[i];
+ float mean = sum/50000;
+ EXPECT_NEAR(mean, 0.0, 1e-2);
+
+ sum = 0;
+ for(int i = 0; i < 50000; i++) sum += (dptr1[i]-mean)*(dptr1[i]-mean);
+ float variance = sum/49999;
+ EXPECT_NEAR(variance, 1.0, 1e-2);
+}
+
+
+
TEST_F(TestTensorMath, MemberAddTensor) {
Tensor aa = a.Clone();
aa += a;
[4/5] incubator-singa git commit: SINGA-182 Clean math function APIs
and implementations
Posted by zh...@apache.org.
SINGA-182 Clean math function APIs and implementations
Clean tensor.h/.cc and tensor_math.h, tensor_math_cpp.h:
re-order the functions by (type, name), where type is a) element-wise
function b) matrix function c) random function d) blas function
Implement GEMV using cblas and cublas.
Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/564c88ad
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/564c88ad
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/564c88ad
Branch: refs/heads/dev
Commit: 564c88ad95e976e6067198c832f4fcd9a8878cd7
Parents: 07c49da
Author: wangwei <wa...@gmail.com>
Authored: Fri Jun 10 23:12:09 2016 +0800
Committer: Wei Wang <wa...@comp.nus.edu.sg>
Committed: Sun Jun 12 12:15:11 2016 +0800
----------------------------------------------------------------------
include/singa/core/tensor.h | 396 +++++++++---------
src/core/tensor/tensor.cc | 688 ++++++++++++++++----------------
src/core/tensor/tensor_math.h | 336 ++++++++--------
src/core/tensor/tensor_math_cpp.h | 640 +++++++++++++++--------------
src/core/tensor/tensor_math_cuda.h | 158 ++++----
test/singa/test_tensor_math.cc | 15 +-
6 files changed, 1131 insertions(+), 1102 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/include/singa/core/tensor.h
----------------------------------------------------------------------
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index bb8d7f8..82bbe81 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -32,17 +32,6 @@ using std::tuple;
namespace singa {
typedef vector<size_t> Shape;
-typedef Shape::iterator ShapeIter;
-inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
- if (len == 0)
- len = shape.size();
- CHECK_LE(len, shape.size());
- size_t v = 1;
- for (unsigned int i = start; i < len; i++)
- v *= shape[i];
- return v;
-}
-
/// hardcode the width of types defined in DataType
const size_t kDataWidth[] = {sizeof(float), sizeof(float) / 2, sizeof(int),
sizeof(char), sizeof(double)};
@@ -65,10 +54,10 @@ class Tensor {
public:
~Tensor();
Tensor();
- explicit Tensor(Shape &&shape, DataType dtype = kFloat32);
- explicit Tensor(const Shape &shape, DataType dtype = kFloat32);
- Tensor(Shape &&shape, Device *dev, DataType dtype = kFloat32);
- Tensor(const Shape &shape, Device *dev, DataType dtype = kFloat32);
+ explicit Tensor(Shape &&shape, const DataType dtype = kFloat32);
+ explicit Tensor(const Shape &shape, const DataType dtype = kFloat32);
+ Tensor(Shape &&shape, Device *dev, const DataType dtype = kFloat32);
+ Tensor(const Shape &shape, Device *dev, const DataType dtype = kFloat32);
/// Copy Tensor to share the internal data. No deep copy.
Tensor(const Tensor &from);
@@ -82,10 +71,10 @@ class Tensor {
Device *device() const { return device_; }
- /// Return immutable Tensor values with given type.
- template <typename DType>
- DType data() const {
- return static_cast<DType>(blob()->data());
+ /// return immutable Tensor values with given type.
+ template <typename SType>
+ SType data() const {
+ return static_cast<SType>(blob()->data());
}
/// data type, including kFloat16, kFloat32, kInt
@@ -93,7 +82,7 @@ class Tensor {
const Shape &shape() const { return shape_; }
- const size_t shape(size_t idx) const {
+ const size_t shape(const size_t idx) const {
CHECK_LT(idx, shape_.size());
return shape_.at(idx);
}
@@ -102,13 +91,13 @@ class Tensor {
bool transpose() const { return transpose_; }
- /// Return number of total elements
+ /// return number of total elements
size_t Size() const {
CHECK_EQ(blob_->size() % SizeOf(data_type_), 0u);
return blob_->size() / SizeOf(data_type_);
}
- /// Return memory size (i.e., Bytes)
+ /// return memory size (i.e., Bytes)
size_t MemSize() const { return blob_->size(); }
/// Reset the tensor shape, it may reallocate blob, if MemSize() changes.
@@ -121,7 +110,7 @@ class Tensor {
void ResetLike(const Tensor &t);
/// Reset the data type, it would reallocate blob if type changes.
- void AsType(DataType type);
+ void AsType(const DataType type);
/// Reset the device.
/// If the target device is a diff device, then do deep data copy.
@@ -135,14 +124,14 @@ class Tensor {
void SetValue(const SType x);
/// For init the tensor values, copy 'num' elements.
- template <typename DType>
- void CopyDataFromHostPtr(const DType *src, size_t num);
+ template <typename SType>
+ void CopyDataFromHostPtr(const SType *src, const size_t num);
/// Copy data from another Tensor which may be on a diff device.
/// Meta data would not be copied!
void CopyData(const Tensor &other);
- /// Return an exactly the same Tensor with data been deep copied.
+ /// return an exactly the same Tensor with data been deep copied.
Tensor Clone() const;
// Tensor operations
@@ -152,42 +141,37 @@ class Tensor {
Tensor T() const;
/// Copy the meta info with data blob shared.
- Tensor &operator=(const Tensor &t);
+ Tensor &operator=(const Tensor &in);
/// Copy the meta info with data blob shared.
- Tensor &operator=(Tensor &&t);
+ Tensor &operator=(Tensor &&in);
- Tensor &operator+=(const Tensor &t);
- // void operator+=(Tensor&& t);
- Tensor &operator-=(const Tensor &t);
- // void operator-=(Tensor&& t);
- Tensor &operator*=(const Tensor &t);
- // void operator*=(Tensor&& t);
- Tensor &operator/=(const Tensor &t);
- // void operator/=(Tensor&& t);
+ Tensor &operator+=(const Tensor &in);
+ // void operator+=(Tensor&& in);
+ Tensor &operator-=(const Tensor &in);
+ // void operator-=(Tensor&& in);
+ Tensor &operator*=(const Tensor &in);
+ // void operator*=(Tensor&& in);
+ Tensor &operator/=(const Tensor &in);
+ // void operator/=(Tensor&& in);
// Scalar operations.
- /// T is a scalar type
- template <typename DType>
- Tensor &operator+=(DType x);
-
- /// T is a scalar type
- template <typename DType>
- Tensor &operator-=(const DType x);
+ /// SType is a scalar type
+ template <typename SType>
+ Tensor &operator+=(const SType x);
- /// T is a scalar type
- template <typename DType>
- Tensor &operator*=(const DType x);
+ /// SType is a scalar type
+ template <typename SType>
+ Tensor &operator-=(const SType x);
- /// T is a scalar type
- template <typename DType>
- Tensor &operator/=(const DType x);
+ /// SType is a scalar type
+ template <typename SType>
+ Tensor &operator*=(const SType x);
- /// save Tensor into a proto msg
- // void ToProto(TensorProto* t);
- /// load Tensor from proto msg
- // void FromProto(const TensorProto& t);
+ /// SType is a scalar type
+ template <typename SType>
+ Tensor &operator/=(const SType x);
protected:
bool transpose_ = false;
@@ -196,14 +180,29 @@ class Tensor {
/// Note: blob_ is allocated in lazy manner to avoid frequent malloc/free.
/// If you want to get an allocated Blob, use blob() instead of blob_.
Blob *blob_ = nullptr;
- Shape shape_;
+ Shape shape_ = {};
};
+typedef Shape::iterator ShapeIter;
+inline size_t Product(const Shape &shape, int start = 0, size_t len = 0) {
+ if (len == 0) len = shape.size();
+ CHECK_LE(len, shape.size());
+ size_t v = 1;
+ for (unsigned int i = start; i < len; i++) v *= shape[i];
+ return v;
+}
+
inline void CheckDataTypeAndLang(const Tensor &in1, const Tensor &in2) {
CHECK_EQ(in1.data_type(), in2.data_type());
CHECK_EQ(in1.device()->lang(), in2.device()->lang());
}
+template <typename FromType, typename ToType>
+ToType TypeCast(const FromType &x) {
+ // TODO(wangwei) cast fp16; prevent some casts, e.g., float to char
+ return static_cast<ToType>(x);
+}
+
Tensor Reshape(const Tensor &in, const Shape &s);
Tensor Reshape(const Tensor &in, Shape &&s);
@@ -212,192 +211,171 @@ Tensor Reshape(const Tensor &in, Shape &&s);
/// Copy 'num' elements of src to dst.
/// The first 'src_offset' ('dst_offset') elements will be skipped.
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
- size_t src_offset = 0, size_t dst_offset = 0);
-
-// ==================Simple Linear Algebra Operations=========================
-Tensor Abs(const Tensor &t);
-Tensor Exp(const Tensor &t);
-Tensor Log(const Tensor &t);
-Tensor ReLU(const Tensor &t);
-Tensor Sigmoid(const Tensor &t);
-Tensor Sign(const Tensor &t);
-Tensor Sqrt(const Tensor &t);
-Tensor Square(const Tensor &t);
-Tensor Tanh(const Tensor &t);
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+ const size_t src_offset = 0, const size_t dst_offset = 0);
+
+// =============Element-wise operations====================================
+Tensor Abs(const Tensor &in);
+Tensor Exp(const Tensor &in);
+Tensor Log(const Tensor &in);
+Tensor ReLU(const Tensor &in);
+Tensor Sigmoid(const Tensor &in);
+Tensor Sign(const Tensor &in);
+Tensor Sqrt(const Tensor &in);
+Tensor Square(const Tensor &in);
+Tensor Tanh(const Tensor &in);
+
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+Tensor Pow(const Tensor &in, const SType x);
+/// Element-wise opeartion, out[i]=in[i]^x
+template <typename SType>
+void Pow(const Tensor &in, const SType x, Tensor *out);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+Tensor Pow(const Tensor &base, const Tensor &exp);
+/// Element-wise opeartion, out[i]=baes[i]^exp[i]
+void Pow(const Tensor &base, const Tensor &exp, Tensor *out);
+/// Element-wise operation, out[i]= (in[i] < x) ? 1.f : 0.f
template <typename SType>
-SType Sum(const Tensor &t);
-/// Sum elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, sum all rows into a single row
-/// if 'axis' is 1, sum all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.sum
-Tensor Sum(const Tensor &t, int axis);
+Tensor operator<(const Tensor &in, const SType x);
+template <typename SType>
+void LT(const Tensor &in, const SType x, Tensor *out);
-/// Average elements in the Tensor, currently only support vector and matrix.
-/// if 'axis' is 0, average all rows into a single row
-/// if 'axis' is 1, average all columns into a single column
-/// TODO(wangwei) support arbitrary Tensor like numpy.average
-Tensor Average(const Tensor &t, int axis);
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
-/// and shape_[axis]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-Tensor SoftMax(const Tensor &t, int axis = 0);
-void SoftMax(const Tensor &t, int axis, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] <= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator<=(const Tensor &in, const SType x);
+template <typename SType>
+void LE(const Tensor &in, const SType x, Tensor *out);
-/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis] rows,
-/// and shape_[axis+1]*...*shape_[nDim()] columns.
-/// and do softmax along each row.
-// Tensor Softmax(const Tensor& t, int axis = -1);
-// void Softmax(const Tensor& t, Tensor* ret, int axis = -1);
-
-/// Element-wise operation, ret[i]= (t[i] < x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<(const Tensor &t, const DType x);
-template <typename DType>
-void LT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] <= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator<=(const Tensor &t, const DType x);
-template <typename DType>
-void LE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] > x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>(const Tensor &t, const DType x);
-template <typename DType>
-void GT(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise operation, ret[i]= (t[i] >= x) ? 1.f : 0.f
-template <typename DType>
-Tensor operator>=(const Tensor &t, const DType x);
-template <typename DType>
-void GE(const Tensor &t, DType x, Tensor *ret);
-
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-Tensor Pow(const Tensor &t, DType x);
-/// Element-wise opeartion, ret[i]=t[i]^x
-template <typename DType>
-void Pow(const Tensor &t, DType x, Tensor *ret);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-Tensor Pow(const Tensor &base, Tensor exp);
-/// Element-wise opeartion, ret[i]=baes[i]^exp[i]
-void Pow(const Tensor &base, const Tensor &exp, Tensor *ret);
+/// Element-wise operation, out[i]= (in[i] > x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>(const Tensor &in, const SType x);
+template <typename SType>
+void GT(const Tensor &in, const SType x, Tensor *out);
+
+/// Element-wise operation, out[i]= (in[i] >= x) ? 1.f : 0.f
+template <typename SType>
+Tensor operator>=(const Tensor &in, const SType x);
+template <typename SType>
+void GE(const Tensor &in, const SType x, Tensor *out);
Tensor operator+(const Tensor &lhs, const Tensor &rhs);
-void Add(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Add(const Tensor &lhs, const Tensor &rhs, Tensor *out);
Tensor operator-(const Tensor &lhs, const Tensor &rhs);
-void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Sub(const Tensor &lhs, const Tensor &rhs, Tensor *out);
Tensor operator*(const Tensor &lhs, const Tensor &rhs);
-void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void EltwiseMult(const Tensor &lhs, const Tensor &rhs, Tensor *out);
Tensor operator/(const Tensor &lhs, const Tensor &rhs);
-void Div(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
+void Div(const Tensor &lhs, const Tensor &rhs, Tensor *out);
-template <typename DType>
-Tensor operator+(const Tensor &t, DType x);
-template <typename DType>
-void Add(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator-(const Tensor &t, DType x);
-template <typename DType>
-void Sub(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator*(const Tensor &t, DType x);
-template <typename DType>
-void EltwiseMult(const Tensor &t, DType x, Tensor *ret);
-
-template <typename DType>
-Tensor operator/(const Tensor &t, DType x);
-template <typename DType>
-void Div(const Tensor &t, DType x, Tensor *ret);
+template <typename SType>
+Tensor operator+(const Tensor &in, const SType x);
+template <typename SType>
+void Add(const Tensor &in, const SType x, Tensor *out);
-// ================Blas operations============================================
-// We fix the scalar argument type to be float.
+template <typename SType>
+Tensor operator-(const Tensor &in, const SType x);
+template <typename SType>
+void Sub(const Tensor &in, const SType x, Tensor *out);
-// ===== Level 1
-// TODO(wangwei) make amax/amin/asum a member function of tensor
-// void Amax(Tensor, Context* ctx); Get the index of the max value in a vector
-// void Asum(Tensor Context* ctx);
+template <typename SType>
+Tensor operator*(const Tensor &in, const SType x);
+template <typename SType>
+void EltwiseMult(const Tensor &in, const SType x, Tensor *out);
-// template <typename DType>
-// void Axpy(DType x, const Blob& t, Blob* ret, Context* ctx);
+/// For each element e of Tensor 'in', compute e / x
+template <typename SType>
+Tensor operator/(const Tensor &in, const SType x);
+/// For each element e of Tensor 'in', compute e / x into out
+template <typename SType>
+void Div(const Tensor &in, const SType x, Tensor *out);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape. result = A * B
-Tensor Mult(const Tensor &A, const Tensor &B);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape. C = A * B
-void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+/// For each element e of Tensor 'in', compute x/e
+template <typename SType>
+Tensor Div(const SType x, const Tensor &in);
+/// For each element e of Tensor 'in', compute x/e into 'out'
+template <typename SType>
+void Div(const SType x, const Tensor &in, Tensor *out);
-/// Do matrix vector multipication or matrix matrix multiplication depdending
-/// on the Tensor shape. ret = alpha lhs * rhs + beta * ret
-void Mult(const float alpha, const Tensor &lhs, const Tensor &rhs,
- const float beta, Tensor *C);
+template <typename SType>
+SType Sum(const Tensor &in);
-// ================Random operations==========================================
-/// For each element x set x = 1 if random() < p; otherwise x = 1.
-void Bernoulli(float p, Tensor *t);
-/// Fill in Tensor 't' following uniform distribution.
-void Uniform(float low, float high, Tensor *t);
-/// Fill in Tensor 't' following Gaussian distribution.
-void Gaussian(float mean, float std, Tensor *t);
+// ============Matrix (row/column) operations==================================
+/// Average elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, average all rows into a single row
+/// if 'axis' is 1, average all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.average
+Tensor Average(const Tensor &in, const int axis);
+/// Sum elements in the Tensor, currently only support vector and matrix.
+/// if 'axis' is 0, sum all rows into a single row
+/// if 'axis' is 1, sum all columns into a single column
+/// TODO(wangwei) support arbitrary Tensor like numpy.sum
+Tensor Sum(const Tensor &in, const int axis);
+/// Regarding the internal data as 2d, with shape_[0]*...*shape_[axis-1] rows,
+/// and shape_[axis]*...*shape_[nDim()] columns.
+/// and do softmax along each row.
+Tensor SoftMax(const Tensor &in, const int axis = 0);
+void SoftMax(const Tensor &in, const int axis, Tensor *out);
-// follow the consistency guide
-// https://issues.apache.org/jira/browse/SINGA-182
-// ============Matrix vector operations=======================================
/// Add column 'v' with each column of matrix M
void AddColumn(const Tensor &v, Tensor *M);
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+/// For each column 'c' of matrix out, do c=alpha*v + beta*c
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
Tensor *out);
-/// Sub column 'v' by each column of matrix M
-void SubColumn(const Tensor &v, Tensor *M);
-/// Multiply column 'v' and each column of matrix M; write results into 'out'
-void MultColumn(const Tensor &v, Tensor *M);
-/// Divide column 'v' by each column of matrix M; write results into 'out'
-void DivColumn(const Tensor &v, Tensor *M);
-
/// Add row 'v' with each row of matrix M; write results into 'out'
void AddRow(const Tensor &v, Tensor *out);
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
-/// Sub row 'v' by each row of matrix M; write results into 'out'
-void SubRow(const Tensor &v, Tensor *M);
-/// Multiply row 'v' with each row of matrix M; write results into 'out'
-void MultRow(const Tensor &v, Tensor *M);
+/// For each row 'r' of matrix out, do r=alpha*v + beta*r
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M);
+/// Divide column 'v' by each column of matrix M; write results into 'out'
+void DivColumn(const Tensor &v, Tensor *M);
/// Divide row 'v' by each row of matrix M; write results into 'out'
void DivRow(const Tensor &v, Tensor *M);
-
-/// Sum all rows of matrix M into a single row as 'out'
-void SumRows(const Tensor &M, Tensor *out);
+/// Multiply column 'v' and each column of matrix M; write results into 'out'
+void MultColumn(const Tensor &v, Tensor *M);
+/// Multiply row 'v' with each row of matrix M; write results into 'out'
+void MultRow(const Tensor &v, Tensor *M);
+/// Sub column 'v' by each column of matrix M
+void SubColumn(const Tensor &v, Tensor *M);
+/// Sub row 'v' by each row of matrix M; write results into 'out'
+void SubRow(const Tensor &v, Tensor *M);
/// Sum all columns of matrix M into a single column as 'out'
void SumColumns(const Tensor &M, Tensor *out);
+/// Sum all rows of matrix M into a single row as 'out'
+void SumRows(const Tensor &M, Tensor *out);
-/// For each element x of Tensor 'in', compute alpha/x
+// ================Random operations==========================================
+/// For each element x set x = 1 if random() < p; otherwise x = 1.
template <typename SType>
-Tensor Div(const SType alpha, const Tensor &in);
+void Bernoulli(const SType p, Tensor *out);
+/// Fill in Tensor 't' following Gaussian distribution.
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out);
+/// Fill in Tensor 't' following uniform distribution.
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out);
-/// For each element x of Tensor 'in', compute alpha/x into 'out'
+// ================Blas operations============================================
+// TODO(wangwei) make amax/amin/asum a member function of tensor
+
+/// out = alpha*in + out
template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out);
-
-/*
-/// Multiply each column of the lhs matrix with the rhs column
-Tensor MultColumn(const Tensor &lhs, const Tensor &rhs);
-void MultColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Multiply each row of the lhs matrix with the rhs row
-Tensor MultRow(const Tensor &lhs, const Tensor &rhs);
-void MultRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Div each row of the lhs matrix with the rhs column
-Tensor DivColumn(const Tensor &lhs, const Tensor &rhs);
-void DivColumn(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-/// Divide each row of the lhs matrix by the rhs row
-Tensor DivRow(const Tensor &lhs, const Tensor &rhs);
-void DivRow(const Tensor &lhs, const Tensor &rhs, Tensor *ret);
-*/
+void Axpy(SType alpha, const Tensor &in, Tensor *out);
+
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. result = A * B
+Tensor Mult(const Tensor &A, const Tensor &B);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. C = A * B
+void Mult(const Tensor &A, const Tensor &B, Tensor *C);
+/// Do matrix vector multipication or matrix matrix multiplication depdending
+/// on the Tensor shape. out = alpha lhs * rhs + beta * out
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+ Tensor *C);
} // namespace singa
#endif // SINGA_CORE_TENSOR_H_
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor.cc
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor.cc b/src/core/tensor/tensor.cc
index 5ae375c..f4e9da2 100644
--- a/src/core/tensor/tensor.cc
+++ b/src/core/tensor/tensor.cc
@@ -26,61 +26,61 @@ namespace singa {
Tensor::~Tensor() {
// LOG(ERROR) << "~";
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
blob_ = nullptr;
}
Tensor::Tensor() { device_ = &defaultDevice; }
-Tensor::Tensor(const Shape &shape, DataType dtype)
+Tensor::Tensor(const Shape &shape, const DataType dtype)
: data_type_(dtype), device_(&defaultDevice), shape_(shape) {
device_ = &defaultDevice;
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(Shape &&shape, DataType dtype)
+Tensor::Tensor(Shape &&shape, const DataType dtype)
: data_type_(dtype), device_(&defaultDevice), shape_(shape) {
device_ = &defaultDevice;
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(const Shape &shape, Device *device, DataType dtype)
+Tensor::Tensor(const Shape &shape, Device *device, const DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(Shape &&shape, Device *device, DataType dtype)
+Tensor::Tensor(Shape &&shape, Device *device, const DataType dtype)
: data_type_(dtype), device_(device), shape_(shape) {
blob_ = device_->NewBlob(Product(shape_) * SizeOf(data_type_));
}
-Tensor::Tensor(const Tensor &t)
- : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
- blob_(t.blob()), shape_(t.shape_) {
+Tensor::Tensor(const Tensor &in)
+ : transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ blob_(in.blob()),
+ shape_(in.shape_) {
blob_->IncRefCount();
- // LOG(ERROR) << "const&";
}
-Tensor::Tensor(Tensor &&t)
- : transpose_(t.transpose_), data_type_(t.data_type_), device_(t.device_),
- shape_(std::move(t.shape_)) {
- blob_ = t.blob_;
- t.blob_ = nullptr;
- // LOG(ERROR) << "&&";
+Tensor::Tensor(Tensor &&in)
+ : transpose_(in.transpose_),
+ data_type_(in.data_type_),
+ device_(in.device_),
+ shape_(std::move(in.shape_)) {
+ blob_ = in.blob_;
+ in.blob_ = nullptr;
}
-void Tensor::ResetLike(const Tensor &t) {
- if (blob_ == nullptr || device_ != t.device_ || MemSize() != t.MemSize()) {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
- shape_ = t.shape_;
- device_ = t.device_;
- data_type_ = t.data_type_;
- blob_ = device_->NewBlob(t.MemSize());
+void Tensor::ResetLike(const Tensor &in) {
+ if (blob_ == nullptr || device_ != in.device_ || MemSize() != in.MemSize()) {
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+ shape_ = in.shape_;
+ device_ = in.device_;
+ data_type_ = in.data_type_;
+ blob_ = device_->NewBlob(in.MemSize());
}
}
void Tensor::Reshape(const Shape &shape) {
if (Product(shape_) != Product(shape)) {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
}
shape_ = shape;
@@ -88,17 +88,15 @@ void Tensor::Reshape(const Shape &shape) {
void Tensor::Reshape(Shape &&shape) {
if (Product(shape_) != Product(shape)) {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
blob_ = device_->NewBlob(Product(shape) * SizeOf(data_type_));
}
shape_ = std::move(shape);
}
-void Tensor::AsType(DataType type) {
+void Tensor::AsType(const DataType type) {
if (data_type_ != type) {
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
blob_ = device_->NewBlob(Product(shape_) * SizeOf(type));
data_type_ = type;
}
@@ -109,8 +107,7 @@ void Tensor::ToDevice(Device *dst) {
if (device_ != dst) {
Tensor tmp(shape_, dst, data_type_);
tmp.CopyData(*this);
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
blob_ = tmp.blob_;
tmp.blob_ = nullptr;
device_ = dst;
@@ -120,7 +117,7 @@ void Tensor::ToDevice(Device *dst) {
void Tensor::ToHost() { ToDevice(device_->host()); }
template <typename DType>
-void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
+void Tensor::CopyDataFromHostPtr(const DType *src, const size_t num) {
CHECK_EQ(sizeof(DType), SizeOf(data_type_))
<< "data_type is " << DataType_Name(data_type_)
<< " user given type is of size " << sizeof(DType);
@@ -130,8 +127,8 @@ void Tensor::CopyDataFromHostPtr(const DType *src, size_t num) {
LOG(WARNING) << "Copy data from null host ptr";
}
}
-template void Tensor::CopyDataFromHostPtr(const float *src, size_t num);
-template void Tensor::CopyDataFromHostPtr(const int *src, size_t num);
+template void Tensor::CopyDataFromHostPtr(const float *src, const size_t num);
+template void Tensor::CopyDataFromHostPtr(const int *src, const size_t num);
void Tensor::CopyData(const Tensor &src) {
CHECK_EQ(Size(), src.Size());
@@ -162,29 +159,27 @@ Tensor Tensor::T() const {
return t;
}
-Tensor &Tensor::operator=(const Tensor &t) {
+Tensor &Tensor::operator=(const Tensor &in) {
// LOG(ERROR) << "= const &";
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
- transpose_ = t.transpose_;
- data_type_ = t.data_type_;
- shape_ = t.shape_;
- device_ = t.device_;
- blob_ = t.blob();
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+ transpose_ = in.transpose_;
+ data_type_ = in.data_type_;
+ shape_ = in.shape_;
+ device_ = in.device_;
+ blob_ = in.blob();
blob_->IncRefCount();
return *this;
}
-Tensor &Tensor::operator=(Tensor &&t) {
+Tensor &Tensor::operator=(Tensor &&in) {
// LOG(ERROR) << "= &&";
- if (blob_ != nullptr && blob_->DecRefCount() == 0)
- device_->FreeBlob(blob_);
- transpose_ = t.transpose_;
- data_type_ = t.data_type_;
- shape_ = std::move(t.shape_);
- device_ = t.device_;
- blob_ = t.blob_;
- t.blob_ = nullptr;
+ if (blob_ != nullptr && blob_->DecRefCount() == 0) device_->FreeBlob(blob_);
+ transpose_ = in.transpose_;
+ data_type_ = in.data_type_;
+ shape_ = std::move(in.shape_);
+ device_ = in.device_;
+ blob_ = in.blob_;
+ in.blob_ = nullptr;
return *this;
}
@@ -200,10 +195,10 @@ Tensor Reshape(const Tensor &in, Shape &&s) {
return out;
}
-#define GenUnaryTensorArgMemberFn(op, fn) \
- Tensor &Tensor::op(const Tensor &t) { \
- fn(*this, t, this); \
- return *this; \
+#define GenUnaryTensorArgMemberFn(op, fn) \
+ Tensor &Tensor::op(const Tensor &in) { \
+ fn(*this, in, this); \
+ return *this; \
}
GenUnaryTensorArgMemberFn(operator+=, Add);
@@ -211,12 +206,13 @@ GenUnaryTensorArgMemberFn(operator-=, Sub);
GenUnaryTensorArgMemberFn(operator*=, EltwiseMult);
GenUnaryTensorArgMemberFn(operator/=, Div);
-#define GenUnaryScalarArgMemberFn(op, fn) \
- template <typename DType> Tensor &Tensor::op(DType x) { \
- fn(*this, x, this); \
- return *this; \
- } \
- template Tensor &Tensor::op<float>(float x)
+#define GenUnaryScalarArgMemberFn(op, fn) \
+ template <typename DType> \
+ Tensor &Tensor::op(const DType x) { \
+ fn(*this, x, this); \
+ return *this; \
+ } \
+ template Tensor &Tensor::op<float>(const float x)
GenUnaryScalarArgMemberFn(operator-=, Sub);
GenUnaryScalarArgMemberFn(operator+=, Add);
@@ -224,103 +220,105 @@ GenUnaryScalarArgMemberFn(operator*=, EltwiseMult);
GenUnaryScalarArgMemberFn(operator/=, Div);
// ====================Tensor Operations=======================================
-void CopyDataToFrom(Tensor *dst, const Tensor &src, size_t num,
- size_t dst_offset, size_t src_offset) {
+void CopyDataToFrom(Tensor *dst, const Tensor &src, const size_t num,
+ const size_t dst_offset, const size_t src_offset) {
auto width = SizeOf(src.data_type());
CHECK_EQ(width, SizeOf(dst->data_type()));
size_t nBytes = num * width;
- dst_offset *= width;
- src_offset *= width;
- CHECK_GE(src.MemSize(), src_offset + nBytes);
- CHECK_GE(dst->MemSize(), dst_offset + nBytes);
+ auto d_offset = dst_offset * width;
+ auto s_offset = src_offset * width;
+ CHECK_GE(src.MemSize(), s_offset + nBytes);
+ CHECK_GE(dst->MemSize(), d_offset + nBytes);
Device *src_dev = src.device(), *dst_dev = dst->device();
Blob *from = src.blob(), *to = dst->blob();
if (dst_dev->lang() != src_dev->lang()) {
// let the none cpp device conduct copy op
if (dst_dev->lang() == kCpp) {
- src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, dst_offset,
- src_offset);
+ src_dev->CopyDataToFrom(to, from, nBytes, kDeviceToHost, d_offset,
+ s_offset);
} else if (src_dev->lang() == kCpp) {
- dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, dst_offset,
- src_offset);
+ dst_dev->CopyDataToFrom(to, from, nBytes, kHostToDevice, d_offset,
+ s_offset);
} else {
LOG(FATAL) << "Not support mem copy betwee Cuda and OpenCL device";
}
} else {
auto direct = src_dev->lang() == kCpp ? kHostToHost : kDeviceToDevice;
- src_dev->CopyDataToFrom(to, from, nBytes, direct, dst_offset, src_offset);
+ src_dev->CopyDataToFrom(to, from, nBytes, direct, d_offset, s_offset);
}
}
//============================================================================
/// typedef DType accroding to type value.
/// DType would be used in the code block __VA_ARGS__.
-#define TYPE_SWITCH(type, DType, ...) \
- do { \
- switch (type) { \
- case kFloat32: { \
- typedef float DType; \
- { __VA_ARGS__ } \
- break; \
- } \
- case kInt: { \
- typedef int DType; \
- { __VA_ARGS__ } \
- break; \
- } \
- case kChar: { \
- typedef char DType; \
- { __VA_ARGS__ } \
- break; \
- } \
- default: \
- LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
- } \
+#define TYPE_SWITCH(type, DType, ...) \
+ do { \
+ switch (type) { \
+ case kFloat32: { \
+ typedef float DType; \
+ { __VA_ARGS__ } \
+ break; \
+ } \
+ case kInt: { \
+ typedef int DType; \
+ { __VA_ARGS__ } \
+ break; \
+ } \
+ case kChar: { \
+ typedef char DType; \
+ { __VA_ARGS__ } \
+ break; \
+ } \
+ default: \
+ LOG(FATAL) << "Unknow data type = " << DataType_Name(type); \
+ } \
} while (0)
/// typedef DType and Lang according to data type and device programming
/// language respectively.
/// type is from DataType, and lang is from LangType.
/// DType and Lang would be used in __VA_ARGS__.
-#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...) \
- do { \
- const int _SwitchShift = 3; \
- int _SwitchHash = ((dtype) << _SwitchShift) + (ltype); \
- switch (_SwitchHash) { \
- case ((kFloat32 << _SwitchShift) + kCuda): { \
- typedef float DType; \
- typedef lang::Cuda Lang; \
- { __VA_ARGS__ } \
- break; \
- } \
- case ((kFloat32 << _SwitchShift) + kCpp): { \
- typedef float DType; \
- typedef lang::Cpp Lang; \
- { __VA_ARGS__ } \
- break; \
- } \
- case ((kFloat32 << _SwitchShift) + kOpencl): { \
- typedef float DType; \
- typedef lang::Opencl Lang; \
- { __VA_ARGS__ } \
- break; \
- } \
- default: \
- LOG(FATAL) << "Unknown combination of data type " \
- << DataType_Name(dtype) << " and language " \
- << LangType_Name(ltype); \
- } \
+#define TYPE_LANG_SWITCH(dtype, DType, ltype, Lang, ...) \
+ do { \
+ const int _SwitchShift = 3; \
+ int _SwitchHash = ((dtype) << _SwitchShift) + (ltype); \
+ switch (_SwitchHash) { \
+ case ((kFloat32 << _SwitchShift) + kCuda): { \
+ typedef float DType; \
+ typedef lang::Cuda Lang; \
+ { __VA_ARGS__ } \
+ break; \
+ } \
+ case ((kFloat32 << _SwitchShift) + kCpp): { \
+ typedef float DType; \
+ typedef lang::Cpp Lang; \
+ { __VA_ARGS__ } \
+ break; \
+ } \
+ case ((kFloat32 << _SwitchShift) + kOpencl): { \
+ typedef float DType; \
+ typedef lang::Opencl Lang; \
+ { __VA_ARGS__ } \
+ break; \
+ } \
+ default: \
+ LOG(FATAL) << "Unknown combination of data type " \
+ << DataType_Name(dtype) << " and language " \
+ << LangType_Name(ltype); \
+ } \
} while (0)
-template <typename SType> void Tensor::SetValue(const SType x) {
+// =============Element-wise operations====================================
+template <typename SType>
+void Tensor::SetValue(const SType x) {
CHECK_EQ(sizeof(SType), SizeOf(data_type_));
auto size = Size();
auto ptr = blob_;
TYPE_LANG_SWITCH(data_type_, DType, device_->lang(), Lang, {
// cast x to DType
- device_->Exec(
- [size, x, ptr](Context *ctx) { Set<DType, Lang>(size, x, ptr, ctx); },
- {}, {ptr});
+ device_->Exec([size, x, ptr](Context *ctx) {
+ Set<DType, Lang>(size, x, ptr, ctx);
+ }, {}, {ptr});
});
}
template void Tensor::SetValue<float>(const float x);
@@ -328,21 +326,19 @@ template void Tensor::SetValue<float>(const float x);
#define EltwiseUnaryTensorFn(fn, t, ret) \
do { \
TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
- ret->device()->Exec( \
- [t, ret](Context* ctx) { \
- fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx); \
- }, \
- {t.blob()}, {ret->blob()}); \
+ ret->device()->Exec([t, ret](Context * ctx) { \
+ fn<DType, Lang>(t.Size(), t.blob(), ret->blob(), ctx); \
+ }, {t.blob()}, {ret->blob()}); \
}); \
} while (0)
-#define GenUnaryTensorFn(fn) \
- Tensor fn(const Tensor &t) { \
- Tensor ret(t.shape(), t.device(), t.data_type()); \
- auto *retptr = &ret; \
- EltwiseUnaryTensorFn(fn, t, retptr); \
- return ret; \
- } \
+#define GenUnaryTensorFn(fn) \
+ Tensor fn(const Tensor &in) { \
+ Tensor ret(in.shape(), in.device(), in.data_type()); \
+ auto *retptr = &ret; \
+ EltwiseUnaryTensorFn(fn, in, retptr); \
+ return ret; \
+ } \
void fn(const Tensor &in, Tensor *out) { EltwiseUnaryTensorFn(fn, in, out); }
GenUnaryTensorFn(Abs);
@@ -355,33 +351,89 @@ GenUnaryTensorFn(Sqrt);
GenUnaryTensorFn(Square);
GenUnaryTensorFn(Tanh);
-// TODO(wangwei) conside async exec
-template <> float Sum<float>(const Tensor &t) {
- float s = 0.0f;
- TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, {
- t.device()->Exec(
- [t, &s](Context *ctx) {
- Sum<DType, Lang>(t.Size(), t.blob(), &s, ctx);
- },
- {t.blob()}, {});
- });
- return s;
-}
+#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret) \
+ do { \
+ TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \
+ CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())); \
+ ret->device()->Exec([lhs, rhs, ret](Context * ctx) { \
+ fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), ctx); \
+ }, {lhs.blob(), rhs.blob()}, {ret->blob()}); \
+ }); \
+ } while (0)
-Tensor Sum(const Tensor &M, int axis) {
- if (axis == 0) {
- Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
- SumRows(M, &out);
- return out;
- } else {
- CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
- Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
- SumColumns(M, &out);
- return out;
+#define GenBinaryTensorFn(op, fn) \
+ Tensor op(const Tensor &lhs, const Tensor &rhs) { \
+ Tensor ret(lhs.shape(), lhs.device(), lhs.data_type()); \
+ fn(lhs, rhs, &ret); \
+ return ret; \
+ } \
+ void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
+ EltwiseBinaryTensorFn(fn, lhs, rhs, ret); \
}
+
+GenBinaryTensorFn(operator+, Add);
+GenBinaryTensorFn(operator-, Sub);
+GenBinaryTensorFn(operator*, EltwiseMult);
+GenBinaryTensorFn(operator/, Div);
+GenBinaryTensorFn(Pow, Pow);
+
+#define EltwiseTensorScalarFn(fn, t, x, ret) \
+ do { \
+ TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
+ static_assert(std::is_same<SType, DType>::value, \
+ "The Scalar type must match the Tensor data type"); \
+ ret->device()->Exec([t, x, ret](Context * ctx) { \
+ fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx); \
+ }, {t.blob()}, {ret->blob()}); \
+ }); \
+ } while (0)
+
+#define GenTensorScalarFn(op, fn) \
+ template <typename SType> \
+ Tensor op(const Tensor &in, const SType x) { \
+ Tensor ret(in.shape(), in.device(), in.data_type()); \
+ fn(in, x, &ret); \
+ return ret; \
+ } \
+ template <typename SType> \
+ void fn(const Tensor &in, const SType x, Tensor *ret) { \
+ EltwiseTensorScalarFn(fn, in, x, ret); \
+ } \
+ template Tensor op<float>(const Tensor &in, const float x); \
+ template void fn<float>(const Tensor &in, const float x, Tensor *ret)
+
+GenTensorScalarFn(operator+, Add);
+GenTensorScalarFn(operator-, Sub);
+GenTensorScalarFn(operator*, EltwiseMult);
+GenTensorScalarFn(operator/, Div);
+GenTensorScalarFn(Pow, Pow);
+GenTensorScalarFn(operator<, LT);
+GenTensorScalarFn(operator<=, LE);
+GenTensorScalarFn(operator>, GT);
+GenTensorScalarFn(operator>=, GE);
+template <typename SType>
+Tensor Div(const SType alpha, const Tensor &in) {
+ Tensor out(in.shape(), in.device(), in.data_type());
+ Div(alpha, in, &out);
+ return out;
}
+template Tensor Div<float>(const float, const Tensor &);
-Tensor Average(const Tensor &t, int axis) {
+template <typename SType>
+void Div(const SType alpha, const Tensor &in, Tensor *out) {
+ CheckDataTypeAndLang(in, *out);
+ CHECK(in.shape() == out->shape());
+ TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+ // TODO(wangwei) type cast SType to DType;
+ in.device()->Exec([alpha, in, out](Context *ctx) {
+ Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
+ }, {in.blob()}, {out->blob()});
+ });
+}
+template void Div<float>(const float, const Tensor &, Tensor *);
+
+// =============Matrix operations============================================
+Tensor Average(const Tensor &M, int axis) {
// operator/ only has implementation for float scalar type, hence it is
// necessary to cast the denominator to a float.
// TODO(wangwei) implement function for cast scalar type involved in Tensor
@@ -396,10 +448,34 @@ Tensor Average(const Tensor &t, int axis) {
// ....
// }
if (axis == 0) {
- return Sum(t, 0) / (1.0f * t.shape().at(0));
+ return Sum(M, 0) / (1.0f * M.shape(0));
} else {
CHECK_EQ(axis, 1);
- return Sum(t, 1) / (1.0f * t.shape().at(1));
+ return Sum(M, 1) / (1.0f * M.shape(1));
+ }
+}
+// TODO(wangwei) conside async exec
+template <>
+float Sum<float>(const Tensor &in) {
+ float s = 0.0f;
+ TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+ in.device()->Exec([in, &s](Context *ctx) {
+ Sum<DType, Lang>(in.Size(), in.blob(), &s, ctx);
+ }, {in.blob()}, {});
+ });
+ return s;
+}
+
+Tensor Sum(const Tensor &M, int axis) {
+ if (axis == 0) {
+ Tensor out(Shape{M.shape(1)}, M.device(), M.data_type());
+ SumRows(M, &out);
+ return out;
+ } else {
+ CHECK_EQ(axis, 1) << "Not support Sum over axis = " << axis;
+ Tensor out(Shape{M.shape(0)}, M.device(), M.data_type());
+ SumColumns(M, &out);
+ return out;
}
}
@@ -424,141 +500,10 @@ void SoftMax(const Tensor &in, int axis, Tensor *out) {
DivColumn(sum, out);
}
-#define EltwiseBinaryTensorFn(fn, lhs, rhs, ret) \
- do { \
- TYPE_LANG_SWITCH(lhs.data_type(), DType, lhs.device()->lang(), Lang, { \
- CHECK_EQ(sizeof(DType), SizeOf(rhs.data_type())); \
- ret->device()->Exec( \
- [lhs, rhs, ret](Context *ctx) { \
- fn<DType, Lang>(lhs.Size(), lhs.blob(), rhs.blob(), ret->blob(), \
- ctx); \
- }, \
- {lhs.blob(), rhs.blob()}, {ret->blob()}); \
- }); \
- } while (0)
-
-#define GenBinaryTensorFn(op, fn) \
- Tensor op(const Tensor &lhs, const Tensor &rhs) { \
- Tensor ret(lhs.shape(), lhs.device(), lhs.data_type()); \
- fn(lhs, rhs, &ret); \
- return ret; \
- } \
- void fn(const Tensor &lhs, const Tensor &rhs, Tensor *ret) { \
- EltwiseBinaryTensorFn(fn, lhs, rhs, ret); \
- }
-
-GenBinaryTensorFn(operator+, Add);
-GenBinaryTensorFn(operator-, Sub);
-GenBinaryTensorFn(operator*, EltwiseMult);
-GenBinaryTensorFn(operator/, Div);
-GenBinaryTensorFn(Pow, Pow);
-
-#define EltwiseTensorScalarFn(fn, t, x, ret) \
- do { \
- TYPE_LANG_SWITCH(t.data_type(), DType, t.device()->lang(), Lang, { \
- static_assert(std::is_same<SType, DType>::value, \
- "The Scalar type must match the Tensor data type"); \
- ret->device()->Exec( \
- [t, x, ret](Context *ctx) { \
- fn<DType, Lang>(t.Size(), t.blob(), x, ret->blob(), ctx); \
- }, \
- {t.blob()}, {ret->blob()}); \
- }); \
- } while (0)
-
-#define GenTensorScalarFn(op, fn) \
- template <typename SType> Tensor op(const Tensor &t, SType x) { \
- Tensor ret(t.shape(), t.device(), t.data_type()); \
- fn(t, x, &ret); \
- return ret; \
- } \
- template <typename SType> void fn(const Tensor &t, SType x, Tensor *ret) { \
- EltwiseTensorScalarFn(fn, t, x, ret); \
- } \
- template Tensor op<float>(const Tensor &t, float x); \
- template void fn<float>(const Tensor &t, const float x, Tensor *ret)
-
-GenTensorScalarFn(operator+, Add);
-GenTensorScalarFn(operator-, Sub);
-GenTensorScalarFn(operator*, EltwiseMult);
-GenTensorScalarFn(operator/, Div);
-GenTensorScalarFn(Pow, Pow);
-GenTensorScalarFn(operator<, LT);
-GenTensorScalarFn(operator<=, LE);
-GenTensorScalarFn(operator>, GT);
-GenTensorScalarFn(operator>=, GE);
-
-// ================Blas operations============================================
-Tensor Mult(const Tensor &lhs, const Tensor &rhs) {
- Tensor ret(Shape{lhs.shape(0), rhs.shape(1)}, lhs.device(), lhs.data_type());
- Mult(lhs, rhs, &ret);
- return ret;
-}
-
-void Mult(const Tensor &lhs, const Tensor &rhs, Tensor *ret) {
- Mult(1.0f, lhs, rhs, 0.0f, ret);
-}
-
-void Mult(const float alpha, const Tensor &A, const Tensor &B, const float beta,
- Tensor *C) {
- CHECK_EQ(A.shape().size(), 2u);
- if (B.nDim() == 1u) {
- TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
- C->device()->Exec(
- [alpha, A, beta, B, C](Context *ctx) {
- GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), alpha,
- A.blob(), B.blob(), beta, C->blob(), ctx);
- },
- {A.blob(), B.blob()}, {C->blob()});
- });
- } else {
- CHECK(!C->transpose());
- TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
- C->device()->Exec(
- [alpha, A, beta, B, C](Context *ctx) {
- GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0),
- B.shape(1), A.shape(1), alpha, A.blob(), B.blob(),
- beta, C->blob(), ctx);
- },
- {A.blob(), B.blob()}, {C->blob()});
- });
- }
-}
-
-void Bernoulli(float p, Tensor *t) {
- TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
- t->device()->Exec(
- [p, t](Context *ctx) {
- Bernoulli<DType, Lang>(t->Size(), p, t->blob(), ctx);
- },
- {}, {t->blob()}, true);
- });
-}
-
-void Uniform(float low, float high, Tensor *t) {
- TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
- t->device()->Exec(
- [low, high, t](Context *ctx) {
- Uniform<DType, Lang>(t->Size(), low, high, t->blob(), ctx);
- },
- {}, {t->blob()}, true);
- });
-}
-
-void Gaussian(float mean, float std, Tensor *t) {
- TYPE_LANG_SWITCH(t->data_type(), DType, t->device()->lang(), Lang, {
- t->device()->Exec(
- [mean, std, t](Context *ctx) {
- Gaussian<DType, Lang>(t->Size(), mean, std, t->blob(), ctx);
- },
- {}, {t->blob()}, true);
- });
-}
-
-// ======follow the consistency guide
void AddColumn(const Tensor &v, Tensor *M) { AddColumn(1, 1, v, M); }
/// Add column 'v' onto each column of matrix M;
-void AddColumn(const float alpha, const float beta, const Tensor &v,
+template <typename SType>
+void AddColumn(const SType alpha, const SType beta, const Tensor &v,
Tensor *M) {
if (M->transpose()) {
Tensor X = M->T();
@@ -570,15 +515,19 @@ void AddColumn(const float alpha, const float beta, const Tensor &v,
CHECK_EQ(nb_row, v.Size());
Tensor one(Shape{1, nb_col}, M->device(), M->data_type());
- one.SetValue(1.0f); // TODO(wangwei) cast type
+ one.SetValue(1.0f); // TODO(wangwei) cast type
Tensor vmat = Reshape(v, Shape{nb_row, 1});
Mult(alpha, vmat, one, beta, M);
}
}
+template <>
+void AddColumn(const float alpha, const float beta, const Tensor &v, Tensor *M);
+
void AddRow(const Tensor &v, Tensor *M) { AddRow(1, 1, v, M); }
/// Sub column 'v' by each column of matrix M; write results into 'out'
-void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
+template <typename SType>
+void AddRow(const SType alpha, const SType beta, const Tensor &v, Tensor *M) {
if (M->transpose()) {
Tensor X = M->T();
AddColumn(v, &X);
@@ -594,29 +543,8 @@ void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M) {
Mult(alpha, one, vmat, beta, M);
}
}
-
-template <typename SType> Tensor Div(const SType alpha, const Tensor &in) {
- Tensor out(in.shape(), in.device(), in.data_type());
- Div(alpha, in, &out);
- return out;
-}
-
-template Tensor Div<float>(const float, const Tensor &);
-
-template <typename SType>
-void Div(const SType alpha, const Tensor &in, Tensor *out) {
- CheckDataTypeAndLang(in, *out);
- CHECK(in.shape() == out->shape());
- TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
- // TODO(wangwei) type cast SType to DType;
- in.device()->Exec(
- [alpha, in, out](Context *ctx) {
- Div<DType, Lang>(in.Size(), alpha, in.blob(), out->blob(), ctx);
- },
- {in.blob()}, {out->blob()});
- });
-}
-template void Div<float>(const float, const Tensor &, Tensor *);
+template <>
+void AddRow(const float alpha, const float beta, const Tensor &v, Tensor *M);
/// Divide column 'v' by each column of matrix M; write results into 'out'
void DivColumn(const Tensor &v, Tensor *M) {
@@ -640,12 +568,10 @@ void MultColumn(const Tensor &v, Tensor *M) {
CHECK_EQ(v.Size(), M->shape(0));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
- v.device()->Exec(
- [M, v](Context *ctx) {
- DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(),
- v.blob(), M->blob(), ctx);
- },
- {M->blob(), v.blob()}, {M->blob()});
+ v.device()->Exec([M, v](Context *ctx) {
+ DGMM<DType, Lang>(false, M->shape(0), M->shape(1), M->blob(), v.blob(),
+ M->blob(), ctx);
+ }, {M->blob(), v.blob()}, {M->blob()});
});
}
@@ -657,12 +583,10 @@ void MultRow(const Tensor &v, Tensor *M) {
CHECK_EQ(v.Size(), M->shape(1));
CheckDataTypeAndLang(*M, v);
TYPE_LANG_SWITCH(v.data_type(), DType, v.device()->lang(), Lang, {
- v.device()->Exec(
- [M, v](Context *ctx) {
- DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
- M->blob(), ctx);
- },
- {M->blob(), v.blob()}, {M->blob()});
+ v.device()->Exec([M, v](Context *ctx) {
+ DGMM<DType, Lang>(true, M->shape(0), M->shape(1), M->blob(), v.blob(),
+ M->blob(), ctx);
+ }, {M->blob(), v.blob()}, {M->blob()});
});
}
@@ -680,8 +604,8 @@ void SumColumns(const Tensor &M, Tensor *v) {
size_t nb_row = M.shape().at(0), nb_col = M.shape().at(1);
CHECK_EQ(nb_row, v->Size());
- Tensor one(Shape{nb_col, 1}, M.device(), M.data_type());
- one.SetValue(1.0f); // TODO(wangwei) cast type
+ Tensor one(Shape{nb_col}, M.device(), M.data_type());
+ one.SetValue(1.0f); // TODO(wangwei) cast type
Mult(M, one, v);
}
}
@@ -695,10 +619,98 @@ void SumRows(const Tensor &M, Tensor *v) {
size_t nb_row = M.shape(0), nb_col = M.shape(1);
CHECK_EQ(nb_col, v->Size());
- Tensor one(Shape{nb_row, 1}, M.device(), M.data_type());
- one.SetValue(1.0f); // TODO(wangwei) cast type
+ Tensor one(Shape{nb_row}, M.device(), M.data_type());
+ one.SetValue(1.0f); // TODO(wangwei) cast type
Tensor X = M.T();
Mult(X, one, v);
}
}
+// ====================Random operations=====================================
+template <typename SType>
+void Bernoulli(const SType p, Tensor *out) {
+ TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+ auto prob = TypeCast<SType, DType>(p);
+ out->device()->Exec([prob, out](Context *ctx) {
+ Bernoulli<DType, Lang>(out->Size(), prob, out->blob(), ctx);
+ }, {}, {out->blob()}, true);
+ });
+}
+template void Bernoulli<float>(const float p, Tensor *out);
+
+template <typename SType>
+void Uniform(const SType low, const SType high, Tensor *out) {
+ TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+ auto l = TypeCast<SType, DType>(low);
+ auto h = TypeCast<SType, DType>(high);
+ out->device()->Exec([l, h, out](Context *ctx) {
+ Uniform<DType, Lang>(out->Size(), l, h, out->blob(), ctx);
+ }, {}, {out->blob()}, true);
+ });
+}
+template void Uniform<float>(const float low, const float high, Tensor *out);
+
+template <typename SType>
+void Gaussian(const SType mean, const SType std, Tensor *out) {
+ TYPE_LANG_SWITCH(out->data_type(), DType, out->device()->lang(), Lang, {
+ auto m = TypeCast<SType, DType>(mean);
+ auto s = TypeCast<SType, DType>(std);
+ out->device()->Exec([m, s, out](Context *ctx) {
+ Gaussian<DType, Lang>(out->Size(), m, s, out->blob(), ctx);
+ }, {}, {out->blob()}, true);
+ });
+}
+template void Gaussian<float>(const float mean, const float std, Tensor *out);
+
+// ================Blas operations============================================
+template <typename SType>
+void Axpy(const SType alpha, const Tensor &in, Tensor *out) {
+ TYPE_LANG_SWITCH(in.data_type(), DType, in.device()->lang(), Lang, {
+ auto a = TypeCast<SType, DType>(alpha);
+ out->device()->Exec([a, in, out](Context *ctx) {
+ Axpy<DType, Lang>(in.Size(), a, in.blob(), out->blob(), ctx);
+ }, {in.blob(), out->blob()}, {out->blob()});
+ });
+}
+template <>
+void Axpy(const float alpha, const Tensor &in, Tensor *out);
+
+Tensor Mult(const Tensor &A, const Tensor &B) {
+ Shape s;
+ s.push_back(A.shape(0));
+ if (B.nDim() == 2) s.push_back(B.shape(1));
+ Tensor out(s, A.device(), A.data_type());
+ Mult(A, B, &out);
+ return out;
+}
+
+void Mult(const Tensor &A, const Tensor &B, Tensor *out) {
+ Mult(1.0f, A, B, 0.0f, out);
+}
+
+template <typename SType>
+void Mult(const SType alpha, const Tensor &A, const Tensor &B, const SType beta,
+ Tensor *C) {
+ CHECK_EQ(A.shape().size(), 2u);
+ if (B.nDim() == 1u) {
+ TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+ auto a = TypeCast<SType, DType>(alpha);
+ auto b = TypeCast<SType, DType>(beta);
+ C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ GEMV<DType, Lang>(A.transpose(), A.shape(0), A.shape(1), a, A.blob(),
+ B.blob(), b, C->blob(), ctx);
+ }, {A.blob(), B.blob()}, {C->blob()});
+ });
+ } else {
+ CHECK(!C->transpose());
+ TYPE_LANG_SWITCH(A.data_type(), DType, A.device()->lang(), Lang, {
+ auto a = TypeCast<SType, DType>(alpha);
+ auto b = TypeCast<SType, DType>(beta);
+ C->device()->Exec([a, A, b, B, C](Context *ctx) {
+ GEMM<DType, Lang>(A.transpose(), B.transpose(), A.shape(0), B.shape(1),
+ A.shape(1), a, A.blob(), B.blob(), b, C->blob(), ctx);
+ }, {A.blob(), B.blob()}, {C->blob()});
+ });
+ }
+}
+
} // namespace singa
http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/564c88ad/src/core/tensor/tensor_math.h
----------------------------------------------------------------------
diff --git a/src/core/tensor/tensor_math.h b/src/core/tensor/tensor_math.h
index 1bf6fc7..b5d0ba9 100644
--- a/src/core/tensor/tensor_math.h
+++ b/src/core/tensor/tensor_math.h
@@ -29,12 +29,14 @@ namespace singa {
/// device programming language, e.g., Langice::kCpp, Langice::kCuda
///
/// TODO(wangwei) Clean the functions to make the function APIs consistent:
-/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the first
+/// 1. All function names should be like XxxYyy or XY, i.e., capitablize the
+/// first
/// letter.
/// 2. Order functions based on function name in alphabetical order.
-/// 3. Function arguments order is [const basic type] [const Blob] [mutable Blob].
+/// 3. Function arguments order is [const basic type] [const Blob] [mutable
+/// Blob].
/// 4. Function argument names, use 'num' for total number of elements in
-/// elementwise operations; use 'in1' 'in2' for input blobs; use 'out' for
+/// elementwise operations; use 'in1' 'in2' for in blobs; use 'out' for
/// output blob or value. With exceptions for some functions, e.g.,
/// Scale(const float alpha, const Blob* in, Blob* out);
/// For such cases, use x, v, alpha, etc for scalar types.
@@ -46,262 +48,283 @@ namespace singa {
/// 7. Use size_t for the number of elements, rows or columns.
/// 8. Use the same name for the Tensor and Blob level math functions.
-
-// ================Linear algebra functions====================================
-/// ret[i] = |input[i]|
+// =============Element-wise operations====================================
+/// out[i] = |in[i]|
template <typename DType, typename Lang>
void Abs(const size_t num, const Blob *in, Blob *out, Context *ctx) {
LOG(FATAL) << "Abs Not Implemented";
}
+/// out = in + x
template <typename DType, typename Lang>
-void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
- LOG(FATAL) << "Set Not Implemented";
+void Add(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Add Not Implemented";
}
-/// sum all elements of input into ret
+/// out = in1 + in2
template <typename DType, typename Lang>
-void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
- LOG(FATAL) << "Sum Not Implemented";
+void Add(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Add-Pair Not Implemented";
}
-
-/// ret[i] = sign(input[i])
+/// Element-wise operation, clamp every element into [low, high]
+/// if x>high, then x=high; if x<low, then x=low.
template <typename DType, typename Lang>
-void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Sign Not Implemented";
+void Clamp(const size_t num, const DType low, const DType high, const Blob *in,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "Clamp Not Implemented";
}
-/// Base is e, Neper number. ret[i]=exp(input[i])
+/// out = x / in
template <typename DType, typename Lang>
-void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Exp Not Implemented";
+void Div(const size_t num, const DType x, const Blob *in, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Div Not Implemented";
}
-/// Natual logarithm, the base is e, Neper number ret[i]=log(input[i]).
-template <typename DType, typename Lang>
-void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Log Not Implemented";
-}
-/// Element-wise operation, ret[i]=sqrt([input[i])
template <typename DType, typename Lang>
-void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Sqrt Not Implemented";
+void Div(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ CHECK_NE(x, 0.f);
+ EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
}
-/// Element-wise operation, ret[i]=square([input[i])
+/// out = in1 / in2
template <typename DType, typename Lang>
-void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Square Not Implemented";
+void Div(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Div-Pair Not Implemented";
}
-/// Element-wise operation, ret[i]=tanh([input[i])
+/// out = in * x
template <typename DType, typename Lang>
-void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Tanh Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "EltwiseMult Not Implemented";
}
-/// Element-wise operation, ret[i]=max(0, input[i])
+
+/// out = in2 * in2
template <typename DType, typename Lang>
-void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "ReLU Not Implemented";
+void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
}
-/// Element-wise operation, ret[i]=sigmoid([input[i])
+
+/// Base is e, Neper number. out[i]=exp(in[i])
template <typename DType, typename Lang>
-void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Sigmoid Not Implemented";
+void Exp(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Exp Not Implemented";
}
-// Do softmax for each row invidually
+/// out[i]=(in[i]<=x)?1.f:0.f
template <typename DType, typename Lang>
-void Softmax(const size_t nrow, const size_t ncol, const Blob *in,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Softmax Not Implemented";
+void LE(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "LE Not Implemented";
}
-
-// TODO(wangwei) unify SumRow and SumCol.
-/// Sum the rows of the input matrix into a vector
+/// Natual logarithm, the base is e, Neper number out[i]=log(in[i]).
template <typename DType, typename Lang>
-void SumRows(const size_t nrow, const size_t ncol, const Blob *in,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "SumRows Not Implemented";
+void Log(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Log Not Implemented";
}
-
-/// Sum the columns of the input matrix into a vector
+/// out[i]=(in[i]<x)?1.f:0.f
template <typename DType, typename Lang>
-void SumColumns(const size_t nrow, const size_t ncol, const Blob *in,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "SumColumns Not Implemented";
+void LT(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "LT Not Implemented";
}
-
-// TODO(wangwei) unify AddRow and AddCol.
-/// Add the vector v to every row of A as the row of out
+/// out[i]=(in[i]>=x)?1.f:0.f
template <typename DType, typename Lang>
-void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "AddRow Not Implemented";
+void GE(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "GE Not Implemented";
}
-
-/// Add the vector v to every column of A as the column of out
+/// out[i]=(in[i]>x)?1.f:0.f
template <typename DType, typename Lang>
-void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "AddCol Not Implemented";
+void GT(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "GT Not Implemented";
}
-
-/// Element-wise operation, do v^x for every v from the input tensor
+/// Element-wise operation, do v^x for every v from the in tensor
template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
LOG(FATAL) << "Pow Not Implemented";
}
/// Element-wise operation, do v^x for every v from the lhs and every x from rhs
template <typename DType, typename Lang>
-void Pow(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
+void Pow(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+ Context *ctx) {
LOG(FATAL) << "Pow-Pair Not Implemented";
}
-/// Element-wise operation, clamp every element into [low, high]
-/// if x>high, then x=high; if x<low, then x=low.
+/// Element-wise operation, out[i]=max(0, in[i])
template <typename DType, typename Lang>
-void Clamp(const size_t num, const DType low, const DType high, const Blob *in, Blob *out, Context *ctx) {
- LOG(FATAL) << "Clamp Not Implemented";
+void ReLU(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "ReLU Not Implemented";
}
-/// ret = input + x
template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in, const DType x,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Add Not Implemented";
+void Set(const size_t num, const DType x, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Set Not Implemented";
}
-
-/// ret = lhs + rhs
+/// Element-wise operation, out[i]=sigmoid([in[i])
template <typename DType, typename Lang>
-void Add(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Add-Pair Not Implemented";
+void Sigmoid(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Sigmoid Not Implemented";
}
-/// ret = input - x
+/// out[i] = sign(in[i])
template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
- Add<DType, Lang>(num, in, -x, out, ctx);
+void Sign(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Sign Not Implemented";
}
-
-/// ret = lhs - rhs
+/// Element-wise operation, out[i]=sqrt([in[i])
template <typename DType, typename Lang>
-void Sub(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Sub-Pair Not Implemented";
+void Sqrt(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Sqrt Not Implemented";
}
-/// ret = input * x
+/// Element-wise operation, out[i]=square([in[i])
template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in, const DType x, Blob *out,
- Context *ctx) {
- LOG(FATAL) << "EltwiseMult Not Implemented";
+void Square(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Square Not Implemented";
}
-/// ret = lhs * rhs
+/// out = in - x
template <typename DType, typename Lang>
-void EltwiseMult(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "EltwiseMult-Pair Not Implemented";
+void Sub(const size_t num, const Blob *in, const DType x, Blob *out,
+ Context *ctx) {
+ Add<DType, Lang>(num, in, -x, out, ctx);
}
-/// ret = input / x
+/// out = in1 - in2
template <typename DType, typename Lang>
-void Div(const size_t num, const DType x, const Blob *in,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Div Not Implemented";
+void Sub(const size_t num, const Blob *in1, const Blob *in2, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Sub-Pair Not Implemented";
}
-
+/// sum all elements of in into out
template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
- CHECK_NE(x,0.f);
- EltwiseMult<DType, Lang>(num, in, DType(1) / x, out, ctx);
+void Sum(const size_t num, const Blob *in, DType *out, Context *ctx) {
+ LOG(FATAL) << "Sum Not Implemented";
}
-/// ret = lhs / rhs
+/// Element-wise operation, out[i]=tanh([in[i])
template <typename DType, typename Lang>
-void Div(const size_t num, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Div-Pair Not Implemented";
+void Tanh(const size_t num, const Blob *in, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Tanh Not Implemented";
}
+// =========== Matrix operations ===========================================
+/// Add the vector v to every column of A as the column of out
+template <typename DType, typename Lang>
+void AddCol(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "AddCol Not Implemented";
+}
+// TODO(wangwei) unify AddRow and AddCol.
+/// Add the vector v to every row of A as the row of out
+template <typename DType, typename Lang>
+void AddRow(const size_t nrow, const size_t ncol, const Blob *A, const Blob *v,
+ Blob *out, Context *ctx) {
+ LOG(FATAL) << "AddRow Not Implemented";
+}
/// outer-product.
-/// lhs and rhs are vectors of len m and n. ret is matrix of shape m * n
+/// in1 and in2 are vectors of len m and n. out is matrix of shape m * n
template <typename DType, typename Lang>
-void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
- Blob *out, Context *ctx) {
+void Outer(const size_t m, const size_t n, const Blob *in1, const Blob *in2,
+ Blob *out, Context *ctx) {
LOG(FATAL) << "Outer Not Implemented";
}
-
-/// ret[i]=(input[i]<x)?1.f:0.f
+// Do softmax for each row invidually
template <typename DType, typename Lang>
-void LT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
- LOG(FATAL) << "LT Not Implemented";
+void Softmax(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Softmax Not Implemented";
}
-/// ret[i]=(input[i]<=x)?1.f:0.f
+/// Sum the columns of the in matrix into a vector
template <typename DType, typename Lang>
-void LE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
- LOG(FATAL) << "LE Not Implemented";
+void SumColumns(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "SumColumns Not Implemented";
}
-/// ret[i]=(input[i]>x)?1.f:0.f
+// TODO(wangwei) unify SumRow and SumCol.
+/// Sum the rows of the in matrix into a vector
template <typename DType, typename Lang>
-void GT(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
- LOG(FATAL) << "GT Not Implemented";
+void SumRows(const size_t nrow, const size_t ncol, const Blob *in, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "SumRows Not Implemented";
+}
+
+// ================Random functions===========================================
+/// Each element of out would be 1 with prob p and 0 with 1-p. 0<= p <= 1
+// Get the random generator from 'ctx'
+// If DType is not float, then convert the threshold to DType
+template <typename DType, typename Lang>
+void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
+ LOG(FATAL) << "Bernoulli Not Implemented";
}
-/// ret[i]=(input[i]>=x)?1.f:0.f
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the mean and std to DType
template <typename DType, typename Lang>
-void GE(const size_t num, const Blob *in, const DType x, Blob *out, Context *ctx) {
- LOG(FATAL) << "GE Not Implemented";
+void Gaussian(const size_t num, const float mean, const float std, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Gaussian Not Implemented";
+}
+// The random generator should be extracted from ctx.
+// If DType is not float, then convert the low and high to DType
+template <typename DType, typename Lang>
+void Uniform(const size_t num, const float low, const float high, Blob *out,
+ Context *ctx) {
+ LOG(FATAL) << "Uniform Not Implemented";
}
// ===== BLAS functions, ref to http://docs.nvidia.com/cuda/cublas
-// ===== Level 1
-/// return the index of the element with the max value.
+/// outurn the index of the element with the max value.
template <typename DType, typename Lang>
void Amax(const size_t num, const Blob *in, size_t *out, Context *ctx) {
LOG(FATAL) << "Amax Not Implemented";
}
-/// return the index of the element with the min value.
+/// outurn the index of the element with the min value.
template <typename DType, typename Lang>
void Amin(const size_t num, const Blob *in, size_t *out, Context *ctx) {
LOG(FATAL) << "Amin Not Implemented";
}
-/// ret = sum |x| for all x in input
+/// out = sum |x| for all x in in
template <typename DType, typename Lang>
void Asum(const size_t num, const Blob *in, DType *out, Context *ctx) {
LOG(FATAL) << "Asum Not Implemented";
}
-/// ret = alpha * input + ret
+/// out = alpha * in + out
template <typename DType, typename Lang>
-void Axpy(const size_t num, const DType alpha, const Blob *in,
- Blob *out, Context *ctx) {
+void Axpy(const size_t num, const DType alpha, const Blob *in, Blob *out,
+ Context *ctx) {
LOG(FATAL) << "Axpy Not Implemented";
}
-/// ret *= x
+/// out *= x
template <typename DType, typename Lang>
void Scale(const size_t num, const DType x, Blob *out, Context *ctx) {
LOG(FATAL) << "Scale Not Implemented";
}
template <typename DType, typename Lang>
-void Dot(const size_t num, const Blob *in1, const Blob *in2,
- DType *out, Context *ctx) {
+void Dot(const size_t num, const Blob *in1, const Blob *in2, DType *out,
+ Context *ctx) {
LOG(FATAL) << "Dot Not Implemented";
}
-// ===== Level 2
-/// ret = alpha * op(A) * v + beta * ret.
-/// op(A) = A if trans = false; A^T otherwise; rows(op(A)) = m, cols(op(A)) = n.
+/// out = alpha * A * v + beta * out.
+/// transA indicates if the internal data layout is transposed of A
template <typename DType, typename Lang>
-void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
- const Blob *A, const Blob *v,
- const DType beta, Blob *out, Context *ctx) {
+void GEMV(bool trans, const size_t m, const size_t n, const DType alpha,
+ const Blob *A, const Blob *v, const DType beta, Blob *out,
+ Context *ctx) {
LOG(FATAL) << "GEMV Not Implemented";
}
@@ -323,34 +346,5 @@ void GEMM(const bool transA, const bool transB, const size_t nrowA,
LOG(FATAL) << "GEMM Not Implemented";
}
-
-// ===== Level 3
-
-// ================Random functions===========================================
-/// Each element of ret would be 1 with prob p and 0 with 1-p. 0<= p <= 1
-// Get the random generator from 'ctx'
-// If DType is not float, then convert the threshold to DType
-template <typename DType, typename Lang>
-void Bernoulli(const size_t num, const float p, Blob *out, Context *ctx) {
- LOG(FATAL) << "Bernoulli Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the low and high to DType
-template <typename DType, typename Lang>
-void Uniform(const size_t num, const float low, const float high,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Uniform Not Implemented";
-}
-// The random generator should be extracted from ctx.
-// If DType is not float, then convert the mean and std to DType
-template <typename DType, typename Lang>
-void Gaussian(const size_t num, const float mean, const float std,
- Blob *out, Context *ctx) {
- LOG(FATAL) << "Gaussian Not Implemented";
-}
-
-
-
-
} // namespace singa
#endif // SINGA_CORE_MATH_H_