You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2018/03/10 05:46:37 UTC
[incubator-mxnet] branch master updated: Cpu lstm inference (#9977)
This is an automated email from the ASF dual-hosted git repository.
zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 13ae4d1 Cpu lstm inference (#9977)
13ae4d1 is described below
commit 13ae4d1dbbd117a5b5484bf593d24469b4d2d345
Author: Jerry Zhang <zh...@cs.washington.edu>
AuthorDate: Fri Mar 9 21:46:34 2018 -0800
Cpu lstm inference (#9977)
* fix autograd import path
* cpu lstm working
* remove fatal log
* add simple unittest
remove redundant log
enable openmp
* fused input2hidden gemm
* fix lint
* fix pylint
* fix windows build error
* fix gluon rnn interface
* Update dataloader.py
* address cr
* address cr
* fix import
* revert some cosmetic change
* fix typo
* remove newline
* rm virtual
mv hardcoded number to constant
* address cr
add tests
* simplify test
* fix test
* fix tests
* change magic number scope
---
python/mxnet/gluon/rnn/rnn_layer.py | 14 ++-
src/operator/rnn-inl.h | 210 ++++++++++++++++++++++++++++++++
src/operator/rnn.cc | 1 -
tests/python/gpu/test_operator_gpu.py | 19 ++-
tests/python/unittest/test_gluon_rnn.py | 14 +++
5 files changed, 251 insertions(+), 7 deletions(-)
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 204f3c9..2fac399 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -23,6 +23,7 @@
from __future__ import print_function
__all__ = ['RNN', 'LSTM', 'GRU']
+from ...autograd import is_training
from ... import ndarray
from .. import Block
from . import rnn_cell
@@ -185,15 +186,17 @@ class _RNNLayer(Block):
for i in range(self._dir):
self.i2h_weight[i].shape = (self._gates*self._hidden_size, inputs.shape[2])
self.i2h_weight[i]._finish_deferred_init()
- if inputs.context.device_type == 'gpu':
- out = self._forward_gpu(inputs, states)
+ if inputs.context.device_type == 'gpu' or \
+ (not is_training() and self._mode == 'lstm'):
+ out = self._forward_kernel(inputs, states)
else:
- out = self._forward_cpu(inputs, states)
+ out = self._forward(inputs, states)
# out is (output, state)
return out[0] if skip_states else out
- def _forward_cpu(self, inputs, states):
+ def _forward(self, inputs, states):
+ """forward using gluon cell"""
ns = len(states)
axis = self._layout.find('T')
states = sum(zip(*((j for j in i) for i in states)), ())
@@ -207,7 +210,8 @@ class _RNNLayer(Block):
return outputs, new_states
- def _forward_gpu(self, inputs, states):
+ def _forward_kernel(self, inputs, states):
+ """ forward using CUDNN or CPU kenrel"""
if self._layout == 'NTC':
inputs = ndarray.swapaxes(inputs, dim1=0, dim2=1)
ctx = inputs.context
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index b4735b8..13c077d 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -34,7 +34,11 @@
#include <vector>
#include <string>
#include <utility>
+#include "./math.h"
+#include "./math_functions-inl.h"
#include "./operator_common.h"
+#include "./mshadow_op.h"
+#include "./linalg.h"
namespace mxnet {
namespace op {
@@ -153,6 +157,212 @@ class RNNOp : public Operator {
RNNParam param_;
}; // class RNNOp
+template<typename DType>
+class RNNOp<cpu, DType> : public Operator {
+ public:
+ explicit RNNOp(RNNParam param) {
+ this->param_ = param;
+ // RNN Mode
+ param_.lstm_q_ = false;
+ switch (param_.mode) {
+ case rnn_enum::kLstm:
+ param_.lstm_q_ = true;
+ break;
+ default:
+ LOG(FATAL) << "only LSTM is implmented on CPU";
+ }
+ }
+
+ virtual void Forward(const OpContext &ctx,
+ const std::vector<TBlob> &in_data,
+ const std::vector<OpReqType> &req,
+ const std::vector<TBlob> &out_data,
+ const std::vector<TBlob> &aux_args) {
+ // Layout TNC
+ CHECK(!ctx.is_train) << "only inference mode is available"
+ "for cpu at the moment.";
+ size_t in_expected = param_.lstm_q_ ? 4 : 3;
+ size_t out_expected = param_.lstm_q_ ? 3 : 2;
+
+ if (!param_.state_outputs)
+ LOG(FATAL) << "no state outputs is currently not supported for cpu.";
+
+ CHECK_EQ(req[rnn_enum::kOut], kWriteTo);
+ CHECK_EQ(in_data.size(), in_expected);
+ CHECK_EQ(out_data.size(), out_expected);
+
+ mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+ // get input + output tensors
+ // w layout i2h_w, h2h_w, i2h_b, h2h_b
+ Tensor<cpu, 3, DType> x =
+ in_data[rnn_enum::kData].get<cpu, 3, DType>(s); // TNC
+ Tensor<cpu, 1, DType> w = in_data[rnn_enum::kParams].get<cpu, 1, DType>(s);
+ Tensor<cpu, 3, DType> hx =
+ in_data[rnn_enum::kState].get<cpu, 3, DType>(s); // LNC
+ Tensor<cpu, 3, DType> y =
+ out_data[rnn_enum::kOut].get<cpu, 3, DType>(s); // TNC
+ int64_t seq_len = x.shape_[0];
+ int64_t num_layers = hx.shape_[0];
+ int64_t batch_size = x.shape_[1];
+ int64_t h_channel = hx.shape_[2];
+ int64_t in_channel = x.shape_[2];
+ Tensor<cpu, 2, DType> x_flatten = in_data[rnn_enum::kData]
+ .get_with_shape<cpu, 2, DType>(
+ mshadow::Shape2(seq_len * batch_size, in_channel), s); // (T*N)C
+ Tensor<cpu, 2, DType> y_flatten = out_data[rnn_enum::kOut]
+ .get_with_shape<cpu, 2, DType>(
+ mshadow::Shape2(
+ y.shape_[0] * y.shape_[1], y.shape_[2]), s); // (T*N)C
+
+ CHECK(x.CheckContiguous());
+ CHECK(w.CheckContiguous());
+ CHECK(hx.CheckContiguous());
+ CHECK(y.CheckContiguous());
+
+ if (param_.lstm_q_) {
+ const size_t kNumMat = 4;
+ int64_t fused_h_ch = kNumMat * h_channel;
+ int64_t h_size = batch_size * fused_h_ch;
+ int64_t num_dir = 1 + param_.bidirectional;
+ int64_t h2h_w_size = h_channel * fused_h_ch;
+
+ Tensor<cpu, 3, DType> cx =
+ in_data[rnn_enum::kStateCell].get<cpu, 3, DType>(s);
+ CHECK(cx.CheckContiguous());
+
+ Tensor<cpu, 3, DType> cy =
+ out_data[rnn_enum::kStateCellOut].get<cpu, 3, DType>(s);
+ Tensor<cpu, 3, DType> hy =
+ out_data[rnn_enum::kStateOut].get<cpu, 3, DType>(s);
+ CHECK(cy.CheckContiguous());
+ CHECK(hy.CheckContiguous());
+
+ DType* workspace_addr =
+ static_cast<DType *>(ctx.requested[rnn_enum::kTempSpace]
+ .get_host_space_internal(sizeof(DType) *
+ (seq_len * h_size + h_size
+ + y.shape_[0] * y.shape_[1] * y.shape_[2])));
+ Tensor<cpu, 3, DType> i2h_y(
+ workspace_addr, mshadow::Shape3(seq_len, batch_size, fused_h_ch));
+ Tensor<cpu, 2, DType> i2h_y_flatten(
+ workspace_addr, mshadow::Shape2(seq_len * batch_size, fused_h_ch));
+ Tensor<cpu, 2, DType> h2h_y(workspace_addr
+ + seq_len * h_size, mshadow::Shape2(batch_size, fused_h_ch));
+ Tensor<cpu, 3, DType> y_tmp(workspace_addr
+ + (seq_len + 1) * h_size, y.shape_);
+ Tensor<cpu, 2, DType> y_flatten_tmp(workspace_addr
+ + (seq_len + 1) * h_size, y_flatten.shape_);
+ CHECK(i2h_y.CheckContiguous());
+ CHECK(h2h_y.CheckContiguous());
+ CHECK(y_tmp.CheckContiguous());
+
+ for (int64_t layer = 0; layer < num_layers; layer++) {
+ int reverse_dir = 0;
+ int out_tmp = 0;
+ if (param_.bidirectional && layer % 2)
+ reverse_dir = 1;
+ if (layer / num_dir % 2 == 0)
+ out_tmp = 1;
+ mshadow::Shape<2> i2h_w_shape = mshadow::Shape2(fused_h_ch,
+ (layer < num_dir) ? in_channel : num_dir * h_channel);
+ mshadow::Shape<2> h2h_w_shape = mshadow::Shape2(fused_h_ch, h_channel);
+ int64_t start = layer < num_dir ?
+ (layer * (in_channel * fused_h_ch + h2h_w_size)) : // input layer
+ (num_dir * (in_channel * fused_h_ch + h2h_w_size)
+ + (layer - num_dir) * (h2h_w_size * num_dir + h2h_w_size));
+ Tensor<cpu, 2, DType> i2h_w(w.dptr_ + start, i2h_w_shape);
+ start += layer < num_dir ?
+ in_channel * fused_h_ch : h2h_w_size * num_dir;
+ Tensor<cpu, 2, DType> h2h_w(w.dptr_ + start, h2h_w_shape);
+ start = num_dir * (in_channel * fused_h_ch + h2h_w_size)
+ + (num_layers - num_dir) * (h2h_w_size * (num_dir + 1))
+ + layer * fused_h_ch * 2;
+ Tensor<cpu, 1, DType> i2h_b = w.Slice(start, start + fused_h_ch);
+ start += fused_h_ch;
+ Tensor<cpu, 1, DType> h2h_b = w.Slice(start, start + fused_h_ch);
+ if (out_tmp) {
+ linalg_gemm(layer < num_dir ? x_flatten:y_flatten, i2h_w,
+ i2h_y_flatten, false, true, s);
+ } else {
+ linalg_gemm(layer < num_dir ? x_flatten:y_flatten_tmp, i2h_w,
+ i2h_y_flatten, false, true, s);
+ }
+ i2h_y_flatten += repmat(i2h_b, seq_len * batch_size);
+ for (int64_t t = 0; t < seq_len; t++) {
+ int64_t timestep = t;
+ if (reverse_dir)
+ timestep = seq_len - 1 - t;
+ linalg_gemm(t == 0 ? hx[layer]:hy[layer], h2h_w, h2h_y,
+ false, true, s);
+ h2h_y += repmat(h2h_b, batch_size);
+ // fused element-wise ops
+ LSTMFusedElementWiseCPUOps(i2h_y[timestep], cx[layer], h2h_y,
+ y[timestep], out_tmp ? y_tmp[timestep]: y[timestep],
+ hy[layer], cy[layer], batch_size, h_channel, t,
+ reverse_dir, out_tmp && (layer == num_layers - 1));
+ }
+ }
+ } else {
+ LOG(FATAL) << "only LSTM is available for cpu at the moment.";
+ }
+ }
+
+ virtual void Backward(const OpContext &ctx,
+ const std::vector<TBlob> &out_grad,
+ const std::vector<TBlob> &in_data,
+ const std::vector<TBlob> &out_data,
+ const std::vector<OpReqType> &req,
+ const std::vector<TBlob> &in_grad,
+ const std::vector<TBlob> &aux_args) {
+ LOG(FATAL) << "LSTM backward is not available for cpu at the moment.";
+ }
+
+ private:
+ RNNParam param_;
+
+ void LSTMFusedElementWiseCPUOps(const Tensor<cpu, 2, DType> &i2h_y,
+ const Tensor<cpu, 2, DType> &cx,
+ const Tensor<cpu, 2, DType> &h2h_y,
+ const Tensor<cpu, 2, DType> &y,
+ // holding intermediate layer output
+ const Tensor<cpu, 2, DType> &tmp,
+ const Tensor<cpu, 2, DType> &hy,
+ const Tensor<cpu, 2, DType> &cy,
+ const int64_t batch_size,
+ const int64_t h_channel,
+ const int64_t t,
+ const int reverse_dir,
+ const int copy_tmp2y) {
+ int64_t length = batch_size * h_channel;
+ #pragma omp parallel for
+ for (int64_t ji = 0; ji < length; ++ji) {
+ int64_t j = ji / h_channel; // batch dim
+ int64_t i = ji % h_channel;
+ int64_t f = i + h_channel;
+ int64_t c = i + h_channel * 2;
+ int64_t o = i + h_channel * 3;
+ int64_t j_pos = j * h_channel * 4;
+ h2h_y.dptr_[j_pos + i] += i2h_y.dptr_[j_pos + i];
+ h2h_y.dptr_[j_pos + f] += i2h_y.dptr_[j_pos + f];
+ h2h_y.dptr_[j_pos + o] += i2h_y.dptr_[j_pos + o];
+ h2h_y.dptr_[j_pos + c] += i2h_y.dptr_[j_pos + c];
+ h2h_y.dptr_[j_pos + i] = 1.0f / (1.0f + math::exp(-h2h_y.dptr_[j_pos + i]));
+ h2h_y.dptr_[j_pos + f] = 1.0f / (1.0f + math::exp(-h2h_y.dptr_[j_pos + f]));
+ h2h_y.dptr_[j_pos + o] = 1.0f / (1.0f + math::exp(-h2h_y.dptr_[j_pos + o]));
+ h2h_y.dptr_[j_pos + c] = tanh(h2h_y.dptr_[j_pos + c]);
+ cy[j][i] = h2h_y.dptr_[j_pos + f] * (t == 0 ? cx[j][i]:cy[j][i])
+ + h2h_y.dptr_[j_pos + i] * h2h_y.dptr_[j_pos + c];
+ hy[j][i] = h2h_y.dptr_[j_pos + o] * tanh(cy[j][i]);
+ tmp[j][i + h_channel * reverse_dir] = hy[j][i];
+ if (copy_tmp2y) {
+ y[j][i] = tmp[j][i];
+ if (reverse_dir)
+ y[j][i + h_channel] = tmp[j][i + h_channel];
+ }
+ }
+ }
+}; // class RNNOp
+
template<typename xpu>
Operator* CreateOp(RNNParam param, int dtype);
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 908428b..a60adbc 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -30,7 +30,6 @@ namespace mxnet {
namespace op {
template<>
Operator *CreateOp<cpu>(RNNParam param, int dtype) {
- LOG(FATAL) << "RNN is only available for gpu at the moment.";
Operator *op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new RNNOp<cpu, DType>(param);
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 2cef29c..0d14968 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1523,6 +1523,23 @@ def check_rnn_layer(layer):
for g, c in zip(gs, cs):
assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)
+def check_rnn_layer_w_rand_inputs(layer):
+ layer.collect_params().initialize(ctx=[mx.cpu(0), mx.gpu(0)])
+ x = mx.nd.uniform(shape=(10, 16, 30))
+ with mx.gpu(0):
+ x = x.copyto(mx.gpu(0))
+ states = layer.begin_state(16)
+ go, gs = layer(x, states)
+
+ with mx.cpu(0):
+ x = x.copyto(mx.cpu(0))
+ states = layer.begin_state(16)
+ co, cs = layer(x, states)
+
+ assert_almost_equal(go.asnumpy(), co.asnumpy(), rtol=1e-2, atol=1e-6)
+ for g, c in zip(gs, cs):
+ assert_almost_equal(g.asnumpy(), c.asnumpy(), rtol=1e-2, atol=1e-6)
+
@with_seed()
def test_rnn_layer():
check_rnn_layer(gluon.rnn.RNN(100, num_layers=3))
@@ -1531,7 +1548,7 @@ def test_rnn_layer():
check_rnn_layer(gluon.rnn.GRU(100, num_layers=3))
check_rnn_layer(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
-
+ check_rnn_layer_w_rand_inputs(gluon.rnn.LSTM(100, num_layers=3, bidirectional=True))
@with_seed()
def test_sequence_reverse():
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 871deeb..f22b13d 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -67,6 +67,20 @@ def test_lstm_forget_bias():
forget_bias * np.ones(100, ), np.zeros((2 * 100,))])
assert_allclose(mod.get_params()[0][bias_argument].asnumpy(), expected_bias)
+def test_lstm_cpu_inference():
+ # should behave the same as lstm cell
+ EXPECTED_LSTM_OUTPUT = np.array([[[0.72045636, 0.72045636, 0.95215213, 0.95215213],
+ [0.72045636, 0.72045636, 0.95215213, 0.95215213]],
+ [[0.95215213, 0.95215213, 0.72045636, 0.72045636],
+ [0.95215213, 0.95215213, 0.72045636, 0.72045636]]])
+ x = mx.nd.ones(shape=(2, 2, 2))
+ model = mx.gluon.rnn.LSTM(2, num_layers=6, bidirectional=True)
+ model.initialize(mx.init.One())
+ y = model(x).asnumpy()
+
+ mx.test_utils.assert_almost_equal(y, EXPECTED_LSTM_OUTPUT,
+ rtol=1e-3, atol=1e-5)
+
def test_gru():
cell = gluon.rnn.GRUCell(100, prefix='rnn_')
--
To stop receiving notification emails like this one, please contact
zhasheng@apache.org.