You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/01/16 18:38:11 UTC
[GitHub] piiswrong closed pull request #9306: Sequence Operator Improvements

piiswrong closed pull request #9306: Sequence Operator Improvements
URL: https://github.com/apache/incubator-mxnet/pull/9306
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index f71b8cf8e3..07c4709baf 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -45,16 +45,46 @@ namespace op {
 namespace seq_last {
 enum SequenceLastOpInputs { kData, kSequenceLength };
 enum SequenceLastOpOutputs { kOut };
+enum SequenceLastOpResource { kTempSpace };
 }
 
 struct SequenceLastParam : public dmlc::Parameter<SequenceLastParam> {
   bool use_sequence_length;
+  int axis;
   DMLC_DECLARE_PARAMETER(SequenceLastParam) {
     DMLC_DECLARE_FIELD(use_sequence_length)
         .set_default(false)
         .describe(
-            "If set to true, this layer takes in an extra input parameter `sequence_length` "
+            "If set to true, this layer takes in an extra input parameter "
+            "`sequence_length` "
             "to specify variable length sequence");
+    DMLC_DECLARE_FIELD(axis).set_default(0).describe(
+        "The sequence axis. Only values of 0 and 1 are currently supported.");
+  }
+};
+
+template <int req>
+struct SequenceLastKernel {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
+                                  const DType *idx, int offset1, int offset2,
+                                  mshadow::Shape<2> oshape) {
+    const auto opos = mxnet_op::unravel(i, oshape);
+    const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
+    const int ipos = seqpos * offset1 + opos[0] * offset2 + opos[1];
+    KERNEL_ASSIGN(out[i], req, in[ipos]);
+  }
+};
+
+struct SequenceLastGradKernel {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *in_grad, const DType *out_grad,
+                                  const DType *idx, int offset1, int offset2,
+                                  mshadow::Shape<2> oshape) {
+    const auto opos = mxnet_op::unravel(i, oshape);
+    const int seqpos = static_cast<int>(idx[opos[0]]) - 1;
+    const int ipos = seqpos * offset1 + opos[0] * offset2 + opos[1];
+    in_grad[ipos] += out_grad[i];
   }
 };
 
@@ -63,6 +93,47 @@ class SequenceLastOp : public Operator {
  public:
   explicit SequenceLastOp(SequenceLastParam p) { this->param_ = p; }
 
+  void sequence_last(const mshadow::Tensor<xpu, 3, DType> &data,
+                     const mshadow::Tensor<xpu, 2, DType> &out,
+                     const mshadow::Tensor<xpu, 1, DType> &indices,
+                     const OpReqType req, mshadow::Stream<xpu> *const s) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    int axis = param_.axis;
+    int out_size = out.size(0) * out.size(1);
+    int max_seq_len = data.size(axis);
+    int offset1 = axis ? out.size(1) : out_size;
+    int offset2 = axis ? (max_seq_len * out.size(1)) : out.size(1);
+
+    MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+      mxnet_op::Kernel<SequenceLastKernel<req_type>, xpu>::Launch(
+          s, out_size, out.dptr_, data.dptr_, indices.dptr_, offset1, offset2,
+          out.shape_);
+    });
+  }
+
+  void sequence_last_grad(const mshadow::Tensor<xpu, 3, DType> &in_grad,
+                          const mshadow::Tensor<xpu, 2, DType> &out_grad,
+                          const mshadow::Tensor<xpu, 1, DType> &indices,
+                          mshadow::Stream<xpu> *const s) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    auto axis = param_.axis;
+    int batch = out_grad.size(0);
+    int rest = out_grad.size(1);
+    int out_size = batch * rest;
+
+    int max_seq_len = in_grad.size(axis);
+    int offset1 = axis ? rest : out_size;
+    int offset2 = axis ? (max_seq_len * rest) : rest;
+
+    mxnet_op::Kernel<SequenceLastGradKernel, xpu>::Launch(
+        s, out_size, in_grad.dptr_, out_grad.dptr_, indices.dptr_, offset1,
+        offset2, out_grad.shape_);
+  }
+
   virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
@@ -74,33 +145,32 @@ class SequenceLastOp : public Operator {
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
+    // only support axis of 0 or 1 for now
+    auto axis = param_.axis;
+
     // Get any size input + output into required form
-    index_t n = in_data[seq_last::kData].size(1);
-    int max_seq_len = in_data[seq_last::kData].size(0);
-    int total_size = in_data[seq_last::kData].Size();
-    Shape<2> s2 = Shape2(n, static_cast<int>(total_size / n / max_seq_len));
-    Shape<3> s3 =
-        Shape3(max_seq_len, n, static_cast<int>(total_size / n / max_seq_len));
+    auto d0 = in_data[seq_last::kData].size(0);
+    auto d1 = in_data[seq_last::kData].size(1);
+    auto dsize = in_data[seq_last::kData].Size();
+
+    auto batch = (axis != 0) ? d0 : d1;
+    auto max_seq_len = in_data[seq_last::kData].size(axis);
+    auto rest_size = dsize / (d0 * d1);
+
     Tensor<xpu, 3, DType> data =
-        in_data[seq_last::kData].get_with_shape<xpu, 3, DType>(s3, s);
+        in_data[seq_last::kData].get_with_shape<xpu, 3, DType>(
+            Shape3(d0, d1, rest_size), s);
     Tensor<xpu, 2, DType> out =
-        out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(s2, s);
-
-    if (param_.use_sequence_length) {
-      std::vector<index_t> indices_vec(n, max_seq_len);
-      IndexTensorToVector(
-          in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s),
-          &indices_vec);
-      if (req[seq_last::kOut] == kWriteTo) out = 0.0f;
-      index_t seq_ind;
-      for (index_t i = 0; i < n; ++i) {
-        seq_ind = indices_vec[i] - 1;  // 1-indexing
-        out[i] += data[seq_ind][i];
-      }
-    } else {
-      Assign(out, req[seq_last::kOut],
-             F<mshadow_op::identity>(data[max_seq_len - 1]));
-    }
+        out_data[seq_last::kOut].get_with_shape<xpu, 2, DType>(
+            Shape2(batch, rest_size), s);
+    Tensor<xpu, 1, DType> indices =
+        param_.use_sequence_length
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            : ctx.requested[seq_last::kTempSpace]
+                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
+    if (!param_.use_sequence_length) indices = max_seq_len;
+
+    sequence_last(data, out, indices, req[seq_last::kOut], s);
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -119,33 +189,32 @@ class SequenceLastOp : public Operator {
     if (req[seq_last::kData] == kNullOp) return;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
+    // only support axis of 0 or 1 for now
+    auto axis = param_.axis;
 
     // Get any size input + output into required form
-    index_t n = in_grad[seq_last::kData].size(1);
-    int max_seq_len = in_grad[seq_last::kData].size(0);
-    int total_size = in_grad[seq_last::kData].Size();
-    Shape<2> s2 = Shape2(n, static_cast<int>(total_size / n / max_seq_len));
-    Shape<3> s3 =
-        Shape3(max_seq_len, n, static_cast<int>(total_size / n / max_seq_len));
+    auto d0 = in_data[seq_last::kData].size(0);
+    auto d1 = in_data[seq_last::kData].size(1);
+    auto dsize = in_data[seq_last::kData].Size();
+
+    auto batch = (axis != 0) ? d0 : d1;
+    auto max_seq_len = in_data[seq_last::kData].size(axis);
+    auto rest_size = dsize / (d0 * d1);
 
     Tensor<xpu, 3, DType> data_grad =
-        in_grad[seq_last::kData].get_with_shape<xpu, 3, DType>(s3, s);
+        in_grad[seq_last::kData].get_with_shape<xpu, 3, DType>(
+            Shape3(d0, d1, rest_size), s);
     Tensor<xpu, 2, DType> output_grad =
-        out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+        out_grad[seq_last::kOut].get_with_shape<xpu, 2, DType>(
+            Shape2(batch, rest_size), s);
+    Tensor<xpu, 1, DType> indices =
+        param_.use_sequence_length
+            ? in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s)
+            : ctx.requested[seq_last::kTempSpace]
+                  .get_space_typed<xpu, 1, DType>(Shape1(batch), s);
 
-    // copy indices to vector
-    std::vector<index_t> indices_vec(n, max_seq_len);
-    if (param_.use_sequence_length)
-      IndexTensorToVector(
-          in_data[seq_last::kSequenceLength].get<xpu, 1, DType>(s),
-          &indices_vec);
-
-    index_t seq_ind;
     if (req[seq_last::kData] == kWriteTo) data_grad = 0.0f;
-    for (index_t i = 0; i < n; ++i) {
-      seq_ind = indices_vec[i] - 1;
-      data_grad[seq_ind][i] += output_grad[i];
-    }
+    sequence_last_grad(data_grad, output_grad, indices, s);
   }
 
  private:
@@ -183,18 +252,21 @@ class SequenceLastProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2U : 1U)
         << "Input:[data, sequence_length]";
+    CHECK((param_.axis == 0) || (param_.axis == 1))
+        << "Current implementation expects axis to be 0 or 1.";
 
     const TShape &dshape = (*in_shape)[seq_last::kData];
     CHECK_GT(dshape.ndim(), 1U)
         << "The data array must be of rank 2 or greater.";
     // seq length vector is same as batch size
+    int sbatch = param_.axis ? dshape[0] : dshape[1];
     if (param_.use_sequence_length)
-      SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength,
-                         Shape1(dshape[1]));
+      SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength, Shape1(sbatch));
 
     // calculate output size
     TShape shape_o(dshape.ndim() - 1);
-    for (index_t i = 0; i < shape_o.ndim(); ++i) shape_o[i] = dshape[i + 1];
+    shape_o[0] = sbatch;
+    for (index_t i = 1; i < shape_o.ndim(); ++i) shape_o[i] = dshape[i + 1];
 
     const TShape &oshape = shape_o;
     out_shape->clear();
@@ -227,6 +299,16 @@ class SequenceLastProp : public OperatorProperty {
 
   std::string TypeString() const override { return "SequenceLast"; }
 
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
   std::vector<int> DeclareBackwardDependency(
       const std::vector<int> &out_grad, const std::vector<int> &in_data,
       const std::vector<int> &out_data) const override {
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index 7f53a0ba82..a34cea0496 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -32,12 +32,11 @@
 #include <mxnet/operator.h>
 #include <algorithm>
 #include <map>
-#include <vector>
 #include <string>
 #include <utility>
-#include "./operator_common.h"
+#include <vector>
 #include "./mshadow_op.h"
-#include "./nn/sequence_mask-inl.h"
+#include "./operator_common.h"
 
 namespace mxnet {
 namespace op {
@@ -45,19 +44,60 @@ namespace op {
 namespace seq_mask {
 enum SequenceMaskOpInputs { kData, kSequenceLength };
 enum SequenceMaskOpOutputs { kOut };
+enum SequenceMaskOpBackResource { kTempSpace };
 }
 
 struct SequenceMaskParam : public dmlc::Parameter<SequenceMaskParam> {
   bool use_sequence_length;
   float value;
+  int axis;
   DMLC_DECLARE_PARAMETER(SequenceMaskParam) {
     DMLC_DECLARE_FIELD(use_sequence_length)
         .set_default(false)
         .describe(
-            "If set to true, this layer takes in an extra input parameter `sequence_length` "
+            "If set to true, this layer takes in an extra input parameter "
+            "`sequence_length` "
             "to specify variable length sequence");
     DMLC_DECLARE_FIELD(value).set_default(0.).describe(
         "The value to be used as a mask.");
+    DMLC_DECLARE_FIELD(axis).set_default(0).describe(
+        "The sequence axis. Only values of 0 and 1 are currently supported.");
+  }
+};
+
+// (seqlen, batch, rest) case
+template <int req>
+struct SequenceMask0Kernel {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int b, DType *in, const DType *idx,
+                                  index_t max_s_len, index_t batch_size,
+                                  index_t restsize, DType value) {
+    const index_t seqpos = static_cast<int>(idx[b]);
+#pragma unroll
+    for (index_t s = seqpos; s < max_s_len; ++s) {
+      index_t incr = (s * batch_size * restsize) + (b * restsize);
+#pragma unroll
+      for (index_t r = 0; r < restsize; ++r)
+        KERNEL_ASSIGN(in[incr + r], req, value);
+    }
+  }
+};
+
+// (batch, seqlen, rest) case
+template <int req>
+struct SequenceMask1Kernel {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int b, DType *in, const DType *idx,
+                                  index_t max_s_len, index_t batch_size,
+                                  index_t restsize, DType value) {
+    const index_t seqpos = static_cast<int>(idx[b]);
+#pragma unroll
+    for (index_t s = seqpos; s < max_s_len; ++s) {
+      index_t incr = (b * max_s_len * restsize) + (s * restsize);
+#pragma unroll
+      for (index_t r = 0; r < restsize; ++r)
+        KERNEL_ASSIGN(in[incr + r], req, value);
+    }
   }
 };
 
@@ -66,6 +106,29 @@ class SequenceMaskOp : public Operator {
  public:
   explicit SequenceMaskOp(SequenceMaskParam p) { this->param_ = p; }
 
+  void sequence_mask(const mshadow::Tensor<xpu, 3, DType> &data,
+                     const mshadow::Tensor<xpu, 1, DType> &indices,
+                     const OpReqType req, mshadow::Stream<xpu> *const s,
+                     DType val) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+
+    index_t batch = indices.size(0);
+    index_t max_seq_len = data.size(param_.axis);
+    index_t restsize = data.size(2);
+
+    MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+      if (param_.axis == 1)
+        mxnet_op::Kernel<SequenceMask1Kernel<req_type>, xpu>::Launch(
+            s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
+            val);
+      else
+        mxnet_op::Kernel<SequenceMask0Kernel<req_type>, xpu>::Launch(
+            s, batch, data.dptr_, indices.dptr_, max_seq_len, batch, restsize,
+            val);
+    });
+  }
+
   virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
                        const std::vector<TBlob> &out_data,
@@ -77,21 +140,23 @@ class SequenceMaskOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
-    int max_seq_len = in_data[seq_mask::kData].size(0);
-    int n = in_data[seq_mask::kData].size(1);
-    int total_size = in_data[seq_mask::kData].Size();
-    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
+    auto d0 = in_data[seq_mask::kData].size(0);
+    auto d1 = in_data[seq_mask::kData].size(1);
+    auto dsize = in_data[seq_mask::kData].Size();
+    auto rest_size = dsize / (d0 * d1);
 
-    Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
+    Shape<3> s3 = Shape3(d0, d1, rest_size);
     Tensor<xpu, 3, DType> data =
         in_data[seq_mask::kData].get_with_shape<xpu, 3, DType>(s3, s);
     Tensor<xpu, 3, DType> out =
         out_data[seq_mask::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+    // Actual implementation of masking
     Assign(out, req[seq_mask::kOut], F<mshadow_op::identity>(data));
     if (param_.use_sequence_length) {
       Tensor<xpu, 1, DType> indices =
           in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
-      mxnet_op::SequenceMask(out, indices, static_cast<DType>(param_.value));
+      sequence_mask(out, indices, req[seq_mask::kOut], s,
+                    static_cast<DType>(param_.value));
     }
   }
 
@@ -109,25 +174,36 @@ class SequenceMaskOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
-    int max_seq_len = in_grad[seq_mask::kData].size(0);
-    int n = in_grad[seq_mask::kData].size(1);
-    int total_size = in_grad[seq_mask::kData].Size();
-    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
-
-    Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
+    auto d0 = in_grad[seq_mask::kData].size(0);
+    auto d1 = in_grad[seq_mask::kData].size(1);
+    auto dsize = in_grad[seq_mask::kData].Size();
+    auto rest_size = dsize / (d0 * d1);
 
-    Tensor<xpu, 3, DType> data_grad =
+    Shape<3> s3 = Shape3(d0, d1, rest_size);
+    Tensor<xpu, 3, DType> data_g =
         in_grad[seq_mask::kData].get_with_shape<xpu, 3, DType>(s3, s);
-    Tensor<xpu, 3, DType> output_grad =
+    Tensor<xpu, 3, DType> out_g =
         out_grad[seq_mask::kOut].get_with_shape<xpu, 3, DType>(s3, s);
 
-    Assign(data_grad, req[seq_mask::kData],
-           F<mshadow_op::identity>(output_grad));
-
-    if (param_.use_sequence_length) {
+    // Actual implementation of masking
+    if (req[seq_mask::kData] == kNullOp) return;
+    if (!param_.use_sequence_length) {
+      Assign(data_g, req[seq_mask::kData], F<mshadow_op::identity>(out_g));
+    } else {
       Tensor<xpu, 1, DType> indices =
           in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
-      mxnet_op::SequenceMask(data_grad, indices, DType(0));
+      if (req[seq_mask::kData] == kAddTo) {
+        Tensor<xpu, 3, DType> out_g_temp =
+            ctx.requested[seq_mask::kTempSpace].get_space_typed<xpu, 3, DType>(
+                s3, s);
+        out_g_temp = F<mshadow_op::identity>(out_g);
+        out_g = out_g_temp;
+        sequence_mask(out_g, indices, kWriteInplace, s, DType(0.));
+        Assign(data_g, kAddTo, F<mshadow_op::identity>(out_g));
+      } else {
+        Assign(data_g, req[seq_mask::kData], F<mshadow_op::identity>(out_g));
+        sequence_mask(data_g, indices, req[seq_mask::kData], s, DType(0.));
+      }
     }
   }
 
@@ -172,10 +248,13 @@ class SequenceMaskProp : public OperatorProperty {
     const TShape &dshape = (*in_shape)[seq_mask::kData];
     CHECK_GT(dshape.ndim(), 1U)
         << "The data array must be of rank 2 or greater.";
+    CHECK((param_.axis == 0) || (param_.axis == 1))
+        << "Current implementation expects axis to be 0 or 1.";
+
     // seq length vector is same as batch size
+    int sbatch = param_.axis ? dshape[0] : dshape[1];
     if (param_.use_sequence_length)
-      SHAPE_ASSIGN_CHECK(*in_shape, seq_mask::kSequenceLength,
-                         Shape1(dshape[1]));
+      SHAPE_ASSIGN_CHECK(*in_shape, seq_mask::kSequenceLength, Shape1(sbatch));
 
     const TShape &oshape = dshape;
     out_shape->clear();
@@ -222,6 +301,19 @@ class SequenceMaskProp : public OperatorProperty {
     return {ResourceRequest::kTempSpace};
   }
 
+  std::vector<std::pair<int, void *> > BackwardInplaceOption(
+      const std::vector<int> &out_grad, const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<void *> &in_grad) const override {
+    return {{out_grad[seq_mask::kOut], in_grad[seq_mask::kData]}};
+  }
+
+  std::vector<std::pair<int, void *> > ForwardInplaceOption(
+      const std::vector<int> &in_data,
+      const std::vector<void *> &out_data) const override {
+    return {{in_data[seq_mask::kData], out_data[seq_mask::kOut]}};
+  }
+
   Operator *CreateOperator(Context ctx) const override {
     LOG(FATAL) << "Not Implemented.";
     return NULL;
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 47154011bc..943ca6e933 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -51,6 +51,7 @@ enum SequenceReverseOpOutputs { kOut };
 
 struct SequenceReverseParam : public dmlc::Parameter<SequenceReverseParam> {
   bool use_sequence_length;
+  int axis;
   DMLC_DECLARE_PARAMETER(SequenceReverseParam) {
     DMLC_DECLARE_FIELD(use_sequence_length)
         .set_default(false)
@@ -58,20 +59,23 @@ struct SequenceReverseParam : public dmlc::Parameter<SequenceReverseParam> {
             "If set to true, this layer takes in an extra input parameter "
             "`sequence_length` "
             "to specify variable length sequence");
+    DMLC_DECLARE_FIELD(axis).set_default(0).describe(
+        "The sequence axis. Only 0 is currently supported.");
   }
 };
 
 struct ReverseKernel {
   template <typename DType>
-  MSHADOW_XINLINE static void Map(
-      const int i, DType *const out_data, const DType *const in_data,
-      const OpReqType req, const index_t max_seq_len, const index_t batch_size,
-      const index_t other_dim, const index_t numel, const DType *const indices
-      ) {
+  MSHADOW_XINLINE static void Map(const int i, DType *const out_data,
+                                  const DType *const in_data,
+                                  const OpReqType req,
+                                  const index_t max_seq_len,
+                                  const index_t batch_size,
+                                  const index_t other_dim, const index_t numel,
+                                  const DType *const indices) {
     for (index_t batch = 0; batch < batch_size; ++batch) {
-      const index_t num_seq = indices
-                                  ? static_cast<index_t>(indices[batch])
-                                  : max_seq_len;
+      const index_t num_seq =
+          indices ? static_cast<index_t>(indices[batch]) : max_seq_len;
       const index_t padded_periods = max_seq_len - num_seq;
       // padded part
       if (padded_periods > 0 && i < static_cast<int>(padded_periods)) {
@@ -130,10 +134,10 @@ class SequenceReverseOp : public Operator {
     Stream<xpu> *const s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
-    int max_seq_len = in_data[seq_reverse::kData].size(0);
-    int n = in_data[seq_reverse::kData].size(1);
-    int total_size = in_data[seq_reverse::kData].Size();
-    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
+    auto max_seq_len = in_data[seq_reverse::kData].size(0);
+    auto n = in_data[seq_reverse::kData].size(1);
+    auto total_size = in_data[seq_reverse::kData].Size();
+    auto rest_dim = static_cast<int>(total_size / n / max_seq_len);
 
     Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
     Tensor<xpu, 3, DType> data =
@@ -163,10 +167,10 @@ class SequenceReverseOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
-    int max_seq_len = in_grad[seq_reverse::kData].size(0);
-    int n = in_grad[seq_reverse::kData].size(1);
-    int total_size = in_grad[seq_reverse::kData].Size();
-    int rest_dim = static_cast<int>(total_size / n / max_seq_len);
+    auto max_seq_len = in_grad[seq_reverse::kData].size(0);
+    auto n = in_grad[seq_reverse::kData].size(1);
+    auto total_size = in_grad[seq_reverse::kData].Size();
+    auto rest_dim = static_cast<int>(total_size / n / max_seq_len);
 
     Shape<3> s3 = Shape3(max_seq_len, n, rest_dim);
 
@@ -180,7 +184,8 @@ class SequenceReverseOp : public Operator {
             ? in_data[seq_reverse::kSequenceLength].dptr<DType>()
             : nullptr;
 
-    sequence_reverse(output_grad, data_grad, req[seq_reverse::kData], indices, s);
+    sequence_reverse(output_grad, data_grad, req[seq_reverse::kData], indices,
+                     s);
   }
 
  private:
@@ -220,6 +225,7 @@ class SequenceReverseProp : public OperatorProperty {
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2U : 1U)
         << "Input:[data, sequence_length]";
+    CHECK_EQ(param_.axis, 0) << "Current implementation expects axis to be 0.";
 
     const TShape &dshape = (*in_shape)[seq_reverse::kData];
     CHECK_GT(dshape.ndim(), 1U)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 56dc27c493..e92e9caf12 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -2311,39 +2311,91 @@ def test_l2_normalization():
                         check_l2_normalization((nbatch, nchannel, height, width), mode)
 
 
-def sequence_mask_numpy(array, lengths, value):
+# Numpy Implementation of Sequence Ops
+def sequence_last_numpy(array, lengths, axis):
+    # create new array of dims [batch, seqlen, ...]
+    array2 = np.moveaxis(array, axis, 1)
+    dims = array2.shape
+    if lengths is None:
+        return array2[:, -1]
+    lengths = list(lengths)
+    return np.array([array2[i, int(lengths[i]) - 1] for i in range(dims[0])])
+
+
+def sequence_mask_numpy(array, lengths, axis, value):
+    if lengths is None:
+        return array
     arrayMask = array.copy()
-    shape = array.shape
-    batch = shape[1]
-    for i in range(batch):
-        arrayMask[int(lengths[i]):, i] = value
-    return arrayMask
-
-def check_sequence_mask(shape, xpu, mask_value):
+    # conform to [batch, seqlen, ...]
+    arrayMask = np.moveaxis(arrayMask, axis, 1)
+    shape = arrayMask.shape
+    lengths = list(lengths)
+    for i in range(shape[0]):
+        arrayMask[i, int(lengths[i]):] = value
+    return np.moveaxis(arrayMask, 1, axis)
+
+
+def sequence_reverse_numpy(array, lengths, axis):
+    rarray = array.copy()
+    # conform to [batch, seqlen, ...]
+    rarray = np.moveaxis(rarray, axis, 1)
+    shape = rarray.shape
+    if lengths is None:
+        lengths = [shape[1]] * shape[0]
+    lengths = list(lengths)
+    for i in range(shape[0]):
+        j = int(lengths[i])
+        rarray[i,:j] = rarray[i,:j][::-1]
+    return np.moveaxis(rarray, 1, axis)
+
+
+def check_sequence_func(ftype, mask_value=0, axis=0):
     # bind with label
+    xpu = default_context()
     X = mx.symbol.Variable('X')
     L = mx.symbol.Variable('L') # lengths
-    Y = mx.symbol.SequenceMask(data=X, use_sequence_length=True, sequence_length=L, value=mask_value)
-    x = mx.random.uniform(-1, 1, shape, ctx=mx.cpu()).copyto(xpu)
-    l = mx.nd.array(np.random.randint(1, shape[0] + 1, shape[1]), ctx=mx.cpu()).copyto(xpu)
-    # numpy result
-    np_out = sequence_mask_numpy(x.asnumpy(), l.asnumpy(), mask_value)
-    # mxnet result
-    exec1 = Y.bind(xpu, args = [x, l], grad_req={'X':'null', 'L':'null'})
-    exec1.forward()
-    out = exec1.outputs[0].asnumpy()
-    # compare numpy + mxnet
-    assert_almost_equal(out, np_out, rtol=1e-5)
-    # grad check
-    check_numeric_gradient(Y, [x.asnumpy(), l.asnumpy()], grad_nodes={'X':'write'},
-        numeric_eps=1e-3, rtol=1e-2)
+    shapes = [(3, 4), (1, 1), (3, 4, 3, 1, 1)]
+    for seqlenQ in [True, False]:
+        for s in shapes:
+            x = mx.random.uniform(-1, 1, s, ctx=mx.cpu()).copyto(xpu)
+            batch = s[1] if (axis == 0) else s[0]
+            seqlen = s[axis]
+            l_np = np.random.randint(1, seqlen + 1, batch)
+            l = mx.nd.array(l_np, ctx=mx.cpu()).copyto(xpu)
+            if not seqlenQ:
+                l_np = None
+            args = {'data':X, 'use_sequence_length':seqlenQ, "axis":axis}
+            if seqlenQ:
+                args['sequence_length'] = L
+            if ftype == "last":
+                Y = mx.symbol.SequenceLast(**args)
+                np_out = sequence_last_numpy(x.asnumpy(), l_np, axis)
+            elif ftype == "mask":
+                args['value'] = mask_value
+                Y = mx.symbol.SequenceMask(**args)
+                np_out = sequence_mask_numpy(x.asnumpy(), l_np, axis, mask_value)
+            elif ftype == "reverse":
+                Y = mx.symbol.SequenceReverse(**args)
+                np_out = sequence_reverse_numpy(x.asnumpy(), l_np, axis)
+            fargs = [x, l] if seqlenQ else [x]
+            gargs = [x.asnumpy(), l_np] if seqlenQ else [x.asnumpy()]
+            check_symbolic_forward(Y, fargs, [np_out])
+            check_numeric_gradient(Y, gargs, grad_nodes={'X':'write'},
+                numeric_eps=1e-2, rtol=1e-2)
+            check_numeric_gradient(Y, gargs, grad_nodes={'X':'add'},
+                numeric_eps=1e-3, rtol=1e-2, atol=1E-4)
+            check_numeric_gradient(Y, gargs, grad_nodes={'X':'null'},
+                numeric_eps=1e-3, rtol=1e-2, atol=1E-4)
+
+
+def test_sequence_last():
+    check_sequence_func("last", axis=0)
+    check_sequence_func("last", axis=1)
+
 
 def test_sequence_mask():
-    shape1 = (4, 2, 2, 3)
-    shape2 = (1, 2, 2, 3, 1, 1)
-    check_sequence_mask(shape1, default_context(), 2.1)
-    check_sequence_mask(shape2, default_context(), 0.1)
-    check_sequence_mask((3, 4), default_context(), 0.14)
+    check_sequence_func("mask", axis = 0, mask_value=-2.3)
+    check_sequence_func("mask", axis = 1, mask_value=0.3)
 
 
 def check_sequence_reverse(xpu):
@@ -2410,7 +2462,9 @@ def test_wrapper(arr, xpu, sequence_length=None, use_sequence_length=False):
     assert_array_equal(test_wrapper(arr, xpu, sequence_length=[2, 3], use_sequence_length=True), arr3)
     assert_array_equal(test_wrapper(arr_4, xpu, sequence_length=seq_len_1, use_sequence_length=True), arr_5)
 
+
 def test_sequence_reverse():
+    check_sequence_func("reverse", axis=0)
     check_sequence_reverse(mx.cpu())
 
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services