You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2021/03/25 04:25:33 UTC
[tvm] branch main updated: [Topi, Relay] Add cumprod (#7722)

This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 8e23806  [Topi, Relay] Add cumprod (#7722)
8e23806 is described below

commit 8e23806d2d522b71979d0a2730b38cc5c3bf6185
Author: AndrewZhaoLuo <an...@gmail.com>
AuthorDate: Wed Mar 24 21:25:18 2021 -0700

    [Topi, Relay] Add cumprod (#7722)
    
    * make cumbinop, refactor cumsum, add cumprod
    
    * cumsum exclusive test
    
    * Add cumprod + flesh out cumsum tests
    
    add cumprod and tests
    
    reinstate tests
    
    rethink
    
    * add rudimentary scan implementation
    
    * add attributes of cumprod node
    
    * add cumprod strategy
    
    * add cuda strategy
    
    * python relay node construction
    
    * change attrs to be reusuable
    
    * add cumprod nodes
    
    * complete tests
    
    * Fix some typos about sum --> prod
    
    typos fix sum -> prod
    
    more typos
    
    more typo fixes
    
    more typos
    
    add doc strings
    
    * Use Bool instead of int to represent exclusive
    
    make exclusive a bool up and down stack
    
    fix x
    
    fix bool err
    
    it is a bool now
    
    fix
    
    fix thing
    
    formatting to pass linter
    
    lint python
    
    cumprod pylint
    
    fix attribute
    
    fix ordering
    
    add exclusivity tests for end to end
    
    fix things
    
    cuda identity_value
    
    * Overall improve formatting, add doc message corrections
    
    simplify construction
    
    clang-format
    
    more tests
    
    undo simpler construction due to function passing stuff
    
    fix docs
    
    more exclusive doc changes
    
    more fixins"
    
    * merge cumsum and cumprod to scan, merge tests
    
    fix stuff
    
    * remove other mentions of cumbinop -> scanop
    
    * lint formatting
    
    Co-authored-by: Andrew Zhao Luo <an...@Andrews-MacBook-Pro.local>
---
 include/tvm/relay/attrs/transform.h          |  14 +-
 python/tvm/relay/op/_transform.py            |  19 ++-
 python/tvm/relay/op/strategy/cuda.py         |  19 ++-
 python/tvm/relay/op/strategy/generic.py      |  29 +++-
 python/tvm/relay/op/transform.py             |  65 +++++++-
 python/tvm/topi/__init__.py                  |   2 +-
 python/tvm/topi/cuda/scan.py                 | 196 +++++++++++++++++++---
 python/tvm/topi/cumsum.py                    | 121 --------------
 python/tvm/topi/scan.py                      | 236 +++++++++++++++++++++++++++
 python/tvm/topi/unique.py                    |   2 +-
 src/relay/op/tensor/transform.cc             |  34 +++-
 tests/python/relay/test_op_level3.py         |  77 ++++++---
 tests/python/topi/python/test_topi_cumsum.py |  79 ---------
 tests/python/topi/python/test_topi_scan.py   | 144 ++++++++++++++++
 14 files changed, 758 insertions(+), 279 deletions(-)

diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index ff344f5..a5544c8 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -438,17 +438,19 @@ struct MatrixSetDiagAttrs : public tvm::AttrsNode<MatrixSetDiagAttrs> {
   }
 };  // struct MatrixSetDiagAttrs
 
-/*! \brief Attributes used in cumsum operator */
-struct CumsumAttrs : public tvm::AttrsNode<CumsumAttrs> {
+/*! \brief Attributes used in cumsum and cumprod operator */
+struct ScanopAttrs : public tvm::AttrsNode<ScanopAttrs> {
   Integer axis;
   DataType dtype;
-  Integer exclusive;
-  TVM_DECLARE_ATTRS(CumsumAttrs, "relay.attrs.CumsumAttrs") {
-    TVM_ATTR_FIELD(axis).describe("The axis to sum over").set_default(NullValue<Integer>());
+  Bool exclusive = Bool(false);
+  TVM_DECLARE_ATTRS(ScanopAttrs, "relay.attrs.ScanopAttrs") {
+    TVM_ATTR_FIELD(axis).describe("The axis to operate over").set_default(NullValue<Integer>());
     TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue<DataType>());
+
+    // Default is 0 which is "false"
     TVM_ATTR_FIELD(exclusive)
         .describe("The first element is not included")
-        .set_default(NullValue<Integer>());
+        .set_default(Bool(false));
   }
 };
 
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index e90263d..1626283 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -19,16 +19,17 @@
 # pylint: disable=too-many-local-variables, too-many-arguments, no-else-return
 
 from __future__ import absolute_import
+
 import tvm
-from tvm import te
-from tvm.te.hybrid import script
+from tvm import te, topi
 from tvm.runtime import convert
-from tvm import topi
+from tvm.te.hybrid import script
 from tvm.topi.utils import get_const_int, get_const_tuple
+
 from . import op as _reg
 from . import strategy
-from .op import OpPattern
 from ._tensor import elemwise_shape_func
+from .op import OpPattern
 
 _reg.register_broadcast_schedule("broadcast_to")
 _reg.register_broadcast_schedule("broadcast_to_like")
@@ -159,6 +160,16 @@ def compute_cumsum(attrs, inputs, output_type):
 _reg.register_strategy("cumsum", strategy.cumsum_strategy)
 _reg.register_shape_func("cumsum", False, elemwise_shape_func)
 
+# cumprod
+@_reg.register_compute("cumprod")
+def compute_cumprod(attrs, inputs, output_type):
+    """Compute definition of cumprod"""
+    return [topi.cumprod(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
+
+
+_reg.register_strategy("cumprod", strategy.cumprod_strategy)
+_reg.register_shape_func("cumprod", False, elemwise_shape_func)
+
 
 @_reg.register_compute("unique")
 def compute_unique(attrs, inputs, output_type):
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index e0d0f16..1a67425 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -18,11 +18,12 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 from tvm import topi
 from tvm.auto_scheduler import is_auto_scheduler_enabled
-from tvm.te import SpecializedCondition
 from tvm.contrib import nvcc
 from tvm.contrib.thrust import can_use_thrust
-from .generic import *
+from tvm.te import SpecializedCondition
+
 from .. import op as _op
+from .generic import *
 
 
 @schedule_injective.register(["cuda", "gpu"])
@@ -1017,13 +1018,25 @@ def cumsum_strategy_cuda(attrs, inputs, out_type, target):
     """cumsum cuda strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_cumsum(topi.cuda.cumsum),
+        wrap_compute_scanop(topi.cuda.cumsum),
         wrap_topi_schedule(topi.cuda.schedule_scan),
         name="cumsum.cuda",
     )
     return strategy
 
 
+@cumprod_strategy.register(["cuda", "gpu"])
+def cumprod_strategy_cuda(attrs, inputs, out_type, target):
+    """cumprod cuda strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scanop(topi.cuda.cumprod),
+        wrap_topi_schedule(topi.cuda.schedule_scan),
+        name="cumprod.cuda",
+    )
+    return strategy
+
+
 @unique_strategy.register(["cuda", "gpu"])
 def unique_strategy_cuda(attrs, inputs, out_type, target):
     """unique cuda strategy"""
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 04f2564..322a360 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -17,11 +17,12 @@
 """Definition of generic operator strategy."""
 # pylint: disable=invalid-name,unused-argument
 import logging
-
 import re
-from tvm import topi, _ffi, te, ir
-from tvm.topi.utils import get_const_int, get_const_float, get_const_tuple, get_float_tuple
+
+from tvm import _ffi, ir, te, topi
 from tvm.target import generic_func, override_native_generic_func
+from tvm.topi.utils import get_const_float, get_const_int, get_const_tuple, get_float_tuple
+
 from .. import op as _op
 
 logger = logging.getLogger("strategy")
@@ -1463,13 +1464,13 @@ def threefry_split_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
-def wrap_compute_cumsum(topi_compute):
-    """Wrap cumsum topi compute"""
+def wrap_compute_scanop(topi_compute):
+    """Wrap scanop style topi compute"""
 
-    def _compute_cumsum(attrs, inputs, _):
+    def _compute_scanop(attrs, inputs, _):
         return [topi_compute(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)]
 
-    return _compute_cumsum
+    return _compute_scanop
 
 
 @override_native_generic_func("cumsum_strategy")
@@ -1477,13 +1478,25 @@ def cumsum_strategy(attrs, inputs, out_type, target):
     """cumsum generic strategy"""
     strategy = _op.OpStrategy()
     strategy.add_implementation(
-        wrap_compute_cumsum(topi.cumsum),
+        wrap_compute_scanop(topi.cumsum),
         wrap_topi_schedule(topi.generic.schedule_extern),
         name="cumsum.generic",
     )
     return strategy
 
 
+@override_native_generic_func("cumprod_strategy")
+def cumprod_strategy(attrs, inputs, out_type, target):
+    """cumprod generic strategy"""
+    strategy = _op.OpStrategy()
+    strategy.add_implementation(
+        wrap_compute_scanop(topi.cumprod),
+        wrap_topi_schedule(topi.generic.schedule_extern),
+        name="cumprod.generic",
+    )
+    return strategy
+
+
 def wrap_compute_unique(topi_compute):
     """Wrap unique topi compute"""
 
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index df0ae76..f94a00d 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -18,11 +18,11 @@
 # pylint: disable=import-outside-toplevel
 """Transform operators."""
 
+from ...tir import expr as _expr
+from ..expr import Constant, Expr, Tuple, TupleWrapper, const
 from . import _make
 from .dyn import _make as _dyn_make
 from .tensor import shape_of
-from ..expr import TupleWrapper, const, Constant, Expr, Tuple
-from ...tir import expr as _expr
 
 
 def cast(data, dtype):
@@ -1539,9 +1539,9 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
         Type of the returned array and of the accumulator in which the elements are summed.
         If dtype is not specified, it defaults to the dtype of data.
 
-    exclusive : int, optional
-        If set to 1 will return exclusive sum in which the first element is not
-        included. In other terms, if set to 1, the j-th output element would be
+    exclusive : bool, optional
+        If true will return exclusive sum in which the first element is not
+        included. In other terms, if true, the j-th output element would be
         the sum of the first (j-1) elements. Otherwise, it would be the sum of
         the first j elements.
 
@@ -1577,6 +1577,61 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
     return _make.cumsum(data, axis, dtype, exclusive)
 
 
+def cumprod(data, axis=None, dtype=None, exclusive=None):
+    """Numpy style cumprod op. Return the cumulative inclusive product of the elements along
+    a given axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative product is computed. The default (None) is to compute
+        the cumprod over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are multiplied.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If true will return exclusive product in which the first element is not
+        included. In other terms, if true, the j-th output element would be
+        the product of the first (j-1) elements. Otherwise, it would be the product of
+        the first j elements. The product of zero elements will be 1.
+
+    Returns
+    -------
+    result : relay.Expr
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+
+    Examples
+    --------
+    .. code-block:: python
+        a = [[1,2,3], [4,5,6]]
+
+        cumprod(a)  # if axis is not provided, cumprod is done over the flattened input.
+        -> [ 1,  2,  6, 24, 120, 720]
+
+        cumprod(a, dtype="float32")
+        -> [  1.,  2.,  6., 24., 120., 720.]
+
+        cumprod(a, axis=0)  # multiply over rows for each of the 3 columns
+        -> [[1, 2, 3],
+            [4, 10, 18]]
+
+        cumprod(a, axis=1)
+        -> [[ 1,  2,  6],
+            [ 4,  20, 120]]
+
+        a = [1, 1, 1, 0, 1, 1, 0]  # a is a boolean array
+        cumprod(a, dtype=int32)  # dtype should be provided to get the expected results
+        -> [1, 1, 1, 0, 0, 0, 0]
+    """
+    return _make.cumprod(data, axis, dtype, exclusive)
+
+
 def unique(data, is_sorted=True, return_counts=False):
     """
     Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to
diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py
index c196b33..ef2c5c1 100644
--- a/python/tvm/topi/__init__.py
+++ b/python/tvm/topi/__init__.py
@@ -42,7 +42,7 @@ from .sparse_fill_empty_rows import *
 from .sparse_reshape import *
 from .scatter_add import *
 from .argwhere import *
-from .cumsum import *
+from .scan import *
 from .einsum import *
 from .unique import *
 from . import generic
diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py
index 84ab5dc..3240ebc 100644
--- a/python/tvm/topi/cuda/scan.py
+++ b/python/tvm/topi/cuda/scan.py
@@ -16,13 +16,16 @@
 # under the License.
 # pylint: disable=invalid-name, too-many-locals, too-many-statements
 "Scan related operators"
+from typing import Callable, Optional, Union
+
 import tvm
 from tvm import te
-from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust
-from ..transform import expand_dims, squeeze, transpose, reshape
-from ..utils import ceil_div, swap, prod, get_const_int
-from ..math import cast
+from tvm.contrib.thrust import can_use_rocthrust, can_use_thrust
+
 from .. import tag
+from ..math import cast
+from ..transform import expand_dims, reshape, squeeze, transpose
+from ..utils import ceil_div, get_const_int, prod, swap
 from .injective import schedule_injective_from_existing
 
 
@@ -32,7 +35,7 @@ def _get_thrust_func_name(tvmop):
     return tvmop_to_thrust_func_name[tvmop]
 
 
-def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add):
+def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add, identity_value=0):
     """Low level IR to do exclusive sum scan along rows of 2D input.
 
     Parameters
@@ -50,6 +53,11 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add):
         A binary associative op to use for scan. The function takes two TIR expressions
         and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
         prefix sum.
+
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
     """
 
     batch_size = prod(data.shape[:-1])
@@ -134,7 +142,7 @@ def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add):
             with ib.if_scope(bx < batch_size):
                 if reduction is not None:
                     reduction[bx] = output[(bx + 1) * scan_axis_size - 1]
-                output[(bx + 1) * scan_axis_size - 1] = cast(0, out_dtype)
+                output[(bx + 1) * scan_axis_size - 1] = cast(identity_value, out_dtype)
 
         with ib.for_range(0, lim, dtype="int64") as l2_width:
             width = 2 << (lim - l2_width - 1)
@@ -309,7 +317,12 @@ def scan_thrust(
 
 
 def exclusive_scan(
-    data, axis=-1, return_reduction=False, output_dtype=None, binop=tvm.tir.generic.add
+    data,
+    axis=-1,
+    return_reduction=False,
+    output_dtype=None,
+    binop=tvm.tir.generic.add,
+    identity_value=0,
 ):
     """Do exclusive scan on 1D or multidimensional input.
 
@@ -335,6 +348,11 @@ def exclusive_scan(
         and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
         prefix sum.
 
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
     Returns
     -------
     output : tvm.te.Tensor
@@ -347,9 +365,15 @@ def exclusive_scan(
 
     def do_scan(data, output_dtype):
         target = tvm.target.Target.current()
-        if target and (
-            can_use_thrust(target, "tvm.contrib.thrust.sum_scan")
-            or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan")
+
+        # TODO: add support for a prod_scan
+        if (
+            target
+            and binop == tvm.tir.generic.add
+            and (
+                can_use_thrust(target, "tvm.contrib.thrust.sum_scan")
+                or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan")
+            )
         ):
             return scan_thrust(
                 data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop
@@ -366,7 +390,9 @@ def exclusive_scan(
             output, reduction = te.extern(
                 [data.shape, data.shape[:-1]],
                 [data],
-                lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], outs[1], binop=binop),
+                lambda ins, outs: exclusive_scan_ir(
+                    ins[0], outs[0], outs[1], binop=binop, identity_value=identity_value
+                ),
                 dtype=[data.dtype, output_dtype],
                 in_buffers=[data_buf],
                 name="exclusive_scan",
@@ -376,7 +402,9 @@ def exclusive_scan(
             output = te.extern(
                 [data.shape],
                 [data],
-                lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], binop=binop),
+                lambda ins, outs: exclusive_scan_ir(
+                    ins[0], outs[0], binop=binop, identity_value=identity_value
+                ),
                 dtype=[output_dtype],
                 in_buffers=[data_buf],
                 out_buffers=[output_buf],
@@ -423,7 +451,7 @@ def exclusive_scan(
     return output
 
 
-def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add):
+def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add, identity_value=0):
     """Do inclusive scan on 1D or multidimensional input.
 
     Parameters
@@ -442,12 +470,19 @@ def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add):
         and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute
         prefix sum.
 
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
     Returns
     -------
     output : tvm.te.Tensor
         A N-D tensor of the same rank N as the input data.
     """
-    ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype, binop=binop)
+    ex_scan = exclusive_scan(
+        data, axis, output_dtype=output_dtype, binop=binop, identity_value=identity_value
+    )
 
     if output_dtype is not None and data.dtype != output_dtype and output_dtype != "":
         data = cast(data, output_dtype)
@@ -486,7 +521,74 @@ def schedule_scan(outs):
     return s
 
 
-def cumsum(data, axis=None, dtype=None, exclusive=None):
+def scanop(
+    data: tvm.te.Tensor,
+    binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"],
+    identity_value: Union[float, int],
+    axis: Optional[int] = None,
+    dtype: Optional[str] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Cumulative binary operator (scan) with similar axis behavior as np.cumsum and np.cumprod.
+
+    See cumprod and cumsum for an example of use.
+
+    E.g. if * is your binary operator and the input tensor is [1, 2, 3, 4] the output may be
+    [1, 1 * 2, 1 * 2 * 3, 1 * 2 * 3 * 4]
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    binop: Callable (tvm.Expr, tvm.Expr) -> tvm.Expr
+        A binary operator which should be associative and commutative. E.g. if * is your
+        operator then a * (b * c) = (a * b) * c and a * b = b * a
+
+    identity_value: int or float
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
+    axis : int, optional
+        Axis along which the operation is computed. The default (None) is to compute
+        the cumulative operation over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are computed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If true will return exclusive cumulative operation in which the first element is not
+        included. In other terms, if true, the j-th output element would be
+        the cumulative operation of the first (j-1) elements. Otherwise, it would be the
+        cumulative operation of the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    if axis is None:
+        axis = 0
+        data = reshape(data, (prod(data.shape),))
+    axis = get_const_int(axis)
+    if exclusive is not None and exclusive:
+        return exclusive_scan(
+            data, axis, output_dtype=dtype, binop=binop, identity_value=identity_value
+        )
+    return inclusive_scan(
+        data, axis, output_dtype=dtype, binop=binop, identity_value=identity_value
+    )
+
+
+def cumsum(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
     """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
 
     Parameters
@@ -502,9 +604,9 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
         Type of the returned array and of the accumulator in which the elements are summed.
         If dtype is not specified, it defaults to the dtype of data.
 
-    exclusive : int, optional
-        If set to 1 will return exclusive sum in which the first element is not
-        included. In other terms, if set to 1, the j-th output element would be
+    exclusive : bool, optional
+        If true will return exclusive sum in which the first element is not
+        included. In other terms, if true, the j-th output element would be
         the sum of the first (j-1) elements. Otherwise, it would be the sum of
         the first j elements.
 
@@ -514,10 +616,54 @@ def cumsum(data, axis=None, dtype=None, exclusive=None):
         The result has the same size as data, and the same shape as data if axis is not None.
         If axis is None, the result is a 1-d array.
     """
-    if axis is None:
-        axis = 0
-        data = reshape(data, (prod(data.shape),))
-    axis = get_const_int(axis)
-    if exclusive is not None and exclusive != 0:
-        return exclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add)
-    return inclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add)
+    return scanop(
+        data=data,
+        binop=tvm.tir.generic.add,
+        identity_value=0,
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
+
+
+def cumprod(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+):
+    """Numpy style cumprod op. Return the cumulative product of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative product is computed. The default (None) is to compute
+        the cumproduct over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are multiplied.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True, will return exclusive product in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the product of the first (j-1) elements. Otherwise, it would be the product of
+        the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    return scanop(
+        data=data,
+        binop=tvm.tir.generic.multiply,
+        identity_value=1,
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
diff --git a/python/tvm/topi/cumsum.py b/python/tvm/topi/cumsum.py
deleted file mode 100644
index 2013a35..0000000
--- a/python/tvm/topi/cumsum.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Cumsum operator"""
-from ..tir import decl_buffer, ir_builder
-from ..te import extern
-from .utils import prod, get_const_int
-from .math import cast
-
-
-def cumsum(data, axis=None, dtype=None, exclusive=None):
-    """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
-
-    Parameters
-    ----------
-    data : tvm.te.Tensor
-        The input data to the operator.
-
-    axis : int, optional
-        Axis along which the cumulative sum is computed. The default (None) is to compute
-        the cumsum over the flattened array.
-
-    dtype : string, optional
-        Type of the returned array and of the accumulator in which the elements are summed.
-        If dtype is not specified, it defaults to the dtype of data.
-
-    exclusive : int, optional
-        If set to 1 will return exclusive sum in which the first element is not
-        included. In other terms, if set to 1, the j-th output element would be
-        the sum of the first (j-1) elements. Otherwise, it would be the sum of
-        the first j elements.
-
-    Returns
-    -------
-    result : tvm.te.Tensor
-        The result has the same size as data, and the same shape as data if axis is not None.
-        If axis is None, the result is a 1-d array.
-    """
-    if dtype is None or dtype == "":
-        dtype = data.dtype
-
-    def maybe_cast(x):
-        if dtype != data.dtype:
-            return cast(x, dtype)
-        return x
-
-    axis_mul_before = 1
-    axis_mul_after = 1
-
-    if axis is None:
-        axis = 0
-        cumsum_axis_len = prod(data.shape)
-        shape = (cumsum_axis_len,)
-    else:
-        if not isinstance(axis, int):
-            axis = get_const_int(axis)
-
-        shape = data.shape
-        cumsum_axis_len = shape[axis]
-
-        if axis < 0:
-            axis = len(shape) + axis
-
-        for i, value in enumerate(shape, 0):
-            if i < axis:
-                axis_mul_before *= value
-            elif i > axis:
-                axis_mul_after *= value
-
-    if exclusive is None:
-        exclusive = 0
-
-    def gen_ir(data_buf, out_buf):
-        ib = ir_builder.create()
-        data_buf = ib.buffer_ptr(data_buf)
-        out_buf = ib.buffer_ptr(out_buf)
-
-        with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused:
-            i = fused // axis_mul_after
-            j = fused % axis_mul_after
-            base_idx = i * cumsum_axis_len * axis_mul_after + j
-            if exclusive == 0:
-                out_buf[base_idx] = maybe_cast(data_buf[base_idx])
-            else:
-                out_buf[base_idx] = cast(0, dtype)
-            with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k:
-                k = _k + 1
-                cur_idx = base_idx + k * axis_mul_after
-                prev_idx = base_idx + (k - 1) * axis_mul_after
-                if exclusive == 0:
-                    out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx])
-                else:
-                    out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[prev_idx])
-
-        return ib.get()
-
-    out_buf = decl_buffer(shape, dtype, "out_buf")
-
-    return extern(
-        [shape],
-        [data],
-        lambda ins, outs: gen_ir(ins[0], outs[0]),
-        dtype=dtype,
-        out_buffers=[out_buf],
-        name="cumsum_generic",
-        tag="cumsum_generic",
-    )
diff --git a/python/tvm/topi/scan.py b/python/tvm/topi/scan.py
new file mode 100644
index 0000000..f579673
--- /dev/null
+++ b/python/tvm/topi/scan.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name
+"""Scan (cumulative binary) operators"""
+from typing import Callable, Optional
+
+import tvm
+
+from ..te import extern
+from ..tir import decl_buffer, generic, ir_builder
+from .math import cast
+from .utils import get_const_int, prod
+
+
+def scanop(
+    data: tvm.te.Tensor,
+    binop: Callable[["tvm.Expr", "tvm.Expr"], "tvm.Expr"],
+    identity_value: "tvm.Expr",
+    op_name: str,
+    axis: Optional[int] = None,
+    dtype: Optional[str] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Cumulative binary operator (scan) with similar axis behavior as np.cumsum and np.cumprod.
+
+    See cumprod and cumsum for an example of use.
+
+    E.g. if * is your binary operator and the input tensor is [1, 2, 3, 4] the output may be
+    [1, 1 * 2, 1 * 2 * 3, 1 * 2 * 3 * 4]
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    binop: Callable (tvm.Expr, tvm.Expr) -> tvm.Expr
+        A binary operator which should be associative and commutative. E.g. if * is your
+        operator then a * (b * c) = (a * b) * c and a * b = b * a
+
+    identity_value: tvm.Expr
+        A value for the binary operation which provides the identity property. E.g. if * is
+        your operator and i is the identity_value then a * i = a for all a in the domain of
+        your operation.
+
+    axis : int, optional
+        Axis along which the operation is computed. The default (None) is to compute
+        the cumulative operation over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are computed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True will return exclusive cumulative operation in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the cumulative operation of the first (j-1) elements. Otherwise, it would be the
+        cumulative operation of the first j elements. The cumulative operation of zero elements
+        is assumed to be the identity_value.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    if dtype is None or dtype == "":
+        dtype = data.dtype
+
+    if exclusive is None:
+        exclusive = False
+
+    def maybe_cast(x):
+        if dtype != data.dtype:
+            return cast(x, dtype)
+        return x
+
+    axis_mul_before = 1
+    axis_mul_after = 1
+
+    if axis is None:
+        axis = 0
+        cumsum_axis_len = prod(data.shape)
+        shape = (cumsum_axis_len,)
+    else:
+        if not isinstance(axis, int):
+            axis = get_const_int(axis)
+
+        shape = data.shape
+        cumsum_axis_len = shape[axis]
+
+        if axis < 0:
+            axis = len(shape) + axis
+
+        for i, value in enumerate(shape, 0):
+            if i < axis:
+                axis_mul_before *= value
+            elif i > axis:
+                axis_mul_after *= value
+
+    def gen_ir(data_buf, out_buf):
+        ib = ir_builder.create()
+        data_buf = ib.buffer_ptr(data_buf)
+        out_buf = ib.buffer_ptr(out_buf)
+
+        with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused:
+            i = fused // axis_mul_after
+            j = fused % axis_mul_after
+            base_idx = i * cumsum_axis_len * axis_mul_after + j
+            if exclusive:
+                out_buf[base_idx] = cast(identity_value, dtype)
+            else:
+                out_buf[base_idx] = maybe_cast(data_buf[base_idx])
+            with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k:
+                k = _k + 1
+                cur_idx = base_idx + k * axis_mul_after
+                prev_idx = base_idx + (k - 1) * axis_mul_after
+                if exclusive:
+                    out_buf[cur_idx] = binop(out_buf[prev_idx], maybe_cast(data_buf[prev_idx]))
+                else:
+                    out_buf[cur_idx] = binop(out_buf[prev_idx], maybe_cast(data_buf[cur_idx]))
+
+        return ib.get()
+
+    out_buf = decl_buffer(shape, dtype, "out_buf")
+
+    return extern(
+        [shape],
+        [data],
+        lambda ins, outs: gen_ir(ins[0], outs[0]),
+        dtype=dtype,
+        out_buffers=[out_buf],
+        name=op_name,
+        tag=op_name,
+    )
+
+
+def cumsum(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative sum is computed. The default (None) is to compute
+        the cumsum over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are summed.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True, will return exclusive sum in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the sum of the first (j-1) elements. Otherwise, it would be the sum of
+        the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    return scanop(
+        data=data,
+        binop=generic.add,
+        identity_value=0,
+        op_name="cumsum_generic",
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
+
+
+def cumprod(
+    data: tvm.te.Tensor,
+    axis: Optional[int] = None,
+    dtype: Optional[int] = None,
+    exclusive: Optional[bool] = None,
+) -> tvm.te.Tensor:
+    """Numpy style cumprod op. Return the cumulative product of the elements along a given axis.
+
+    Parameters
+    ----------
+    data : tvm.te.Tensor
+        The input data to the operator.
+
+    axis : int, optional
+        Axis along which the cumulative product is computed. The default (None) is to compute
+        the cumproduct over the flattened array.
+
+    dtype : string, optional
+        Type of the returned array and of the accumulator in which the elements are multiplied.
+        If dtype is not specified, it defaults to the dtype of data.
+
+    exclusive : bool, optional
+        If True, will return exclusive product in which the first element is not
+        included. In other terms, if True, the j-th output element would be
+        the product of the first (j-1) elements. Otherwise, it would be the product of
+        the first j elements.
+
+    Returns
+    -------
+    result : tvm.te.Tensor
+        The result has the same size as data, and the same shape as data if axis is not None.
+        If axis is None, the result is a 1-d array.
+    """
+    return scanop(
+        data=data,
+        binop=generic.multiply,
+        identity_value=1,
+        op_name="cumprod_generic",
+        axis=axis,
+        dtype=dtype,
+        exclusive=exclusive,
+    )
diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py
index b4f27b3..e725655 100644
--- a/python/tvm/topi/unique.py
+++ b/python/tvm/topi/unique.py
@@ -18,7 +18,7 @@
 """Unique operator"""
 from tvm import te, tir
 from ..te import hybrid
-from .cumsum import cumsum
+from .scan import cumsum
 from .sort import sort, argsort
 
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index b65068b..6fb9f77 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -3772,20 +3772,20 @@ RELAY_REGISTER_OP("adv_index")
     .set_attr<TOpPattern>("TOpPattern", kInjective)
     .set_attr<FTVMCompute>("FTVMCompute", AdvIndexCompute);
 
-TVM_REGISTER_NODE_TYPE(CumsumAttrs);
+TVM_REGISTER_NODE_TYPE(ScanopAttrs);
 
-bool CumsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+bool ScanopRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
                const TypeReporter& reporter) {
   // types: [data, output]
   ICHECK_EQ(types.size(), 2) << "Expects two types, one for the input and another for the output";
   const auto* data = types[0].as<TensorTypeNode>();
   if (data == nullptr) {
     ICHECK(types[0].as<IncompleteTypeNode>())
-        << "cumsum: expect input type to be TensorType but get " << types[0];
+        << "Scanop: expect input type to be TensorType but get " << types[0];
     return false;
   }
 
-  const auto* param = attrs.as<CumsumAttrs>();
+  const auto* param = attrs.as<ScanopAttrs>();
 
   auto dtype = param->dtype;
   if (dtype.is_void()) {
@@ -3805,8 +3805,8 @@ bool CumsumRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   return true;
 }
 
-Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Integer exclusive) {
-  auto attrs = make_object<CumsumAttrs>();
+Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Bool exclusive) {
+  auto attrs = make_object<ScanopAttrs>();
   attrs->dtype = dtype;
   attrs->axis = axis;
   attrs->exclusive = exclusive;
@@ -3822,7 +3822,27 @@ RELAY_REGISTER_OP("cumsum")
     .set_num_inputs(1)
     .add_argument("data", "Tensor", "The input tensor.")
     .set_support_level(3)
-    .add_type_rel("Cumsum", CumsumRel)
+    .add_type_rel("Cumsum", ScanopRel)
+    .set_attr<TOpPattern>("TOpPattern", kOpaque);
+
+Expr MakeCumprod(Expr data, Integer axis, DataType dtype, Bool exclusive) {
+  auto attrs = make_object<ScanopAttrs>();
+  attrs->dtype = dtype;
+  attrs->axis = axis;
+  attrs->exclusive = exclusive;
+  static const Op& op = Op::Get("cumprod");
+  return Call(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_GLOBAL("relay.op._make.cumprod").set_body_typed(MakeCumprod);
+
+RELAY_REGISTER_OP("cumprod")
+    .describe(
+        R"doc(Return the cumulative product of the elements along a given axis.)doc" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input tensor.")
+    .set_support_level(3)
+    .add_type_rel("Cumprod", ScanopRel)
     .set_attr<TOpPattern>("TOpPattern", kOpaque);
 
 TVM_REGISTER_NODE_TYPE(UniqueAttrs);
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index d2a5090..7e443aa 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -16,16 +16,16 @@
 # under the License.
 """ Support level3 operator test cases.
 """
+from typing import Callable, Optional
+
 import numpy as np
 import pytest
 import tvm
-from tvm import te
-from tvm import relay
+import tvm.testing
+from tvm import relay, te
 from tvm.error import TVMError
 from tvm.relay import create_executor, transform
 from tvm.relay.testing import check_grad, run_infer_type
-from typing import Optional
-import tvm.testing
 
 
 def test_zeros_ones():
@@ -1758,12 +1758,27 @@ def test_adv_index():
     verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)])
 
 
-@tvm.testing.parametrize_targets
-def test_cumsum(target, ctx):
-    def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e-5):
+# Helper for testing binop functions
+scanops_supported = {"cumsum": relay.op.cumsum, "cumprod": relay.op.cumprod}
+
+
+def run_binop_tests(
+    target, ctx, binop_type: str, gt_func: Callable[..., np.array], identity_value: int
+):
+    def assert_relay_scanop(
+        data_np: np.array,
+        np_out: np.array,
+        axis: int = None,
+        out_dtype: str = None,
+        rtol: float = 1e-5,
+        atol: float = 1e-5,
+        exclusive: bool = False,
+    ):
         inp = relay.var("data", relay.TensorType(data_np.shape, str(data_np.dtype)))
 
-        out = relay.op.cumsum(inp, axis, out_dtype)
+        if binop_type not in scanops_supported.keys():
+            raise ValueError(f"Unknown function {binop_type}. Options: {scanops_supported.keys()}")
+        out = scanops_supported[binop_type](inp, axis, out_dtype, exclusive=exclusive)
         func = relay.Function([inp], out)
 
         for kind in ["graph", "debug"]:
@@ -1772,24 +1787,48 @@ def test_cumsum(target, ctx):
             tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=rtol, atol=atol)
 
     data = np.array([2, 3, 0])
-    verify_cumsum(data, np.cumsum(data))
-    verify_cumsum(data, np.cumsum(data), out_dtype="int64")
+    assert_relay_scanop(data, gt_func(data))
+    assert_relay_scanop(data, gt_func(data), out_dtype="int64")
 
     data = np.random.randn(10, 10)
-    verify_cumsum(data, np.cumsum(data))
-    verify_cumsum(data, np.cumsum(data, axis=0), axis=0)
-    verify_cumsum(data, np.cumsum(data, axis=1), axis=1)
+    assert_relay_scanop(data, gt_func(data))
+    assert_relay_scanop(data, gt_func(data, axis=0), axis=0)
+    assert_relay_scanop(data, gt_func(data, axis=1), axis=1)
 
     data = np.random.randn(10, 5, 10).astype("float32")
-    verify_cumsum(data, np.cumsum(data), rtol=1e-4, atol=1e-4)
-    verify_cumsum(data, np.cumsum(data, axis=0), axis=0, rtol=1e-4, atol=1e-4)
-    verify_cumsum(data, np.cumsum(data, axis=1), axis=1, rtol=1e-4, atol=1e-4)
-    verify_cumsum(data, np.cumsum(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data), rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data, axis=0), axis=0, rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data, axis=1), axis=1, rtol=1e-4, atol=1e-4)
+    assert_relay_scanop(data, gt_func(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4)
 
     data = np.random.rand(10) > 0.5
     data = data.astype(np.int32)
-    verify_cumsum(data, np.cumsum(data, dtype=np.int32))
-    verify_cumsum(data, np.cumsum(data, dtype="int64"), out_dtype="int64")
+    assert_relay_scanop(data, gt_func(data, dtype=np.int32))
+    assert_relay_scanop(data, gt_func(data, dtype="int64"), out_dtype="int64")
+
+    # Test exclusivity operations
+    data = np.random.randint(-100, 100, size=(10, 10)).astype("int64")
+    expected_result = np.roll(gt_func(data), 1)
+    expected_result[0] = identity_value
+    assert_relay_scanop(data, expected_result, exclusive=True)
+
+    expected_result = np.roll(gt_func(data, axis=0), 1, axis=0)
+    expected_result[0, :] = identity_value
+    assert_relay_scanop(data, expected_result, exclusive=True, axis=0)
+
+    expected_result = np.roll(gt_func(data, axis=1), 1, axis=1)
+    expected_result[:, 0] = identity_value
+    assert_relay_scanop(data, expected_result, exclusive=True, axis=1)
+
+
+@tvm.testing.parametrize_targets
+def test_cumsum(target, ctx):
+    run_binop_tests(target, ctx, binop_type="cumsum", gt_func=np.cumsum, identity_value=0)
+
+
+@tvm.testing.parametrize_targets
+def test_cumprod(target, ctx):
+    run_binop_tests(target, ctx, binop_type="cumprod", gt_func=np.cumprod, identity_value=1)
 
 
 @tvm.testing.parametrize_targets
diff --git a/tests/python/topi/python/test_topi_cumsum.py b/tests/python/topi/python/test_topi_cumsum.py
deleted file mode 100644
index cfe5130..0000000
--- a/tests/python/topi/python/test_topi_cumsum.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import numpy as np
-import tvm
-import tvm.testing
-from tvm import topi
-import tvm.topi.testing
-
-
-@tvm.testing.parametrize_targets
-def test_cumsum(ctx, target):
-    def check_cumsum(np_ref, data, axis=None, dtype=None):
-        implementations = {
-            "generic": (lambda x: topi.cumsum(x, axis, dtype), topi.generic.schedule_extern),
-            "cuda": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-            "nvptx": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-            "vulkan": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-            "metal": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan),
-        }
-        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
-        tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule)
-
-    data = np.array([2, 3, 0])
-    check_cumsum(np.cumsum(data), data)
-
-    data = np.random.rand(10) > 0.5
-    data = data.astype(np.int32)
-    check_cumsum(np.cumsum(data, dtype=np.int32), data)
-    check_cumsum(np.cumsum(data), data, dtype="int64")
-
-    data = np.random.rand(10) > 0.5
-    check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32")
-
-    for in_dtype in ["float32", "float64"]:
-        if target == "metal" and in_dtype == "float64":
-            # float64 is not supported in metal
-            continue
-        data = np.random.randn(10, 10).astype(in_dtype)
-        check_cumsum(np.cumsum(data), data)
-        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
-        check_cumsum(np.cumsum(data, axis=1), data, axis=1)
-
-        data = np.random.randn(10, 5, 10).astype(in_dtype)
-        check_cumsum(np.cumsum(data), data)
-        check_cumsum(np.cumsum(data, axis=0), data, axis=0)
-        check_cumsum(np.cumsum(data, axis=1), data, axis=1)
-        check_cumsum(np.cumsum(data, axis=-1), data, axis=-1)
-
-    for in_dtype in ["int32", "int64"]:
-        data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype)
-        check_cumsum(np.cumsum(data, dtype=in_dtype), data)
-        check_cumsum(np.cumsum(data), data, dtype="int64")
-        check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0)
-        check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1)
-
-        data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype)
-        check_cumsum(np.cumsum(data), data, dtype="int64")
-
-
-if __name__ == "__main__":
-    test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm"))
-    test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda"))
-    test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))
-    test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan"))
-    test_cumsum(tvm.context("metal"), tvm.target.Target("metal"))
diff --git a/tests/python/topi/python/test_topi_scan.py b/tests/python/topi/python/test_topi_scan.py
new file mode 100644
index 0000000..020fde5
--- /dev/null
+++ b/tests/python/topi/python/test_topi_scan.py
@@ -0,0 +1,144 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+from typing import Callable
+
+import numpy as np
+import tvm
+import tvm.testing
+import tvm.topi.testing
+from tvm import topi
+
+topi_funcs = {
+    "cumsum": {"generic": topi.cumsum, "cuda": topi.cuda.cumsum},
+    "cumprod": {"generic": topi.cumprod, "cuda": topi.cuda.cumprod},
+}
+
+identity_value = {"cumsum": 0, "cumprod": 1}
+
+
+def get_implementations(name, axis, dtype, exclusive):
+    topi_func_generic = topi_funcs[name]["generic"]
+    topi_func_cuda = topi_funcs[name]["cuda"]
+
+    return {
+        "generic": (
+            lambda x: topi_func_generic(x, axis, dtype, exclusive=exclusive),
+            topi.generic.schedule_extern,
+        ),
+        "cuda": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+        "nvptx": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+        "vulkan": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+        "metal": (
+            lambda x: topi_func_cuda(x, axis, dtype, exclusive=exclusive),
+            topi.cuda.schedule_scan,
+        ),
+    }
+
+
+def _run_tests(
+    ctx,
+    target,
+    op_name: str = "cumsum",
+    gt_func: Callable[..., np.array] = np.cumsum,
+):
+    def check_scan(np_ref, data, axis=None, dtype=None, exclusive=False):
+        implementations = get_implementations(op_name, axis, dtype, exclusive)
+        fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations)
+        tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule)
+
+    data = np.array([2, 3, 0])
+    check_scan(gt_func(data), data)
+
+    data = np.random.rand(10) > 0.5
+    data = data.astype(np.int32)
+    check_scan(gt_func(data, dtype=np.int32), data)
+    check_scan(gt_func(data), data, dtype="int64")
+
+    data = np.random.rand(10) > 0.5
+    check_scan(gt_func(data, dtype=np.int32), data, dtype="int32")
+
+    for in_dtype in ["float32", "float64"]:
+        if target == "metal" and in_dtype == "float64":
+            # float64 is not supported in metal
+            continue
+        data = np.random.randn(10, 10).astype(in_dtype)
+        check_scan(gt_func(data), data)
+        check_scan(gt_func(data, axis=0), data, axis=0)
+        check_scan(gt_func(data, axis=1), data, axis=1)
+
+        data = np.random.randn(10, 5, 10).astype(in_dtype)
+        check_scan(gt_func(data), data)
+        check_scan(gt_func(data, axis=0), data, axis=0)
+        check_scan(gt_func(data, axis=1), data, axis=1)
+        check_scan(gt_func(data, axis=-1), data, axis=-1)
+
+    for in_dtype in ["int32", "int64"]:
+        data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype)
+        check_scan(gt_func(data, dtype=in_dtype), data)
+        check_scan(gt_func(data), data, dtype="int64")
+        check_scan(gt_func(data, axis=0, dtype=in_dtype), data, axis=0)
+        check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1)
+
+        data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype)
+        check_scan(gt_func(data), data, dtype="int64")
+
+    data = np.random.randint(-100, 100, size=(100, 100)).astype("int64")
+
+    expected_result = np.roll(gt_func(data), 1)
+    expected_result[0] = identity_value[op_name]
+    check_scan(expected_result, data, dtype="int64", exclusive=True)
+
+    expected_result = np.roll(gt_func(data, axis=0, dtype=in_dtype), 1, axis=0)
+    expected_result[0, :] = identity_value[op_name]
+    check_scan(expected_result, data, axis=0, exclusive=True)
+
+    expected_result = np.roll(gt_func(data, axis=1, dtype=in_dtype), 1, axis=1)
+    expected_result[:, 0] = identity_value[op_name]
+    check_scan(gt_func(data, axis=1, dtype=in_dtype), data, axis=1)
+
+
+@tvm.testing.parametrize_targets
+def test_cumsum(ctx, target):
+    _run_tests(ctx, target, op_name="cumsum", gt_func=np.cumsum)
+
+
+@tvm.testing.parametrize_targets
+def test_cumprod(ctx, target):
+    _run_tests(ctx, target, op_name="cumprod", gt_func=np.cumprod)
+
+
+if __name__ == "__main__":
+    test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm"))
+    test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda"))
+    test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx"))
+    test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan"))
+    test_cumsum(tvm.context("metal"), tvm.target.Target("metal"))
+
+    test_cumprod(tvm.context("cpu"), tvm.target.Target("llvm"))
+    test_cumprod(tvm.context("cuda"), tvm.target.Target("cuda"))
+    test_cumprod(tvm.context("nvptx"), tvm.target.Target("nvptx"))
+    test_cumprod(tvm.context("vulkan"), tvm.target.Target("vulkan"))
+    test_cumprod(tvm.context("metal"), tvm.target.Target("metal"))