cuda.tile.cumsum#

cuda.tile.cumsum(
x,
/,
axis=0,
*,
reverse=False,
rounding_mode=None,
flush_to_zero=False,
)#

Performs cumsum on tile along the axis.

Parameters:
  • x (Tile) – input tile

  • axis (const int) – the axis for scan, default 0.

  • reverse (const bool) – if True, the scan is performed in the reverse direction.

  • rounding_mode (RoundingMode) – The rounding mode for the operation, only supported for float types, default is RoundingMode.RN when applicable.

  • flush_to_zero (const bool) – If True, flushes subnormal inputs and results to sign-preserving zero, default is False.

Return type:

Tile

Examples

Scan along axis 1.

tx = ct.arange(8, dtype=ct.int32).reshape((2, 4))
print("input:", tx)
print("cumsum:", ct.cumsum(tx, 1))
import cuda.tile as ct
import torch

@ct.kernel
def kernel():
    tx = ct.arange(8, dtype=ct.int32).reshape((2, 4))
    print("input:", tx)
    print("cumsum:", ct.cumsum(tx, 1))


torch.cuda.init()
ct.launch(torch.cuda.current_stream(), (1,), kernel, ())
torch.cuda.synchronize()

Output

input: [[0, 1, 2, 3], [4, 5, 6, 7]]
cumsum: [[0, 1, 3, 6], [4, 9, 15, 22]]

Reverse scan.

tx = ct.arange(8, dtype=ct.int32).reshape((2, 4))
print("input:", tx)
print("reverse:", ct.cumsum(tx, 1, reverse=True))
import cuda.tile as ct
import torch

@ct.kernel
def kernel():
    tx = ct.arange(8, dtype=ct.int32).reshape((2, 4))
    print("input:", tx)
    print("reverse:", ct.cumsum(tx, 1, reverse=True))


torch.cuda.init()
ct.launch(torch.cuda.current_stream(), (1,), kernel, ())
torch.cuda.synchronize()

Output

input: [[0, 1, 2, 3], [4, 5, 6, 7]]
reverse: [[6, 6, 5, 3], [22, 18, 13, 7]]