tcgen05 submodule#

class cutlass.cute.nvgpu.tcgen05.Repetition(value)#

Bases: Enum

An enumeration for the number of repetitions of a given TMEM copy within the instruction.

x1 = 1#
x2 = 2#
x4 = 4#
x8 = 8#
x16 = 16#
x32 = 32#
x64 = 64#
x128 = 128#
class cutlass.cute.nvgpu.tcgen05.Pack(value)#

Bases: Enum

An enumeration for the possible packing patterns for TMEM to RMEM copies.

NONE = 1#
PACK_16b_IN_32b = 2#
class cutlass.cute.nvgpu.tcgen05.Unpack(value)#

Bases: Enum

An enumeration for the possible unpacking patterns for RMEM to TMEM copies.

NONE = 1#
UNPACK_32b_IN_16b = 2#
class cutlass.cute.nvgpu.tcgen05.Ld16x64bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x64b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x64b qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld16x128bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x128b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x128b qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld16x256bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x256b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x256b qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld16x32bx2Op(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x32bx2 TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x32bx2 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld32x32bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

32x32b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .32x32 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x64bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x64b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x64 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x128bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x128b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x128 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x256bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x256b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x256 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x32bx2Op(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x32x2b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x32x2 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St32x32bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

32x32b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .32x32 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.OperandMajorMode(value)#

Bases: Enum

An enumeration for the majorness of the input operands of the MMA.

class cutlass.cute.nvgpu.tcgen05.OperandSource(value)#

Bases: Enum

An enumeration for the source memory location of the A input operand of the MMA.

class cutlass.cute.nvgpu.tcgen05.CtaGroup(value)#

Bases: Enum

An enumeration for the cta_group qualifier of the MMA.

ONE = 1#
TWO = 2#
class cutlass.cute.nvgpu.tcgen05.Field(value)#

Bases: Enum

An enumeration for the fields of the MMA Atom that can be modified at runtime.

NEGATE_A = 'neg_a'#
NEGATE_B = 'neg_b'#
ACCUMULATE = 'accum_c'#
class cutlass.cute.nvgpu.tcgen05.MmaTF32Op(
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

TF32 tcgen05 MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::tf32 qualifier.

descriptive_name = 'tcgen05 TF32 MMA Operation'#
__init__(
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaF16BF16Op(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

F16/BF16 tcgen05 MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::f16 qualifier.

descriptive_name = 'tcgen05 F16/BF16 MMA Operation'#
__init__(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaI8Op(
ab_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

I8 tcgen05 MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::i8 qualifier.

descriptive_name = 'tcgen05 I8 MMA Operation'#
__init__(
ab_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaFP8Op(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

F8 tcgen05 MMA Operation.

See the PTX documentation.

descriptive_name = 'tcgen05 F8 MMA Operation'#
__init__(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.SmemLayoutAtomKind(value)#

Bases: Enum

Enum class for the kinds of SMEM layout atoms for SM100.

Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can be used to construct an SMEM layout using blocked product for operand A or B such that the resulting layout is legal for both TMA and UMMA.

Note that there are other ways of creating legal layouts for operand A and B.

MN_INTER = 1#
MN_SW32 = 2#
MN_SW64 = 3#
MN_SW128 = 4#
MN_SW128_32B = 5#
K_INTER = 6#
K_SW32 = 7#
K_SW64 = 8#
K_SW128 = 9#
cutlass.cute.nvgpu.tcgen05.make_smem_layout_atom(
kind: SmemLayoutAtomKind,
element_type: Type[cutlass.cute.typing.Numeric],
*,
loc=None,
ip=None,
) cutlass._mlir.ir.register_value_caster#

Makes a SMEM layout Atom.

This function creates a composed layout in unit of elements consistent with the requested layout Atom kind and element data type.

Parameters:
  • kind (SmemLayoutAtomKind) – The kind of layout Atom

  • element_type (Type[Numeric]) – The element data type to construct the layout for

Returns:

The SMEM layout atom

Return type:

core.ComposedLayout

cutlass.cute.nvgpu.tcgen05.tile_to_mma_shape(
atom,
mma_tile_shape: cutlass.cute.typing.Shape,
order: cutlass.cute.typing.IntTuple | None = None,
*,
loc=None,
ip=None,
)#

Tiles a layout to an MMA shape.

cutlass.cute.nvgpu.tcgen05.commit(
mbar_ptr: cutlass.cute.typing.Pointer,
mask=None,
cta_group: ~cutlass.cute.nvgpu.tcgen05.mma.CtaGroup = <CtaGroup.ONE>,
*,
loc=None,
ip=None,
) None#

Perform an arrive operation on a mbarrier upon completion of previous MMA operations.

Parameters:
  • mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM

  • mask (Int) – An optional multicast mask for the CTAs in the cluster to signal arrival to

cutlass.cute.nvgpu.tcgen05.is_tmem_load(atom: CopyAtom) bool#

Returns whether a CopyAtom instance is a TMEM load.

cutlass.cute.nvgpu.tcgen05.is_tmem_store(atom: CopyAtom) bool#

Returns whether a CopyAtom instance is a TMEM store.

cutlass.cute.nvgpu.tcgen05.get_tmem_copy_properties(
atom: CopyAtom,
) Tuple[int, int, int, Pack | Unpack]#

Returns the properties of a TMEM copy atom (number of data paths, bits, repetitions, and whether packing/unpacking is used).

cutlass.cute.nvgpu.tcgen05.find_tmem_tensor_col_offset(
tmem_tensor: cutlass.cute.typing.Tensor,
*,
loc=None,
ip=None,
) cutlass.cute.typing.Int#

Computes the TMEM column offset given a TMEM tensor.

Parameters:

tmem_tensor (Tensor) – The TMEM tensor to use to compute the columns offset

Returns:

The columns offset

Return type:

Int

cutlass.cute.nvgpu.tcgen05.make_tmem_copy(
atom: CopyAtom,
tmem_tensor: cutlass.cute.typing.Tensor,
*,
loc=None,
ip=None,
) TiledCopy#

Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.