tcgen05 submodule#

class cutlass.cute.nvgpu.tcgen05.Repetition(value)#

Bases: Enum

An enumeration for the number of repetitions of a given TMEM copy within the instruction.

x1 = 1#
x2 = 2#
x4 = 4#
x8 = 8#
x16 = 16#
x32 = 32#
x64 = 64#
x128 = 128#
class cutlass.cute.nvgpu.tcgen05.TmemLoadRedOp(value)#

Bases: Enum

An enumeration for the possible reduce operations for TMEM load operations.

class cutlass.cute.nvgpu.tcgen05.Pack(value)#

Bases: Enum

An enumeration for the possible packing patterns for TMEM to RMEM copies.

NONE = 1#
PACK_16b_IN_32b = 2#
class cutlass.cute.nvgpu.tcgen05.Unpack(value)#

Bases: Enum

An enumeration for the possible unpacking patterns for RMEM to TMEM copies.

NONE = 1#
UNPACK_32b_IN_16b = 2#
class cutlass.cute.nvgpu.tcgen05.Ld16x64bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x64b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x64b qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld16x128bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x128b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x128b qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld16x256bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x256b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x256b qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld16x32bx2Op(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

16x32bx2 TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .16x32bx2 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.Ld32x32bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
)#

Bases: _LdBase

32x32b TMEM load Operation.

See the PTX documentation. This Operation corresponds to the .32x32 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition = <Repetition.x1>,
pack: ~cutlass.cute.nvgpu.tcgen05.copy.Pack = <Pack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x64bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x64b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x64 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x128bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x128b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x128 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x256bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x256b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x256 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St16x32bx2Op(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

16x32x2b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .16x32x2 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.St32x32bOp(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
)#

Bases: _StBase

32x32b TMEM store Operation.

See the PTX documentation. This Operation corresponds to the .32x32 qualifier.

__init__(
repeat: ~cutlass.cute.nvgpu.tcgen05.copy.Repetition,
unpack: ~cutlass.cute.nvgpu.tcgen05.copy.Unpack = <Unpack.NONE>,
) None#
class cutlass.cute.nvgpu.tcgen05.OperandMajorMode(value)#

Bases: Enum

An enumeration for the majorness of the input operands of the MMA.

class cutlass.cute.nvgpu.tcgen05.OperandSource(value)#

Bases: Enum

An enumeration for the source memory location of the A input operand of the MMA.

class cutlass.cute.nvgpu.tcgen05.CtaGroup(value)#

Bases: Enum

An enumeration for the cta_group qualifier of the MMA.

ONE = 1#
TWO = 2#
class cutlass.cute.nvgpu.tcgen05.Field(value)#

Bases: Enum

An enumeration for the fields of the MMA Atom that can be modified at runtime.

NEGATE_A = 'neg_a'#
NEGATE_B = 'neg_b'#
ACCUMULATE = 'accum_c'#
SFA = 'sf_a'#
SFB = 'sf_b'#
class cutlass.cute.nvgpu.tcgen05.MmaTF32Op(
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

TF32 tcgen05 MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::tf32 qualifier.

descriptive_name = 'tcgen05 TF32 MMA Operation'#
__init__(
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaF16BF16Op(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

F16/BF16 tcgen05 MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::f16 qualifier.

descriptive_name = 'tcgen05 F16/BF16 MMA Operation'#
__init__(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaI8Op(
ab_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

I8 tcgen05 MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::i8 qualifier.

descriptive_name = 'tcgen05 I8 MMA Operation'#
__init__(
ab_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaFP8Op(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: MmaOp

F8 tcgen05 MMA Operation.

See the PTX documentation.

descriptive_name = 'tcgen05 F8 MMA Operation'#
__init__(
ab_dtype: Type[cutlass.cute.typing.Numeric],
acc_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaMXF8Op(
ab_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
)#

Bases: BlockScaledMmaOp

MXF8 tcgen05 BlockScaled MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::mxf8f6f4 qualifier.

descriptive_name = 'tcgen05 MXF8 BlockScaled MMA Operation'#
__init__(
ab_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
a_major_mode: OperandMajorMode,
b_major_mode: OperandMajorMode,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaMXF4Op(
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
)#

Bases: BlockScaledMmaOp

MXF4 tcgen05 BlockScaled MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::mxf4 qualifier.

descriptive_name = 'tcgen05 MXF4 BlockScaled MMA Operation'#
__init__(
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
) None#
class cutlass.cute.nvgpu.tcgen05.MmaMXF4NVF4Op(
sf_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
)#

Bases: BlockScaledMmaOp

MXF4NVF4 tcgen05 BlockScaled MMA Operation.

See the PTX documentation. This Operation corresponds to the .kind::mxf4nvf4 qualifier.

descriptive_name = 'tcgen05 MXF4NVF4 BlockScaled MMA Operation'#
__init__(
sf_dtype: Type[cutlass.cute.typing.Numeric],
instruction_shape: cutlass.cute.typing.Shape,
cta_group: CtaGroup,
a_src: OperandSource,
) None#
class cutlass.cute.nvgpu.tcgen05.SmemLayoutAtomKind(value)#

Bases: Enum

Enum class for the kinds of SMEM layout atoms for SM100.

Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can be used to construct an SMEM layout using blocked product for operand A or B such that the resulting layout is legal for both TMA and UMMA.

Note that there are other ways of creating legal layouts for operand A and B.

MN_INTER = 1#
MN_SW32 = 2#
MN_SW64 = 3#
MN_SW128 = 4#
MN_SW128_32B = 5#
K_INTER = 6#
K_SW32 = 7#
K_SW64 = 8#
K_SW128 = 9#
cutlass.cute.nvgpu.tcgen05.make_smem_layout_atom(
kind: SmemLayoutAtomKind,
element_type: Type[cutlass.cute.typing.Numeric],
*,
loc=None,
ip=None,
) cutlass.cute.typing.ComposedLayout#

Makes a SMEM layout Atom.

This function creates a composed layout in unit of elements consistent with the requested layout Atom kind and element data type.

Parameters:
  • kind (SmemLayoutAtomKind) – The kind of layout Atom

  • element_type (Type[Numeric]) – The element data type to construct the layout for

Returns:

The SMEM layout atom

Return type:

ComposedLayout

cutlass.cute.nvgpu.tcgen05.tile_to_mma_shape(
atom,
mma_tile_shape: cutlass.cute.typing.Shape,
order: cutlass.cute.typing.IntTuple | None = None,
*,
loc=None,
ip=None,
)#

Tiles a layout to an MMA shape.

cutlass.cute.nvgpu.tcgen05.commit(
mbar_ptr: cutlass.cute.typing.Pointer,
mask=None,
cta_group: ~cutlass.cute.nvgpu.tcgen05.mma.CtaGroup = <CtaGroup.ONE>,
*,
loc=None,
ip=None,
) None#

Perform an arrive operation on a mbarrier upon completion of previous MMA operations.

Parameters:
  • mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM

  • mask (Int) – An optional multicast mask for the CTAs in the cluster to signal arrival to

cutlass.cute.nvgpu.tcgen05.is_tmem_load(atom: CopyAtom) bool#

Returns whether a CopyAtom instance is a TMEM load.

cutlass.cute.nvgpu.tcgen05.is_tmem_store(atom: CopyAtom) bool#

Returns whether a CopyAtom instance is a TMEM store.

cutlass.cute.nvgpu.tcgen05.get_tmem_copy_properties(
atom: CopyAtom,
) Tuple[int, int, int, Pack | Unpack]#

Returns the properties of a TMEM copy atom (number of data paths, bits, repetitions, and whether packing/unpacking is used).

cutlass.cute.nvgpu.tcgen05.find_tmem_tensor_col_offset(
tmem_tensor: cutlass.cute.typing.Tensor,
*,
loc=None,
ip=None,
) cutlass.cute.typing.Int#

Computes the TMEM column offset given a TMEM tensor.

Parameters:

tmem_tensor (Tensor) – The TMEM tensor to use to compute the columns offset

Returns:

The columns offset

Return type:

Int

cutlass.cute.nvgpu.tcgen05.make_tmem_copy(
atom: CopyAtom,
tmem_tensor: cutlass.cute.typing.Tensor,
*,
loc=None,
ip=None,
) TiledCopy#

Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.

cutlass.cute.nvgpu.tcgen05.make_s2t_copy(
atom: CopyAtom,
tmem_tensor: cutlass.cute.typing.Tensor,
*,
loc=None,
ip=None,
) TiledCopy#

Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor.

cutlass.cute.nvgpu.tcgen05.get_s2t_smem_desc_tensor(
atom: CopyAtom,
smem_tensor: cutlass.cute.typing.Tensor,
*,
loc=None,
ip=None,
) cutlass.cute.typing.Tensor#

Returns the SMEM descriptor tensor from a S2T copy atom and a SMEM tensor.

cutlass.cute.nvgpu.tcgen05.make_umma_smem_desc(
src: cutlass.cute.typing.Pointer,
layout: cutlass.cute.typing.Layout,
major: str,
next_src: cutlass.cute.typing.Pointer | None = None,
*,
loc=None,
ip=None,
)#

Construct shared memory descriptor for UMMA.

The make_umma_smem_desc operation accepts an input cute.ptr (optionally a nextSrc pointer for the second buffer in a circular buffer scheme), alongside a cute.layout and a major attr, then constructs the shared memory descriptor and returns it. The layout must be describing the buffer pointed to by the input pointer and the iterator must carry valid swizzle information.

There are 5 supported swizzle variants: - S<0, 4, 3> | SWIZZLE_NONE - S<1, 4, 3> | SWIZZLE_32B - S<2, 4, 3> | SWIZZLE_64B - S<3, 4, 3> | SWIZZLE_128B - S<2, 5, 2> | SWIZZLE_128B_BASE32B

The cute.ptr must carry shared address space and must be aligned to 16B.

Parameters:
  • src (Pointer) – The source pointer to shared memory

  • layout (Layout) – The layout describing the buffer

  • major (str) – The major mode attribute

  • next_src (Optional[Pointer]) – Optional next source pointer for circular buffer scheme

Returns:

The shared memory descriptor

Return type:

SmemDescType