cutlass.cute.arch#

The cute.arch module contains wrappers around NVVM-level MLIR Op builders that seamlessly inter-operate with the Python types used in CUTLASS Python. Another benefit of wrapping these Op builders is that the source location can be tracked with the @dsl_user_op decorator. Available functions include

basic API like thr_idx;
functions related to the direct management of mbarriers;
low-level SMEM management (prefer using the SmemAllocator class);
TMEM management.

API documentation#

cutlass.cute.arch.make_warp_uniform( value: cutlass.cute.typing.Int, *, loc=None, ip=None, ) → cutlass.cute.typing.Int32#

Creates a warp-uniform value from the given integer input.

Parameters:: value (Int) – The integer to make warp uniform.
Returns:: The warp-uniform value equal to the input.
Return type:: Int32

cutlass.cute.arch.elect_one(*, loc=None, ip=None) → IfOpRegion#

Elects one thread within a warp.

with elect_one():
    # Only one thread in the warp executes the code in this context
    pass

cutlass.cute.arch.mbarrier_init( mbar_ptr: cutlass.cute.typing.Pointer, cnt: cutlass.cute.typing.Int, *, loc=None, ip=None, ) → None#

Initializes a mbarrier with the specified thread arrival count.

Parameters:

mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM
cnt (Int) – The arrival count of the mbarrier

cutlass.cute.arch.mbarrier_init_fence(*, loc=None, ip=None) → None#: A fence operation that applies to the mbarrier initializations.

cutlass.cute.arch.mbarrier_arrive_and_expect_tx( mbar_ptr: cutlass.cute.typing.Pointer, bytes: cutlass.cute.typing.Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None, ) → None#

Arrives on a mbarrier and expects a specified number of transaction bytes.

Parameters:

mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM
bytes (Int) – The number of transaction bytes
peer_cta_rank_in_cluster – An optional CTA rank in cluster. If provided, the pointer to the mbarrier is converted to a remote address in the peer CTA’s SMEM.

cutlass.cute.arch.mbarrier_expect_tx( mbar_ptr: cutlass.cute.typing.Pointer, bytes: cutlass.cute.typing.Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None, ) → None#

Expects a specified number of transaction bytes without an arrive.

Parameters:

mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM
bytes (Int) – The number of transaction bytes
peer_cta_rank_in_cluster – An optional CTA rank in cluster. If provided, the pointer to the mbarrier is converted to a remote address in the peer CTA’s SMEM.

cutlass.cute.arch.mbarrier_wait( mbar_ptr: cutlass.cute.typing.Pointer, phase: cutlass.cute.typing.Int, *, loc=None, ip=None, ) → None#

Waits on a mbarrier with a specified phase.

Parameters:

mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM
phase (Int) – The phase to wait for (either 0 or 1)

cutlass.cute.arch.mbarrier_try_wait( mbar_ptr: cutlass.cute.typing.Pointer, phase: cutlass.cute.typing.Int, *, loc=None, ip=None, ) → cutlass.cute.typing.Boolean#

Attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.

Parameters:

mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM
phase (Int) – The phase to wait for (either 0 or 1)

Returns:

A boolean value indicating whether the wait operation was successful

Return type:

Boolean

cutlass.cute.arch.mbarrier_conditional_try_wait( cond, mbar_ptr: cutlass.cute.typing.Pointer, phase: cutlass.cute.typing.Int, *, loc=None, ip=None, ) → cutlass.cute.typing.Boolean#

Conditionally attempts to wait on a mbarrier with a specified phase in a non-blocking fashion.

Parameters:

cond – A boolean predicate
mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM
phase (Int) – The phase to wait for (either 0 or 1)

Returns:

A boolean value indicating whether the wait operation was successful

Return type:

Boolean

cutlass.cute.arch.mbarrier_arrive( mbar_ptr: cutlass.cute.typing.Pointer, peer_cta_rank_in_cluster: cutlass.cute.typing.Int | None = None, *, loc=None, ip=None, ) → None#

Arrives on an mbarrier.

Parameters:

mbar_ptr (Pointer) – A pointer to the mbarrier in SMEM
peer_cta_rank_in_cluster – An optional CTA rank in cluster. If provided, the pointer to the mbarrier is converted to a remote address in the peer CTA’s SMEM.

cutlass.cute.arch.lane_idx(*, loc=None, ip=None) → cutlass.cute.typing.Int32#: Returns the lane index of the current thread within the warp.

cutlass.cute.arch.warp_idx(*, loc=None, ip=None) → cutlass.cute.typing.Int32#: Returns the warp index within a CTA.

cutlass.cute.arch.thread_idx( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the thread index within a CTA.

cutlass.cute.arch.block_dim( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the number of threads in each dimension of the CTA.

cutlass.cute.arch.block_idx( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the CTA identifier within a grid.

cutlass.cute.arch.grid_dim( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the number of CTAs in each dimension of the grid.

cutlass.cute.arch.cluster_idx( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the cluster identifier within a grid.

cutlass.cute.arch.cluster_dim( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the number of clusters in each dimension of the grid.

cutlass.cute.arch.block_in_cluster_idx( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the CTA index within a cluster across all dimensions.

cutlass.cute.arch.block_in_cluster_dim( *, loc=None, ip=None, ) → Tuple[cutlass.cute.typing.Int32, cutlass.cute.typing.Int32, cutlass.cute.typing.Int32]#: Returns the dimensions of the cluster.

cutlass.cute.arch.block_idx_in_cluster( *, loc=None, ip=None, ) → cutlass.cute.typing.Int32#: Returns the linearized identifier of the CTA within the cluster.

cutlass.cute.arch.barrier( *, barrier_id=None, number_of_threads=None, loc=None, ip=None, ) → None#: Creates a barrier, optionally named.

cutlass.cute.arch.barrier_arrive( *, barrier_id=None, number_of_threads=None, loc=None, ip=None, ) → None#

cutlass.cute.arch.sync_threads(*, loc=None, ip=None) → None#: Synchronizes all threads within a CTA.

cutlass.cute.arch.sync_warp( mask: cutlass.cute.typing.Int = 4294967295, *, loc=None, ip=None, ) → None#: Performs a warp-wide sync with an optional mask.

cutlass.cute.arch.fence_acq_rel_cta(*, loc=None, ip=None) → None#

Fence operation with acquire-release semantics.

See the PTX documentation.

cutlass.cute.arch.fence_acq_rel_cluster(*, loc=None, ip=None) → None#

Fence operation with acquire-release semantics.

See the PTX documentation.

cutlass.cute.arch.fence_acq_rel_gpu(*, loc=None, ip=None) → None#

Fence operation with acquire-release semantics.

See the PTX documentation.

cutlass.cute.arch.fence_acq_rel_sys(*, loc=None, ip=None) → None#

Fence operation with acquire-release semantics.

See the PTX documentation.

cutlass.cute.arch.cp_async_commit_group(*, loc=None, ip=None) → None#

Commits all prior initiated but uncommitted cp.async instructions.

See the PTX documentation.

cutlass.cute.arch.cp_async_wait_group(n, *, loc=None, ip=None) → None#

Waits till only a specified numbers of cp.async groups are pending.

See the PTX documentation.

cutlass.cute.arch.cp_async_bulk_commit_group(*, loc=None, ip=None) → None#

Commits all prior initiated but uncommitted cp.async.bulk instructions.

See the PTX documentation.

cutlass.cute.arch.cp_async_bulk_wait_group( group, *, read=None, loc=None, ip=None, ) → None#

Waits till only a specified numbers of cp.async.bulk groups are pending.

See the PTX documentation.

cutlass.cute.arch.cluster_wait(*, loc=None, ip=None) → None#: A cluster-wide wait operation.

cutlass.cute.arch.cluster_arrive(*, aligned=None, loc=None, ip=None) → None#: A cluster-wide arrive operation.

cutlass.cute.arch.cluster_arrive_relaxed(*, aligned=None, loc=None, ip=None) → None#: A cluster-wide arrive operation with relaxed semantics.

cutlass.cute.arch.fence_proxy( kind: cutlass._mlir.dialects.nvvm.ProxyKind, *, space: cutlass._mlir.dialects.nvvm.SharedSpace | None = None, use_intrinsic=None, loc=None, ip=None, ) → None#

cutlass.cute.arch.vote_ballot_sync( pred: cutlass.cute.typing.Boolean, mask: cutlass.cute.typing.Int = 4294967295, *, loc=None, ip=None, ) → cutlass.cute.typing.Int32#: Performs a ballot operation across the warp.

cutlass.cute.arch.popc( value: cutlass.cute.typing.Numeric, *, loc=None, ip=None, ) → cutlass.cute.typing.Numeric#: Performs a population count operation.

cutlass.cute.arch.fmax( a: float | cutlass.cute.typing.Float32, b: float | cutlass.cute.typing.Float32, *, loc=None, ip=None, ) → cutlass.cute.typing.Float32#

cutlass.cute.arch.rcp_approx( a: float | cutlass.cute.typing.Float32, *, loc=None, ip=None, )#

cutlass.cute.arch.exp2( a: float | cutlass.cute.typing.Float32, *, loc=None, ip=None, ) → cutlass.cute.typing.Float32#

cutlass.cute.arch.alloc_smem( element_type: Type[cutlass.cute.typing.Numeric], size_in_elems: int, alignment: None | int = None, *, loc=None, ip=None, ) → cutlass.cute.typing.Pointer#

Statically allocates SMEM.

Parameters:

element_type (Type[Numeric]) – The pointee type of the pointer.
size_in_elems (int) – The size of the allocation in terms of number of elements of the pointee type
alignment (int) – An optional pointer alignment for the allocation

Returns:

A pointer to the start of the allocation

Return type:

Pointer

cutlass.cute.arch.get_dyn_smem( element_type: Type[cutlass.cute.typing.Numeric], alignment: None | int = None, *, loc=None, ip=None, ) → cutlass.cute.typing.Pointer#

Retrieves a pointer to a dynamic SMEM allocation.

Parameters:

element_type (Type[Numeric]) – The pointee type of the pointer.
alignment (int) – An optional pointer alignment, the result pointer is offset appropriately

Returns:

A pointer to the start of the dynamic SMEM allocation with a correct alignement

Return type:

Pointer

cutlass.cute.arch.retrieve_tmem_ptr( element_type: Type[cutlass.cute.typing.Numeric], alignment: int, ptr_to_buffer_holding_addr: cutlass.cute.typing.Pointer, *, loc=None, ip=None, ) → cutlass.cute.typing.Pointer#

Retrieves a pointer to TMEM with the provided element type and alignment.

Parameters:

element_type (Type[Numeric]) – The pointee type of the pointer.
alignment (int) – The alignment of the result pointer
ptr_to_buffer_holding_addr (Pointer) – A pointer to a SMEM buffer holding the TMEM address of the start of the allocation allocation

Returns:

A pointer to TMEM

Return type:

Pointer

cutlass.cute.arch.alloc_tmem( num_columns: cutlass.cute.typing.Int, smem_ptr_to_write_address: cutlass.cute.typing.Pointer, is_two_cta=None, *, loc=None, ip=None, ) → None#

Allocates TMEM.

Parameters:

num_columns (Int) – The number of TMEM columns to allocate
smem_ptr_to_write_address (Pointer) – A pointer to a SMEM buffer where the TMEM address is written to
is_two_cta – Optional boolean parameter for 2-CTA MMAs

cutlass.cute.arch.relinquish_tmem_alloc_permit( is_two_cta=None, *, loc=None, ip=None, ) → None#: Relinquishes the right to allocate TMEM so that other CTAs potentially in a different grid can allocate.

cutlass.cute.arch.dealloc_tmem( tmem_ptr: cutlass.cute.typing.Pointer, num_columns: cutlass.cute.typing.Int, is_two_cta=None, *, loc=None, ip=None, ) → None#

Deallocates TMEM using the provided pointer and number of columns.

Parameters:

tmem_ptr (Pointer) – A pointer to the TMEM allocation to de-allocate
num_columns (Int) – The number of columns in the TMEM allocation
is_two_cta – Optional boolean parameter for 2-CTA MMAs