Python API reference#

class nvMatmulHeuristics.GemmConfig#

Bases: object

A GEMM kernel configuration.

class nvMatmulHeuristics.MatmulProblem#

Bases: object

Description of a matrix multiplication problem.

class nvMatmulHeuristics.NvMatmulHeuristicsTarget(value)#

Bases: IntEnum

Enumeration of supported heuristic target libraries and frameworks.

GENERIC = 0#

NVFUSER = 1#

CUTLASS = 2#

TRITON = 3#

CUTLASS3 = 4#

RESERVED_1 = 5#

RESERVED_2 = 6#

END = 7#

class nvMatmulHeuristics.NvMatmulHeuristicsFlags(value)#

Bases: IntEnum

Bit-flag options controlling nvMatmulHeuristics behavior.

NONE = 0#

DISABLE_OPT_PIPELINE = 1#

REDUCE_OUTPUT_SPACE = 2#

REFINE_CANDIDATES_USING_TIMING_MODEL = 4#

PERF_MODEL_BASED_AUTO_TUNING = 8#

AUTO_TUNE_THE_PERF_MODEL = 16#

MAFI_SEEDS = 32#

END = 64#

class nvMatmulHeuristics.NvMatmulHeuristicsMatmulLayout(value)#

Bases: IntEnum

Matrix memory layout combinations for the A, B, and C operands.

NN_ROW_MAJOR = 0#

NT_ROW_MAJOR = 1#

TN_ROW_MAJOR = 2#

TT_ROW_MAJOR = 3#

NN_COL_MAJOR = 4#

NT_COL_MAJOR = 5#

TN_COL_MAJOR = 6#

TT_COL_MAJOR = 7#

END = 8#

class nvMatmulHeuristics.NvMatmulHeuristicsSiliconMetric(value)#

Bases: IntEnum

Metrics that can be estimated for a given kernel/problem pair.

RUNTIME_S = 0#

L2_HIT_RATE = 1#

COMPUTE_S = 2#

LOAD_S = 3#

STORE_S = 4#

GMEM_LOAD_BYTES = 5#

GMEM_STORE_BYTES = 6#

L2_LOAD_BYTES = 7#

STATIC_LATENCIES_S = 8#

SMEM_LOAD_BYTES = 9#

SMEM_STORE_BYTES = 10#

ENERGY_JOULES = 11#

L2_FAR_LOAD_BYTES = 12#

EDP = 13#

RUNTIME_RELATIVE_FAST_S = 14#

END = 15#

class nvMatmulHeuristics.NvMatmulHeuristicsDependencyConfiguration(value)#

Bases: IntEnum

How nvMatmulHeuristics links to CUDA.

NONE = 0#

STATIC_LINK = 1#

DYNAMIC_LINK = 2#

RUNTIME_LOAD = 3#

class nvMatmulHeuristics.NvMatmulHeuristicsNvidiaGpu(value)#

Bases: IntEnum

Pre-defined NVIDIA GPU targets

A100_SXM_80GB = 8000#

A100_PCIE_80GB = 8001#

A30_PCIE = 8002#

A10_PCIE = 8003#

A40_PCIE = 8600#

RTX_3090 = 8601#

RTX_A6000 = 8602#

L20 = 8900#

L40 = 8901#

L40S = 8902#

L4 = 8903#

RTX_4090 = 8904#

RTX_6000_ADA = 8905#

H100_SXM = 9000#

H100_PCIE = 9001#

H100_NVL = 9002#

H200_SXM = 9003#

H20_SXM = 9004#

B200 = 10000#

GB200_NVL = 10001#

GB300_NVL = 10300#

RTX_5080 = 12000#

RTX_5090 = 12001#

RTX_PRO_6000 = 12002#

END = 4294967295#

class nvMatmulHeuristics.NvMatmulHeuristicsBackendProperty(value)#

Bases: IntEnum

Tunable backend properties that influence kernel selection.

HAS_SLICE_K = 0#

HAS_COL_MAJOR_RASTER = 1#

REQUIRES_WARP_CONFIG = 2#

SUPPORTS_CLUSTER_CONFIG = 3#

HIGH_SMEM_ALIGNMENT = 4#

SMEM_EPILOGUE = 5#

SPLIT_K_KIND = 6#

CTA_SWIZZLER_BUILTIN_KIND = 7#

WORKSPACE_SIZE = 8#

DISABLE_FAST_ACC_FOR_FP8 = 9#

SUPPORTS_FALLBACK_CLUSTER = 10#

SUPPORTS_ODD_CLUSTER_N = 11#

EPILOGUE_REGISTERS = 12#

CTA_TILE_M_DIV_REQUIREMENT = 13#

CTA_TILE_N_DIV_REQUIREMENT = 14#

SMEM_CARVEOUT_SIZE = 15#

END = 16#

class nvMatmulHeuristics.NvMatmulHeuristicsBackendPropertyCallbackKind(value)#

Bases: IntEnum

Kinds of callback hooks a backend can expose.

KERNEL_ADDITIONAL_VALIDITY_CHECK = 0#

SHARED_MEMORY_USAGE = 1#

CONCURRENT_CTAS = 2#

END = 3#

class nvMatmulHeuristics.NvMatmulHeuristicsSplitKKind(value)#

Bases: IntEnum

Split-K kind enum.

NONE = 0#

IN_PLACE = 1#

OUT_OF_PLACE = 2#

STREAM_K = 3#

SEGMENT_K = 4#

END = 5#

nvMatmulHeuristics.layoutToStr( matmulLayout: NvMatmulHeuristicsMatmulLayout, )#: Return the short string identifier corresponding to matmulLayout.

nvMatmulHeuristics.boolsToNvMatmulHeuristicsLayout(trans_a: bool, trans_b: bool)#: Map boolean transpose flags to a NvMatmulHeuristicsMatmulLayout value.

class nvMatmulHeuristics.NvMatmulHeuristicsInterface( backend: NvMatmulHeuristicsTarget = NvMatmulHeuristicsTarget.GENERIC, precision: str = 'HSS', path: str = None, flags: NvMatmulHeuristicsFlags = NvMatmulHeuristicsFlags.NONE, )#

Bases: object

Python wrapper of the nvMatmulHeuristics C API.

class nvmmhKernelConfiguration#

Bases: Structure

cluster#: Structure/Union member

cta#: Structure/Union member

ctaOrder#: Structure/Union member

gridSwizzle#: Structure/Union member

instr#: Structure/Union member

loadStages#: Structure/Union member

splitK#: Structure/Union member

warp#: Structure/Union member

class nvmmhMatmulProblem#

Bases: Structure

asGemmProblem()#

K#: Structure/Union member

M#: Structure/Union member

N#: Structure/Union member

batchSize#: Structure/Union member

matmulLayout#: Structure/Union member

class nvmmhHardwareDescriptor#

Bases: Structure

data#: Structure/Union member

class nvmmhBackend#

Bases: Structure

data#: Structure/Union member

resetLibraryState()#: Reset the library state and create a new handle.

get( problem: nvmmhMatmulProblem, count: int, hardware_descriptor=None, )#

Get GEMM configurations using a problem object.

Args:

problem: nvmmhMatmulProblem object
count: Number of configurations to retrieve
hardware_descriptor: Optional hardware descriptor

get_with_mnk( m: int, n: int, k: int, matmulLayout: NvMatmulHeuristicsMatmulLayout, count: int, hardware_descriptor=None, )#

Get GEMM configurations using problem dimensions.

Args:

m: Output matrix height
n: Output matrix width
k: Reduced dimension
matmulLayout: Matrix layout
count: Number of configurations to retrieve
hardware_descriptor: Optional hardware descriptor

unpackGemmConfig( output: nvmmhKernelConfiguration, )#: Convert a C kernel-configuration struct into a Python GemmConfig.

getDiscoverySet( matmulLayout: NvMatmulHeuristicsMatmulLayout, )#: Return the discovery set for the requested matrix layout.

commitDiscoverySet( dicts, matmulLayout: NvMatmulHeuristicsMatmulLayout, )#: Upload measured discovery-set runtimes to refine heuristics.

loadInternalDiscoverySet( matmulLayout: NvMatmulHeuristicsMatmulLayout, hardware_descriptor=None, ) → bool#

Load internal discovery set for a specific matrix multiplication layout.

Args:

matmulLayout: The matrix multiplication layout to load discovery set for
hardware_descriptor: Optional hardware descriptor

Returns:

True if the discovery set was successfully loaded, False otherwise

estimateSiliconMetric( problem, kernel_config, metric: NvMatmulHeuristicsSiliconMetric, hardware_descriptor=None, )#

Estimate silicon metrics for a given problem and kernel configuration.

Args:

problem: Problem configuration
kernel_config: Kernel configuration
metric: Silicon metric to estimate
hardware_descriptor: Optional hardware descriptor

createHardwareDescriptor()#: Creates a hardware descriptor

destroyHardwareDescriptor(descriptor)#: Destroys a hardware descriptor

setHardwarePredefinedGpu( descriptor, gpu: NvMatmulHeuristicsNvidiaGpu, )#: Sets the hardware descriptor to a predefined GPU configuration

getDependencyConfiguration() → NvMatmulHeuristicsDependencyConfiguration#: Gets CUDA and cuBLAS dependency configuration

makeNvMatmulHeuristicsProblem( m: int, n: int, k: int, matmulLayout: NvMatmulHeuristicsMatmulLayout, batch_size: int = 1, )#: Create a C problem struct from Python dimension arguments.

setBackendValueProperty( backend, property: NvMatmulHeuristicsBackendProperty, value: bytes, value_size: int, ) → bool#

Sets a backend value property.

Args:

backend: Backend object (ctypes.POINTER(self.nvmmhBackend))
property: Property to set
value: Value to set
value_size: Size of the value

getBackendProperty( backend, property: NvMatmulHeuristicsBackendProperty, buffer: bytes, buffer_size: int, ) → int#

Gets a backend property.

Args:

backend: Backend object (ctypes.POINTER(self.nvmmhBackend))
property: Property to get
buffer: Buffer to store the property value
buffer_size: Size of the buffer

Returns:

Status code

setBackendCallbackProperty( backend: nvmmhBackend, callback_kind: NvMatmulHeuristicsBackendPropertyCallbackKind, callback, ) → bool#

Sets a backend callback property.

Args:

backend: Backend object
callback_kind: Type of callback
callback: Callback function

Returns:

True if successful

getBackendStringProperty( backend: nvmmhBackend, property: NvMatmulHeuristicsBackendProperty, ) → str#

Helper method to get string properties.

Args:: _ backend: Backend object _ property: Property to get
Returns:: Property value as string

getEx( problem: nvmmhMatmulProblem, count: int, backend, hardware_descriptor=None, )#

Get GEMM configurations using a problem object and custom backend.

Args:

problem: Problem configuration
count: Number of configurations to retrieve
backend: Backend object (ctypes.POINTER(self.nvmmhBackend))
hardware_descriptor: Optional hardware descriptor

createBackend( target: NvMatmulHeuristicsTarget, )#

Creates a backend object.

Args:

target: Target backend type

Returns:

Backend object

destroyBackend(backend)#: Destroys a backend object.

class nvMatmulHeuristics.NvMatmulHeuristicsInterfaceEx( backend: NvMatmulHeuristicsTarget = NvMatmulHeuristicsTarget.GENERIC, path: str = None, flags: NvMatmulHeuristicsFlags = NvMatmulHeuristicsFlags.NONE, load_discovery_implicitly: bool = True, gpu: NvMatmulHeuristicsNvidiaGpu = None, )#

Bases: NvMatmulHeuristicsInterface

Extended version of NvMatmulHeuristicsInterface that manages discovery profiles internally and allows precision to be specified per-call rather than at construction time.

Initialize the extended interface.

Args:

backend: Target backend type
path: Path to nvMatmulHeuristics library
flags: Flags to use for operations
load_discovery_implicitly: Whether to automatically load discovery sets when needed
gpu: Optional GPU to use. If None, no GPU will be set.

loadInternalDiscoverySet( matmulLayout: NvMatmulHeuristicsMatmulLayout, precision: str = None, ) → bool#

Override to track loaded discovery sets.

Args:

matmulLayout: Matrix layout
hardware_descriptor: Hardware descriptor
precision: Optional precision override

Returns:

True if the discovery set was loaded successfully

get( problem: nvmmhMatmulProblem, count: int, precision: str = None, ) → list#

Get GEMM configurations with optional precision override.

Args:

problem: Problem configuration
count: Number of configurations to retrieve
precision: Optional precision override

Returns:

List of kernel configurations

getEx( problem: nvmmhMatmulProblem, count: int, backend, precision: str = None, ) → list#

Get GEMM configurations with custom backend and optional precision override.

Args:

problem: Problem configuration
count: Number of configurations to retrieve
backend: Backend object
precision: Optional precision override

Returns:

List of kernel configurations

estimateRuntime( problem: nvmmhMatmulProblem, kernel_config: nvmmhKernelConfiguration, precision: str = None, ) → float#

Estimate runtime with optional precision override.

Args:

problem: Problem configuration
kernel_config: Kernel configuration
precision: Optional precision override

Returns:

Estimated runtime in seconds

estimateSiliconMetric( problem: nvmmhMatmulProblem, kernel_config: nvmmhKernelConfiguration, metric: NvMatmulHeuristicsSiliconMetric, precision: str = None, ) → float#

Estimate silicon metric with optional precision override.

Args:

problem: Problem configuration
kernel_config: Kernel configuration
metric: Metric to estimate
precision: Optional precision override

Returns:

Estimated metric value