Python API reference#

class nvMatmulHeuristics.GemmConfig#

Bases: object

A GEMM kernel configuration.

class nvMatmulHeuristics.MatmulProblem#

Bases: object

Description of a matrix multiplication problem.

class nvMatmulHeuristics.NvMatmulHeuristicsTarget(value)#

Bases: IntEnum

Enumeration of supported heuristic target libraries and frameworks.

GENERIC = 0#
NVFUSER = 1#
CUTLASS = 2#
TRITON = 3#
CUTLASS3 = 4#
RESERVED_1 = 5#
RESERVED_2 = 6#
END = 7#
class nvMatmulHeuristics.NvMatmulHeuristicsFlags(value)#

Bases: IntEnum

Bit-flag options controlling nvMatmulHeuristics behavior.

NONE = 0#
DISABLE_OPT_PIPELINE = 1#
REDUCE_OUTPUT_SPACE = 2#
REFINE_CANDIDATES_USING_TIMING_MODEL = 4#
PERF_MODEL_BASED_AUTO_TUNING = 8#
AUTO_TUNE_THE_PERF_MODEL = 16#
MAFI_SEEDS = 32#
END = 64#
class nvMatmulHeuristics.NvMatmulHeuristicsMatmulLayout(value)#

Bases: IntEnum

Matrix memory layout combinations for the A, B, and C operands.

NN_ROW_MAJOR = 0#
NT_ROW_MAJOR = 1#
TN_ROW_MAJOR = 2#
TT_ROW_MAJOR = 3#
NN_COL_MAJOR = 4#
NT_COL_MAJOR = 5#
TN_COL_MAJOR = 6#
TT_COL_MAJOR = 7#
END = 8#
class nvMatmulHeuristics.NvMatmulHeuristicsSiliconMetric(value)#

Bases: IntEnum

Metrics that can be estimated for a given kernel/problem pair.

RUNTIME_S = 0#
L2_HIT_RATE = 1#
COMPUTE_S = 2#
LOAD_S = 3#
STORE_S = 4#
GMEM_LOAD_BYTES = 5#
GMEM_STORE_BYTES = 6#
L2_LOAD_BYTES = 7#
STATIC_LATENCIES_S = 8#
SMEM_LOAD_BYTES = 9#
SMEM_STORE_BYTES = 10#
ENERGY_JOULES = 11#
L2_FAR_LOAD_BYTES = 12#
EDP = 13#
RUNTIME_RELATIVE_FAST_S = 14#
END = 15#
class nvMatmulHeuristics.NvMatmulHeuristicsDependencyConfiguration(value)#

Bases: IntEnum

How nvMatmulHeuristics links to CUDA.

NONE = 0#
RUNTIME_LOAD = 3#
class nvMatmulHeuristics.NvMatmulHeuristicsNvidiaGpu(value)#

Bases: IntEnum

Pre-defined NVIDIA GPU targets

A100_SXM_80GB = 8000#
A100_PCIE_80GB = 8001#
A30_PCIE = 8002#
A10_PCIE = 8003#
A40_PCIE = 8600#
RTX_3090 = 8601#
RTX_A6000 = 8602#
L20 = 8900#
L40 = 8901#
L40S = 8902#
L4 = 8903#
RTX_4090 = 8904#
RTX_6000_ADA = 8905#
H100_SXM = 9000#
H100_PCIE = 9001#
H100_NVL = 9002#
H200_SXM = 9003#
H20_SXM = 9004#
B200 = 10000#
GB200_NVL = 10001#
GB300_NVL = 10300#
RTX_5080 = 12000#
RTX_5090 = 12001#
RTX_PRO_6000 = 12002#
END = 4294967295#
class nvMatmulHeuristics.NvMatmulHeuristicsBackendProperty(value)#

Bases: IntEnum

Tunable backend properties that influence kernel selection.

HAS_SLICE_K = 0#
HAS_COL_MAJOR_RASTER = 1#
REQUIRES_WARP_CONFIG = 2#
SUPPORTS_CLUSTER_CONFIG = 3#
HIGH_SMEM_ALIGNMENT = 4#
SMEM_EPILOGUE = 5#
SPLIT_K_KIND = 6#
CTA_SWIZZLER_BUILTIN_KIND = 7#
WORKSPACE_SIZE = 8#
DISABLE_FAST_ACC_FOR_FP8 = 9#
SUPPORTS_FALLBACK_CLUSTER = 10#
SUPPORTS_ODD_CLUSTER_N = 11#
EPILOGUE_REGISTERS = 12#
CTA_TILE_M_DIV_REQUIREMENT = 13#
CTA_TILE_N_DIV_REQUIREMENT = 14#
SMEM_CARVEOUT_SIZE = 15#
END = 16#
class nvMatmulHeuristics.NvMatmulHeuristicsBackendPropertyCallbackKind(value)#

Bases: IntEnum

Kinds of callback hooks a backend can expose.

KERNEL_ADDITIONAL_VALIDITY_CHECK = 0#
SHARED_MEMORY_USAGE = 1#
CONCURRENT_CTAS = 2#
END = 3#
class nvMatmulHeuristics.NvMatmulHeuristicsSplitKKind(value)#

Bases: IntEnum

Split-K kind enum.

NONE = 0#
IN_PLACE = 1#
OUT_OF_PLACE = 2#
STREAM_K = 3#
SEGMENT_K = 4#
END = 5#
nvMatmulHeuristics.layoutToStr(
matmulLayout: NvMatmulHeuristicsMatmulLayout,
)#

Return the short string identifier corresponding to matmulLayout.

nvMatmulHeuristics.boolsToNvMatmulHeuristicsLayout(trans_a: bool, trans_b: bool)#

Map boolean transpose flags to a NvMatmulHeuristicsMatmulLayout value.

class nvMatmulHeuristics.NvMatmulHeuristicsInterface(
backend: NvMatmulHeuristicsTarget = NvMatmulHeuristicsTarget.GENERIC,
precision: str = 'HSS',
path: str = None,
flags: NvMatmulHeuristicsFlags = NvMatmulHeuristicsFlags.NONE,
)#

Bases: object

Python wrapper of the nvMatmulHeuristics C API.

class nvmmhKernelConfiguration#

Bases: Structure

cluster#

Structure/Union member

cta#

Structure/Union member

ctaOrder#

Structure/Union member

gridSwizzle#

Structure/Union member

instr#

Structure/Union member

loadStages#

Structure/Union member

splitK#

Structure/Union member

warp#

Structure/Union member

class nvmmhMatmulProblem#

Bases: Structure

asGemmProblem()#
K#

Structure/Union member

M#

Structure/Union member

N#

Structure/Union member

batchSize#

Structure/Union member

matmulLayout#

Structure/Union member

class nvmmhHardwareDescriptor#

Bases: Structure

data#

Structure/Union member

class nvmmhBackend#

Bases: Structure

data#

Structure/Union member

resetLibraryState()#

Reset the library state and create a new handle.

get(
problem: nvmmhMatmulProblem,
count: int,
hardware_descriptor=None,
)#

Get GEMM configurations using a problem object.

Args:
  • problem: nvmmhMatmulProblem object

  • count: Number of configurations to retrieve

  • hardware_descriptor: Optional hardware descriptor

get_with_mnk(
m: int,
n: int,
k: int,
matmulLayout: NvMatmulHeuristicsMatmulLayout,
count: int,
hardware_descriptor=None,
)#

Get GEMM configurations using problem dimensions.

Args:
  • m: Output matrix height

  • n: Output matrix width

  • k: Reduced dimension

  • matmulLayout: Matrix layout

  • count: Number of configurations to retrieve

  • hardware_descriptor: Optional hardware descriptor

unpackGemmConfig(
output: nvmmhKernelConfiguration,
)#

Convert a C kernel-configuration struct into a Python GemmConfig.

getDiscoverySet(
matmulLayout: NvMatmulHeuristicsMatmulLayout,
)#

Return the discovery set for the requested matrix layout.

commitDiscoverySet(
dicts,
matmulLayout: NvMatmulHeuristicsMatmulLayout,
)#

Upload measured discovery-set runtimes to refine heuristics.

loadInternalDiscoverySet(
matmulLayout: NvMatmulHeuristicsMatmulLayout,
hardware_descriptor=None,
) bool#

Load internal discovery set for a specific matrix multiplication layout.

Args:
  • matmulLayout: The matrix multiplication layout to load discovery set for

  • hardware_descriptor: Optional hardware descriptor

Returns:

True if the discovery set was successfully loaded, False otherwise

estimateSiliconMetric(
problem,
kernel_config,
metric: NvMatmulHeuristicsSiliconMetric,
hardware_descriptor=None,
)#

Estimate silicon metrics for a given problem and kernel configuration.

Args:
  • problem: Problem configuration

  • kernel_config: Kernel configuration

  • metric: Silicon metric to estimate

  • hardware_descriptor: Optional hardware descriptor

createHardwareDescriptor()#

Creates a hardware descriptor

destroyHardwareDescriptor(descriptor)#

Destroys a hardware descriptor

setHardwarePredefinedGpu(
descriptor,
gpu: NvMatmulHeuristicsNvidiaGpu,
)#

Sets the hardware descriptor to a predefined GPU configuration

getDependencyConfiguration() NvMatmulHeuristicsDependencyConfiguration#

Gets CUDA and cuBLAS dependency configuration

makeNvMatmulHeuristicsProblem(
m: int,
n: int,
k: int,
matmulLayout: NvMatmulHeuristicsMatmulLayout,
batch_size: int = 1,
)#

Create a C problem struct from Python dimension arguments.

setBackendValueProperty(
backend,
property: NvMatmulHeuristicsBackendProperty,
value: bytes,
value_size: int,
) bool#

Sets a backend value property.

Args:
  • backend: Backend object (ctypes.POINTER(self.nvmmhBackend))

  • property: Property to set

  • value: Value to set

  • value_size: Size of the value

getBackendProperty(
backend,
property: NvMatmulHeuristicsBackendProperty,
buffer: bytes,
buffer_size: int,
) int#

Gets a backend property.

Args:
  • backend: Backend object (ctypes.POINTER(self.nvmmhBackend))

  • property: Property to get

  • buffer: Buffer to store the property value

  • buffer_size: Size of the buffer

Returns:

Status code

setBackendCallbackProperty(
backend: nvmmhBackend,
callback_kind: NvMatmulHeuristicsBackendPropertyCallbackKind,
callback,
) bool#

Sets a backend callback property.

Args:
  • backend: Backend object

  • callback_kind: Type of callback

  • callback: Callback function

Returns:

True if successful

getBackendStringProperty(
backend: nvmmhBackend,
property: NvMatmulHeuristicsBackendProperty,
) str#

Helper method to get string properties.

Args:

_ backend: Backend object _ property: Property to get

Returns:

Property value as string

getEx(
problem: nvmmhMatmulProblem,
count: int,
backend,
hardware_descriptor=None,
)#

Get GEMM configurations using a problem object and custom backend.

Args:
  • problem: Problem configuration

  • count: Number of configurations to retrieve

  • backend: Backend object (ctypes.POINTER(self.nvmmhBackend))

  • hardware_descriptor: Optional hardware descriptor

createBackend(
target: NvMatmulHeuristicsTarget,
)#

Creates a backend object.

Args:
  • target: Target backend type

Returns:

Backend object

destroyBackend(backend)#

Destroys a backend object.

class nvMatmulHeuristics.NvMatmulHeuristicsInterfaceEx(
backend: NvMatmulHeuristicsTarget = NvMatmulHeuristicsTarget.GENERIC,
path: str = None,
flags: NvMatmulHeuristicsFlags = NvMatmulHeuristicsFlags.NONE,
load_discovery_implicitly: bool = True,
gpu: NvMatmulHeuristicsNvidiaGpu = None,
)#

Bases: NvMatmulHeuristicsInterface

Extended version of NvMatmulHeuristicsInterface that manages discovery profiles internally and allows precision to be specified per-call rather than at construction time.

Initialize the extended interface.

Args:
  • backend: Target backend type

  • path: Path to nvMatmulHeuristics library

  • flags: Flags to use for operations

  • load_discovery_implicitly: Whether to automatically load discovery sets when needed

  • gpu: Optional GPU to use. If None, no GPU will be set.

loadInternalDiscoverySet(
matmulLayout: NvMatmulHeuristicsMatmulLayout,
precision: str = None,
) bool#

Override to track loaded discovery sets.

Args:
  • matmulLayout: Matrix layout

  • hardware_descriptor: Hardware descriptor

  • precision: Optional precision override

Returns:

True if the discovery set was loaded successfully

get(
problem: nvmmhMatmulProblem,
count: int,
precision: str = None,
) list#

Get GEMM configurations with optional precision override.

Args:
  • problem: Problem configuration

  • count: Number of configurations to retrieve

  • precision: Optional precision override

Returns:

List of kernel configurations

getEx(
problem: nvmmhMatmulProblem,
count: int,
backend,
precision: str = None,
) list#

Get GEMM configurations with custom backend and optional precision override.

Args:
  • problem: Problem configuration

  • count: Number of configurations to retrieve

  • backend: Backend object

  • precision: Optional precision override

Returns:

List of kernel configurations

estimateRuntime(
problem: nvmmhMatmulProblem,
kernel_config: nvmmhKernelConfiguration,
precision: str = None,
) float#

Estimate runtime with optional precision override.

Args:
  • problem: Problem configuration

  • kernel_config: Kernel configuration

  • precision: Optional precision override

Returns:

Estimated runtime in seconds

estimateSiliconMetric(
problem: nvmmhMatmulProblem,
kernel_config: nvmmhKernelConfiguration,
metric: NvMatmulHeuristicsSiliconMetric,
precision: str = None,
) float#

Estimate silicon metric with optional precision override.

Args:
  • problem: Problem configuration

  • kernel_config: Kernel configuration

  • metric: Metric to estimate

  • precision: Optional precision override

Returns:

Estimated metric value