API reference#
Typedefs and structs#
-
typedef void *nvmmhHandle_t#
An opaque handle to a nvMatmulHeuristics instance.
-
struct nvmmhMatmulProblem_t#
Describes a matmul problem
-
struct nvmmhKernelConfiguration_t#
Holds the result of the matmul heuristic, aka the kernel configuration.
-
typedef struct nvmmhHardwareDescriptor *nvmmhHardwareDescriptor_t#
Opaque handle to the nvMatmulHeuristics hardware descriptor.
-
typedef struct nvmmhBackend *nvmmhBackend_t#
Opaque handle describing a matmul backend.
-
struct nvmmhDim3_t#
A tuple of three int32_t elements.
-
struct nvmmhSwizzling_t#
Description of the CTA swizzling mode.
-
typedef nvmmhDim3_t (*nvmmhSwizzler_t)(nvmmhDim3_t blockIdx, int timestamp, const nvmmhMatmulProblem_t *problem, const nvmmhKernelConfiguration_t *kernelConfig)#
A callback describing the CTA swizzling mode.
- Param blockIdx:
Current block index on the GPU.
- Param timestamp:
Iteration of the main loop. Used to figure out which tiles across K dimension are processed.
- Param problem:
Problem.
- Param kernelConfig:
Kernel configuration.
- Return:
Coordinates of the processed output tile and the “depth” of the reduction. depth refers to the K position. Return {-1,-1,-1} to indicate end of gpu block.
-
typedef nvmmhDim3_t (*nvmmhGridLauncher_t)(const nvmmhMatmulProblem_t *problem, const nvmmhKernelConfiguration_t *kernelConfig)#
A callback computing the dimensions of the grid launched on the GPU.
- Param problem:
Problem.
- Param kernelConfig:
Kernel configuration.
- Return:
The dimensions of the grid launched on the GPU.
-
typedef double (*nvmmhPerformanceModel_t)(const nvmmhMatmulProblem_t *problem, const nvmmhKernelConfiguration_t *kernelConfig, double nvMatmulHeuristics_internal_timing_model_prediction)#
Callback to override the internal performance model estimates in a few select places, primarily for perf-model based auto-tuning and split-k computation.
- Param problem:
The matmul problem
- Param kernelConfig:
Pointer to the currently tested kernel configuration.
- Param nvMatmulHeuristics_internal_timing_model_prediction:
Expected timing by nvMatmulHeuristics in seconds.
- Return:
Timing in seconds.
-
typedef int (*nvmmhPropertyCallback_t)(const nvmmhKernelConfiguration_t *kernelConfig)#
Callback called by nvMatmulHeuristics.
- Param kernelConfig:
The kernel configuration.
- Return:
An
int
whose meaning depends on the nvmmhBackendPropertyCallbackKind_t property.
Enumerations#
-
enum nvmmhStatus_t#
Return status
Values:
-
enumerator NVMMH_STATUS_ERROR#
Error
-
enumerator NVMMH_STATUS_SUCCESS#
Success
-
enumerator NVMMH_STATUS_PROFILE_NOT_ENTIRELY_LOADED#
Everything is in order besides that some or all of the internal profile data was missing.
-
enumerator NVMMH_STATUS_INVALID_INPUT#
Invalid input passed into the function
-
enumerator NVMMH_STATUS_INVALID_ENUM_INPUT#
Invalid enum input passed into the function
-
enumerator NVMMH_STATUS_INVALID_DESCRIPTOR#
Invalid descriptor used.
-
enumerator NVMMH_STATUS_DRIVER_ALREADY_INITIALIZED#
CUDA Driver was already loaded by nvMatmulHeuristics
-
enumerator NVMMH_STATUS_UNSUPPORTED_FEATURE#
Attempt to use something that is not supported
-
enumerator NVMMH_STATUS_MISSING_RUNTIME_DISCOVERY_PROFILE#
The operation requires runtime discovery to be present, but turns out it’s not
-
enumerator NVMMH_STATUS_EXECUTION_FAILED#
nvMatmulHeuristics cannot accept the current inputs for some reason
-
enumerator NVMMH_STATUS_BUFFER_TOO_SMALL#
Some buffer is too small
-
enumerator NVMMH_STATUS_INVALID_HANDLE#
Input handle is invalid
-
enumerator NVMMH_STATUS_END#
End
-
enumerator NVMMH_STATUS_ERROR#
-
enum nvmmhDependencyConfiguration_t#
Dependency configuration. Describes the state of a potential nvMatmulHeuristics dependency. It also allows to detect enabled features. For example this allows to know if nvMatmulHeuristics was built with CUDA support.
Values:
-
enumerator NVMMH_DEP_NONE#
The dependency is absent and not required
-
enumerator NVMMH_DEP_STATIC_LINK#
The dependency is statically linked into the shared library.
-
enumerator NVMMH_DEP_DYNAMIC_LINK#
The dependency is dynamically linked (i.e. shared library) with the library.
-
enumerator NVMMH_DEP_RUNTIME_LOAD#
The dependency loaded at runtime.
-
enumerator NVMMH_DEP_NONE#
-
enum nvmmhMatmulLayout_t#
Defines the layout of the matmul.
Values:
-
enumerator NVMMH_MATMUL_LAYOUT_NN_ROW_MAJOR#
No transpose. Row Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_NT_ROW_MAJOR#
Transposing B while loading. Row Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_TN_ROW_MAJOR#
Transposing A while loading. Row Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_TT_ROW_MAJOR#
Transposing A and B while loading. Row Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_NN_COL_MAJOR#
No transpose. Col Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_NT_COL_MAJOR#
Transposing B while loading. Col Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_TN_COL_MAJOR#
Transposing A while loading. Col Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_TT_COL_MAJOR#
Transposing A and B while loading. Col Major memory layouts.
-
enumerator NVMMH_MATMUL_LAYOUT_END#
End Marker
-
enumerator NVMMH_MATMUL_LAYOUT_NN_ROW_MAJOR#
-
enum nvmmhBetaMode_t#
Defines the beta mode, aka if beta is statically known to be 0, 1 or can take any value.
Values:
-
enumerator LH_BETA_DEFAULT#
-
enumerator LH_BETA_ZERO#
-
enumerator LH_BETA_ONE#
-
enumerator LH_BETA_ANY#
-
enumerator LH_BETA_DEFAULT#
-
enum nvmmhNvidiaGpu_t#
List of predefined supported NVIDIA GPUs.
For devices not listed here, it is strongly recommended to use the automatic hardware detection mode (by passing nullptr to APIs that accept a hardware descriptor) which is the preferred way of using the library. This mode provides better adaptability and future compatibility.
Even for listed devices, using automatic hardware detection is recommended as it ensures the hardware descriptor exactly matches your specific GPU variant/SKU, which may differ from the base model in terms of memory, clock speeds, or other characteristics that can affect performance.
Warning
The list of devices included in this enum is subject to change. The library authors make no commitment to maintain this list in any specific shape or form.
Values:
-
enumerator NVMMH_NVGPU_A100_SXM_80GB#
-
enumerator NVMMH_NVGPU_A100_PCIE_80GB#
-
enumerator NVMMH_NVGPU_A30_PCIE#
-
enumerator NVMMH_NVGPU_A10_PCIE#
-
enumerator NVMMH_NVGPU_A40_PCIE#
-
enumerator NVMMH_NVGPU_RTX_3090#
-
enumerator NVMMH_NVGPU_RTX_A6000#
-
enumerator NVMMH_NVGPU_L20#
-
enumerator NVMMH_NVGPU_L40#
-
enumerator NVMMH_NVGPU_L40S#
-
enumerator NVMMH_NVGPU_L4#
-
enumerator NVMMH_NVGPU_RTX_4090#
-
enumerator NVMMH_NVGPU_RTX_6000_ADA#
-
enumerator NVMMH_NVGPU_H100_SXM#
-
enumerator NVMMH_NVGPU_H100_PCIE#
-
enumerator NVMMH_NVGPU_H100_NVL#
-
enumerator NVMMH_NVGPU_H200_SXM#
-
enumerator NVMMH_NVGPU_H20_SXM#
-
enumerator NVMMH_NVGPU_B200#
-
enumerator NVMMH_NVGPU_GB200_NVL#
-
enumerator NVMMH_NVGPU_GB300_NVL#
-
enumerator NVMMH_NVGPU_RTX_5080#
-
enumerator NVMMH_NVGPU_RTX_5090#
-
enumerator NVMMH_NVGPU_RTX_PRO_6000#
-
enumerator NVMMH_NVGPU_END#
-
enumerator NVMMH_NVGPU_A100_SXM_80GB#
-
enum nvmmhTarget_t#
nvMatmulHeuristics matmul targets.
Values:
-
enumerator NVMMH_TARGET_GENERIC#
Targets some abstract gemm implementation. Returns a configuration, but there’s no guarantee that there exists a backend that can implement this configuration
-
enumerator NVMMH_TARGET_NVFUSER#
Targets nvFuser
-
enumerator NVMMH_TARGET_CUTLASS#
Targets CUTLASS
-
enumerator NVMMH_TARGET_TRITON#
Targets Triton
-
enumerator NVMMH_TARGET_CUTLASS3#
Targets CUTLASS3
-
enumerator NVMMH_TARGET_RESERVED_1#
Reserved target
-
enumerator NVMMH_TARGET_RESERVED_2#
Reserved target
-
enumerator NVMMH_TARGET_END#
End Marker
-
enumerator NVMMH_TARGET_GENERIC#
-
enum nvmmhSplitKKind_t#
Various K-dimension reduction methods.
Values:
-
enumerator NVMMH_SPLIT_K_NONE#
No support for parallelization on the reduced dimension
-
enumerator NVMMH_SPLIT_K_IN_PLACE#
In-place split-k.
-
enumerator NVMMH_SPLIT_K_OUT_OF_PLACE#
Out-of-place split-k
-
enumerator NVMMH_SPLIT_K_STREAM_K#
Stream-k
-
enumerator NVMMH_SPLIT_K_SEGMENT_K#
Segment-k
-
enumerator NVMMH_SPLIT_K_END#
End Marker
-
enumerator NVMMH_SPLIT_K_NONE#
-
enum nvmmhSwizzlerKind_t#
Defines an internal swizzling method. These swizzlers will be overridden by nvMatmulHeuristicsBackendSetSwizzler
Values:
-
enumerator NVMMH_SWIZZLER_GENERIC#
Some unspecified and generic CTA swizzling method.
-
enumerator NVMMH_SWIZZLER_NVFUSER#
CTA swizzling method used in nvFuser
-
enumerator NVMMH_SWIZZLER_CUTLASS#
CTA swizzling method used in CUTLASS
-
enumerator NVMMH_SWIZZLER_TRITON#
CTA swizzling method used in Triton
-
enumerator NVMMH_SWIZZLER_END#
End Marker
-
enumerator NVMMH_SWIZZLER_GENERIC#
-
enum nvmmhBackendProperty_t#
Allows to specify the backend property to read or write.
Values:
-
enumerator NVMMH_BACKEND_PROP_HAS_SLICE_K#
Boolean, int32_t. Indicates whether the backend supports slice-k. If there’s no slice-k, then Cta.K must be equal to Warp.k.
-
enumerator NVMMH_BACKEND_PROP_HAS_COL_MAJOR_RASTER#
Boolean, int32_t. Indicates whether the backend supports col-major rasterization order.
-
enumerator NVMMH_BACKEND_PROP_REQUIRES_WARP_CONFIG#
Boolean, int32_t. Indicates whether the backend takes into account the specific warp configuration, Otherwise we assume it only requires a number of warp.
-
enumerator NVMMH_BACKEND_PROP_SUPPORTS_CLUSTER_CONFIG#
Boolean, int32_t. Indicates whether the backend supports thread block clusters. sm90+.
-
enumerator NVMMH_BACKEND_PROP_HIGH_SMEM_ALIGNMENT#
Boolean, int32_t. Indicates whether nvMatmulHeuristics needs to assume high alignment for shared memory allocations.
-
enumerator NVMMH_BACKEND_PROP_SMEM_EPILOGUE#
Boolean, int32_t. Indicates whether the backend has a shared-memory epilogue. This is for nvFuser.
-
enumerator NVMMH_BACKEND_PROP_SPLIT_K_KIND#
nvmmhSplitKKind_t, int32_t. Indicates the backends’ split-k kind,
-
enumerator NVMMH_BACKEND_PROP_CTA_SWIZZLER_BUILTIN_KIND#
nvmmhSwizzlerKind_t, int32_t. Indicates which swizzling method nvMatmulHeuristics must choose. This setting will be overridden by the use of nvMatmulHeuristicsBackendSetSwizzler.
-
enumerator NVMMH_BACKEND_PROP_WORKSPACE_SIZE#
int32_t workspace size for parallel split-k and such
-
enumerator NVMMH_BACKEND_PROP_DISABLE_FAST_ACC_FOR_FP8#
int32_t If set, disable fast accumulation for FP8 kernels
-
enumerator NVMMH_BACKEND_PROP_SUPPORTS_FALLBACK_CLUSTER#
int32_t Whether the kernel supports fallback thread block cluster, when used in combination with preferred cluster sizes
-
enumerator NVMMH_BACKEND_PROP_SUPPORTS_ODD_CLUSTER_N#
int32_t Whether the kernel supports odd cluster n value
-
enumerator NVMMH_BACKEND_PROP_EPILOGUE_REGISTERS#
int32_t A positive integer that represents how many registers are used by the epilogue part of the kernel
-
enumerator NVMMH_BACKEND_PROP_CTA_TILE_M_DIV_REQUIREMENT#
int32_t A positive integer that specifies that CTA tile M dimension must be divisible by a provided number
-
enumerator NVMMH_BACKEND_PROP_CTA_TILE_N_DIV_REQUIREMENT#
int32_t A positive integer that specifies that CTA tile N dimension must be divisible by a provided number
-
enumerator NVMMH_BACKEND_PROP_SMEM_CARVEOUT_SIZE#
int32_t A positive integer that specifies the size of the SMEM carveout in bytes (e.g. to specify SMEM requirement for epilogue).
-
enumerator NVMMH_BACKEND_PROP_END#
End Marker
-
enumerator NVMMH_BACKEND_PROP_HAS_SLICE_K#
-
enum nvmmhBackendPropertyCallbackKind_t#
Callback kind.
Values:
-
enumerator NVMMH_CALLBACK_KERNEL_ADDITIONAL_VALIDITY_CHECK#
Extra kernel validity check that that will be called by nvMatmulHeuristics when required
-
enumerator NVMMH_CALLBACK_SHARED_MEMORY_USAGE#
Callback that returns the amount of shared memory used by the block, in bytes
-
enumerator NVMMH_CALLBACK_CONCURRENT_CTAS#
Callback that returns the number of concurrent CTAs that can be scheduled on an SM. The goal of this is to capture the register and shared memory pressure.
-
enumerator NVMMH_CALLBACK_END#
End Marker
-
enumerator NVMMH_CALLBACK_KERNEL_ADDITIONAL_VALIDITY_CHECK#
-
enum nvmmhFlags_t#
Flags to control heuristics behavior.
Values:
-
enumerator NVMMH_FLAG_NONE#
No flag
-
enumerator NVMMH_FLAG_DISABLE_OPT_PIPELINE#
Disables internal optimization pipeline
-
enumerator NVMMH_FLAG_REDUCE_OUTPUT_SPACE#
Tries to reduce the number of different kernels generated
-
enumerator NVMMH_FLAG_REFINE_CANDIDATES_USING_TIMING_MODEL#
Sorts and/or prune candidates using a perf
-
enumerator NVMMH_FLAG_PERF_MODEL_BASED_AUTO_TUNING#
Generates A LOT of candidates internally and reorders them using the perf model, and returns only the top N requested. Should be enabled only when discovery is used.
-
enumerator NVMMH_FLAG_AUTO_TUNE_THE_PERF_MODEL#
Secret mode that tunes the perf model to the results using extra runtimes values. Very expensive.
-
enumerator NVMMH_FLAG_MAFI_SEEDS#
Shmoos the CTA tiles sizes seeds in the opt pipeline
-
enumerator NVMMH_FLAG_END#
End Marker.
-
enumerator NVMMH_FLAG_NONE#
-
enum nvmmhKernelConfigurationPropertyMask_t#
Bitmasks that can be used to select some properties of a kernel configuration.
Values:
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_NONE#
No property
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_CTA_TILE#
Refers to nvmmhKernelConfiguration_t::cta
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_WARP_TILE#
Refers to nvmmhKernelConfiguration_t::warp
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_INSTR_TILE#
Refers to nvmmhKernelConfiguration_t::instr
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_SPLIT_K#
Refers to nvmmhKernelConfiguration_t::splitK
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_GRID_SWIZZLE#
Refers to nvmmhKernelConfiguration_t::gridSwizzle
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_CTA_ORDER#
Refers to nvmmhKernelConfiguration_t::ctaOrder
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_CLUSTER_CONFIG#
Refers to nvmmhKernelConfiguration_t.clusterConfig
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_LOAD_STAGES#
Refers to nvmmhKernelConfiguration_t::loadStages
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_SPLIT_K_MODE#
Refers to nvmmhSplitKKind_t
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_ALL#
All properties
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_END#
End Marker
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_CTA_SCHEDULING#
Compound flag to select all CTA scheduling properties
-
enumerator NVMMH_KERNEL_CONFIG_PROPERTY_NONE#
-
enum nvmmhSiliconMetric_t#
Silicon metrics that nvMatmulHeuristics can estimate.
For comparing kernel performance, NVMMH_METRIC_RUNTIME_RELATIVE_FAST_S is recommended over NVMMH_METRIC_RUNTIME_S as it is specifically tuned for relative ordering of kernels rather than absolute runtime prediction. This can provide more reliable results when the goal is to determine which kernel configuration might perform better than another.
Warning
These metrics are estimates used internally by the performance model and should not be treated as exact measurements of real hardware behavior. They are primarily designed for relative comparisons and internal decision-making.
Values:
-
enumerator NVMMH_METRIC_RUNTIME_S#
Runtime in seconds
-
enumerator NVMMH_METRIC_L2_HIT_RATE#
L2 HitRate
-
enumerator NVMMH_METRIC_COMPUTE_S#
Compute time
-
enumerator NVMMH_METRIC_LOAD_S#
Memory load time
-
enumerator NVMMH_METRIC_STORE_S#
Memory store time
-
enumerator NVMMH_METRIC_GMEM_LOAD_BYTES#
Bytes read from global memory
-
enumerator NVMMH_METRIC_GMEM_STORE_BYTES#
Bytes stored to global memory
-
enumerator NVMMH_METRIC_L2_LOAD_BYTES#
Bytes read from L2
-
enumerator NVMMH_METRIC_STATIC_LATENCIES_S#
Static latencies (kernel launch, split-k, etc)
-
enumerator NVMMH_METRIC_SMEM_LOAD_BYTES#
Smem load bytes latencies
-
enumerator NVMMH_METRIC_SMEM_STORE_BYTES#
Smem store bytes latencies
-
enumerator NVMMH_METRIC_ENERGY_JOULES#
Gemm Energy
-
enumerator NVMMH_METRIC_L2_FAR_LOAD_BYTES#
Bytes read from L2
-
enumerator NVMMH_METRIC_EDP#
EDP (NVMMH_METRIC_ENERGY_JOULES * NVMMH_METRIC_RUNTIME_S)
-
enumerator NVMMH_METRIC_RUNTIME_RELATIVE_FAST_S#
Runtime tuned for relative comparison
-
enumerator NVMMH_METRIC_END#
End Marker
-
enumerator NVMMH_METRIC_RUNTIME_S#
APIs#
-
nvmmhStatus_t nvMatmulHeuristicsCreate(nvmmhHandle_t *handle)#
Create a new nvMatmulHeuristics thread-safe handle. Handles created using this API must be destroyed using nvMatmulHeuristicsDestroy. Multiple host threads may manipulate a given handle through various API calls without explicit synchronization, except for nvMatmulHeuristicsDestroy.
- Parameters:
handle – A pointer to a handle.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
-
nvmmhStatus_t nvMatmulHeuristicsDestroy(nvmmhHandle_t *handle)#
Destroys a handle created using nvMatmulHeuristicsCreate. A single host thread must be be calling this API for a given
handle
.- Parameters:
handle – A pointer to the handle to destroy.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- const char *nvMatmulHeuristicsGetStatusString(
- const nvmmhStatus_t status
Returns a pointer to a null-terminated string constant describing the status.
- Parameters:
status – The status to describe.
- Returns:
A pointer to a constant null-terminated string. The string should not be freed by the caller.
- unsigned nvMatmulHeuristicsGetLastErrorLog(
- nvmmhHandle_t handle,
- char *buffer,
- unsigned bufferSize
Returns the last error log for the current handle. Writes up to bufferSize bytes including the NULL terminator. If buffer == NULL and bufferSize == 0, then the required buffer size is returned.
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
buffer – output buffer.
bufferSize – output buffer size.
- Returns:
bytes written or required buffer size including NULL terminator.
-
unsigned nvMatmulHeuristicsGetVersionMajor()#
Returns the shared library version major
- Returns:
Version major.
-
unsigned nvMatmulHeuristicsGetVersionMinor()#
Returns the shared library version minor
- Returns:
Version minor.
-
unsigned nvMatmulHeuristicsGetVersionPatch()#
Returns the shared library version patch.
- Returns:
Version patch.
- nvmmhStatus_t nvMatmulHeuristicsGetDependencyConfiguration(
- nvmmhDependencyConfiguration_t *hasCUDA
Returns build and runtime info.
- Parameters:
hasCUDA – indicates if nvMatmulHeuristics was built to support CUDA, if so, it indicates how nvMatmulHeuristics uses libcuda.so: either dynamic linking or runtime loading
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsSetCudaDriverPath(
- nvmmhHandle_t handle,
- const char *path
Allows to specify the path to NVIDIA CUDA Driver libcuda.so/nvcuda.dll that nvMatmulHeuristics will use.
nvMatmulHeuristics looks for the driver in the following order: 1 - Path explicitly set using this API 2 - Path set using nvMatmulHeuristics_CUDA_PATH environment variable 3 - Default system lookup (so LD_PRELOAD, LD_LIBRARY_PATH, …)
Must be called before any other API call, or it will fail with NVMMH_STATUS_DRIVER_ALREADY_INITIALIZED
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
path – The path to the CUDA drive
libcuda.so
ornvcuda.dll
.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- unsigned nvMatmulHeuristicsGetVersionString(
- char *buffer,
- unsigned bufferSize
Writes version string into a buffer. Write up to bufferSize bytes including the NULL terminator. If buffer == NULL and bufferSize == 0, then the required buffer size is returned.
- Parameters:
buffer – output buffer.
bufferSize – output buffer size.
- Returns:
bytes written or required buffer size including NULL terminator.
-
nvmmhStatus_t nvMatmulHeuristicsClearInternalState()#
Clears nvMatmulHeuristics internal caches and states. This is not thread-safe and must be called by a single host thread.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsHardwareDescriptorCreate(
- nvmmhHardwareDescriptor_t *descr
Creates a hardware descriptor and allocates memory. Handles created using this API must be destroyed using nvMatmulHeuristicsHardwareDescriptorDestroy.
- Parameters:
descr – Pointer to the descriptor to create.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsHardwareDescriptorDestroy(
- nvmmhHardwareDescriptor_t *descr
Destroys the descriptor and frees memory. Handle is set to nullptr to prevent crash if API is called twice
- Parameters:
descr – Pointer to the descriptor to destroy. Set to NULL on output.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsHardwareDescriptorSetPredefinedGpu(
- nvmmhHardwareDescriptor_t descr,
- nvmmhNvidiaGpu_t gpu
Sets the hardware descriptor to a predefined GPU configuration.
- Parameters:
descr – Descriptor to set.
gpu – GPU to set the descriptor to.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsBackendCreate(
- nvmmhBackend_t *backend,
- nvmmhTarget_t target
- Parameters:
backend – Pointer to the backend handle to create. Must be destroyed using nvMatmulHeuristicsBackendDestroy.
target – nvmmhTarget_t target to create the backend for.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsBackendDestroy(
- nvmmhBackend_t *backend
Destroys the backend handle and frees associated memory. Sets the opaque pointer to NULL to prevent double free.
- Parameters:
backend – Pointer to the backend handle to destroy. Set to NULL on output.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsBackendSetValueProperty(
- nvmmhBackend_t backend,
- nvmmhBackendProperty_t property,
- const void *inputBuffer,
- unsigned bufferSize
Sets a backend property.
- Parameters:
backend – Pointer to the backend configuration.
property – nvmmhBackendProperty_t The property to set.
inputBuffer – Pointer to the input value that we want to set.
bufferSize – Size of the input value, in bytes.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsBackendGetProperty(
- nvmmhBackend_t backend,
- nvmmhBackendProperty_t property,
- void *inputBuffer,
- unsigned bufferSize
Reads a backend property.
- Parameters:
backend – Pointer to the backend configuration.
property – nvmmhBackendProperty_t The property to read.
inputBuffer – Pointer to the buffer where to store the read value.
bufferSize – Size of the buffer, in bytes.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsBackendSetCallbackProperty(
- nvmmhBackend_t backend,
- nvmmhBackendPropertyCallbackKind_t property,
- nvmmhPropertyCallback_t callback
Sets a callback property for the backend. This can be used for example to supply custom methods such as validity checks.
- Parameters:
backend – Pointer to the backend configuration.
property – nvmmhBackendPropertyCallbackKind_t Callback that we want to define.
callback – Function pointer of type nvmmhPropertyCallback_t.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsBackendSetSwizzler(
- nvmmhBackend_t backend,
- nvmmhSwizzler_t swizzler,
- nvmmhGridLauncher_t gridLauncher,
- const nvmmhSwizzling_t *supportedConfigs,
- unsigned supportedConfigsCount
Sets a callback to override the default swizzling method or method defined using NVMMH_BACKEND_PROP_CTA_SWIZZLER_BUILTIN_KIND.
- Parameters:
backend – Pointer to the backend configuration.
swizzler – Pointer to the swizzler callback.
gridLauncher – Pointer to the grid launcher callback.
supportedConfigs – Pointer to an array of nvmmhSwizzling_t.
supportedConfigsCount – Number of elements in the supportedConfigs array.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsBackendSetPerformanceModel(
- nvmmhBackend_t backend,
- nvmmhPerformanceModel_t model_ptr
Sets a performance model callback on the current backend.
- Parameters:
backend – The backend to set the performance model callback on.
model_ptr – A ptr to the performance model callback or NULL to disable the callback.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- unsigned nvMatmulHeuristicsGetGemmConfig(
- nvmmhHandle_t handle,
- const char *precision,
- unsigned flags,
- nvmmhTarget_t target,
- const nvmmhMatmulProblem_t *problemIn,
- nvmmhKernelConfiguration_t *kernelConfigOut,
- unsigned requestedConfigurations,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Heuristics entry point. If CUDA is present, the current device is used to gather device info.
- Supported precisions.
nvMatmulHeuristics APIs take as input a string describing the problem precision. Each letter in the precision string corresponds to a specific data type:
H
: 16-bit real half precision floating-pointT
: 16-bit real bfloat16 floating-pointS
: 32-bit real single precision floating-pointC
: 64-bit complex number (two single precision floats)D
: 64-bit real double precision floating-pointZ
: 128-bit complex number (two double precision floats)B
: 8-bit real unsigned integerI
: 32-bit real signed integerQ
: 8-bit real floating point in E4M3 formatR
: 8-bit real floating point in E5M2 formatO
: 4-bit floating data type (FP4)F
: Tensor Cores with TF32 compute precision
Three-letter precision strings for D += A * B + C:
First letter: Precision of A & B matrix
Second letter: Compute precision
Third letter: Precision of C & D matrix
Five-letter precision strings for D += A * B + C)
First letter: Precision of A matrix
Second letter: Precision of B matrix
Third letter: Precision of C matrix
Fourth letter: Compute precision
Fifth letter: Precision of D matrix
The following list shows some examples of supported precisions:
”BSS”
”BSB”
”BII”
”HSS”
”HSH”
”HHH”
”TST”
”TSS”
”SFS”
”SSS”
”DDD”
”CCC”
”ZZZ”
”QQTST”
”QQTSQ”
”QQSSS”
”QRTST”
”QRTSQ”
”QRTSR”
”RQTSR”
”RQSSS”
”QQHSH”
”QRHSH”
”RQHSH”
”QQHSQ”
”QRHSQ”
”QRHSR”
”RQHSQ”
”RQHSR”
”QRSSS”
”RQTST”
”RQTSQ”
”RQTSR”
”RQSSS”
”QQHSH”
”QRHSH”
”RQHSH”
”QQHSQ”
”QRHSQ”
”QRHSR”
”RQHSQ”
”RQHSR”
”OOTST”
”OOTSO”
”OOHSH”
”OOHSO”
”OOSSS”
Warning
This might initialize the CUDA context on the default device, if nvMatmulHeuristics is called before CUDA is initialized.
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate
precision – Gemm Precision. See Supported precisions.
flags – One or more flags from nvmmhFlags_t or-ed together.
target – Gemm implementation targeted by the heuristic.
problemIn – Input problem.
kernelConfigOut – Buffer where the heuristic will write its results.
requestedConfigurations – size of the output buffer. This is also used to request a certain number of kernel candidates from the heuristic (for auto-tuning, etc.).
hardwareDescriptor – Set to NULL to ignore.
- Returns:
Number of kernel configuration written into the buffer.
- unsigned nvMatmulHeuristicsGetGemmConfigEx(
- nvmmhHandle_t handle,
- const char *precision,
- unsigned flags,
- nvmmhBackend_t backend,
- const nvmmhMatmulProblem_t *problemIn,
- nvmmhKernelConfiguration_t *kernelConfigOut,
- unsigned requestedConfigurations,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Heuristics entry point. If CUDA is present, the current device is used to gather device info.
Warning
This might initialize the CUDA context on the default device, if nvMatmulHeuristics is called before CUDA is initialized
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate
precision – Gemm Precision. See Supported precisions.
flags – One or more flags from nvmmhFlags_t or-ed together.
backend – User-defined backend.
problemIn – Input problem.
kernelConfigOut – Buffer where the heuristic will write its results.
requestedConfigurations – size of the output buffer. This is also used to request a certain number of kernel candidates from the heuristic (for auto-tuning, etc.).
hardwareDescriptor – Set to NULL to ignore.
- Returns:
Number of kernel configuration written into the buffer.
- nvmmhStatus_t nvMatmulHeuristicsGetGemmConfigWithBounds(
- nvmmhHandle_t handle,
- const char *precision,
- unsigned flags,
- nvmmhTarget_t target,
- const nvmmhMatmulProblem_t *problemsIn,
- const nvmmhKernelConfiguration_t *kernelConfigAllowListIn,
- unsigned allowListEltCount,
- int *selectedKernelConfigIndicesOut,
- unsigned reqConfigCount,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Runs the heuristic and finds the closest kernel from the allow list
Warning
Experimental.
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate
precision – Gemm Precision. See Supported precisions.
flags – One or more flags from nvmmhFlags_t or-ed together.
target – Gemm implementation targeted by the heuristic.
problemsIn – Input problems.
kernelConfigAllowListIn – Pointer to an array of nvmmhKernelConfiguration_t.
allowListEltCount – Number of elements in the kernelConfigAllowListIn array.
selectedKernelConfigIndicesOut – Pointer to an array of indices of the selected kernel configurations.
reqConfigCount – Number of kernel configurations to select.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsOptimizeGemmConfig(
- nvmmhHandle_t handle,
- const char *precision,
- unsigned flags,
- nvmmhTarget_t target,
- const nvmmhMatmulProblem_t *problemIn,
- nvmmhKernelConfiguration_t *kernelConfigInOut,
- unsigned propertyAllowMask,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Starts the heuristics with the user-supplied kernel configuration.
See also
Warning
Experimental.
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precision – Gemm Precision. See Supported precisions.
flags – One or more flags from nvmmhFlags_t or-ed together.
target – Gemm implementation targeted by the heuristic.
problemIn – Input problem.
kernelConfigInOut – Buffer where the heuristic will write its results.
propertyAllowMask – XOR-ed nvMatmulHeuristicsKernelConfigurationPropertyMasks that indicate which kernel config settings can be optimized.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsOptimizeGemmConfigEx(
- nvmmhHandle_t handle,
- const char *precisionStr,
- unsigned flags,
- nvmmhBackend_t backend,
- const nvmmhMatmulProblem_t *problemIn,
- nvmmhKernelConfiguration_t *kernelConfigInOut,
- unsigned propertyAllowMask,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Starts the heuristics with the user-supplied kernel configuration.
See also
Warning
Experimental.
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precisionStr – Gemm Precision. See Supported precisions.
flags – One or more flags from nvmmhFlags_t or-ed together.
backend – Backend.
problemIn – Input problem.
kernelConfigInOut – Buffer where the heuristic will write its results.
propertyAllowMask – XOR-ed nvMatmulHeuristicsKernelConfigurationPropertyMasks that indicate which kernel config settings can be optimized.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- unsigned nvMatmulHeuristicsKernelConfigurationGetString(
- const nvmmhKernelConfiguration_t *kernelConfiguration,
- char *outputBuffer,
- unsigned bufferSize
Formats the kernel configuration into a null-terminated string and writes it into the output buffer. Writes up to bufferSize bytes (null terminator included). If buffer is NULL and
bufferSize == 0
, the function returns the required buffer size including the null terminator.- Parameters:
kernelConfiguration – List of configurations to convert.
outputBuffer – Output buffer.
bufferSize – Buffer size, in bytes.
- Returns:
Number of bytes written into buffer or the required buffer length if buffer and bufferSize are NULL.
- unsigned nvMatmulHeuristicsMatmulProblemGetString(
- const nvmmhMatmulProblem_t *matmulProblem,
- char *outputBuffer,
- unsigned bufferSize
Formats the matmul problem into a null-terminated string and writes it into the output buffer. Writes up to bufferSize bytes (null terminator included). If buffer is NULL and
bufferSize == 0
, the function returns the required buffer size including the null terminator.- Parameters:
matmulProblem – List of problems to convert.
outputBuffer – Output buffer.
bufferSize – Buffer size, in bytes.
- Returns:
number of bytes written into buffer OR the required buffer length if buffer and bufferSize are NULL.
- unsigned nvMatmulHeuristicsKernelConfigurationToConstructorString(
- const nvmmhKernelConfiguration_t *kernelConfiguration,
- char *outputBuffer,
- unsigned bufferSize
Generates a “Constructor String” that can be used to emit nvMatmulHeuristics API calls. Output is a null-terminated string and writes it into the output buffer. Writes up to bufferSize bytes (null terminator included). If buffer is NULL and
bufferSize == 0
, the function returns the required buffer size including the null terminator.- Parameters:
kernelConfiguration – List of configurations to convert.
outputBuffer – Output buffer.
bufferSize – Buffer size, in bytes.
- Returns:
Number of bytes written into buffer or the required buffer length if buffer and bufferSize are NULL.
- unsigned nvMatmulHeuristicsMatmulProblemToConstructorString(
- const nvmmhMatmulProblem_t *matmulProblem,
- char *outputBuffer,
- unsigned bufferSize
Generates a “Constructor String” that can be used to emit nvMatmulHeuristics API calls. Output is a null-terminated string and writes it into the output buffer. Writes up to bufferSize bytes (null terminator included). If buffer is NULL and
bufferSize == 0
, the function returns the required buffer size including the null terminator.- Parameters:
matmulProblem – List of problems to convert.
outputBuffer – Output buffer.
bufferSize – Buffer size, in bytes.
- Returns:
Number of bytes written into buffer or the required buffer length if buffer and bufferSize are NULL.
- unsigned nvMatmulHeuristicsGetDiscoverySet(
- nvmmhHandle_t handle,
- const char *precisionStr,
- nvmmhTarget_t target,
- nvmmhMatmulLayout_t matmulLayout,
- nvmmhMatmulProblem_t *problemsOut,
- nvmmhKernelConfiguration_t *kernelConfigsOut,
- unsigned bufferSize,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Returns a set of problems and kernels to be executed by the target implementation. Each problem must be executed with the matching kernel. The benchmark must be re-done on each kernel implementation, backend, GPU and clocks. This is optional.
On multi-device systems, this needs to be executed on each device. The device used is the one returned by
cudaGetDevice()
.If outConfigs, outProblems and bufferSize are NULL, then the required buffer size is returned.
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precisionStr – Gemm Precision. See Supported precisions.
target – Gemm implementation targeted by the heuristic.
matmulLayout – Layout of the matmul problem.
problemsOut – Pointer to an array of nvmmhMatmulProblem_t.
kernelConfigsOut – Pointer to an array of nvmmhKernelConfiguration_t.
bufferSize – Number of elements in the output arrays.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
The size of the discovery set.
- unsigned nvMatmulHeuristicsGetEnergyDiscoverySet(
- nvmmhHandle_t handle,
- const char *precisionStr,
- nvmmhTarget_t target,
- nvmmhMatmulLayout_t matmulLayout,
- nvmmhMatmulProblem_t *problemsOut,
- nvmmhKernelConfiguration_t *kernelConfigsOut,
- unsigned bufferSize,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Returns a discovery set for energy estimation.
If outConfigs, outProblems and bufferSize are NULL, then the required buffer size is returned.
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precisionStr – Gemm Precision. See Supported precisions.
target – Gemm implementation targeted by the heuristic.
matmulLayout – Layout of the matmul problem.
problemsOut – Pointer to an array of nvmmhMatmulProblem_t.
kernelConfigsOut – Pointer to an array of nvmmhKernelConfiguration_t.
bufferSize – Number of elements in the output arrays.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
The size of the discovery set.
- nvmmhStatus_t nvMatmulHeuristicsCommitDiscoveryResults(
- nvmmhHandle_t handle,
- const char *precisionStr,
- nvmmhTarget_t target,
- nvmmhMatmulLayout_t matmulLayout,
- const nvmmhMatmulProblem_t *problemsIn,
- const nvmmhKernelConfiguration_t *kernelConfigsIn,
- const float *runtimesIn,
- unsigned bufferSize,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Send the results back to nvMatmulHeuristics for processing. On multi-device systems, this needs to be executed on each device. The device used is the one returned by cudaGetDevice().
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precisionStr – Gemm Precision. See Supported precisions.
target – will set the profile for the given target globally.
matmulLayout – Layout of the matmul problem.
problemsIn – Pointer to an array of nvmmhMatmulProblem_t.
kernelConfigsIn – Pointer to an array of nvmmhKernelConfiguration_t.
runtimesIn – Config runtimes. Cases where the runtime <=0 are skipped.
bufferSize – Number of elements in the input arrays.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsCommitEnergyDiscoveryResults(
- nvmmhHandle_t handle,
- const char *precision,
- nvmmhTarget_t target,
- nvmmhMatmulLayout_t matmulLayout,
- const nvmmhMatmulProblem_t *problemsIn,
- const nvmmhKernelConfiguration_t *kernelConfigsIn,
- const float *energy_joules,
- unsigned bufferSize,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Send the energy discovery results back to nvMatmulHeuristics for processing and internal weights tuning. On multi-device systems, this needs to be executed on each device. The device used is the one returned by cudaGetDevice().
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precision – Gemm Precision. See Supported precisions.
target – will set the profile for the given target globally.
matmulLayout – Layout of the matmul problem.
problemsIn – Pointer to an array of nvmmhMatmulProblem_t.
kernelConfigsIn – Pointer to an array of nvmmhKernelConfiguration_t.
energy_joules – Config energy in Joules. Cases where the energy <=0 are skipped.
bufferSize – Number of elements in the input arrays.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.
- nvmmhStatus_t nvMatmulHeuristicsLoadInternalDiscoverySet(
- nvmmhHandle_t handle,
- const char *precisionStr,
- nvmmhTarget_t target,
- nvmmhMatmulLayout_t matmulLayout,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Loads internal discovery results to improve heuristics result. Needs to be called once, manually, and for each used configuration. Behavior is equivalent to nvMatmulHeuristicsGetDiscoverySet followed by nvMatmulHeuristicsCommitDiscoveryResults, but this variant uses internal data to avoid having to run silicon benchmarks. On multi-device systems, this needs to be executed on each device. The device used is the one returned by cudaGetDevice().
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precisionStr – Gemm Precision. See Supported precisions.
target – will load and set the profile for the given target globally
matmulLayout – Layout of the matmul problem.
hardwareDescriptor – Set to NULL to ignore.
- Returns:
NVMMH_STATUS_SUCCESS on success. Otherwise NVMMH_STATUS_ERROR or NVMMH_STATUS_PROFILE_NOT_ENTIRELY_LOADED.
- double nvMatmulHeuristicsEstimateRuntime(
- nvmmhHandle_t handle,
- const char *precision,
- nvmmhTarget_t target,
- const nvmmhMatmulProblem_t *problemIn,
- const nvmmhKernelConfiguration_t *kernelConfigIn,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Returns expected runtime in seconds. Uses an internal perf-model that relies on: 1 - Discovery Process 2 - Runtime CUDA info 3 - Internal defaults. 4 - Target/Backend information
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precision – Gemm Precision. See Supported precisions.
target – Required so nvMatmulHeuristics can match the request to one of the internal profiles if the Discovery Process was used.
problemIn – Input problem.
kernelConfigIn – Input kernel configuration.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
Expected runtime in seconds.
- double nvMatmulHeuristicsEstimateSiliconMetricEx(
- nvmmhHandle_t handle,
- const char *precisionStr,
- nvmmhBackend_t backend,
- const nvmmhMatmulProblem_t *problemIn,
- const nvmmhKernelConfiguration_t *kernelConfigIn,
- nvmmhSiliconMetric_t metric,
- nvmmhHardwareDescriptor_t hardwareDescriptor
Estimate a silicon metric.
See also
- Parameters:
handle – A handle output from nvMatmulHeuristicsCreate.
precisionStr – Gemm Precision. See Supported precisions.
backend – The backend.
problemIn – Input matmul problem.
kernelConfigIn – Input kernel configuration.
metric – What metric to estimate.
hardwareDescriptor – Hardware descriptor. Set to NULL to use the current device.
- Returns:
The estimated metric. 0 on error.
- nvmmhStatus_t nvMatmulHeuristicsGetSymbolPointer(
- const char *symbolName,
- void **pointer
Get a pointer to a symbol in the nvMatmulHeuristics shared library.
- Parameters:
symbolName – Name of the symbol to get.
pointer – Pointer to the symbol.
- Returns:
NVMMH_STATUS_SUCCESS if the operation was successful, an error code otherwise.