5.19. NvLink Methods
This chapter describes methods that NVML can perform on NVLINK enabled devices.
Classes
Defines
- #define NVML_NVLINK_BER_EXP_SHIFT 0
- Shift for NVLink BER exponent.
- #define NVML_NVLINK_BER_EXP_WIDTH 0xff
- Width for NVLink BER exponent.
- #define NVML_NVLINK_BER_MANTISSA_SHIFT 8
- Shift for NVLink BER mantissa.
- #define NVML_NVLINK_BER_MANTISSA_WIDTH 0xf
- Width for NVLink BER mantissa.
- #define NVML_NVLINK_ERROR_COUNTER_BER_GET ( var, type )
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_MSE 0x1
- MSE ucode type.
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR 0x2
- NETIR ucode type.
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_CLN 0x4
- NETIR CLN ucode type.
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_DLN 0x5
- NETIR DLN ucode type.
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_UPHY 0x3
- NETIR UPHY ucode type.
- #define NVML_NVLINK_FIRMWARE_VERSION_LENGTH 100
- Length of firmware version string.
- #define NVML_NVLINK_STATE_ACTIVE 0x1
- NVLink is active.
- #define NVML_NVLINK_STATE_INACTIVE 0x0
- NVLink is inactive.
- #define NVML_NVLINK_STATE_SLEEP 0x2
- NVLink is in sleep state.
- #define NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES 23
- Total supported NVLink bandwidth modes.
- #define nvmlNvLinkInfo_v1
- Version macro for nvmlNvLinkInfo_v1_t.
- #define nvmlNvLinkInfo_v2
- Version macro for nvmlNvLinkInfo_v2_t.
- #define nvmlNvlinkGetBwMode_v1
- Version macro for nvmlNvlinkGetBwMode_v1_t.
- #define nvmlNvlinkSetBwMode_v1
- Version macro for nvmlNvlinkSetBwMode_v1_t.
- #define nvmlNvlinkSupportedBwModes_v1
- Version macro for nvmlNvlinkSupportedBwModes_v1_t.
Functions
- nvmlReturn_t nvmlDeviceFreezeNvLinkUtilizationCounter ( nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlEnableState_t freeze )
- nvmlReturn_t nvmlDeviceGetNvLinkCapability ( nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult )
- nvmlReturn_t nvmlDeviceGetNvLinkErrorCounter ( nvmlDevice_t device, unsigned int link, nvmlNvLinkErrorCounter_t counter, unsigned long long* counterValue )
- nvmlReturn_t nvmlDeviceGetNvLinkInfo ( nvmlDevice_t device, nvmlNvLinkInfo_t* info )
- nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType ( nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t* pNvLinkDeviceType )
- nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2 ( nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci )
- nvmlReturn_t nvmlDeviceGetNvLinkState ( nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive )
- nvmlReturn_t nvmlDeviceGetNvLinkUtilizationControl ( nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control )
- nvmlReturn_t nvmlDeviceGetNvLinkUtilizationCounter ( nvmlDevice_t device, unsigned int link, unsigned int counter, unsigned long long* rxcounter, unsigned long long* txcounter )
- nvmlReturn_t nvmlDeviceGetNvLinkVersion ( nvmlDevice_t device, unsigned int link, unsigned int* version )
- nvmlReturn_t nvmlDeviceGetNvlinkBwMode ( nvmlDevice_t device, nvmlNvlinkGetBwMode_t* getBwMode )
- nvmlReturn_t nvmlDeviceGetNvlinkSupportedBwModes ( nvmlDevice_t device, nvmlNvlinkSupportedBwModes_t* supportedBwMode )
- nvmlReturn_t nvmlDeviceResetNvLinkErrorCounters ( nvmlDevice_t device, unsigned int link )
- nvmlReturn_t nvmlDeviceResetNvLinkUtilizationCounter ( nvmlDevice_t device, unsigned int link, unsigned int counter )
- nvmlReturn_t nvmlDeviceSetNvLinkDeviceLowPowerThreshold ( nvmlDevice_t device, nvmlNvLinkPowerThres_t* info )
- nvmlReturn_t nvmlDeviceSetNvLinkUtilizationControl ( nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control, unsigned int reset )
- nvmlReturn_t nvmlDeviceSetNvlinkBwMode ( nvmlDevice_t device, nvmlNvlinkSetBwMode_t* setBwMode )
- nvmlReturn_t nvmlSystemGetNvlinkBwMode ( unsigned int* nvlinkBwMode )
- nvmlReturn_t nvmlSystemSetNvlinkBwMode ( unsigned int nvlinkBwMode )
Defines
- #define NVML_NVLINK_BER_EXP_SHIFT 0
-
- #define NVML_NVLINK_BER_EXP_WIDTH 0xff
-
- #define NVML_NVLINK_BER_MANTISSA_SHIFT 8
-
- #define NVML_NVLINK_BER_MANTISSA_WIDTH 0xf
-
- #define NVML_NVLINK_ERROR_COUNTER_BER_GET ( var, type )
-
Nvlink Error counter BER can be obtained using the below macros Ex - NVML_NVLINK_ERROR_COUNTER_BER_GET( var, BER_MANTISSA)
Value
(((var) >> NVML_NVLINK_##type##_SHIFT) & \ (NVML_NVLINK_##type##_WIDTH)) \
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_MSE 0x1
-
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR 0x2
-
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_CLN 0x4
-
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_DLN 0x5
-
- #define NVML_NVLINK_FIRMWARE_UCODE_TYPE_NETIR_UPHY 0x3
-
- #define NVML_NVLINK_FIRMWARE_VERSION_LENGTH 100
-
- #define NVML_NVLINK_STATE_ACTIVE 0x1
-
- #define NVML_NVLINK_STATE_INACTIVE 0x0
-
- #define NVML_NVLINK_STATE_SLEEP 0x2
-
- #define NVML_NVLINK_TOTAL_SUPPORTED_BW_MODES 23
-
- #define nvmlNvLinkInfo_v1
-
Value
NVML_STRUCT_VERSION(NvLinkInfo, 1)
- #define nvmlNvLinkInfo_v2
-
Value
NVML_STRUCT_VERSION(NvLinkInfo, 2)
- #define nvmlNvlinkGetBwMode_v1
-
Value
NVML_STRUCT_VERSION(NvlinkGetBwMode, 1)
- #define nvmlNvlinkSetBwMode_v1
-
Value
NVML_STRUCT_VERSION(NvlinkSetBwMode, 1)
- #define nvmlNvlinkSupportedBwModes_v1
-
Value
NVML_STRUCT_VERSION(NvlinkSupportedBwModes, 1)
Functions
- nvmlReturn_t nvmlDeviceFreezeNvLinkUtilizationCounter ( nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlEnableState_t freeze )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- counter
- Specifies the counter that should be frozen (0 or 1).
- freeze
- NVML_FEATURE_ENABLED = freeze the receive and transmit counters NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters
Returns
- NVML_SUCCESS if counters were successfully frozen or unfrozen
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device, link, counter, or freeze is invalid
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Deprecated
Freezing NVLINK utilization counters is no longer supported.
Description
Freeze the NVLINK utilization counters Both the receive and transmit counters are operated on by this function
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvLinkCapability ( nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int* capResult )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- capability
- Specifies the nvmlNvLinkCapability_t to be queried
- capResult
- A boolean for the queried capability indicating that feature is available
Returns
- NVML_SUCCESS if capResult has been set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device, link, or capability is invalid or capResult is NULL
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Retrieves the requested capability from the device's NvLink for the link specified Please refer to the nvmlNvLinkCapability_t structure for the specific caps that can be queried The return value should be treated as a boolean.
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvLinkErrorCounter ( nvmlDevice_t device, unsigned int link, nvmlNvLinkErrorCounter_t counter, unsigned long long* counterValue )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- counter
- Specifies the NvLink counter to be queried
- counterValue
- Returned counter value
Returns
- NVML_SUCCESS if counter has been set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device, link, or counter is invalid or counterValue is NULL
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Retrieves the specified error counter value Please refer to nvmlNvLinkErrorCounter_t for error counters that are available
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvLinkInfo ( nvmlDevice_t device, nvmlNvLinkInfo_t* info )
-
Parameters
- device
- The identifier of the target device
- info
- Reference to nvmlNvLinkInfo_t
Returns
- NVML_SUCCESS if query is success
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device is invalid, or info is NULL
- NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version is invalid/unsupported
- NVML_ERROR_NOT_SUPPORTED if the device does not support this feature
- NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Query NVLINK information associated with this device.
- nvmlReturn_t nvmlDeviceGetNvLinkRemoteDeviceType ( nvmlDevice_t device, unsigned int link, nvmlIntNvLinkDeviceType_t* pNvLinkDeviceType )
-
Parameters
- device
- The device handle of the target GPU
- link
- The NVLink link index on the target GPU
- pNvLinkDeviceType
- Pointer in which the output remote device type is returned
Returns
- NVML_SUCCESS if pNvLinkDeviceType has been set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_NOT_SUPPORTED if NVLink is not supported
- NVML_ERROR_INVALID_ARGUMENT if device or link is invalid, or pNvLinkDeviceType is NULL
- NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Get the NVLink device type of the remote device connected over the given link.
- nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2 ( nvmlDevice_t device, unsigned int link, nvmlPciInfo_t* pci )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- pci
- nvmlPciInfo_t of the remote node for the specified link
Returns
- NVML_SUCCESS if pci has been set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device or link is invalid or pci is NULL
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Retrieves the PCI information for the remote node on a NvLink link Note: pciSubSystemId is not filled in this function and is indeterminate
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvLinkState ( nvmlDevice_t device, unsigned int link, nvmlEnableState_t* isActive )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- isActive
- nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that the link is active and NVML_FEATURE_DISABLED indicates it is inactive
Returns
- NVML_SUCCESS if isActive has been set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device or link is invalid or isActive is NULL
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Retrieves the state of the device's NvLink for the link specified
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvLinkUtilizationControl ( nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- counter
- Specifies the counter that should be set (0 or 1).
- control
- A reference to the nvmlNvLinkUtilizationControl_t to place information
Returns
- NVML_SUCCESS if the control has been set successfully
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device, counter, link, or control is invalid
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Deprecated
Getting utilization counter control is no longer supported.
Description
Get the NVLINK utilization counter control information for the specified counter, 0 or 1. Please refer to nvmlNvLinkUtilizationControl_t for the structure definition
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvLinkUtilizationCounter ( nvmlDevice_t device, unsigned int link, unsigned int counter, unsigned long long* rxcounter, unsigned long long* txcounter )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- counter
- Specifies the counter that should be read (0 or 1).
- rxcounter
- Receive counter return value
- txcounter
- Transmit counter return value
Returns
- NVML_SUCCESS if rxcounter and txcounter have been successfully set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device, counter, or link is invalid or rxcounter or txcounter are NULL
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Deprecated
Use nvmlDeviceGetFieldValues with NVML_FI_DEV_NVLINK_THROUGHPUT_* as field values instead.
Description
Retrieve the NVLINK utilization counter based on the current control for a specified counter. In general it is good practice to use nvmlDeviceSetNvLinkUtilizationControl before reading the utilization counters as they have no default state
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvLinkVersion ( nvmlDevice_t device, unsigned int link, unsigned int* version )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- version
- Requested NvLink version from nvmlNvlinkVersion_t
Returns
- NVML_SUCCESS if version has been set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device or link is invalid or version is NULL
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Retrieves the version of the device's NvLink for the link specified
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvlinkBwMode ( nvmlDevice_t device, nvmlNvlinkGetBwMode_t* getBwMode )
-
Parameters
- device
- The identifier of the target device
- getBwMode
- Reference to nvmlNvlinkGetBwMode_t
Returns
- NVML_SUCCESS if the query was successful
- NVML_ERROR_INVALID_ARGUMENT if device is invalid or getBwMode is NULL
- NVML_ERROR_NOT_SUPPORTED if this feature is not supported by the device
- NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version specified is not supported
Description
Get the NvLink Reduced Bandwidth Mode for the device
For Blackwell or newer fully supported devices.
- nvmlReturn_t nvmlDeviceGetNvlinkSupportedBwModes ( nvmlDevice_t device, nvmlNvlinkSupportedBwModes_t* supportedBwMode )
-
Parameters
- device
- The identifier of the target device
- supportedBwMode
- Reference to nvmlNvlinkSupportedBwModes_t
Returns
- NVML_SUCCESS if the query was successful
- NVML_ERROR_INVALID_ARGUMENT if device is invalid or supportedBwMode is NULL
- NVML_ERROR_NOT_SUPPORTED if this feature is not supported by the device
- NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version specified is not supported
Description
Get the supported NvLink Reduced Bandwidth Modes of the device
For Blackwell or newer fully supported devices.
- nvmlReturn_t nvmlDeviceResetNvLinkErrorCounters ( nvmlDevice_t device, unsigned int link )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
Returns
- NVML_SUCCESS if the reset is successful
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device or link is invalid
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Description
Resets all error counters to zero Please refer to nvmlNvLinkErrorCounter_t for the list of error counters that are reset
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceResetNvLinkUtilizationCounter ( nvmlDevice_t device, unsigned int link, unsigned int counter )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be reset
- counter
- Specifies the counter that should be reset (0 or 1)
Returns
- NVML_SUCCESS if counters were successfully reset
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device, link, or counter is invalid
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Deprecated
Resetting NVLINK utilization counters is no longer supported.
Description
Reset the NVLINK utilization counters Both the receive and transmit counters are operated on by this function
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceSetNvLinkDeviceLowPowerThreshold ( nvmlDevice_t device, nvmlNvLinkPowerThres_t* info )
-
Parameters
- device
- The identifier of the target device
- info
- Reference to nvmlNvLinkPowerThres_t struct input parameters
Returns
- NVML_SUCCESS if the Threshold is successfully set
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device is invalid or Threshold is not within range
- NVML_ERROR_NOT_READY if an internal driver setting prevents the threshold from being used
- NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device
Description
Set NvLink Low Power Threshold for device.
For Hopper or newer fully supported devices.
- nvmlReturn_t nvmlDeviceSetNvLinkUtilizationControl ( nvmlDevice_t device, unsigned int link, unsigned int counter, nvmlNvLinkUtilizationControl_t* control, unsigned int reset )
-
Parameters
- device
- The identifier of the target device
- link
- Specifies the NvLink link to be queried
- counter
- Specifies the counter that should be set (0 or 1).
- control
- A reference to the nvmlNvLinkUtilizationControl_t to set
- reset
- Resets the counters on set if non-zero
Returns
- NVML_SUCCESS if the control has been set successfully
- NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
- NVML_ERROR_INVALID_ARGUMENT if device, counter, link, or control is invalid
- NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature
- NVML_ERROR_UNKNOWN on any unexpected error
Deprecated
Setting utilization counter control is no longer supported.
Description
Set the NVLINK utilization counter control information for the specified counter, 0 or 1. Please refer to nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset of the counters if the reset parameter is non-zero.
For Pascal or newer fully supported devices.
- nvmlReturn_t nvmlDeviceSetNvlinkBwMode ( nvmlDevice_t device, nvmlNvlinkSetBwMode_t* setBwMode )
-
Parameters
- device
- The identifier of the target device
- setBwMode
- Reference to nvmlNvlinkSetBwMode_t
Returns
- NVML_SUCCESS if the Bandwidth mode was successfully set
- NVML_ERROR_INVALID_ARGUMENT if device is invalid or setBwMode is NULL
- NVML_ERROR_NO_PERMISSION if user does not have permission to change Bandwidth mode
- NVML_ERROR_NOT_SUPPORTED if this feature is not supported by the device
- NVML_ERROR_ARGUMENT_VERSION_MISMATCH if the version specified is not supported
Description
Set the NvLink Reduced Bandwidth Mode for the device
For Blackwell or newer fully supported devices.
- nvmlReturn_t nvmlSystemGetNvlinkBwMode ( unsigned int* nvlinkBwMode )
-
Parameters
- nvlinkBwMode
- reference of nvlink bandwidth mode
Returns
- NVML_SUCCESS on success
- NVML_ERROR_INVALID_ARGUMENT if an invalid pointer is provided
- NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture.
- NVML_ERROR_NO_PERMISSION if not root user
Description
Get the global nvlink bandwith mode
- nvmlReturn_t nvmlSystemSetNvlinkBwMode ( unsigned int nvlinkBwMode )
-
Parameters
- nvlinkBwMode
- nvlink bandwidth mode
Returns
- NVML_SUCCESS on success
- NVML_ERROR_INVALID_ARGUMENT if an invalid argument is provided
- NVML_ERROR_IN_USE if P2P object exists
- NVML_ERROR_NOT_SUPPORTED if GPU is not Hopper or newer architecture.
- NVML_ERROR_NO_PERMISSION if not root user
Description
Set the global nvlink bandwith mode