1.14. Enums and Macros

Defines

#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0
#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2
#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1
#define DCGM_FP64_BLANK 140737488355328.0
#define DCGM_FP64_IS_BLANK ( val )
#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK + 1.0)
#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK + 3.0)
#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK + 2.0)
#define DCGM_GRID_LICENSE_BUFFER_SIZE 128
#define DCGM_GROUP_ALL_GPUS 0x7fffffff
#define DCGM_GROUP_MAX_ENTITIES 64
#define DCGM_HE_PORT_NUMBER 5555
#define DCGM_INT32_BLANK 0x7ffffff0
#define DCGM_INT32_IS_BLANK ( val )
#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK + 1)
#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK + 3)
#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK + 2)
#define DCGM_INT64_BLANK 0x7ffffffffffffff0
#define DCGM_INT64_IS_BLANK ( val )
#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK + 1)
#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK + 3)
#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK + 2)
#define DCGM_MAX_CLOCKS 256
#define DCGM_MAX_FBC_SESSIONS 256
#define DCGM_MAX_NUM_DEVICES 32
#define DCGM_MAX_NUM_GROUPS 64
#define DCGM_MAX_NUM_SWITCHES 12
#define DCGM_MAX_STR_LENGTH 256
#define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32
#define DCGM_NVLINK_MAX_LINKS_PER_GPU 12
#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 6
#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 36
#define DCGM_STR_BLANK "<<<NULL>>>"
#define DCGM_STR_IS_BLANK ( val )
#define DCGM_STR_NOT_FOUND "<<<NOT_FOUND>>>"
#define DCGM_STR_NOT_PERMISSIONED "<<<NOT_PERM>>>"
#define DCGM_STR_NOT_SUPPORTED "<<<NOT_SUPPORTED>>>"
#define DCGM_VGPU_NAME_BUFFER_SIZE 64
#define MAKE_DCGM_VERSION ( typeName, ver )

Enumerations

enum dcgmChipArchitecture_t
enum dcgmConfigPowerLimitType_t
enum dcgmConfigType_t
enum dcgmGroupType_t
enum dcgmOperationMode_t
enum dcgmOrder_t
enum dcgmReturn_t

Defines

#define DCGM_CONFIG_COMPUTEMODE_DEFAULT 0

Default compute mode -- multiple contexts per device

#define DCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2

Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time

#define DCGM_CONFIG_COMPUTEMODE_PROHIBITED 1

Compute-prohibited mode -- no contexts per device

#define DCGM_FP64_BLANK 140737488355328.0

Base value for double blank. 2 ** 47. FP 64 has 52 bits of mantissa, so 47 bits can still increment by 1 and represent each value from 0-15

#define DCGM_FP64_IS_BLANK ( val )

Macro to check if a FP64 value is blank or not

Value

(((val) >= DCGM_FP64_BLANK ? 1 : 0))

#define DCGM_FP64_NOT_FOUND (DCGM_FP64_BLANK + 1.0)

Represents an error where FP64 data was not found

#define DCGM_FP64_NOT_PERMISSIONED (DCGM_FP64_BLANK + 3.0)

Represents and error where fetching the FP64 value is not allowed with our current credentials

#define DCGM_FP64_NOT_SUPPORTED (DCGM_FP64_BLANK + 2.0)

Represents an error where fetching the FP64 value is not supported

#define DCGM_GRID_LICENSE_BUFFER_SIZE 128

Represents the size of a buffer that holds a vGPU license string

#define DCGM_GROUP_ALL_GPUS 0x7fffffff

Identifies for special DCGM groups

#define DCGM_GROUP_MAX_ENTITIES 64

Maximum number of entities per entity group

#define DCGM_HE_PORT_NUMBER 5555

Default Port Number for DCGM Host Engine

#define DCGM_INT32_BLANK 0x7ffffff0

Represents value of the field which can be returned by Host Engine in case the operation is not successful Base value for 32 bits integer blank. can be used as an unspecified blank

#define DCGM_INT32_IS_BLANK ( val )

Macro to check if a INT32 value is blank or not

Value

(((val) >= DCGM_INT32_BLANK) ? 1 : 0)

#define DCGM_INT32_NOT_FOUND (DCGM_INT32_BLANK + 1)

Represents an error where INT32 data was not found

#define DCGM_INT32_NOT_PERMISSIONED (DCGM_INT32_BLANK + 3)

Represents and error where fetching the INT32 value is not allowed with our current credentials

#define DCGM_INT32_NOT_SUPPORTED (DCGM_INT32_BLANK + 2)

Represents an error where fetching the INT32 value is not supported

#define DCGM_INT64_BLANK 0x7ffffffffffffff0

Base value for 64 bits integer blank. can be used as an unspecified blank

#define DCGM_INT64_IS_BLANK ( val )

Macro to check if a INT64 value is blank or not

Value

(((val) >= DCGM_INT64_BLANK) ? 1 : 0)

#define DCGM_INT64_NOT_FOUND (DCGM_INT64_BLANK + 1)

Represents an error where INT64 data was not found

#define DCGM_INT64_NOT_PERMISSIONED (DCGM_INT64_BLANK + 3)

Represents and error where fetching the INT64 value is not allowed with our current credentials

#define DCGM_INT64_NOT_SUPPORTED (DCGM_INT64_BLANK + 2)

Represents an error where fetching the INT64 value is not supported

#define DCGM_MAX_CLOCKS 256

Max number of clocks supported for a device

#define DCGM_MAX_FBC_SESSIONS 256

Max number of active FBC sessions

#define DCGM_MAX_NUM_DEVICES 32

Max number of GPUs supported by DCGM

#define DCGM_MAX_NUM_GROUPS 64

Max limit on the number of groups supported by DCGM

#define DCGM_MAX_NUM_SWITCHES 12

Max number of NvSwitches supported by DCGM

#define DCGM_MAX_STR_LENGTH 256

Max length of the DCGM string field

#define DCGM_MAX_VGPU_INSTANCES_PER_PGPU 32

Maximum number of vGPU instances per physical GPU

#define DCGM_NVLINK_MAX_LINKS_PER_GPU 12

Number of NvLink links per GPU supported by DCGM This is 12 for Ampere, 6 for Volta, and 4 for Pascal

#define DCGM_NVLINK_MAX_LINKS_PER_GPU_LEGACY1 6

Maximum NvLink links pre-Ampere

#define DCGM_NVLINK_MAX_LINKS_PER_NVSWITCH 36

Number of NvLink links per NvSwitch supported by DCGM

#define DCGM_STR_BLANK "<<<NULL>>>"

Base value for string blank.

#define DCGM_STR_IS_BLANK ( val )

Macro to check if a STR value is blank or not Works on (char *). Looks for <<< at first position and >>> inside string

Value

(val == strstr(val, "<<<") && strstr(val, ">>>"))

#define DCGM_STR_NOT_FOUND "<<<NOT_FOUND>>>"

Represents an error where STR data was not found

#define DCGM_STR_NOT_PERMISSIONED "<<<NOT_PERM>>>"

Represents and error where fetching the STR value is not allowed with our current credentials

#define DCGM_STR_NOT_SUPPORTED "<<<NOT_SUPPORTED>>>"

Represents an error where fetching the STR value is not supported

#define DCGM_VGPU_NAME_BUFFER_SIZE 64

Represents the size of a buffer that holds a vGPU type Name or vGPU class type or name of process running on vGPU instance.

#define MAKE_DCGM_VERSION ( typeName, ver )

Creates a unique version number for each struct

Value

(unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U))

Enumerations

enum dcgmChipArchitecture_t

Simplified chip architecture. Note that these are made to match nvmlChipArchitecture_t and thus do not start at 0.

Values
DCGM_CHIP_ARCH_OLDER = 1
All GPUs older than Kepler.
DCGM_CHIP_ARCH_KEPLER = 2
All Kepler-architecture parts.
DCGM_CHIP_ARCH_MAXWELL = 3
All Maxwell-architecture parts.
DCGM_CHIP_ARCH_PASCAL = 4
All Pascal-architecture parts.
DCGM_CHIP_ARCH_VOLTA = 5
All Volta-architecture parts.
DCGM_CHIP_ARCH_TURING = 6
All Turing-architecture parts.
DCGM_CHIP_ARCH_AMPERE = 7
All Ampere-architecture parts.
DCGM_CHIP_ARCH_COUNT
Keep this second to last, exclude unknown.
DCGM_CHIP_ARCH_UNKNOWN = 0xffffffff
Anything else, presumably something newer.
enum dcgmConfigPowerLimitType_t

Represents the power cap for each member of the group.

Values
DCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0
Represents the power cap to be applied for each member of the group.
DCGM_CONFIG_POWER_BUDGET_GROUP = 1
Represents the power budget for the entire group.
enum dcgmConfigType_t

Represents the type of configuration to be fetched from the GPUs

Values
DCGM_CONFIG_TARGET_STATE = 0
The target configuration values to be applied.
DCGM_CONFIG_CURRENT_STATE = 1
The current configuration state.
enum dcgmGroupType_t

Type of GPU groups

Values
DCGM_GROUP_DEFAULT = 0
All the GPUs on the node are added to the group.
DCGM_GROUP_EMPTY = 1
Creates an empty group.
DCGM_GROUP_DEFAULT_NVSWITCHES = 2
All NvSwitches of the node are added to the group.
DCGM_GROUP_DEFAULT_INSTANCES = 3
All GPU instances of the node are added to the group.
DCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4
All compute instances of the node are added to the group.
DCGM_GROUP_DEFAULT_EVERYTHING = 5
All entities are added to this default group.
enum dcgmOperationMode_t

Operation mode for DCGM

DCGM can run in auto-mode where it runs additional threads in the background to collect any metrics of interest and auto manages any operations needed for policy management.

DCGM can also operate in manual-mode where it's execution is controlled by the user. In this mode, the user has to periodically call APIs such as dcgmPolicyTrigger and dcgmUpdateAllFields which tells DCGM to wake up and perform data collection and operations needed for policy management.

Values
DCGM_OPERATION_MODE_AUTO = 1
DCGM_OPERATION_MODE_MANUAL = 2
enum dcgmOrder_t

When more than one value is returned from a query, which order should it be returned in?

Values
DCGM_ORDER_ASCENDING = 1
Data with earliest (lowest) timestamps returned first.
DCGM_ORDER_DESCENDING = 2
Data with latest (highest) timestamps returned first.
enum dcgmReturn_t

Return values for DCGM API calls.

Values
DCGM_ST_OK = 0
Success.
DCGM_ST_BADPARAM = -1
A bad parameter was passed to a function.
DCGM_ST_GENERIC_ERROR = -3
A generic, unspecified error.
DCGM_ST_MEMORY = -4
An out of memory error occurred.
DCGM_ST_NOT_CONFIGURED = -5
Setting not configured.
DCGM_ST_NOT_SUPPORTED = -6
Feature not supported.
DCGM_ST_INIT_ERROR = -7
DCGM Init error.
DCGM_ST_NVML_ERROR = -8
When NVML returns error.
DCGM_ST_PENDING = -9
Object is in pending state of something else.
DCGM_ST_UNINITIALIZED = -10
Object is in undefined state.
DCGM_ST_TIMEOUT = -11
Requested operation timed out.
DCGM_ST_VER_MISMATCH = -12
Version mismatch between received and understood API.
DCGM_ST_UNKNOWN_FIELD = -13
Unknown field id.
DCGM_ST_NO_DATA = -14
No data is available.
DCGM_ST_STALE_DATA = -15
Data is considered stale.
DCGM_ST_NOT_WATCHED = -16
The given field id is not being updated by the cache manager.
DCGM_ST_NO_PERMISSION = -17
Do not have permission to perform the desired action.
DCGM_ST_GPU_IS_LOST = -18
GPU is no longer reachable.
DCGM_ST_RESET_REQUIRED = -19
GPU requires a reset.
DCGM_ST_FUNCTION_NOT_FOUND = -20
The function that was requested was not found (bindings only error).
DCGM_ST_CONNECTION_NOT_VALID = -21
The connection to the host engine is not valid any longer.
DCGM_ST_GPU_NOT_SUPPORTED = -22
This GPU is not supported by DCGM.
DCGM_ST_GROUP_INCOMPATIBLE = -23
The GPUs of the provided group are not compatible with each other for the requested operation
DCGM_ST_MAX_LIMIT = -24
Max limit reached for the object.
DCGM_ST_LIBRARY_NOT_FOUND = -25
DCGM library could not be found.
DCGM_ST_DUPLICATE_KEY = -26
Duplicate key passed to a function.
DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27
GPU is already a part of a sync boost group.
DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28
GPU is not a part of a sync boost group.
DCGM_ST_REQUIRES_ROOT = -29
This operation cannot be performed when the host engine is running as non-root.
DCGM_ST_NVVS_ERROR = -30
DCGM GPU Diagnostic was successfully executed, but reported an error.
DCGM_ST_INSUFFICIENT_SIZE = -31
An input argument is not large enough.
DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32
The given field ID is not supported by the API being called.
DCGM_ST_MODULE_NOT_LOADED = -33
This request is serviced by a module of DCGM that is not currently loaded.
DCGM_ST_IN_USE = -34
The requested operation could not be completed because the affected resource is in use
DCGM_ST_GROUP_IS_EMPTY = -35
This group is empty and the requested operation is not valid on an empty group.
DCGM_ST_PROFILING_NOT_SUPPORTED = -36
Profiling is not supported for this group of GPUs or GPU.
DCGM_ST_PROFILING_LIBRARY_ERROR = -37
The third-party Profiling module returned an unrecoverable error.
DCGM_ST_PROFILING_MULTI_PASS = -38
The requested profiling metrics cannot be collected in a single pass.
DCGM_ST_DIAG_ALREADY_RUNNING = -39
A diag instance is already running, cannot run a new diag until the current one finishes.
DCGM_ST_DIAG_BAD_JSON = -40
The DCGM GPU Diagnostic returned JSON that cannot be parsed.
DCGM_ST_DIAG_BAD_LAUNCH = -41
Error while launching the DCGM GPU Diagnostic.
DCGM_ST_DIAG_VARIANCE = -42
There is too much variance while training the diagnostic.
DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43
A field value met or exceeded the error threshold.
DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44
The installed driver version is insufficient for this API.
DCGM_ST_INSTANCE_NOT_FOUND = -45
The specified GPU instance does not exist.
DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46
The specified GPU compute instance does not exist.
DCGM_ST_CHILD_NOT_KILLED = -47
Couldn't kill a child process within the retries.
DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48
Detected an error in a 3rd-party library.
DCGM_ST_INSUFFICIENT_RESOURCES = -49
Not enough resources available.
DCGM_ST_PLUGIN_EXCEPTION = -50
Exception thrown from a diagnostic plugin.
DCGM_ST_NVVS_ISOLATE_ERROR = -51
The diagnostic returned an error that indicates the need for isolation.