cudnn_graph Library#
Data Type References#
These are the data type references for the cudnn_graph
library.
Struct Types#
These are the struct types for the cudnn_graph
library.
cudnnDebug_t#
cudnnDebug_t
is a structure used by cudnnSetCallback() and cudnnGetCallback() containing the metadata, such as time, time since start, stream ID, process and thread ID, that the user may choose to print or store in customized callbacks.
cudnnFraction_t#
cudnnFraction_t
is a structure that allows a user to define int64_t
fractions.
typedef struct cudnnFractionStruct { int64_t numerator; int64_t denominator; } cudnnFraction_t;
Enumeration Types#
These are the enumeration types for the cudnn_graph
library.
cudnnActivationMode_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnActivationMode_t
is an enumerated type used to select the neuron activation function used in cudnnActivationForward(), cudnnActivationBackward(), and cudnnConvolutionBiasActivationForward().
Values
CUDNN_ACTIVATION_SIGMOID
Selects the sigmoid function.
CUDNN_ACTIVATION_RELU
Selects the rectified linear function.
CUDNN_ACTIVATION_TANH
Selects the hyperbolic tangent function.
CUDNN_ACTIVATION_CLIPPED_RELU
Selects the clipped rectified linear function.
CUDNN_ACTIVATION_ELU
Selects the exponential linear function.
CUDNN_ACTIVATION_IDENTITY
Selects the identity function, intended for bypassing the activation step in cudnnConvolutionBiasActivationForward(). (The cudnnConvolutionBiasActivationForward() function must use
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
.) Does not work with cudnnActivationForward() or cudnnActivationBackward().CUDNN_ACTIVATION_SWISH
Selects the swish function.
cudnnBackendAttributeName_t#
cudnnBackendAttributeName_t
is an enumerated type that indicates the backend descriptor attributes that can be set or get using cudnnBackendSetAttribute() and cudnnBackendGetAttribute() functions. The backend descriptor to which an attribute belongs is identified by the prefix of the attribute name.
typedef enum { CUDNN_ATTR_POINTWISE_MODE = 0, CUDNN_ATTR_POINTWISE_MATH_PREC = 1, CUDNN_ATTR_POINTWISE_NAN_PROPAGATION = 2, CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3, CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4, CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5, CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6, CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7, CUDNN_ATTR_POINTWISE_SWISH_BETA = 8, CUDNN_ATTR_POINTWISE_AXIS = 9, CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100, CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101, CUDNN_ATTR_CONVOLUTION_DILATIONS = 102, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103, CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105, CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106, CUDNN_ATTR_ENGINEHEUR_MODE = 200, CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201, CUDNN_ATTR_ENGINEHEUR_RESULTS = 202, CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203, CUDNN_ATTR_ENGINECFG_ENGINE = 300, CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301, CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302, CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE = 303, CUDNN_ATTR_EXECUTION_PLAN_HANDLE = 400, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402, CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403, CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404, CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405, CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE = 406, CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500, CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501, CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502, CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503, CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600, CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717, CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750, CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751, CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752, CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753, CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754, CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755, CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756, CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757, CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758, CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770, CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771, CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772, CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773, CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774, CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780, CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781, CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782, CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783, CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784, CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785, CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786, CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787, CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788, CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789, CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790, CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791, CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792, CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793, CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794, CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795, CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796, CUDNN_ATTR_OPERATIONGRAPH_HANDLE = 800, CUDNN_ATTR_OPERATIONGRAPH_OPS = 801, CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802, CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900, CUDNN_ATTR_TENSOR_DATA_TYPE = 901, CUDNN_ATTR_TENSOR_DIMENSIONS = 902, CUDNN_ATTR_TENSOR_STRIDES = 903, CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904, CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905, CUDNN_ATTR_TENSOR_UNIQUE_ID = 906, CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907, CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908, CUDNN_ATTR_TENSOR_REORDERING_MODE = 909, CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC = 910, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001, CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002, CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003, CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100, CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101, CUDNN_ATTR_KNOB_INFO_TYPE = 1200, CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201, CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202, CUDNN_ATTR_KNOB_INFO_STRIDE = 1203, CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300, CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301, CUDNN_ATTR_ENGINE_KNOB_INFO = 1302, CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303, CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304, CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305, CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306, CUDNN_ATTR_MATMUL_COMP_TYPE = 1500, CUDNN_ATTR_MATMUL_PADDING_VALUE = 1501, CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520, CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521, CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522, CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523, CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524, CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525, CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526, CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527, CUDNN_ATTR_REDUCTION_OPERATOR = 1600, CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601, CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610, CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611, CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630, CUDNN_ATTR_RESAMPLE_MODE = 1700, CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701, CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702, CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703, CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704, CUDNN_ATTR_RESAMPLE_STRIDES = 1705, CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706, CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707, CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA = 1713, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA = 1714, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA = 1723, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA = 1724, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727, CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800, CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801, CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802, CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803, CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900, CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901, CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902, CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903, CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC = 1950, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC = 1951, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC = 1952, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953, CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000, CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001, CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002, CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003, CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004, CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005, CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006, CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007, CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008, CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009, CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010, CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011, CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012, CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013, CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014, CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100, CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101, CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102, CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103, CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104, CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105, CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106, CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107, CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108, CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109, CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110, CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200, CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201, CUDNN_ATTR_RNG_DISTRIBUTION = 2300, CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301, CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302, CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303, CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304, CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305, CUDNN_ATTR_OPERATION_RNG_YDESC = 2310, CUDNN_ATTR_OPERATION_RNG_SEED = 2311, CUDNN_ATTR_OPERATION_RNG_DESC = 2312, CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313, CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2400 } cudnnBackendAttributeName_t;
cudnnBackendAttributeType_t#
The enumeration type cudnnBackendAttributeType_t
specifies the data type of an attribute of a cuDNN backend descriptor. It is used to specify the type of data pointed to by the void *arrayOfElements
argument of cudnnBackendSetAttribute() and cudnnBackendGetAttribute().
typedef enum { CUDNN_TYPE_HANDLE = 0, CUDNN_TYPE_DATA_TYPE, CUDNN_TYPE_BOOLEAN, CUDNN_TYPE_INT64, CUDNN_TYPE_FLOAT, CUDNN_TYPE_DOUBLE, CUDNN_TYPE_VOID_PTR, CUDNN_TYPE_CONVOLUTION_MODE, CUDNN_TYPE_HEUR_MODE, CUDNN_TYPE_KNOB_TYPE, CUDNN_TYPE_NAN_PROPOGATION, CUDNN_TYPE_NUMERICAL_NOTE, CUDNN_TYPE_LAYOUT_TYPE, CUDNN_TYPE_ATTRIB_NAME, CUDNN_TYPE_POINTWISE_MODE, CUDNN_TYPE_BACKEND_DESCRIPTOR, CUDNN_TYPE_GENSTATS_MODE, CUDNN_TYPE_BN_FINALIZE_STATS_MODE, CUDNN_TYPE_REDUCTION_OPERATOR_TYPE, CUDNN_TYPE_BEHAVIOR_NOTE, CUDNN_TYPE_TENSOR_REORDERING_MODE, CUDNN_TYPE_RESAMPLE_MODE, CUDNN_TYPE_PADDING_MODE, CUDNN_TYPE_INT32, CUDNN_TYPE_CHAR, CUDNN_TYPE_SIGNAL_MODE, CUDNN_TYPE_FRACTION, CUDNN_TYPE_NORM_MODE, CUDNN_TYPE_NORM_FWD_PHASE, CUDNN_TYPE_RNG_DISTRIBUTION } cudnnBackendAttributeType_t;
|
Attribute Type |
---|---|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cudnnBackendBehaviorNote_t#
cudnnBackendBehaviorNote_t
is an enumerated type that indicates queryable behavior notes of an engine. Users can query for an array of behavior notes from an CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.
typedef enum { CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0, CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1, CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2, CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, } cudnnBackendBehaviorNote_t;
cudnnBackendDescriptorType_t#
cudnnBackendDescriptor_t
is an enumerated type that indicates the type of backend descriptors. Users create a backend descriptor of a particular type by passing a value from this enumerate to the cudnnBackendCreateDescriptor() function.
typedef enum { CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0, CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, CUDNN_BACKEND_ENGINE_DESCRIPTOR, CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR, CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR, CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR, CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR, CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR, CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, CUDNN_BACKEND_TENSOR_DESCRIPTOR, CUDNN_BACKEND_MATMUL_DESCRIPTOR, CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR, CUDNN_BACKEND_REDUCTION_DESCRIPTOR, CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR, CUDNN_BACKEND_RESAMPLE_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR, CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR, CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR, CUDNN_BACKEND_RNG_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR, CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR, CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR } cudnnBackendDescriptorType_t;
cudnnBackendHeurMode_t#
cudnnBackendHeurMode_t
is an enumerated type that indicates the operation mode of a CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR.
typedef enum { CUDNN_HEUR_MODE_INSTANT = 0, CUDNN_HEUR_MODE_B = 1, CUDNN_HEUR_MODE_FALLBACK = 2, CUDNN_HEUR_MODE_A = 3 }
Values
CUDNN_HEUR_MODE_A
andCUDNN_HEUR_MODE_INSTANT
CUDNN_HEUR_MODE_A
provides the exact same functionality asCUDNN_HEUR_MODE_INSTANT
. The purpose of this renaming is to better match the naming ofCUDNN_HEUR_MODE_B
.Consider the use of
CUDNN_HEUR_MODE_INSTANT
as deprecated; instead, useCUDNN_HEUR_MODE_A
.CUDNN_HEUR_MODE_A
utilizes a decision tree heuristic which provides optimal inference time on the CPU in comparison toCUDNN_HEUR_MODE_B
.CUDNN_HEUR_MODE_A
andCUDNN_HEUR_MODE_INSTANT
support the following operation node or operation graph:All other operation graphs are not supported.
CUDNN_HEUR_MODE_B
Can utilize the neural net based heuristics to improve generalization performance compared to
CUDNN_HEUR_MODE_INSTANT
.In cases where the neural net is utilized, inference time on the CPU will be increased by 10-100x compared to
CUDNN_HEUR_MODE_INSTANT
. These neural net heuristics are not supported for any of the following cases:3-D convolutions
Grouped convolutions (
groupCount
larger than1
)Dilated convolutions (any dilation for any spatial dimension larger than
1
)
Further, the neural net is only enabled on x86 platforms when cuDNN is run on an A100 GPU. In cases where the neural net is not supported,
CUDNN_HEUR_MODE_B
will fall back toCUDNN_HEUR_MODE_INSTANT
.CUDNN_HEUR_MODE_B
will also fall back toCUDNN_HEUR_MODE_INSTANT
in cases where the overhead ofCUDNN_HEUR_MODE_B
is projected to reduce overall network performance.CUDNN_HEUR_MODE_FALLBACK
This heuristic mode is intended to be used for finding fallback options which provide functional support (without any expectation of providing optimal GPU performance).
cudnnBackendKnobType_t#
cudnnBackendKnobType_t
is an enumerated type that indicates the type of performance knobs. Performance knobs are runtime settings to an engine that will affect its performance. Users can query for an array of performance knobs and their valid value range from a CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function. Users can set the choice for each knob using the cudnnBackendSetAttribute() function with a CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR.
typedef enum { CUDNN_KNOB_TYPE_SPLIT_K = 0, CUDNN_KNOB_TYPE_SWIZZLE = 1, CUDNN_KNOB_TYPE_TILE_SIZE = 2, CUDNN_KNOB_TYPE_USE_TEX = 3, CUDNN_KNOB_TYPE_EDGE = 4, CUDNN_KNOB_TYPE_KBLOCK = 5, CUDNN_KNOB_TYPE_LDGA = 6, CUDNN_KNOB_TYPE_LDGB = 7, CUDNN_KNOB_TYPE_CHUNK_K = 8, CUDNN_KNOB_TYPE_SPLIT_H = 9, CUDNN_KNOB_TYPE_WINO_TILE = 10, CUDNN_KNOB_TYPE_MULTIPLY = 11, CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12, CUDNN_KNOB_TYPE_TILEK = 13, CUDNN_KNOB_TYPE_STAGES = 14, CUDNN_KNOB_TYPE_REDUCTION_MODE = 15, CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16, CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17, CUDNN_KNOB_TYPE_IDX_MODE = 18, CUDNN_KNOB_TYPE_SLICED = 19, CUDNN_KNOB_TYPE_SPLIT_RS = 20, CUDNN_KNOB_TYPE_SINGLEBUFFER = 21, CUDNN_KNOB_TYPE_LDGC = 22, CUDNN_KNOB_TYPE_SPECFILT = 23, CUDNN_KNOB_TYPE_KERNEL_CFG = 24, CUDNN_KNOB_TYPE_WORKSPACE = 25, CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM = 26, CUDNN_KNOB_TYPE_TILE_CGA_M = 27, CUDNN_KNOB_TYPE_TILE_CGA_N = 28, CUDNN_KNOB_TYPE_BLOCK_SIZE = 29, CUDNN_KNOB_TYPE_OCCUPANCY = 30, CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD = 31, CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM = 32, CUDNN_KNOB_TYPE_SPLIT_COLS = 33, CUDNN_KNOB_TYPE_TILE_ROWS = 34, CUDNN_KNOB_TYPE_TILE_COLS = 35, } cudnnBackendKnobType_t;
cudnnBackendLayoutType_t#
cudnnBackendLayoutType_t
is an enumerated type that indicates queryable layout requirements of an engine. Users can query for layout requirements from a CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.
typedef enum { CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0, CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1, CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2, CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3, CUDNN_LAYOUT_TYPE_COUNT = 4, } cudnnBackendLayoutType_t;
cudnnBackendNormFwdPhase_t#
cudnnBackendNormFwdPhase_t
is an enumerated type used to distinguish the inference and training phase of the normalization forward operation.
typedef enum { CUDNN_NORM_FWD_INFERENCE = 0, CUDNN_NORM_FWD_TRAINING = 1, } cudnnBackendNormFwdPhase_t;
cudnnBackendNormMode_t#
cudnnBackendNormMode_t
is an enumerated type to indicate the normalization mode in the backend normalization forward and normalization backward operations.
For reference:
The definition of layer normalization can be found in the Layer Normalization paper.
The definition of instance normalization can be found in the Instance Normalization: The Missing Ingredient for Fast Stylization paper.
The definition of batch normalization can be found in the Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift paper.
The definition of root mean square normalization can be found in the Root Mean Square Layer Normalization paper.
CUDNN_GROUP_NORM
is not yet supported. If you try to use it, cuDNN returns a CUDNN_STATUS_INTERNAL_ERROR
error.
typedef enum { CUDNN_LAYER_NORM = 0, CUDNN_INSTANCE_NORM = 1, CUDNN_BATCH_NORM = 2, CUDNN_GROUP_NORM = 3, CUDNN_RMS_NORM = 4, } cudnnBackendNormMode_t;
cudnnBackendNumericalNote_t#
cudnnBackendNumericalNot_t
is an enumerated type that indicates queryable numerical properties of an engine. Users can query for an array of numerical notes from an CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.
typedef enum { CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0, CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS, CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION, CUDNN_NUMERICAL_NOTE_FFT, CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC, CUDNN_NUMERICAL_NOTE_WINOGRAD, CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4, CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6, CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13, CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP, CUDNN_NUMERICAL_NOTE_TYPE_COUNT, } cudnnBackendNumericalNote_t;
cudnnBackendTensorReordering_t#
cudnnBackendTensorReordering_t
is an enumerated type that indicates tensor reordering as a property of the tensor descriptor. Users can get and set this property in a CUDNN_BACKEND_TENSOR_DESCRIPTOR using the cudnnBackendSetAttribute() and cudnnBackendGetAttribute() functions.
typedef enum { CUDNN_TENSOR_REORDERING_NONE = 0, CUDNN_TENSOR_REORDERING_INT8x32 = 1, CUDNN_TENSOR_REORDERING_F16x16 = 2, } cudnnBackendTensorReordering_t;
cudnnBnFinalizeStatsMode_t#
cudnnBnFinalizeStatsMode_t
is an enumerated type that that exposes the different mathematical operation modes that converts batch norm statistics and the trained scale and bias to the equivalent scale and bias to be applied in the next normalization stage for inference and training use cases.
typedef enum { CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0, CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1, } cudnnBnFinalizeStatsMode_t;
cudnnConvolutionMode_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnConvolutionMode_t
is an enumerated type used by cudnnSetConvolution2dDescriptor() to configure a convolution descriptor
. The filter used for the convolution can be applied in two different ways, corresponding mathematically to a convolution or to a cross-correlation. (A cross-correlation is equivalent to a convolution with its filter rotated by 180 degrees.)
Values
CUDNN_CONVOLUTION
In this mode, a convolution operation will be done when applying the filter to the images.
CUDNN_CROSS_CORRELATION
In this mode, a cross-correlation operation will be done when applying the filter to the images.
cudnnDataType_t#
cudnnDataType_t
is an enumerated type indicating the data type to which a tensor descriptor or filter descriptor refers.
Values
CUDNN_DATA_FLOAT
The data is a 32-bit single-precision floating-point (
float
).CUDNN_DATA_DOUBLE
The data is a 64-bit double-precision floating-point (
double
).CUDNN_DATA_HALF
The data is a 16-bit floating-point.
CUDNN_DATA_INT8
The data is an 8-bit signed integer.
CUDNN_DATA_INT32
The data is a 32-bit signed integer.
CUDNN_DATA_INT8x4
The data is 32-bit elements each composed of 4 8-bit signed integers. This data type is only supported with the tensor format
CUDNN_TENSOR_NCHW_VECT_C
.CUDNN_DATA_UINT8
The data is an 8-bit unsigned integer.
CUDNN_DATA_UINT8x4
The data is 32-bit elements each composed of 4 8-bit unsigned integers. This data type is only supported with the tensor format
CUDNN_TENSOR_NCHW_VECT_C
.CUDNN_DATA_INT8x32
The data is 32-element vectors, each element being an 8-bit signed integer. This data type is only supported with the tensor format
CUDNN_TENSOR_NCHW_VECT_C
. Moreover, this data type can only be used withalgo 1
, meaning,CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
. For more information, refer to cudnnConvolutionFwdAlgo_t.CUDNN_DATA_BFLOAT16
The data is a 16-bit quantity, with 7 mantissa bits, 8 exponent bits, and 1 sign bit.
CUDNN_DATA_INT64
The data is a 64-bit signed integer.
CUDNN_DATA_BOOLEAN
The data is a boolean (
bool
).Note that for type
CUDNN_TYPE_BOOLEAN
, elements are expected to be “packed”: that is, one byte contains 8 elements of typeCUDNN_TYPE_BOOLEAN
. Further, within each byte, elements are indexed from the least significant bit to the most significant bit. For example, a 1 dimensional tensor of 8 elements containing 01001111 has value 1 for elements 0 through 3, 0 for elements 4 and 5, 1 for element 6 and 0 for element 7.Tensors with more than 8 elements simply use more bytes, where the order is also from least significant to most significant byte. Note, CUDA is little-endian, meaning that the least significant byte has the lower memory address address. For example, in the case of 16 elements, 01001111 11111100 has value 1 for elements 0 through 3, 0 for elements 4 and 5, 1 for element 6 and 0 for element 7, value 0 for elements 8 and 9, 1 for elements 10 through 15.
CUDNN_DATA_FP8_E4M3
The data is an 8-bit quantity, with 3 mantissa bits, 4 exponent bits, and 1 sign bit.
CUDNN_DATA_FP8_E5M2
The data is an 8-bit quantity, with 2 mantissa bits, 5 exponent bits, and 1 sign bit.
CUDNN_DATA_FAST_FLOAT_FOR_FP8
The data type is a higher throughput but lower precision compute type (compared to
CUDNN_DATA_FLOAT
) used for FP8 tensor core operations.
cudnnErrQueryMode_t#
cudnnErrQueryMode_t
is an enumerated type passed to cudnnQueryRuntimeError() to select the remote kernel error query mode.
Values
CUDNN_ERRQUERY_RAWCODE
Read the error storage location regardless of the kernel completion status.
CUDNN_ERRQUERY_NONBLOCKING
Report if all tasks in the user stream of the cuDNN handle were completed. If that is the case, report the remote kernel error code.
CUDNN_ERRQUERY_BLOCKING
Wait for all tasks to complete in the user stream before reporting the remote kernel error code.
cudnnGenStatsMode_t#
cudnnGenStatsMode_t
is an enumerated type to indicate the statistics mode in the backend statistics generation operation.
Values
CUDNN_GENSTATS_SUM_SQSUM
In this mode, the sum and sum of squares of the input tensor along the specified dimensions are computed and written out. The reduction dimensions currently supported are limited per channel, however additional support may be added upon request.
cudnnHandle_t#
cudnnHandle_t
is a pointer to an opaque structure holding the cuDNN library context. The cuDNN library context must be created using cudnnCreate() and the returned handle must be passed to all subsequent library function calls. The context should be destroyed at the end using cudnnDestroy(). The context is associated with only one GPU device, the current device at the time of the call to cudnnCreate(). However, multiple contexts can be created on the same GPU device.
cudnnMathType_t#
cudnnMathType_t
is an enumerated type used to indicate if the use of Tensor Core operations is permitted in a given library routine.
Values
CUDNN_DEFAULT_MATH
Tensor Core operations are not used on pre-NVIDIA A100 GPU devices. On A100 GPU architecture devices, Tensor Core TF32 operation is permitted.
CUDNN_TENSOR_OP_MATH
The use of Tensor Core operations is permitted but will not actively perform datatype down conversion on tensors in order to utilize Tensor Cores.
CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION
The use of Tensor Core operations is permitted and will actively perform datatype down conversion on tensors in order to utilize Tensor Cores.
CUDNN_FMA_MATH
Restricted to only kernels that use FMA instructions.
On pre-NVIDIA A100 GPU devices,
CUDNN_DEFAULT_MATH
andCUDNN_FMA_MATH
have the same behavior: Tensor Core kernels will not be selected. With NVIDIA Ampere architecture and CUDA toolkit 11,CUDNN_DEFAULT_MATH
permits TF32 Tensor Core operation andCUDNN_FMA_MATH
does not. The TF32 behavior forCUDNN_DEFAULT_MATH
and the other Tensor Core math types can be explicitly disabled by the environment variableNVIDIA_TF32_OVERRIDE=0
.
cudnnNanPropagation_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnNanPropagation_t
is an enumerated type used to indicate if a given routine should propagate Nan numbers. This enumerated type is used as a field for the cudnnActivationDescriptor_t descriptor and cudnnPoolingDescriptor_t descriptor.
Values
CUDNN_NOT_PROPAGATE_NAN
NAN
numbers are not propagated.CUDNN_PROPAGATE_NAN
NAN
numbers are propagated.
cudnnCTCGradMode_t#
Enumerated type used by cudnnSetCTCLossDescriptor_v9
and cudnnGetCTCLossDescriptor_v9
to indicate the behavior for out of boundary (OOB) samples. OOB samples are samples where L+R > T is encountered during the gradient calculation.
If
ctcGradMode
is set toCUDNN_CTC_SKIP_OOB_GRADIENTS
, then the CTC loss function does not write to the gradient buffer for that sample. Instead, the current values, even not finite, are retained.If
ctcGradMode
is set toCUDNN_CTC_ZERO_OOB_GRADIENTS
, then the gradient for that sample is set to zero. This guarantees a finite gradient.
cudnnPaddingMode_t#
cudnnPaddingMode_t
is an enumerated type to indicate the padding mode in the backend resample operations.
typedef enum { CUDNN_ZERO_PAD = 0, CUDNN_NEG_INF_PAD = 1, CUDNN_EDGE_VAL_PAD = 2, } cudnnPaddingMode_t;
cudnnPointwiseMode_t#
cudnnPointwiseMode_t
is an enumerated type to indicate the intended pointwise math operation in the backend pointwise operation descriptor.
Values
CUDNN_POINTWISE_ADD
A pointwise addition between two tensors is computed.
CUDNN_POINTWISE_ADD_SQUARE
A pointwise addition between the first tensor and the square of the second tensor is computed.
CUDNN_POINTWISE_DIV
A pointwise true division of the first tensor by second tensor is computed.
CUDNN_POINTWISE_MAX
A pointwise maximum is taken between two tensors.
CUDNN_POINTWISE_MIN
A pointwise minimum is taken between two tensors.
CUDNN_POINTWISE_MOD
A pointwise floating-point remainder of the first tensor’s division by the second tensor is computed.
CUDNN_POINTWISE_MUL
A pointwise multiplication between two tensors is computed.
CUDNN_POINTWISE_POW
A pointwise value from the first tensor to the power of the second tensor is computed.
CUDNN_POINTWISE_SUB
A pointwise subtraction between two tensors is computed.
CUDNN_POINTWISE_ABS
A pointwise absolute value of the input tensor is computed.
CUDNN_POINTWISE_CEIL
A pointwise ceiling of the input tensor is computed.
CUDNN_POINTWISE_COS
A pointwise trigonometric cosine of the input tensor is computed.
CUDNN_POINTWISE_EXP
A pointwise exponential of the input tensor is computed.
CUDNN_POINTWISE_FLOOR
A pointwise floor of the input tensor is computed.
CUDNN_POINTWISE_LOG
A pointwise natural logarithm of the input tensor is computed.
CUDNN_POINTWISE_NEG
A pointwise numerical negative of the input tensor is computed.
CUDNN_POINTWISE_RSQRT
A pointwise reciprocal of the square root of the input tensor is computed.
CUDNN_POINTWISE_SIN
A pointwise trigonometric sine of the input tensor is computed.
CUDNN_POINTWISE_SQRT
A pointwise square root of the input tensor is computed.
CUDNN_POINTWISE_TAN
A pointwise trigonometric tangent of the input tensor is computed.
CUDNN_POINTWISE_ERF
A pointwise Error Function is computed.
CUDNN_POINTWISE_IDENTITY
No computation is performed. As with other pointwise modes, this mode provides implicit conversions by specifying the data type of the input tensor as one type, and the data type of the output tensor as another.
CUDNN_POINTWISE_RELU_FWD
A pointwise rectified linear activation function of the input tensor is computed.
CUDNN_POINTWISE_TANH_FWD
A pointwise tanh activation function of the input tensor is computed.
CUDNN_POINTWISE_SIGMOID_FWD
A pointwise sigmoid activation function of the input tensor is computed.
CUDNN_POINTWISE_ELU_FWD
A pointwise Exponential Linear Unit activation function of the input tensor is computed.
CUDNN_POINTWISE_GELU_FWD
A pointwise Gaussian Error Linear Unit activation function of the input tensor is computed.
CUDNN_POINTWISE_SOFTPLUS_FWD
A pointwise softplus activation function of the input tensor is computed.
CUDNN_POINTWISE_SWISH_FWD
A pointwise swish activation function of the input tensor is computed.
CUDNN_POINTWISE_GELU_APPROX_TANH_FWD
A pointwise tanh approximation of the Gaussian Error Linear Unit activation function of the input tensor is computed. The tanh GELU approximation is computed as \(0.5x\left( 1+\tanh\left[ \sqrt{2/\pi}\left( x+0.044715x^{3} \right) \right] \right)\). For more information, refer to the GAUSSIAN ERROR LINEAR UNIT (GELUS) paper.
CUDNN_POINTWISE_RELU_BWD
A pointwise first derivative of rectified linear activation of the input tensor is computed.
CUDNN_POINTWISE_TANH_BWD
A pointwise first derivative of tanh activation of the input tensor is computed.
CUDNN_POINTWISE_SIGMOID_BWD
A pointwise first derivative of sigmoid activation of the input tensor is computed.
CUDNN_POINTWISE_ELU_BWD
A pointwise first derivative of Exponential Linear Unit activation of the input tensor is computed.
CUDNN_POINTWISE_GELU_BWD
A pointwise first derivative of Gaussian Error Linear Unit activation of the input tensor is computed.
CUDNN_POINTWISE_SOFTPLUS_BWD
A pointwise first derivative of softplus activation of the input tensor is computed.
CUDNN_POINTWISE_SWISH_BWD
A pointwise first derivative of swish activation of the input tensor is computed.
CUDNN_POINTWISE_GELU_APPROX_TANH_BWD
A pointwise first derivative of the tanh approximation of the Gaussian Error Linear Unit activation of the input tensor is computed. This is computed as \(0.5\left( 1+\tanh\left( b\left( x+cx^{3} \right) \right)+bxsech^{2}\left( b\left( cx^{3}+x \right) \right)\left( 3cx^{2}+1 \right)dy \right)\) where \(b\) is \(\sqrt{2/\pi}\) and \(c\) is \(0.044715\).
CUDNN_POINTWISE_CMP_EQ
A pointwise truth value of the first tensor equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_NEQ
A pointwise truth value of the first tensor not equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_GT
A pointwise truth value of the first tensor greater than the second tensor is computed.
CUDNN_POINTWISE_CMP_GE
A pointwise truth value of the first tensor greater than equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_LT
A pointwise truth value of the first tensor less than the second tensor is computed.
CUDNN_POINTWISE_CMP_LE
A pointwise truth value of the first tensor less than equal to the second tensor is computed.
CUDNN_POINTWISE_LOGICAL_AND
A pointwise truth value of the first tensor logical
AND
second tensor is computed.CUDNN_POINTWISE_LOGICAL_OR
A pointwise truth value of the first tensor logical
OR
second tensor is computed.CUDNN_POINTWISE_LOGICAL_NOT
A pointwise truth value of input tensors logical
NOT
is computed.CUDNN_POINTWISE_GEN_INDEX
A pointwise index value of the input tensor is generated along a given axis.
CUDNN_POINTWISE_BINARY_SELECT
A pointwise value is selected amongst two input tensors based on a given predicate tensor.
CUDNN_POINTWISE_RECIPROCAL
A pointwise reciprocal of the input tensor is computed. In other words, for every element x in the input tensor, 1/x is computed.
cudnnReduceTensorOp_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnReduceTensorOp_t
is an enumerated type used to indicate the Tensor Core operation to be used by the cudnnReduceTensor() routine. This enumerated type is used as a field for the cudnnReduceTensorDescriptor_t descriptor.
Values
CUDNN_REDUCE_TENSOR_ADD
The operation to be performed is addition.
CUDNN_REDUCE_TENSOR_MUL
The operation to be performed is multiplication.
CUDNN_REDUCE_TENSOR_MIN
The operation to be performed is a minimum comparison.
CUDNN_REDUCE_TENSOR_MAX
The operation to be performed is a maximum comparison.
CUDNN_REDUCE_TENSOR_AMAX
The operation to be performed is a maximum comparison of absolute values.
CUDNN_REDUCE_TENSOR_AVG
The operation to be performed is averaging.
CUDNN_REDUCE_TENSOR_NORM1
The operation to be performed is addition of absolute values.
CUDNN_REDUCE_TENSOR_NORM2
The operation to be performed is a square root of the sum of squares.
CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS
The operation to be performed is multiplication, not including elements of value zero.
cudnnReorderType_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnReorderType_t
is an enumerated type to set the convolution reordering type. The reordering type can be set by cudnnSetConvolutionReorderType() and its status can be read by cudnnGetConvolutionReorderType().
typedef enum { CUDNN_DEFAULT_REORDER = 0, CUDNN_NO_REORDER = 1, } cudnnReorderType_t;
cudnnResampleMode_t#
cudnnResampleMode_t
is an enumerated type to indicate the resample mode in the backend resample operations.
typedef enum { CUDNN_RESAMPLE_NEAREST = 0, CUDNN_RESAMPLE_BILINEAR = 1, CUDNN_RESAMPLE_AVGPOOL = 2, CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2, CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4, CUDNN_RESAMPLE_MAXPOOL = 3, } cudnnResampleMode_t;
cudnnRngDistribution_t#
cudnnRngDistribution_t
is an enumerated type to indicate the distribution to be used in the backend Rng
(random number generator) operation.
typedef enum { CUDNN_RNG_DISTRIBUTION_BERNOULLI, CUDNN_RNG_DISTRIBUTION_UNIFORM, CUDNN_RNG_DISTRIBUTION_NORMAL, } cudnnRngDistribution_t;
Values
CUDNN_RNG_DISTRIBUTION_BERNOULLI
The bernoulli distribution is used for the random number generation. The attribute
CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY
can be used to specify the probability of generating 1’s.CUDNN_RNG_DISTRIBUTION_UNIFORM
The uniform distribution is used for the random number generation. The attribute
CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM
andCUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM
can be used to specify the minimum and maximum value between which the random numbers should be uniformly generated.CUDNN_RNG_DISTRIBUTION_NORMAL
The normal distribution is used for the random number generation. The attribute
CUDNN_ATTR_RNG_NORMAL_DIST_MEAN
andCUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION
can be used to specify the mean and standard deviation of the random number generator.
cudnnSeverity_t#
cudnnSeverity_t
is an enumerated type passed to the customized callback function for logging that users may set. This enumerate describes the severity level of the item, so the customized logging call back may react differently. The numerical values are the same that are to be used for setting the CUDNN_LOGLEVEL_DBG
environment variable.
Values
CUDNN_SEV_FATAL = 0
This value indicates a fatal error emitted by cuDNN.
CUDNN_SEV_ERROR = 1
This value indicates a normal error emitted by cuDNN.
CUDNN_SEV_WARNING = 2
This value indicates a warning emitted by cuDNN.
CUDNN_SEV_INFO = 3
This value indicates a piece of information (for example, API log) emitted by cuDNN.
cudnnSignalMode_t#
cudnnSignalMode_t
is an enumerated type to indicate the signaling mode in the backend signal operation.
typedef enum { CUDNN_SIGNAL_SET = 0, CUDNN_SIGNAL_WAIT = 1, } cudnnSignalMode_t;
Values
CUDNN_SIGNAL_SET
The flag variable is updated with the provided signal value atomically.
CUDNN_SIGNAL_WAIT
The operation blocks until the flag variable keeps comparing equal to the provided signal value.
cudnnStatus_t#
cudnnStatus_t
is an enumerated type used for function status returns. All cuDNN library functions return their status, which can be one of the following values:
Values
CUDNN_STATUS_SUCCESS
The operation was completed successfully.
CUDNN_STATUS_NOT_INITIALIZED
The cuDNN library was not initialized properly. This error is usually returned when a call to cudnnCreate() fails or when cudnnCreate() has not been called prior to calling another cuDNN routine. In the former case, it is usually due to an error in the CUDA Runtime API called by cudnnCreate() or by an error in the hardware setup.
CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH
Some cuDNN sub libraries have different versions, indicative of an installation issue.
CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH
The schema used for serialization is not what the current cuDNN library expects, thus the serialized artifact is no longer valid and needs to be re-serialized.
CUDNN_STATUS_DEPRECATED
This error code may be reported in the warning logging level as a reminder that some functionality is under deprecation and will be removed in the next major version update.
CUDNN_STATUS_LICENSE_ERROR
The functionality requested requires some license and an error was detected when trying to check the current licensing. This error can happen if the license is not present or is expired or if the environment variable
NVIDIA_LICENSE_FILE
is not set properly.CUDNN_STATUS_RUNTIME_IN_PROGRESS
Some tasks in the user stream are not completed.
CUDNN_STATUS_RUNTIME_FP_OVERFLOW
Numerical overflow occurred during the GPU kernel execution.
CUDNN_STATUS_BAD_PARAM
This is an error category code. An incorrect value or parameter was passed to the function.
CUDNN_STATUS_BAD_PARAM_NULL_POINTER
The cuDNN API has unexpectedly received a null pointer from the user.
CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER
The cuDNN API has received a misaligned pointer from the user.
CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED
The backend descriptor has not been finalized.
CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND
The cuDNN API has received an out-of-bound value.
CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT
The cuDNN API has received a memory buffer with insufficient space.
CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH
The cuDNN API has received an unexpected stream.
CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH
The cuDNN API has received inconsistent tensor shapes.
CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES
The cuDNN API has received duplicated entries.
CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE
The cuDNN API has received an invalid or unsupported attribute type.
CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH
The cuDNN API has received an unexpected CUDA graph.
CUDNN_STATUS_NOT_SUPPORTED
This is an error category code. The functionality requested is not currently supported by cuDNN.
CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN
cuDNN does not currently support such an operation graph pattern.
CUDNN_STATUS_NOT_SUPPORTED_SHAPE
cuDNN does not currently support the tensor shapes used in some specific operation or graph pattern.
CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE
cuDNN does not currently support the tensor data type.
CUDNN_STATUS_NOT_SUPPORTED_LAYOUT
cuDNN does not currently support the tensor layout.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER
The requested functionality is not compatible with the current CUDA driver.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART
The requested functionality is not compatible with the current CUDA runtime.
CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH
The function requires a feature absent from the current GPU device.
CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING
A runtime library required by cuDNN cannot be found in the predefined search paths. These libraries are
libcuda.so
(nvcuda.dll
) andlibnvrtc.so
(nvrtc64_<Major Release Version><Minor Release Version>_0.dll
andnvrtc-builtins64_<Major Release Version><Minor Release Version>.dll
).CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE
The requested functionality is not available due to missing a sublibrary.
CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT
The requested functionality is not available due to the insufficient shared memory size on the GPU.
CUDNN_STATUS_NOT_SUPPORTED_PADDING
The requested functionality is not available due to padding requirements.
CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM
The requested functionality is not available because they lead to invalid kernel launch parameters.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API
The requested functionality is not available because this particular engine does not support the native CUDA graph API. (The engines that do support that API have the behavior note
CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API
.)CUDNN_STATUS_INTERNAL_ERROR
This is an error category code. An internal cuDNN operation failed.
CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED
A runtime kernel has failed to be compiled.
CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE
An unexpected internal inconsistency has been detected.
CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED
An internal host memory allocation failed inside the cuDNN library.
CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED
Resource allocation failed inside the cuDNN library.
CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM
Invalid kernel launch parameters are unexpectedly detected.
CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED
Access to GPU memory space failed, which is usually caused by a failure to bind a texture. To correct, prior to the function call, unbind any previously bound textures. Otherwise, this may indicate an internal error/bug in the library.
CUDNN_STATUS_EXECUTION_FAILED
This is an error category code. The GPU program failed to execute. This is usually caused by a failure to launch a kernel on the GPU, which can be caused by another library that cuDNN depends on.
CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER
The GPU program failed to execute due to an error reported by the CUDA driver.
CUDNN_STATUS_EXECUTION_FAILED_CUBLAS
The GPU program failed to execute due to an error reported by cuBLAS.
CUDNN_STATUS_EXECUTION_FAILED_CUDART
The GPU program failed to execute due to an error reported by the CUDA runtime.
CUDNN_STATUS_EXECUTION_FAILED_CURAND
The GPU program failed to execute due to an error reported by cuRAND.
Additionally, the following macros can be used on cudnnStatus_t
error codes.
CUDNN_STATUS_CATEGORY(full_error_code)
Extract the category error code from a
cudnnStatus_t
error codefull_error_code
. This is useful for checking if an error belongs to a certain category. For example,CUDNN_STATUS_CATEGORY(CUDNN_STATUS_BAD_PARAM_NULL_POINTER)
will outputCUDNN_STATUS_BAD_PARAM
.CUDNN_STATUS_SPECIFIC_ERROR(full_error_code)
Extract the specific error code from a
cudnnStatus_t
error codefull_error_code
, that is, removing the category.CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err)
Recombine a category error code and a specific code in the same category into a full
cudnnStatus_t
error code, such thatCUDNN_STATUS_FULL_ERROR_CODE(CUDNN_STATUS_CATEGORY(e), CUDNN_STATUS_SPECIFIC_ERROR(e)) == e
, for any validcudnnStatus_t
error code.
cudnnTensorFormat_t#
cudnnTensorFormat_t
is an enumerated type used by cudnnSetTensor4dDescriptor() to create a tensor with a pre-defined layout. For a detailed explanation of how these tensors are arranged in memory, refer to Data Layout Formats.
Values
CUDNN_TENSOR_NCHW
This tensor format specifies that the data is laid out in the following order: batch size, feature maps, rows, columns. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, feature maps, rows, and columns; the columns are the inner dimension and the images are the outermost dimension.
CUDNN_TENSOR_NHWC
This tensor format specifies that the data is laid out in the following order: batch size, rows, columns, feature maps. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, rows, columns, and feature maps; the feature maps are the inner dimension and the images are the outermost dimension.
CUDNN_TENSOR_NCHW_VECT_C
This tensor format specifies that the data is laid out in the following order: batch size, feature maps, rows, columns. However, each element of the tensor is a vector of multiple feature maps. The length of the vector is carried by the data type of the tensor. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, feature maps, rows, and columns; the columns are the inner dimension and the images are the outermost dimension. This format is only supported with tensor data types
CUDNN_DATA_INT8x4
,CUDNN_DATA_INT8x32
, andCUDNN_DATA_UINT8x4
.The
CUDNN_TENSOR_NCHW_VECT_C
can also be interpreted in the following way: The NCHW INT8x32 format is really N x (C/32) x H x W x 32 (32 Cs for every W), just as the NCHW INT8x4 format is N x (C/4) x H x W x 4 (4 Cs for every W). Hence, the VECT_C name - each W is a vector (4 or 32) of Cs.
Uncategorized#
These are the uncategorized references found in the cudnn_graph
library.
cudnnBackendDescriptor_t#
cudnnBackendDescriptor_t
is a typedef void pointer to one of many opaque descriptor
structures. The type of structure that it points to is determined by the argument when allocating the memory for the opaque structure using cudnnBackendCreateDescriptor().
Attributes of a descriptor
can be set using cudnnBackendSetAttribute(). After all required attributes of a descriptor
are set, the descriptor
can be finalized by cudnnBackendFinalize(). From a finalized descriptor
, one can query its queryable attributes using cudnnBackendGetAttribute(). Finally, the memory allocated for a descriptor
can be freed using cudnnBackendDestroyDescriptor().
API Functions#
These are the API functions in the cudnn_graph
library.
cudnnBackendPopulateCudaGraph()#
This method, part of the new Native CUDA Graph API, directly builds a CUDA graph (not to be confused with a cuDNN graph) representing the given engine. When the caller instantiates and executes this CUDA graph, the graph will execute the engine configuration plan on the VariantPack
and the finalized ExecutionPlan
on the data.
The resulting CUDA graph captures the pointers (data and working space) in the VariantPack
at the time this API is called, but it can be run arbitrarily many times with different data at the same pointers. The graph can also can later be modified in place with different VariantPack
pointers by using cudnnBackendUpdateCudaGraph().
The initial CUDA graph passed in to this API must be empty (having no nodes), and the caller should not append additional nodes to the resulting graph. However, the graph can be embedded as a child node of a larger CUDA graph, for example, by cudaGraphAddChildGraphNode
. (This is typical usage.)
Only a limited number of engines currently support this API (with more to be added in future releases of cuDNN). Those supporting it have the behavior note CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API
.
Note
This API is only supported in versions of cuDNN compiled against CUDA runtime 12.x and above.
cudnnStatus_t cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t cudaGraph)
Parameters
executionPlan
Input. Pointer to the finalized
ExecutionPlan
.variantPack
Input. Pointer to the finalized VariantPack consisting of:
Data pointer for each non-virtual pointer of the operation set in the execution plan.
Pointer to user-allocated workspace in global memory at least as large as the size queried from
CUDNN_BACKEND_
.
cudaGraph
Input/Output. A CUDA graph handle, representing an already created, empty CUDA graph to be populated by the API.
Returns
CUDNN_STATUS_SUCCESS
The CUDA graph was generated successfully.
CUDNN_STATUS_BAD_PARAM
An incorrect or inconsistent value is encountered. For example, a required data pointer is invalid, or the given
cudaGraph
isn’t initially empty.CUDNN_STATUS_INTERNAL_ERROR
Some internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILED
An error was encountered creating a CUDA graph for the plan with the variant pack.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API
This particular engine does not support the native CUDA graph API.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART
This cuDNN was built to a CUDA runtime less than 12.0, and doesn’t support the native CUDA graph API.
cudnnBackendCreateDescriptor()#
This function allocates memory in the descriptor
for a given descriptor
type and at the location pointed by the descriptor
.
cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor)Note
The cudnnBackendDescriptor_t is a pointer to
void *
.
Parameters
descriptorType
Input. One among the enumerated cudnnBackendDescriptorType_t.
descriptor
Input. Pointer to an instance of cudnnBackendDescriptor_t to be created.
Returns
CUDNN_STATUS_SUCCESS
The creation was successful.
CUDNN_STATUS_NOT_SUPPORTED
Creating a descriptor of a given type is not supported.
CUDNN_STATUS_ALLOC_FAILED
The memory allocation failed.
cudnnBackendDestroyDescriptor()#
This function destroys instances of cudnnBackendDescriptor_t that were previously created using cudnnBackendCreateDescriptor().
cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor)
Parameters
descriptor
Input. Instance of cudnnBackendDescriptor_t previously created by cudnnBackendCreateDescriptor().
Returns
CUDNN_STATUS_SUCCESS
The memory was destroyed successfully.
CUDNN_STATUS_ALLOC_FAILED
The destruction of memory failed.
Undefined Behavior
The
descriptor
was altered between theCreate
andDestroy Descriptor
.Undefined
The value pointed by the
descriptor
will beUndefined
after the memory is free and done.
cudnnBackendExecute()#
This function executes the given Engine Configuration Plan
on the VariantPack
and the finalized ExecutionPlan
on the data. The data and the working space are encapsulated in the VariantPack
.
cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t varianPack)
Parameters
executionPlan
Input. Pointer to the finalized
ExecutionPlan
.variantPack
Input. Pointer to the finalized
VariantPack
consisting of:Data pointer for each non-virtual pointer of the operation set in the execution plan.
Pointer to user-allocated workspace in global memory at least as large as the size queried from
CUDNN_BACKEND_
.
Returns
CUDNN_STATUS_SUCCESS
The
ExecutionPlan
was executed successfully.CUDNN_STATUS_BAD_PARAM
An incorrect or inconsistent value is encountered. For example, a required data pointer is invalid.
CUDNN_STATUS_INTERNAL_ERROR
Some internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILED
An error was encountered executing the plan with the variant pack.
cudnnBackendFinalize()#
This function finalizes the memory pointed to by the descriptor
. The type of finalization is done depending on the descriptorType
argument with which the descriptor
was created using cudnnBackendCreateDescriptor() or initialized using cudnnBackendInitialize().
cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor descriptor)
cudnnBackendFinalize()
also checks all the attributes set between the create/initialization and finalize phase. If successful, cudnnBackendFinalize()
returns CUDNN_STATUS_SUCCESS
and the finalized state of the descriptor
is set to true
. In this state, setting attributes using cudnnBackendSetAttribute() is not allowed. Getting attributes using cudnnBackendGetAttribute() is only allowed when the finalized state of the descriptor
is true
.
Parameters
descriptor
Input. Instance of cudnnBackendDescriptor_t to finalize.
Returns
CUDNN_STATUS_SUCCESS
The
descriptor
was finalized successfully.CUDNN_STATUS_BAD_PARAM
Invalid
descriptor
attribute values or combination thereof is encountered.CUDNN_STATUS_NOT_SUPPORTED
Descriptor attribute values or combinations therefore not supported by the current version of cuDNN are encountered.
CUDNN_STATUS_INTERNAL_ERROR
Some internal errors are encountered.
cudnnBackendGetAttribute()#
This function retrieves the values of an attribute of a descriptor
. attributeName
is the name of the attribute whose value is requested. attributeType
is the type of attribute. requestsedElementCount
is the number of elements to be potentially retrieved. The number of elements for the requested attribute is stored in elementCount
. The retrieved values are stored in arrayOfElements
. When the attribute is expected to have a single value, arrayOfElements
can be pointer to the output value. This function will return CUDNN_STATUS_NOT_INTIALIZED
if the descriptor has not been successfully finalized using cudnnBackendFinalize().
cudnnStatus_t cudnnBackendGetAttribute( cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, cudnnBackendAttributeType_t attributeType, int64_t requestedElementCount, int64_t *elementCount, void *arrayOfElements);
Parameters
descriptor
Input. Instance of cudnnBackendDescriptor_t whose attribute the user wants to retrieve.
attributeName
Input. The name of the attribute being get from the on the
descriptor
.attributeType
Input. The type of attribute.
requestedElementCount
Input. Number of elements to output to
arrayOfElements
.elementCount
Input. Output pointer for the number of elements the
descriptor
attribute has. Note thatcudnnBackendGetAttribute()
will only write the least of this andrequestedElementCount
elements toarrayOfElements
.arrayOfElements
Input. Array of elements of the datatype of the
attributeType
. The data type of theattributeType
is listed in the mapping table of cudnnBackendAttributeType_t.
Returns
CUDNN_STATUS_SUCCESS
The
attributeName
was given to thedescriptor
successfully.CUDNN_STATUS_BAD_PARAM
One or more invalid or inconsistent argument values were encountered. Some examples include:
attributeName
is not a valid attribute for the descriptor.attributeType
is not one of the valid types for the attribute.
CUDNN_STATUS_NOT_INITIALIZED
The
descriptor
has not been successfully finalized using cudnnBackendFinalize().
cudnnBackendInitialize()#
This function has been deprecated in cuDNN 9.2.
This function repurposes a pre-allocated memory pointed to by a descriptor
of size sizeInByte
to a backend descriptor
of type descriptorType
. The finalized state of the descriptor
is set to false
.
cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor, cudnnBackendDescriptorType_t descriptorType, size_t sizeInBytes)
Parameters
descriptor
Input. Instance of cudnnBackendDescriptor_t to be initialized.
descriptorType
Input. Enumerated value for the type of cuDNN backend
descriptor
.sizeInBytes
Input. Size of memory pointed to by
descriptor
.
Returns
CUDNN_STATUS_SUCCESS
The memory was initialized successfully.
CUDNN_STATUS_BAD_PARAM
An invalid or inconsistent argument value is encountered. Some examples include:
descriptor
is anullptr
sizeInBytes
is less than the size required by thedescriptor
type
cudnnBackendSetAttribute()#
This function sets an attribute of a descriptor
to values provided as a pointer. descriptor
is the descriptor
to be set. attributeName
is the name of the attribute to be set. attributeType
is the type of attribute. The value to which the attribute is set, is pointed by the arrayOfElements
. The number of elements is given by elementCount
. This function will return CUDNN_STATUS_NOT_INTIALIZED
if the descriptor
is already successfully finalized using cudnnBackendFinalize().
cudnnStatus_t cudnnBackendSetAttribute( cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, cudnnBackendAttributeType_t attributeType, int64_t elementCount, void *arrayOfElements);
Parameters
descriptor
Input. Instance of cudnnBackendDescriptor_t whose attribute is being set.
attributeName
Input. The name of the attribute being set on the
descriptor
.attributeType
Input. The type of attribute.
elementCount
Input. Number of elements being set.
arrayOfElements
Input. The starting location for an array from where to read the values from. The elements of the array are expected to be of the datatype of the
attributeType
. The datatype of theattributeType
is listed in the mapping table of cudnnBackendAttributeType_t.
Returns
CUDNN_STATUS_SUCCESS
The
attributeName
was set to thedescriptor
.CUDNN_STATUS_NOT_INITIALIZED
The backend
descriptor
pointed to by thedescriptor
is already in the finalized state.CUDNN_STATUS_BAD_PARAM
The function is called with arguments that correspond to invalid values. Some examples include:
attributeName
is not a settable attribute ofdescriptor
.attributeType
is incorrect for thisattributeName
.elemCount
value is unexpected.arrayOfElements
contains values invalid for theattributeType
.
CUDNN_STATUS_NOT_SUPPORTED
The values to which the attributes are being set are not supported by the current version of cuDNN.
cudnnBackendUpdateCudaGraph()#
This method, part of the new Native CUDA Graph API, updates an existing CUDA graph previously populated by cudnnBackendPopulateCudaGraph() (or a clone thereof) with a new variantPack
.
Only a limited number of engines currently support this API (with more to be added in future releases of cuDNN). Those supporting it have the behavior note CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API
.
Note
This API is only supported in versions of cuDNN compiled against CUDA runtime 12.x and above.
cudnnStatus_t cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t cudaGraph)
Parameters
executionPlan
Input. Pointer to the finalized
ExecutionPlan
. This must match theExecutionPlan
originally passed to cudnnBackendPopulateCudaGraph().variantPack
Input. Pointer to a finalized
VariantPack
consisting of the following pointers, which replace theVariantPack
pointers captured in the CUDA graph:Data pointer for each non-virtual pointer of the operation set in the execution plan.
Pointer to a user-allocated workspace in global memory at least as large as the size queried from
CUDNN_BACKEND_
.
cudaGraph
Input. Pointer to an existing CUDA graph handle. This graph must have been populated by cudnnBackendPopulateCudaGraph(), or be a clone thereof (for example, as created by
cudaGraphClone
, or as embedded in a larger graph by usingcudaGraphAddChildGraphNode
).
Returns
CUDNN_STATUS_SUCCESS
The CUDA graph was updated successfully.
CUDNN_STATUS_BAD_PARAM
An incorrect or inconsistent value is encountered. For example, a required data pointer is invalid.
CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH
The CUDA graph doesn’t appear to have been populated by cudnnBackendPopulateCudaGraph(), (or a clone thereof) for this execution plan, or contains unexpected additional nodes.
CUDNN_STATUS_INTERNAL_ERROR
Some internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILED
An error was encountered updating a CUDA graph for the plan with the variant pack.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API
This particular engine does not support the native CUDA graph API.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART
This cuDNN was built to a CUDA runtime less than 12.0, and doesn’t support the native CUDA graph API.
cudnnCreate()#
This function initializes the cuDNN library and creates a handle to an opaque structure holding the cuDNN library context. It allocates hardware resources on the host and device and must be called prior to making any other cuDNN library calls.
cudnnStatus_t cudnnCreate(cudnnHandle_t *handle)
The cuDNN library handle is tied to the current CUDA device (context). To use the library on multiple devices, one cuDNN handle needs to be created for each device.
For a given device, multiple cuDNN handles with different configurations (for example, different current CUDA streams) may be created. Because cudnnCreate()
allocates some internal resources, the release of those resources by calling cudnnDestroy() will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate()
or cudnnDestroy() outside of performance-critical code paths.
For multithreaded applications that use the same device from different threads, the recommended programming model is to create one (or a few, as is convenient) cuDNN handles per thread and use that cuDNN handle for the entire life of the thread.
Parameters
handle
Output. Pointer to pointer where to store the address to the allocated cuDNN handle. For more information, refer to cudnnHandle_t.
Returns
CUDNN_STATUS_BAD_PARAM
Invalid (
NULL
) input pointer supplied.CUDNN_STATUS_NOT_INITIALIZED
No compatible GPU found, CUDA driver not installed or disabled, CUDA runtime API initialization failed.
CUDNN_STATUS_ARCH_MISMATCH
NVIDIA GPU architecture is too old.
CUDNN_STATUS_ALLOC_FAILED
Host memory allocation failed.
CUDNN_STATUS_INTERNAL_ERROR
CUDA resource allocation failed.
CUDNN_STATUS_LICENSE_ERROR
cuDNN license validation failed (only when the feature is enabled).
CUDNN_STATUS_SUCCESS
cuDNN handle was created successfully.
cudnnDestroy()#
This function releases the resources used by the cuDNN handle. This function is usually the last call made to cuDNN with a particular handle. Because cudnnCreate() allocates internal resources, the release of those resources by calling cudnnDestroy()
will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate() or cudnnDestroy()
outside of performance-critical code paths.
cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
Parameters
handle
Input. The cuDNN handle to be destroyed.
Returns
CUDNN_STATUS_SUCCESS
The cuDNN context destruction was successful.
cudnnGetCallback()#
This function queries the internal states of cuDNN error reporting functionality.
cudnnStatus_t cudnnGetCallback( unsigned mask, void **udata, cudnnCallback_t fptr)
Parameters
mask
Output. Pointer to the address where the current internal error reporting message bit mask will be outputted.
udata
Output. Pointer to the address where the current internally stored
udata
address will be stored.fptr
Output. Pointer to the address where the current internally stored
callback
function pointer will be stored. When the built-in default callback function is used,NULL
will be outputted.
Returns
CUDNN_STATUS_SUCCESS
The function launched successfully.
CUDNN_STATUS_BAD_PARAM
If any of the input parameters are
NULL
.
cudnnGetCudartVersion()#
The same version of a given cuDNN library can be compiled against different CUDA toolkit versions. This routine returns the CUDA toolkit version that the currently used cuDNN library has been compiled against.
size_t cudnnGetCudartVersion()
cudnnGetErrorString()#
This function converts the cuDNN status code to a NULL
terminated (ASCIIZ) static string. For example, when the input argument is CUDNN_STATUS_SUCCESS
, the returned string is CUDNN_STATUS_SUCCESS
. When an invalid status value is passed to the function, the returned string is CUDNN_UNKNOWN_STATUS
.
const char * cudnnGetErrorString(cudnnStatus_t status)
Parameters
status
Input. cuDNN enumerant status code.
Returns
Pointer to a static, NULL terminated string with the status name.
cudnnGetLastErrorString()#
This function retrieves the last encountered cuDNN error message in the current thread to a NULL
terminated (ASCIIZ) string. Inside the cuDNN library, the messages are stored in thread local buffers. The error is cleared after the user calls this API to retrieve it.
void cudnnGetLastErrorString(char *message, size_t max_size);
Parameters
message
Output. Pointer to a character buffer that can store the error message. As we do not manage the thread-safety of the pre-allocated output buffer “message” to avoid unnecessary overhead, we ask that the user ensure it is thread-safe themselves on their own need-basis.
max_size
Input. Maximum size that can be stored in the location pointed to by message. The output is strictly limited by the size limit
max_size
, also counting the terminator character\0
, which will be automatically appended to the message if there is space.
cudnnGetMaxDeviceVersion()#
This function returns the maximum SM version that the cuDNN library is aware of and supports natively. Any SM version higher than this would be supported in forward compatibility mode. For more information about forward compatibility, refer to the cuDNN Developer Guide.
size_t cudnnGetMaxDeviceVersion(void);
Returns
A size_t
type value indicating the latest known SM number for the current version of the library. For example, if NVIDIA Hopper (GH100) is the latest known SM that the library is aware of, the value returned would be 900
.
cudnnGetProperty()#
This function writes a specific part of the cuDNN library version number into the provided host storage.
cudnnStatus_t cudnnGetProperty( libraryPropertyType type, int *value)
Parameters
type
Input. Enumerant type that instructs the function to report the numerical value of the cuDNN major version, minor version, or the patch level depending on whether
type
is set toMAJOR_VERSION
,MINOR_VERSION
, orPATCH_LEVEL
.value
Output. Host pointer where the version information should be written.
Returns
CUDNN_STATUS_INVALID_VALUE
Invalid value of the
type
argument.CUDNN_STATUS_SUCCESS
Version information was stored successfully at the provided address.
cudnnGetStream()#
This function retrieves the user CUDA stream programmed in the cuDNN handle. When the user’s CUDA stream is not set in the cuDNN handle, this function reports the null-stream.
cudnnStatus_t cudnnGetStream( cudnnHandle_t handle, cudaStream_t *streamId)
Parameters
handle
Input. Pointer to the cuDNN handle.
streamID
Output. Pointer where the current CUDA stream from the cuDNN handle should be stored.
Returns
CUDNN_STATUS_BAD_PARAM
Invalid (
NULL
) handle.CUDNN_STATUS_SUCCESS
The stream identifier was retrieved successfully.
cudnnGetVersion()#
This function returns the version number of the cuDNN library. It returns the CUDNN_VERSION
defined present in the cudnn.h
header file. Starting with release R2, the routine can be used to identify dynamically the current cuDNN library used by the application. The defined CUDNN_VERSION
can be used to have the same application linked against different cuDNN versions using conditional compilation statements.
size_t cudnnGetVersion()
cudnnGraphVersionCheck()#
Cross-library version checker. Each sub-library has a version checker that checks whether its own version matches that of its dependencies.
cudnnStatus_t cudnnGraphVersionCheck(void);
Returns
CUDNN_STATUS_SUCCESS
The version check passed.
CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH
The versions are inconsistent.
cudnnQueryRuntimeError()#
cuDNN library functions perform extensive input argument checking before launching GPU kernels. The last step is to verify that the GPU kernel actually started. When a kernel fails to start, CUDNN_STATUS_EXECUTION_FAILED
is returned by the corresponding API call. Typically, after a GPU kernel starts, no runtime checks are performed by the kernel itself - numerical results are simply written to output buffers.
cudnnStatus_t cudnnQueryRuntimeError( cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode)
When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT
mode is selected in cudnnBatchNormalizationForwardTraining() or cudnnBatchNormalizationBackward(), the algorithm may encounter numerical overflows where CUDNN_BATCHNORM_SPATIAL
performs just fine albeit at a slower speed. The user can invoke cudnnQueryRuntimeError() to make sure numerical overflows did not occur during the kernel execution. Those issues are reported by the kernel that performs computations.
cudnnQueryRuntimeError() can be used in polling and blocking software control flows. There are two polling modes (CUDNN_ERRQUERY_RAWCODE
and CUDNN_ERRQUERY_NONBLOCKING
) and one blocking mode CUDNN_ERRQUERY_BLOCKING
.
CUDNN_ERRQUERY_RAWCODE
reads the error storage location regardless of the kernel completion status. The kernel might not even start and the error storage (allocated per cuDNN handle) might be used by an earlier call.
CUDNN_ERRQUERY_NONBLOCKING
checks if all tasks in the user stream are completed. The cudnnQueryRuntimeError() function will return immediately and report CUDNN_STATUS_RUNTIME_IN_PROGRESS
in rstatus
if some tasks in the user stream are pending. Otherwise, the function will copy the remote kernel error code to rstatus
.
In the blocking mode (CUDNN_ERRQUERY_BLOCKING
), the function waits for all tasks to drain in the user stream before reporting the remote kernel error code. The blocking flavor can be further adjusted by calling cudaSetDeviceFlags
with the cudaDeviceScheduleSpin
, cudaDeviceScheduleYield
, or cudaDeviceScheduleBlockingSync
flag.
CUDNN_ERRQUERY_NONBLOCKING
and CUDNN_ERRQUERY_BLOCKING
modes should not be used when the user stream is changed in the cuDNN handle, meaning, cudnnSetStream() is invoked between functions that report runtime kernel errors and the cudnnQueryRuntimeError() function.
The remote error status reported in rstatus
can be set to: CUDNN_STATUS_SUCCESS
, CUDNN_STATUS_RUNTIME_IN_PROGRESS
, or CUDNN_STATUS_RUNTIME_FP_OVERFLOW
. The remote kernel error is automatically cleared by cudnnQueryRuntimeError().
The cudnnQueryRuntimeError() function should be used in conjunction with cudnnBatchNormalizationForwardTraining() and cudnnBatchNormalizationBackward() when the cudnnBatchNormMode_t argument is CUDNN_BATCHNORM_SPATIAL_PERSISTENT
.
Parameters
handle
Input. Handle to a previously created cuDNN context.
rstatus
Output. Pointer to the user’s error code storage.
mode
Input. Remote error query mode.
tag
Input/Output. Currently, this argument should be
NULL
.
Returns
CUDNN_STATUS_SUCCESS
No errors detected (
rstatus
holds a valid value).CUDNN_STATUS_BAD_PARAM
Invalid input argument.
CUDNN_STATUS_INTERNAL_ERROR
A stream blocking synchronization or a non-blocking stream query failed.
CUDNN_STATUS_MAPPING_ERROR
The device cannot access zero-copy memory to report kernel errors.
cudnnSetCallback()#
This function sets the internal states of cuDNN error reporting functionality.
cudnnStatus_t cudnnSetCallback( unsigned mask, void *udata, cudnnCallback_t fptr)
Parameters
mask
Input. An unsigned integer. The four least significant bits (LSBs) of this unsigned integer are used for switching on and off the different levels of error reporting messages. This applies for both the default callbacks, and for the customized callbacks. The bit position is in correspondence with the enum of cudnnSeverity_t. The user may utilize the predefined macros
CUDNN_SEV_ERROR_EN
,CUDNN_SEV_WARNING_EN
, andCUDNN_SEV_INFO_EN
to form the bit mask. When a bit is set to 1, the corresponding message channel is enabled.For example, when bit 3 is set to 1, the API logging is enabled. Currently, only the log output of level
CUDNN_SEV_INFO
is functional; the others are not yet implemented. When used for turning on and off the logging with the default callback, the user may passNULL
toudata
andfptr
. In addition, the environment variableCUDNN_LOGDEST_DBG
must be set. For more information, refer to the Deprecation Policy.CUDNN_SEV_INFO_EN = 0b1000
(functional).CUDNN_SEV_ERROR_EN = 0b0010
(functional).CUDNN_SEV_WARNING_EN = 0b0100
(functional).The output of
CUDNN_SEV_FATAL
is always enabled and cannot be disabled.udata
Input. A pointer provided by the user. This pointer will be passed to the user’s custom logging callback function. The data it points to will not be read, nor be changed by cuDNN. This pointer may be used in many ways, such as in a mutex or in a communication socket for the user’s callback function for logging. If the user is utilizing the default callback function, or doesn’t want to use this input in the customized callback function, they may pass in
NULL
.fptr
Input. A pointer to a user-supplied callback function. When
NULL
is passed to this pointer, then cuDNN switches back to the built-in default callback function. The user-supplied callback function prototype must be similar to the following (also defined in the header file):void customizedLoggingCallback (cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
The structure cudnnDebug_t is defined in the header file. It provides the metadata, such as time, time since start, stream ID, process and thread ID, that the user may choose to print or store in their customized callback.
The variable
msg
is the logging message generated by cuDNN. Each line of this message is terminated by\0
, and the end of the message is terminated by\0\0
. Users may select what is necessary to show in the log, and may reformat the string.
Returns
CUDNN_STATUS_SUCCESS
The function launched successfully.
cudnnSetStream()#
This function sets the user’s CUDA stream in the cuDNN handle. The new stream will be used to launch cuDNN GPU kernels or to synchronize to this stream when cuDNN kernels are launched in the internal streams. If the cuDNN library stream is not set, all kernels use the default (NULL
) stream. Setting the user stream in the cuDNN handle guarantees the issue-order execution of cuDNN calls and other GPU kernels launched in the same stream.
cudnnStatus_t cudnnSetStream( cudnnHandle_t handle, cudaStream_t streamId)
With CUDA 11.x or later, internal streams have the same priority as the stream set by the last call to this function. In CUDA graph capture mode, CUDA 11.8 or later is required in order for the stream priorities to match.
Parameters
handle
Input. Pointer to the cuDNN handle.
streamID
Input. New CUDA stream to be written to the cuDNN handle.
Returns
CUDNN_STATUS_BAD_PARAM
Invalid (
NULL
) handle.CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH
Mismatch between the user stream and the cuDNN handle context.
CUDNN_STATUS_NOT_SUPPORTED
The stream priority is out of range.
CUDNN_STATUS_INTERNAL_ERROR
CUDA stream APIs reported further errors inside cuDNN.
CUDNN_STATUS_SUCCESS
The new stream was set successfully.
Backend Descriptor Types#
This section enumerates all valid attributes of various descriptors.
CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &desc)
; the cuDNN backend convolution descriptor specifies the parameters for a convolution operator for both forward and backward propagation: compute data type, convolution mode, filter dilation and stride, and padding on both sides.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_CONVOLUTION_
:
CUDNN_ATTR_CONVOLUTION_COMP_TYPE
The compute type of the convolution operator.
CUDNN_TYPE_DATA_TYPE
; one element.Required attribute.
CUDNN_ATTR_CONVOLUTION_MODE
Convolution or cross-correlation mode.
CUDNN_TYPE_CONVOLUTION_MODE
; one element.Required attribute.
CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS
The number of spatial dimensions, expected array length for each of dilations, filter strides, and padding arrays.
CUDNN_TYPE_INT64
; one element.Required attribute.
CUDNN_ATTR_CONVOLUTION_DILATIONS
Filter dilation.
CUDNN_TYPE_INT64
; one or more, but at mostCUDNN_MAX_DIMS
elements.Required attribute.
CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES
Filter stride.
CUDNN_TYPE_INT64
; one or more, but at mostCUDNN_MAX_DIMS
elements.Required attribute.
CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS
Padding at the beginning of each spatial dimension.
CUDNN_TYPE_INT64
; one or more, but at mostCUDNN_MAX_DIMS
elements.Required attribute.
CUDNN_ATTR_CONVOLUTION_POST_PADDINGS
Padding at the end of each spatial dimension.
CUDNN_TYPE_INT64
; one or more, but at mostCUDNN_MAX_DIMS
elements.Required attribute.
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR
can have the following return values:
CUDNN_STATUS_BAD_PARAM
An
elemCount
argument for settingCUDNN_ATTR_CONVOLUTION_DILATIONS
,CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES
,CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS
, andCUDNN_ATTR_CONVOLUTION_POST_PADDINGS
is not equal to the value set forCUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS
.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_ENGINE_DESCRIPTOR#
Created with descriptor type value CUDNN_BACKEND_ENGINE_DESCRIPTOR
, cuDNN backend engine descriptor describes an engine to compute an operation graph. An engine is a grouping of kernels with similar compute and numerical attributes.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_ENGINE_
:
CUDNN_ATTR_ENGINE_OPERATION_GRAPH
The operation graph to compute.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR.Required attribute.
CUDNN_ATTR_ENGINE_GLOBAL_INDEX
The index for the engine.
CUDNN_TYPE_INT64
; one element.Valid values are between
0
andCUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT-1
.Required attribute.
CUDNN_ATTR_ENGINE_KNOB_INFO
The descriptors of performance knobs of the engine.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR.Read-only attribute.
CUDNN_ATTR_ENGINE_NUMERICAL_NOTE
The numerical attributes of the engine.
CUDNN_TYPE_NUMERICAL_NOTE
; zero or more elements.Read-only attribute.
CUDNN_ATTR_ENGINE_LAYOUT_INFO
The preferred tensor layouts of the engine.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR.Read-only attribute.
CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE
The behavior attributes of the engine.
CUDNN_TYPE_BEHAVIOR_NOTE
; zero or more elements.Read-only attribute.
CUDNN_ATTR_ENGINE_SM_COUNT_TARGET
The number of SMs to target.
CUDNN_TYPE_INT32
; one element.Valid values are between
0
and the number of SMs on the device, where0
is default meaning all the SMs will be used.Optional attribute.
Finalization
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_STATUS_NOT_SUPPORTED
The descriptor attribute set is not supported by the current version of cuDNN. For example, the value of
CUDNN_ATTR_ENGINE_GLOBAL_INDEX
is not in a valid range.CUDNN_STATUS_BAD_PARAM
The descriptor attribute set is inconsistent or in an unexpected state. For example, the operation graph descriptor set is not already finalized.
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &desc)
; the cuDNN backend engine configuration descriptor consists of an engine descriptor and an array of knob choice descriptors. Users can query from engine config information about intermediates: computational intermediate results that can be reused between executions.
Attributes
CUDNN_ATTR_ENGINECFG_ENGINE
The backend engine.
CUDNN_TYPE_BACKEND_DESCRIPTOR
: one element, a backend descriptor of type CUDNN_BACKEND_ENGINE_DESCRIPTOR.Required attribute.
CUDNN_ATTR_ENGINECFG_KNOB_CHOICES
The engine tuning knobs and choices.
CUDNN_TYPE_BACKEND_DESCRIPTOR
: zero or more elements, backend descriptors of type CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR.
CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO
Information of the computational intermediate of this engine config.
CUDNN_TYPE_BACKEND_DESCRIPTOR
: one element, a backend descriptor of type CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR.Read-only attribute.
Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE
The size of the workspace buffer required to execute this engine config.
CUDNN_TYPE_INT64
; one element.Read-only attribute.
Finalization
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_STATUS_NOT_SUPPORTED
The descriptor attribute set is not supported by the current version of cuDNN. For example, the value knob.
CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, &desc)
; the cuDNN backend engine heuristics descriptor allows users to obtain for an operation graph engine configuration descriptors ranked by performance according to cuDNN’s heuristics.
Attributes
CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH
The operation graph for which heuristics result in a query.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element.Required attribute.
CUDNN_ATTR_ENGINEHEUR_MODE
The heuristic mode to query the result.
CUDNN_TYPE_HEUR_MODE
; one element.Required attribute.
CUDNN_ATTR_ENGINEHEUR_RESULTS
The result of the heuristics query.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; zero or more elements of descriptor type CUDNN_BACKEND_ENGINECFG_DESCRIPTOR.Get-only attribute.
CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET
The number of SMs to target.
CUDNN_TYPE_INT32
; one element.Valid values are between
0
and the number of SMs on the device, where0
is default meaning all the SMs will be used.Optional attribute.
Finalization
Return values of cudnnBackendFinalize(desc)
where desc
is a cuDNN backend engine heuristics descriptor:
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &desc)
; the cuDNN backend execution plan descriptor allows the user to specify an execution plan, consists of a cuDNN handle, an engine configuration, and optionally an array of intermediates to compute.
Attributes
CUDNN_ATTR_EXECUTION_PLAN_HANDLE
A cuDNN handle.
CUDNN_TYPE_HANDLE
; one element.Required attribute.
CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG
An engine configuration to execute.
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR
; one element.Required attribute.
CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS
Unique identifiers of intermediates to compute.
CUDNN_TYPE_INT64
; zero or more elements.Optional attribute. If set, the execution plan will only compute the specified intermediate and not any of the output tensors on the operation graph in the engine configuration.
CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS
Unique identifiers of precomputed intermediates.
CUDNN_TYPE_INT64
; zero or more elements.Optional attribute. If set, the plan will expect and use pointers for each intermediate in the variant pack descriptor during execution.
Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE
The size of the workspace buffer required to execute this plan.
CUDNN_TYPE_INT64
; one element.Read-only attribute.
CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION
The JSON representation of the serialized execution plan. Serialization and deserialization can be done by getting and setting this attribute, respectively.
CUDNN_TYPE_CHAR
; many elements, the same amount as the size of a null-terminated string of the json representation of the execution plan.
Finalization
Return values of cudnnBackendFinalize(desc)
where desc
is a cuDNN backend execution plan descriptor:
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR, &desc)
; the cuDNN backend intermediate descriptor is a read-only descriptor that contains information about an execution intermediate. An execution intermediate is some intermediate computation for an engine config in device memory that can be reused between plan execution to amortize the kernel. Each intermediate is identified by a unique ID. Users can query for the device memory size of the intermediate. An intermediate can depend on the data of one or more tensors identified by the tensor UIDs or one more attribute of the operation graph.
This is a read-only descriptor. Users cannot set the descriptor attributes or finalize the descriptor. User query for a finalized descriptor from an engine config descriptor.
Attributes
CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID
A unique identifier of the intermediate.
CUDNN_TYPE_INT64
; one element.Read-only attribute.
CUDNN_ATTR_INTERMEDIATE_INFO_SIZE
The required device memory size for the intermediate.
CUDNN_TYPE_INT64
; one element.Read-only attribute.
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS
UID of tensors on which the intermediate depends.
CUDNN_TYPE_INT64
; zero or more elements.Read-only attribute.
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES
Currently unsupported. Placeholder for future implementation.
Finalization
User does not finalize this descriptor. cudnnBackendFinalize(desc)
with a backend intermediate descriptor returns CUDNN_STATUS_NOT_SUPPORTED
.
CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR, &desc)
; the cuDNN backend knob choice descriptor consists of the type of knobs to be set and the value to which the knob is set.
Attributes
CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE
The type of knobs to be set.
CUDNN_TYPE_KNOB_TYPE
: one element.Required attribute.
CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE
The value of the knobs to be set.
CUDNN_TYPE_INT64
: one element.Required attribute.
Finalization
Return values of cudnnBackendFinalize(desc)
where desc
is a cuDNN backend knob choice descriptor:
CUDNN_STATUS_SUCCESS
The knob choice descriptor was finalized successfully.
CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_INFO_DESCRIPTOR, &desc)
; the cuDNN backend knob info descriptor consists of the type and valid value range of an engine performance knob. Valid value range is given in terms of minimum, maximum, and stride of valid values. This is a purely informative descriptor type. Setting descriptor attributes is not supported. User obtains an array of finalized descriptors, one for each knob type, from a finalized backend descriptor.
Attributes
CUDNN_ATTR_KNOB_INFO_TYPE
The type of the performance knob.
CUDNN_TYPE_KNOB_TYPE
: one element.Read-only attribute.
CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE
The smallest valid value choice value for this knob.
CUDNN_TYPE_INT64
: one element.Read-only attribute.
CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE
The largest valid choice value for this knob.
CUDNN_TYPE_INT64
: one element.Read-only attribute.
CUDNN_ATTR_KNOB_INFO_STRIDE
The stride of valid choice values for this knob.
CUDNN_TYPE_INT64
: one element.Read-only attribute.
Finalization
This descriptor is read-only; it is retrieved and finalized from a cuDNN backend engine configuration descriptor. Users cannot set or finalize.
CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR#
Created with descriptor type value CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR
, cuDNN backend layout info descriptor provides information on the preferred layout for a tensor.
Attributes
CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID
The UID of the tensor.
CUDNN_TYPE_INT64
; one element.Read-only attribute.
CUDNN_ATTR_LAYOUT_INFO_TYPES
The preferred layout of the tensor.
CUDNN_TYPE_LAYOUT_TYPE
: zero or more element cudnnBackendLayoutType_t.Read-only attribute.
Finalization
This descriptor is read-only; it is retrieved and finalized from a cuDNN backend engine configuration descriptor. Users cannot set its attribute or finalize it.
CUDNN_BACKEND_MATMUL_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR, &desc)
; the cuDNN backend matmul
descriptor specifies any metadata needed for the matmul
operation.
Attributes
CUDNN_ATTR_MATMUL_COMP_TYPE
The compute precision used for the
matmul
operation.CUDNN_TYPE_DATA_TYPE
; one element.Required attribute.
Finalization
Return values of cudnnBackendFinalize(desc)
where desc
is a cuDNN backend matmul
descriptor:
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR, &desc)
; the cuDNN backend concatenation operation descriptor specifies an operation node for concatenating a given vector of tensors along a given concatenation axis.
This operation also supports an in-place mode, where one of the input tensors is already assumed to be at the correct location in the output tensor, that is, they share the same device buffer.
Attributes
Attributes of a cuDNN backend concat operation descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONCAT_
:
CUDNN_ATTR_OPERATION_CONCAT_AXIS
The dimension which tensors are being concatenated over.
Type:
CUDNN_TYPE_INT64
Required attribute.
CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS
A vector of input tensor descriptors, which are concatenated in the same order as provided in this vector.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX
The index of input tensor in the vector of input tensor descriptors that is already present in-place in the output tensor.
Type:
CUDNN_TYPE_INT64
Optional attribute.
CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC
The output tensor descriptor for the result from concatenation of input tensors.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR()
can have the following return values:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The tensors involved in the operation should have the same shape in all dimensions except the dimension that they are being concatenated over.
The output tensor shape in the concatenating dimension should equal the sum of tensor shape of all input tensors in that same dimension.
Concatenation axis should be a valid tensor dimension.
If provided, the in-place input tensor index should be a valid index in the vector of input tensor descriptors.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, &desc)
; the cuDNN backend convolution backward data operation descriptor specifies an operation node for convolution backward data to compute the gradient of input data dx
with filter tensor w
and gradient of response dy
with output \(\alpha\) scaling and residue add with \(\beta\) scaling. That is, the equation: \(dx=\alpha\left( w\bar{*}dy \right)+\beta dx\) where \(\bar{*}\) denotes the convolution backward data operator.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_
:
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA
The alpha value.
CUDNN_TYPE_FLOAT
orCUDNN_TYPE_DOUBLE
; one or more elements.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA
The beta value.
CUDNN_TYPE_FLOAT
orCUDNN_TYPE_DOUBLE
; one or more elements.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC
The convolution operator descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W
The convolution filter tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX
The image gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY
The response gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In finalizing the convolution operation, the tensor dimensions of the tensor DX
, W
, and DY
are bound based on the same interpretations as the X
, W
, and Y
tensor dimensions described in the CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR section.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR
can have the following return values:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. For example, the
DX
,W
, andDY
tensors do not constitute a valid convolution operation under the convolution operator.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, &desc)
; the cuDNN backend convolution backward filter operation descriptor specifies an operation node for convolution backward filter to compute the gradient of filter dw
with image tensor x
and gradient of response dy
with output \(\alpha\) scaling and residue add with \(\beta\) scaling. That is, the equation: \(dx=\alpha\left( x\tilde{*}dy \right)+\beta dx\) where \(\tilde{*}\) denotes the convolution backward filter operator.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_
:
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA
The alpha value.
CUDNN_TYPE_FLOAT
orCUDNN_TYPE_DOUBLE
; one or more elements.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA
The beta value.
CUDNN_TYPE_FLOAT
orCUDNN_TYPE_DOUBLE
; one or more elements.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC
The convolution operator descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW
The convolution filter tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X
The image gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY
The response gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute. Required to be set before finalization.
Finalization
In finalizing the convolution operation, the tensor dimensions of the tensor X
, DW
, and DY
are bound based on the same interpretations as the X
, W
, and Y
tensor dimensions described in the CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR section.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR()
can have the following return values:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. For example, the
X
,DW
, andDY
tensors do not constitute a valid convolution operation under the convolution operator.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, &desc)
; the cuDNN backend convolution forward operation descriptor specifies an operation node for forward convolution to compute the response tensor y
of image tensor x
convoluted with filter tensor w
with output scaling \(\alpha\) and residual add with \(\beta\) scaling. That is, the equation: \(y=\alpha\left( w*x \right)+\beta y\) where \(*\) is the convolution operator in the forward direction.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_
:
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA
The alpha value.
CUDNN_TYPE_FLOAT
orCUDNN_TYPE_DOUBLE
; one or more elements.Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA
The beta value.
CUDNN_TYPE_FLOAT
orCUDNN_TYPE_DOUBLE
; one or more elements.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC
The convolution operator descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W
The convolution filter tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X
The image tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y
The response tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In finalizing the convolution operation, the tensor dimensions of the tensor X
, W
, and Y
are bound based on the following interpretations:
The CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS
attribute of CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC
is the number of spatial dimension of the convolution. The number of dimensions for tensor X
, W
, and Y
must be larger than the number of spatial dimensions by 2 or 3 depending on how users choose to specify the convolution tensors.
If the number of tensor dimension is the number of spatial dimensions plus 2:
X
tensor dimension and stride arrays are[N, GC, …]
W
tensor dimension and stride arrays are[GK, C, …]
Y
tensor dimension and stride arrays are[N, GK, …]
Where the ellipsis …
are shorthand for spatial dimensions of each tensor, G
is the number of convolution groups, and C
and K
are the number of input and output feature maps per group. In this interpretation, it is assumed that the memory layout for each group is packed. cudnnBackendFinalize() asserts the tensors dimensions and strides are consistent with this interpretation or it returns CUDNN_STATUS_BAD_PARAM
.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR
can have the following return values:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. For example, the
X
,W
, andY
tensors do not constitute a valid convolution operation under the convolution operator.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR#
Represents an operation that will generate per-channel statistics. The specific statistics that will be generated depends on the CUDNN_ATTR_OPERATION_GENSTATS_MODE
attribute in the descriptor. Currently, only CUDNN_GENSTATS_SUM_SQSUM
is supported for the CUDNN_ATTR_OPERATION_GENSTATS_MODE
. It will generate the sum and quadratic sum of per-channel elements of the input tensor x
. The output dimension should be all 1 except the C
dimension. Also, the C
dimension of outputs should equal the C
dimension of the input. This opaque struct can be created with cudnnBackendCreateDescriptor() (CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR)
.
Attributes
CUDNN_ATTR_OPERATION_GENSTATS_MODE
Sets the
CUDNN_TYPE_GENSTATS_MODE
of the operation. This attribute is required.CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC
The math precision of the computation. This attribute is required.
CUDNN_ATTR_OPERATION_GENSTATS_XDESC
Sets the descriptor for the input tensor
X
. This attribute is required.CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC
Sets the descriptor for the output tensor
sum
. This attribute is required.CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC
Sets the descriptor for the output tensor
quadratic
sum. This attribute is required.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The number of dimensions do not match between the input and output tensors.
The input/output tensor dimensions do not agree with the above description.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &desc)
; the cuDNN backend matmul
operation descriptor specifies an operation node for matmul
to compute the matrix product C by multiplying Matrix A and Matrix B, as shown in the following equation: C=AB
When using the matmul
operation, the matrices are expected to be at least rank-2 tensors. The last two dimensions are expected to correspond to either M, K or N. All the preceding dimensions are interpreted as batch dimensions. If there are zero batch dimensions then the requirements are as follows:
Case |
Matrix A |
Matrix B |
Matrix C |
---|---|---|---|
Single Matmul |
M x K |
K x N |
M x N |
For a single batch dimension we have the following requirements:
Case |
Matrix A |
Matrix B |
Matrix C |
---|---|---|---|
Single Matmul |
1 x M x K |
1 x K x N |
1 x M x N |
Batch Matmul |
B x M x K |
B x K x N |
B x M x N |
Broadcast A |
(B/c) x M x K |
B x K x N |
B x M x N |
Broadcast B |
B x M x K |
(B/c) x K x N |
B x M x N |
Where:
B indicates the batch size
M is the number of rows of the Matrix A
K is the number or columns of the input Matrix A (which is the same as the number of rows as the input Matrix B)
N is the number of columns of the input Matrix B
c is a constant integer and a factor of B
If either the batch size of Matrix A or B is set to B/c, this indicates that the matrix will be broadcasted in the batch matmul. The resulting output Matrix C will be a tensor of B x M x N.
The above broadcasting convention is extended to all the batch dimensions. Concretely, for tensors with three batch dimensions:
Case |
Matrix A |
Matrix B |
Matrix C |
---|---|---|---|
Multiple Batched Matmul |
B1 x 1 x B3 x M x K |
1 x B2 x (B3/c) x K x N |
B1 x B2 x B3 x M x N |
The functionality of having multiple batch dimensions allows you to have layouts where the batch is not packed at a single stride. This case is especially seen in multihead attention. c is only allowed to be B (leading to a batch dimension for 1) for matmul and matmul fusions. The other possible values of c are supported for Grouped Query Attention in the cuDNN Fused Flash Attention.
The addressing of the matrix elements from a given tensor can be specified using strides in the tensor descriptor. The strides represent the spacing between elements for each tensor dimension. Considering a matrix tensor A (B x M x N) with strides [BS, MS, NS], it indicates that the actual matrix element A[x, y, z] is found at (A_base_address + x * BS + y * MS + z * NS) from the linear memory space allocated for tensor A. With our current support, the innermost dimension must be packed, which requires either MS=1 or NS=1. Otherwise, there are no other technical constraints with regard to how the strides can be specified in a tensor descriptor as it should follow the aforementioned addressing formula and the strides as specified by the user.
This representation provides support for some common usages, such as leading dimension and matrix transpose as we will explain through the following examples.
The most basic case is a fully packed row-major batch matrix, without any consideration of leading dimension or transpose. In this case, BS = M*N, MS = N, and NS = 1.
Matrix transpose can be achieved by exchanging the inner and outer dimensions using strides. Namely:
To specify a non-transposed matrix: BS = M*N, MS = N, and NS = 1.
To specify matrix transpose: BS = M*N, MS = 1, and NS = M.
Leading dimension, a widely used concept in BLAS-like APIs, describes the inner dimension of the 2D array memory allocation (as opposed to the conceptual matrix dimension). It resembles the stride in a way that it defines the spacing between elements in the outer dimension. The most typical use cases where it shows difference from the matrix inner dimension is when the matrix is only part of the data in the allocated memory, addressing submatrices, or addressing matrices from an aligned memory allocation. Therefore, the leading dimension LDA in a column-major matrix A must satisfy LDA >= M, whereas in a row-major matrix A, it must satisfy LDA >= N. To transition from the leading dimension concept to using strides, this entails MS >= N and NS = 1 or MS = 1 and NS >= M. Keep in mind that, while these are some practical use cases, these inequalities do not impose technical constraints with respect to an acceptable specification of the strides.
Other commonly used GEMM features, such as alpha/beta output blending, can also be achieved using this matmul
operation along with other pointwise operations.
Attributes
Attributes of a cuDNN backend matmul
descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_MATMUL_
:
CUDNN_ATTR_OPERATION_MATMUL_ADESC
The Matrix A descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_MATMUL_BDESC
The Matrix B descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_MATMUL_CDESC
The Matrix C descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT
Number of
matmul
operations to perform in the batch on matrix.CUDNN_TYPE_INT64
; one element.Default value is
1
.
CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC
The tensor
gemm_m_override
descriptor. Allows you to override the M dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC
The tensor
gemm_n_override
descriptor. Allows you to override the N dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC
The tensor
gemm_k_override
descriptor. Allows you to override the K dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_MATMUL_DESC
The
matmul
operation descriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_MATMUL_DESCRIPTOR.Required attribute.
Finalization
In the finalization of the matmul
operation, the tensor dimensions of the Matrices A, B, and C will be checked to ensure that they satisfy the requirements of matmul:
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR
can have the following return values:
CUDNN_STATUS_NOT_SUPPORTED
An unsupported attribute value was encountered. For example, if not all of the Matrices A, B, and C are at least rank-2 tensors.
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT
specified is a negative value.The
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT
and one or more of the batch sizes of the Matrices A, B, and C are not equal to one. That is to say there is a conflict where both irregularly and regularly strided batched matmul are specified, which is not a valid use case.The dimensions of the Matrices A, B, and C do not satisfy the matmul requirements.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR, &desc)
; the cuDNN backend normalization backward operation specifies a node for a backward normalization that takes as input the gradient tensor dY
and outputs the gradient tensor dX
and weight gradients dScale
and dBias
. The normalization mode is set using the CUDNN_ATTR_OPERATION_NORM_BWD_MODE
attribute.
Limitations
Does not support
CUDNN_GROUP_NORM
mode.
|
|
|
|
|
---|---|---|---|---|
Yes |
Yes |
Yes |
No |
Yes |
Note
In addition to single GPU, CUDNN_BATCH_NORM
also supports single node multi-GPU batch norm, while other normalization modes only support running on a single GPU. For more information, refer to the DReluForkDBn pattern.
Attributes
CUDNN_ATTR_OPERATION_NORM_BWD_MODE
Chooses the normalization mode for the norm backward operation.
CUDNN_TYPE_NORM_MODE
; one element.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_XDESC
Input tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC
Saved mean input tensor descriptor for reusing the mean computed during the forward computation of the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC
Saved inverse variance input tensor descriptor for reusing the mean computed during the forward computation of the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC
Gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC
Normalization scale descriptor. Note that the bias descriptor is not necessary for the backward pass.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC
Scalar input tensor descriptor for the epsilon value. The epsilon values are needed only if the saved mean and variances are not passed as inputs to the operation. Note that the attribute
CUDNN_ATTR_TENSOR_IS_BY_VALUE
of this descriptor should be set totrue
.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC
Scale gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC
Bias gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC
Input gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS
Vector of tensor descriptors for the communication buffers used in multi-GPU normalization. Typically, one buffer is provided for every GPU in the node. This is an optional attribute only used for multi-GPU tensor stats reduction.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Finalization
In the finalization stage, the attributes are checked to ensure there are no conflicts.
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The tensor dimensions of the gradient tensors
dY
,dX
, and input tensorX
, do not match.The channel count C for the
mean
,scale
, andinv_variance
tensors do not match.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR, &desc)
; the cuDNN backend normalization forward operation specifies a node for a forward normalization that takes as input a tensor X
and produces a normalized output Y
with the normalization mode set by the CUDNN_ATTR_OPERATION_NORM_FWD_MODE
attribute. The operation supports optional running stats computation and allows for storing the computed means and variances for reuse in the backwards calculation depending on the setting of the CUDNN_ATTR_OPERATION_NORM_FWD_PHASE
attribute.
Limitations
Does not support
CUDNN_GROUP_NORM
mode.Batch norm only supports forward training and not forward inference.
|
|
|
|
|
|
---|---|---|---|---|---|
|
Yes |
Yes |
Yes |
No |
Yes |
|
Yes |
Yes |
No |
No |
Yes |
Note
In addition to single-GPU, batch normalization supports running on single node multi-GPUs, while other normalization modes only support running on a single GPU. For more information, refer to the BNAddRelu pattern.
Attributes
CUDNN_ATTR_OPERATION_NORM_FWD_MODE
Chooses the normalization mode for the norm forward operation.
CUDNN_TYPE_NORM_MODE
; one element.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_PHASE
Selects the training or inference phase for the norm forward operation.
CUDNN_TYPE_NORM_FWD_PHASE
; one element.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_XDESC
Input tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC
Estimated mean input tensor descriptor for the inference phase and the computed mean output tensor descriptor for the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC
Estimated inverse variance input tensor descriptor for the inference phase and the computed inverse variance output tensor descriptor for the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC
Normalization scale input tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC
Normalization bias input tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC
Scalar input tensor descriptor for the epsilon value used in normalization calculation. Note that the attribute
CUDNN_ATTR_TENSOR_IS_BY_VALUE
of this descriptor should be set totrue
.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC
Scalar input tensor descriptor for the exponential average factor value used in running stats computation.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC
Input running mean tensor descriptor for the running stats computation in the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC
Input running variance tensor descriptor for the running stats computation in the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC
Output running
mean
tensor descriptor for the running stats computation in the training phase.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC
Output running variance tensor descriptor for the running stats computation in the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_YDESC
Tensor descriptor for the output of the normalization operation.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS
Vector of tensor descriptors for the communication buffers used in multi-GPU normalization. Typically, one buffer is provided for every GPU in the node. This is an optional attribute only used for multi-GPU tensor stats reduction.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Finalization
In the finalization stage, the attributes are checked to ensure there are no conflicts.
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The output tensor dimensions do not match the input tensor dimensions.
The channel count C for the
mean
,scale
,bias
, andinv_variance
tensors do not match.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR#
Represents a pointwise operation that implements the equation Y = op(alpha1 * X)
or Y = op(alpha1 * X, alpha2 * B)
depending on the operation type. The actual type of operation represented by op()
above depends on the CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR
attribute in the descriptor. This operation descriptor supports operations with single-input single-output.
For a list of supported operations, refer to the cudnnPointwiseMode_t section.
For dual-input pointwise operations, broadcasting is assumed when a tensor dimension in one of the tensors is 1
while the other tensors corresponding dimension is not 1
.
For three-input single-output pointwise operations, we do not support broadcasting in any tensor.
This opaque struct can be created with cudnnBackendCreateDescriptor() (CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR).
Attributes
CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR
Sets the descriptor containing the mathematical settings of the pointwise operation. This attribute is required.
CUDNN_ATTR_OPERATION_POINTWISE_XDESC
Sets the descriptor for the input tensor
X
. This attribute is required for pointwise mathematical functions or activation forward propagation computations.CUDNN_ATTR_OPERATION_POINTWISE_BDESC
If the operation requires two inputs, such as
add
ormultiply
, this attribute sets the second input tensorB
. If the operation requires only 1 input, this field is not used and should not be set.CUDNN_ATTR_OPERATION_POINTWISE_YDESC
Sets the descriptor for the output tensor
Y
. This attribute is required for pointwise mathematical functions or activation forward propagation computations.CUDNN_ATTR_OPERATION_POINTWISE_TDESC
Sets the descriptor for the tensor
T
. This attribute is required forCUDNN_ATTR_POINTWISE_MODE
set toCUDNN_POINTWISE_BINARY_SELECT
and acts as the mask based on which the selection is done.CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1
Sets the scalar
alpha1
value in the equation. Can be infloat
orhalf
. This attribute is optional, if not set, the default value is1.0
.CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2
If the operation requires 2 inputs, such as
add
ormultiply
. This attribute sets the scalaralpha2
value in the equation. Can be infloat
orhalf
. This attribute is optional, if not set, the default value is1.0
. If the operation requires only 1 input, this field is not used and should not be set.CUDNN_ATTR_OPERATION_POINTWISE_DXDESC
Sets the descriptor for the output tensor
dX
. This attribute is required for pointwise activation back propagation computations.CUDNN_ATTR_OPERATION_POINTWISE_DYDESC
Sets the descriptor for the input tensor
dY
. This attribute is required for pointwise activation back propagation computations.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The number of dimensions do not match between the input and output tensors.
The input/output tensor dimensions do not agree with the above described automatic broadcasting rules.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR#
The cuDNN backend reduction
operation descriptor represents an operation node that implements reducing values of an input tensor X
in one or more dimensions to get an output tensor Y
. The math operation and compute data type used for reducing tensor values is specified via CUDNN_ATTR_OPERATION_REDUCTION_DESC
.
This operation descriptor can be created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, &desc)
.
The output tensor Y
should be the size as that of input tensor X
, except dimensions where its size is 1
.
There is a special use case for Grouped Query Attention and Multi Query Attention in cuDNN Fused Flash Attention where some dimensions in the output tensor Y
can also be factors of the corresponding dimensions in the input tensor X
.
Attributes
Attributes of a cuDNN backend reduction
descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_REDUCTION_
:
CUDNN_ATTR_OPERATION_REDUCTION_XDESC
The matrix
X
descriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR
one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_REDUCTION_YDESC
The matrix
Y
descriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR
one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_REDUCTION_DESC
The
reduction
operation descriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR
one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In the finalization of the reduction
operation, the dimensions of tensors X
and Y
are checked to ensure that they satisfy the requirements of the reduction operation.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR
can have the following return values:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. For example, the dimensions of the tensors
X
andY
do not satisfy the requirements of the reduction operation.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR, &desc)
; the cuDNN backend resample backward operation descriptor specifies an operation node for backward resampling. It computes the input tensor gradient dx
from output tensor gradient dy
with backward resampling done according to CUDNN_ATTR_RESAMPLE_MODE
with output scaling \(\alpha\) and residual add with \(\beta\) scaling.
Attributes
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC
Resample operation descriptor (CUDNN_BACKEND_RESAMPLE_DESCRIPTOR) instance contains metadata about the operation.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_RESAMPLE_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC
Input tensor gradient descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC
Output tensor gradient descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC
Tensor containing maxpool or nearest neighbor resampling indices to be used in backprop.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA
Sets the alpha parameter used in blending.
CUDNN_TYPE_DOUBLE
orCUDNN_TYPE_FLOAT
; one element.Optional attribute.
Default value is
1.0
.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA
Sets the beta parameter used in blending.
CUDNN_TYPE_DOUBLE
orCUDNN_TYPE_FLOAT
; one element.Optional attribute.
Default value is
0.0
.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC
Input tensor
X
descriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Required for NCHW layout.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC
Input tensor
Y
descriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Required for NCHW layout.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The output shape calculated based on the padding and strides does not match the given output tensor dimensions.
The shape of
YDESC
andIDXDESC
(if given) do not match.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR, &desc)
; the cuDNN backend resample forward operation descriptor specifies an operation node for forward resampling. It computes the output tensor y
of image tensor x
resampled according to CUDNN_ATTR_RESAMPLE_MODE
, with output scaling \(\alpha\) and residual add with \(\beta\) scaling.
The resampling mode acts independently on each spatial dimension. For spatial dimension i
, the output spatial dimension size y_i
can be calculated by combining input image’s spatial dimension size x_i
, post padding post_i
, pre padding pre_i
, stride s_i
, and window size w_i
as: y_i = 1+(x_i + post_i + pre_i - w_i) / s_i
Attributes
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC
Resample operation descriptor (CUDNN_BACKEND_RESAMPLE_DESCRIPTOR) instance contains metadata about the operation.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_RESAMPLE_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC
Input tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC
Output tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC
Tensor containing maxpool or nearest neighbor resampling indices to be used in backprop.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute (primarily used for use cases involving training).
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA
Sets the alpha parameter used in blending.
CUDNN_TYPE_DOUBLE
orCUDNN_TYPE_FLOAT
; one element.Optional attribute.
Default value is
1.0
.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA
Sets the beta parameter used in blending.
CUDNN_TYPE_DOUBLE
orCUDNN_TYPE_FLOAT
; one element.Optional attribute.
Default value is
0.0
.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered. Some examples include:
The output shape calculated based on the padding and strides does not match the given output tensor dimensions.
The shape of the
YDESC
andIDXDESC
(if given) do not match.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR, &desc)
; the cuDNN backend Rng
operation descriptor specifies an operation node for generating a tensor with random numbers based on the probability distribution specified in the Rng
descriptor.
The random numbers are generated using a Philox random number generator (RNG) as described in Pytorch. The Philox object takes a seed value, a subsequence for starting the generation, and an offset for the subsequence. Seed and offset can be set by using the attributes. The subsequence is internally set, to ensure independent random numbers.
Attributes
CUDNN_ATTR_OPERATION_RNG_DESC
Rng
descriptor (CUDNN_BACKEND_RNG_DESCRIPTOR) instance containing metadata about the operation.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_RNG_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RNG_YDESC
Output tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RNG_SEED
Sets the seed for the random number generator which creates the
Y
tensor. It can be a host INT64 value or a backend descriptor binded to a value on the device. Only supports a tensor with all dimensions set to1
and all strides set to1
.CUDNN_TYPE_INT64
; one element orCUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Default value is
0
.
CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC
Tensor descriptor for the offset used in the RNG Philox object. Only supports a tensor with all dimensions set to
1
and all strides set to1
.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAM
CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC
orCUDNN_ATTR_OPERATION_RNG_SEED
do not have all dimensions and strides set to1
.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR, &desc)
; the cuDNN backend paged cache load operation descriptor is used with a fused flash attention fprop graph, and specifies an operation node for reconstructing the k- or v-cache.
The k/v-cache is reconstructed by using a page table tensor to look up the location of a specific sequence ID in a non-contiguous container tensor. Storing a k/v-cache non-contiguously enables efficient memory management by avoiding fragmentation. For more information, refer to the Paged Attention paper.
Attributes
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC
Virtual output tensor descriptor, containing the reconstructed k/v-cache.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC
A non-virtual tensor descriptor with dimensions [num_blocks,H,block_size,D] containing the k/v-cache. The k/v-cache is divided into
num_blocks
of [H,block_size,D] tensors, whereblock_size
is a parameter chosen by the user. A smallerblock_size
leads to less fragmentation, but also less parallelism.num_blocks
is arbitrary and depends on the size of the allocated k/v-cache.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC
A non-virtual tensor descriptor of dimensions [B,1,ceil(max_seq_size/block_size),1] pointing to the lookup table.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Datatype: INT32
Required attribute.
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC
A non-virtual [B,1,1,1] tensor descriptor indicates which sequence numbers from the k/v-cache are requested. For each batch, all items from the container will be copied from sequence 0 to sequence number 1.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Datatype: INT32 or INT64
Sequence numbers are in the interval [1, max_seq_size]
Required attribute.
Finalization
In the finalization stage, the attributes are cross-checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAM
Types or dimensions of one or more of the input/output tensors are invalid.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR, &desc)
; the cuDNN backend signal operation descriptor specifies an operation node for updating or waiting on a flag variable. Signaling operations can be used to communicate between cuDNN operation graphs, even with operation graphs in another GPU.
This operation, to connect to other nodes in the graph, also has a pass-through input tensor, which is not operated on and is just passed along to the output tensor. This mandatory pass-through input tensor helps in determining the predecessor node after which the signal operation should be executed. The optional output tensor helps in determining the successor node before which the signal execution should have completed. It is also guaranteed that for a non-virtual tensor as the output tensor, all writes for the tensor will have taken place before the signal value is updated by the operation.
Attributes
CUDNN_ATTR_OPERATION_SIGNAL_MODE
The signaling mode to use.
CUDNN_TYPE_SIGNAL_MODE
Required attribute.
CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC
Flag tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_SIGNAL_VALUE
The scalar value to compare or update the flag variable with.
CUDNN_TYPE_INT64
Required attribute.
CUDNN_ATTR_OPERATION_SIGNAL_XDESC
A pass-through input tensor to enable connecting this signal operation to other nodes in the graph.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_SIGNAL_YDESC
The output tensor for the pass-through input tensor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAM
Invalid or inconsistent attribute values are encountered.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR, &desc)
; the cuDNN backend bn_finalize
statistics operation descriptor specifies an operation node for the batch norm finalize operation.
In ResNet like models, a common technique to fuse batch norm with convolutions would involve splitting the batch norm operation into three parts - genStats
, finalize
, and apply
(pointwise, scale, and bias). The genStats
operation is usually fused with the convolution operation that precedes the batch norm while the apply
is fused with the ReLU and convolution that follows the batch norm op. The batch norm finalize
operation is a buffer op between the two fusions that takes the batch norm scale
, bias
, sum
, and sqsum
produced by the genStats
operation as inputs and produces an equivalent scale
and bias
as output. The equivalent scale
and bias
are then consumed in the apply
phase. Additionally, the bn_finalize
operation also produces the running stats, mean, and inverse standard deviation as outputs.
Attributes
CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE
Sets inference or training mode for the
bn_finalize
operation.CUDNN_TYPE_BN_FINALIZE_STATS_MODE
; one element.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC
Math precision of the computation.
Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC
Input sum tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC
Input square sum tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC
Batch norm input scale tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC
Batch norm input bias tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC
Batch norm input running mean descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC
Batch norm input running variance descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC
Batch norm output running mean descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC
Batch norm output running variance descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC
Batch norm output saved mean tensor descriptor. This is computed from the sum input that’s fed in from the preceding
genStats
operation. Storing out the saved mean helps avoid recomputation in the backpropagation phase.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC
Batch norm output inverse standard deviation tensor descriptor. This is computed from the sum and sqm sums input that’s fed in from the preceding
genStats
operation. Storing out the saved inv standard deviations helps avoid recomputation in the backpropagation phase.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC
Output tensor descriptor for the equivalent scale tensor. The equivalent scale tensor is typically fed as input to the batch norm
apply
computation (pointwise, scale, and bias) that follows the batch normfinalize
operation.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC
Output tensor descriptor for the equivalent bias tensor. The equivalent bias tensor is typically fed as input to the batch norm
apply
computation (pointwise, scale, and bias) that follows the batch normfinalize
operation.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC
Scalar input tensor descriptor representing the number of elements accumulated over while calculating the
sum
andsqsum
inputs. The count usually equals N*H*W in case of batch norm.CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC
Scalar input tensor descriptor for the epsilon value used in batch norm variance calculation.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERAGE_FACTOR_DESC
Scalar input tensor descriptor for the exponential average value used in batch norm running stats calculation.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR#
Created with descriptor type value CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR
, cuDNN backend operation graph descriptor describes an operation graph, a small network of one or more operations connected by virtual tensors. Operation graph defines users’ computation case or mathematical expression that they wish to compute.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATIONGRAPH_
:
CUDNN_ATTR_OPERATIONGRAPH_HANDLE
A cuDNN handle.
CUDNN_TYPE_HANDLE
; one element.Required attribute.
CUDNN_ATTR_OPERATIONGRAPH_OPS
Operation nodes to form the operation graph.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one or more elements of descriptor typeCUDNN_BACKEND_OPERATION_*_DESCRIPTOR
.Required attribute.
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT
The number of engines to support the operation graph.
CUDNN_TYPE_INT64
; one element.Read-only attribute.
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_SUPPORTED_COUNT
The number of engines that support the operation graph.
CUDNN_TYPE_INT64
; one element.Read-only attribute.
Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED
Whether dynamic shape is enabled for the operation graph. The rest of the backend API will treat the graph as a dynamic shape graph and enable this feature.
CUDNN_TYPE_BOOLEAN
; one element.
Finalization
CUDNN_STATUS_BAD_PARAM
An invalid attribute value was encountered. Some examples include:
One of the backend descriptors in
CUDNN_ATTR_OPERATIONGRAPH_OPS
is not finalized.The value
CUDNN_ATTR_OPERATIONGRAPH_HANDLE
is not a valid cuDNN handle.
CUDNN_STATUS_NOT_SUPPORTED
An unsupported attribute value was encountered. For example, the combination of operations of attribute
CUDNN_ATTR_OPERATIONGRAPH_OPS
is not supported.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_POINTWISE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_POINTWISE_DESCRIPTOR, &desc)
; the cuDNN backend pointwise descriptor specifies the parameters for a pointwise operator like mode, math precision, nan propagation, and so on.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_POINTWISE_
:
CUDNN_ATTR_POINTWISE_MODE
Mode of the pointwise operation.
CUDNN_TYPE_POINTWISE_MODE
; one element.Required attribute.
CUDNN_ATTR_POINTWISE_MATH_PREC
The math precision of the computation.
CUDNN_TYPE_DATA_TYPE
; one element.Required attribute.
CUDNN_ATTR_POINTWISE_NAN_PROPAGATION
Specifies a method by which to propagate NaNs.
CUDNN_TYPE_NAN_PROPOGATION
; one element.Required only for comparison based pointwise modes, like ReLU.
Current support only includes enum value
CUDNN_PROPAGATE_NAN
.Default value is
CUDNN_NOT_PROPAGATE_NAN
.
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP
Sets the lower clip value for ReLU. If
(value < lower_clip) value = lower_clip + lower_clip_slope * (value - lower_clip)
:CUDNN_TYPE_DOUBLE
/CUDNN_TYPE_FLOAT
; one element.Default value is
0.0f
.
CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP
Sets the upper clip value for ReLU. If
(value > upper_clip) value = upper_clip
:CUDNN_TYPE_DOUBLE
/CUDNN_TYPE_FLOAT
; one element.Default value is
Numeric limit max
.
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE
Sets the lower clip slope value for ReLU. If
(value < lower_clip) value = lower_clip + lower_clip_slope * (value - lower_clip)
:CUDNN_TYPE_DOUBLE
/CUDNN_TYPE_FLOAT
; one element.Default value is
0.0f
.
CUDNN_ATTR_POINTWISE_ELU_ALPHA
Sets the alpha value for ELU. If
(value < 0.0) value = alpha * (e^value - 1.0)
:CUDNN_TYPE_DOUBLE
/CUDNN_TYPE_FLOAT
; one element.Default value is
1.0f
.
CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA
Sets the beta value for softplus. If
value = log (1 + e^(beta * value)) / beta
:CUDNN_TYPE_DOUBLE
/CUDNN_TYPE_FLOAT
; one element.Default value is
1.0f
.
CUDNN_ATTR_POINTWISE_SWISH_BETA
Sets the beta value for swish. If
value = value / (1 + e^(-beta * value))
:CUDNN_TYPE_DOUBLE
/CUDNN_TYPE_FLOAT
; one element.Default value is
1.0f
.
CUDNN_ATTR_POINTWISE_AXIS
Sets the axis value for
GEN_INDEX
. The index will be generated for this axis.CUDNN_TYPE_INT64
; one element.Default value is
-1
.Needs to lie between
[0,input_dim_size-1]
. For example, if your input has dimensions [N,C,H,W], the axis can be set to anything in[0,3]
.
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_POINTWISE_DESCRIPTOR
can have the following return values:
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_REDUCTION_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_REDUCTION_DESCRIPTOR, &desc)
; the cuDNN backend reduction
descriptor specifies any metadata, including the math operation and compute data type, needed for the reduction
operation.
Attributes
CUDNN_ATTR_REDUCTION_OPERATOR
The math operation used for the
reduction
operation.CUDNN_TYPE_REDUCTION_OPERATOR_TYPE
; one element.Required attribute.
CUDNN_ATTR_REDUCTION_COMP_TYPE
The compute precision used for the
reduction
operation.CUDNN_TYPE_DATA_TYPE
; one element.Required attribute.
Finalization
Return values of cudnnBackendFinalize(desc)
where desc
is CUDNN_BACKEND_REDUCTION_DESCRIPTOR
are:
CUDNN_STATUS_NOT_SUPPORTED
An unsupported attribute value was encountered. For example,
CUDNN_ATTR_REDUCTION_OPERATOR
is not set to either ofCUDNN_REDUCE_TENSOR_ADD
,CUDNN_REDUCE_TENSOR_MUL
,CUDNN_REDUCE_TENSOR_MIN
, orCUDNN_REDUCE_TENSOR_MAX
.CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_RESAMPLE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_RESAMPLE_DESCRIPTOR, &desc)
; the cuDNN backend resample descriptor specifies the parameters for a resample operation (upsampling or downsampling) in both forward and backward propagation.
Attributes
CUDNN_ATTR_RESAMPLE_MODE
Specifies mode of resampling, for example, average pool, nearest-neighbor, and so on.
CUDNN_TYPE_RESAMPLE_MODE
; one element.Default value is
CUDNN_RESAMPLE_NEAREST
.
CUDNN_ATTR_RESAMPLE_COMP_TYPE
Compute data type for the resampling operator.
CUDNN_TYPE_DATA_TYPE
; one element.Default value is
CUDNN_DATA_FLOAT
.
CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION
Specifies a method by which to propagate NaNs.
CUDNN_TYPE_NAN_PROPAGATION
; one element.Default value is
CUDNN_NOT_PROPAGATE_NAN
.
CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS
Specifies the number of spatial dimensions to perform the resampling over.
CUDNN_TYPE_INT64
; one element.Required attribute.
CUDNN_ATTR_RESAMPLE_PADDING_MODE
Specifies which values to use for padding.
CUDNN_TYPE_PADDING_MODE
; one element.Default value is
CUDNN_ZERO_PAD
.
CUDNN_ATTR_RESAMPLE_STRIDES
Stride in each dimension for the kernel or filter.
CUDNN_TYPE_INT64
orCUDNN_TYPE_FRACTION
; at mostCUDNN_MAX_DIMS - 2
.Required attribute.
CUDNN_ATTR_RESAMPLE_PRE_PADDINGS
Padding added to the beginning of the input tensor in each dimension.
CUDNN_TYPE_INT64
orCUDNN_TYPE_FRACTION
; at mostCUDNN_MAX_DIMS - 2
.Required attribute.
CUDNN_ATTR_RESAMPLE_POST_PADDINGS
Padding added to the end of the input tensor in each dimension.
CUDNN_TYPE_INT64
orCUDNN_TYPE_FRACTION
; at mostCUDNN_MAX_DIMS - 2
.Required attribute.
CUDNN_ATTR_RESAMPLE_WINDOW_DIMS
Spatial dimensions of filter.
CUDNN_TYPE_INT64
orCUDNN_TYPE_FRACTION
; at mostCUDNN_MAX_DIMS - 2
.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a CUDNN_BACKEND_RESAMPLE_DESCRIPTOR
is:
CUDNN_STATUS_NOT_SUPPORTED
An unsupported attribute value was encountered. Some examples include:
An
elemCount
argument for settingCUDNN_ATTR_RESAMPLE_WINDOW_DIMS
,CUDNN_ATTR_RESAMPLE_STRIDES
,CUDNN_ATTR_RESAMPLE_PRE_PADDINGS
, andCUDNN_ATTR_RESAMPLE_POST_PADDINGS
is not equal to the value set forCUDNN_ATTR_RESAMPLE_SPATIAL_DIMS
.CUDNN_ATTR_RESAMPLE_MODE
is set toCUDNN_RESAMPLE_BILINEAR
and any of theCUDNN_ATTR_RESAMPLE_WINDOW_DIMS
are not set to2
.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_RNG_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_RNG_DESCRIPTOR, &desc)
; the cuDNN backend Rng
descriptor specifies any metadata, including the probability distribution that will be used to generate the tensor and the distribution’s corresponding parameters.
Attributes
CUDNN_ATTR_RNG_DISTRIBUTION
The probability distribution used for the
rng
operation.CUDNN_TYPE_RNG_DISTRIBUTION
; one element.Default value is
CUDNN_RNG_DISTRIBUTION_BERNOULLI
.
CUDNN_ATTR_RNG_NORMAL_DIST_MEAN
The mean value for the normal distribution, used if
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL
.CUDNN_TYPE_DOUBLE
; one element.Default value is
-1
.
CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION
The standard deviation value for the normal distribution, used if
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL
.CUDNN_TYPE_DOUBLE
; one element.Default value is
-1
.
Finalization
Return values of cudnnBackendFinalize(desc)
where desc
is CUDNN_BACKEND_RNG_DESCRIPTOR
are:
CUDNN_STATUS_BAD_PARAM
An invalid attribute value was encountered. Some examples include:
If
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL
and the standard deviation supplied is negative.If
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_UNIFORM
and the maximum value of the range is lower than minimum value.If
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_BERNOULLI
and the probability supplied is negative.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_TENSOR_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &desc)
; the cuDNN backend tensor allows users to specify the memory storage of a generic tensor. A tensor is identified by a unique identifier and described by its data type, its data byte-alignment requirements, and the extents and strides of its dimensions. Optionally, a tensor element can be vector in one of its dimensions. A tensor can also be set to be virtual when it is an intermediate variable in a computation graph and not mapped to physical global memory storage.
Attributes
Attributes of a cuDNN backend tensor descriptors are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_TENSOR_
:
CUDNN_ATTR_TENSOR_UNIQUE_ID
An integer that uniquely identifies the tensor.
CUDNN_TYPE_INT64
; one element.Required attribute.
CUDNN_ATTR_TENSOR_DATA_TYPE
Data type of tensor.
CUDNN_TYPE_DATA_TYPE
; one element.Required attribute.
CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT
Byte alignment of pointers for this tensor.
CUDNN_TYPE_INT64
; one element.Required attribute.
CUDNN_ATTR_TENSOR_DIMENSIONS
Tensor dimensions.
CUDNN_TYPE_INT64
; at mostCUDNN_MAX_DIMS
elements.Required attribute.
CUDNN_ATTR_TENSOR_STRIDES
Tensor strides.
CUDNN_TYPE_INT64
; at mostCUDNN_MAX_DIMS
elements.Required attribute.
CUDNN_ATTR_TENSOR_VECTOR_COUNT
Size of vectorization.
CUDNN_TYPE_INT64
; one element.Default value is
1
.
CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION
Index of the vectorized dimension.
CUDNN_TYPE_INT64
; one element.Required to be set before finalization if
CUDNN_ATTR_TENSOR_VECTOR_COUNT
is set to a value different than its default; otherwise it’s ignored.
CUDNN_ATTR_TENSOR_IS_VIRTUAL
Indicates whether the tensor is virtual. A virtual tensor is an intermediate tensor in the operation graph that exists in transient and not read from or written to in global device memory.
CUDNN_TYPE_BOOLEAN
; one element.Default value is
false
.
CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC
A ragged tensor, that is, a tensor with nested variable length lists as inner dimensions, will have another tensor called the ragged offset descriptor that contains offsets in memory to the next variable length list.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element.Default value is
None
.
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR
can have the following return values:
CUDNN_STATUS_BAD_PARAM
An invalid attribute value was encountered. Some examples include:
Any of the tensor dimensions or strides is not positive.
The value of the tensor alignment attribute is not divisible by the size of the data type.
CUDNN_STATUS_NOT_SUPPORTED
An unsupported attribute value was encountered. Some examples include:
The data type attribute is
CUDNN_DATA_INT8x4
,CUDNN_DATA_UINT8x4
, orCUDNN_DATA_INT8x32
.The data type attribute is
CUDNN_DATA_INT8
andCUDNN_ATTR_TENSOR_VECTOR_COUNT
value is not1
,4
, or32
.
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &desc)
; the cuDNN backend variant pack plan allows users to set up pointers to device buffers to various non-virtual tensors, identified by unique identifiers, of the operation graph, workspace, and computation intermediates.
Attributes
CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS
A unique identifier of tensor for each data pointer.
CUDNN_TYPE_INT64
; zero of more elements.Required attribute.
CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS
Tensor data device pointers.
CUDNN_TYPE_VOID_PTR
; zero or more elements.Required attribute.
CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES
Intermediate device pointers.
CUDNN_TYPE_VOID_PTR
; zero or more elements.Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_VARIANT_PACK_WORKSPACE
Workspace to device pointer.
CUDNN_TYPE_VOID_PTR
; one element.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a cuDNN backend variant pack descriptor is:
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR, &desc)
; the cuDNN backend kernel cache helps significantly reduce execution plan finalizing time for use cases that have same-topology dynamic shape operation graph by binding the previously compiled applicable kernel to the execution plan instead of re-compiling a new one from scratch. This is used with execution plans containing a graph with CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED
enabled.
Attributes
CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED
An attribute used to query whether a given engine config is cached.
CUDNN_TYPE_BACKEND_DESCRIPTOR
; one element.Read-only attribute using the cudnnBackendGetAttribute() API. The engine config in question is to be passed into this attribute as a constant input through
arrayOfElements
and theelementCount
will serve as the resulting output, a value of zero meaning not cached and a positive number meaning that it is cached.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a cuDNN backend variant pack descriptor are:
CUDNN_STATUS_SUCCESS
The descriptor was finalized successfully.
Use Cases#
This section describes some typical use cases of the cuDNN backend API; for example, setting up a simple operation graph, setting up an engine config for that operation graph, and finally setting up an execution plan and executing it with data pointers set in a variant pack descriptor. An example of cuDNN’s native CUDA graph API is given as well.
Setting Up An Operation Graph For A Grouped Convolution#
This use case creates an operation graph with a single grouped 3D convolution forward operation. It starts by setting up the input and output tensors, binding them to a convolution forward operation, and finally setting up an operation graph with a single node.
Create tensor descriptors.
cudnnBackendDescriptor_t xDesc; cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &xDesc); cudnnDataType_t dtype = CUDNN_DATA_FLOAT; cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &dtype); int64_t xDim[] = {n, g, c, d, h, w}; int64_t xStr[] = {g * c * d * h * w, c *d *h *w, d *h *w, h *w, w, 1}; int64_t xUi = 'x'; int64_t alignment = 4; cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 6, xDim); cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 6, xStr); cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &xUi); cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment); cudnnBackendFinalize(xDesc);
Repeat the above step for the convolution filter and output tensor descriptor. The six filter tensor dimensions are
[g, k, c, t, r, s]
and the six output tensor dimensions are[n, g, k, o, p, q]
, respectively. Below, when finalizing a convolution operator to which the tensors are bound, dimension consistency is checked, meaning alln
,g
,c
,k
values shared among the three tensors are required to be the same. Otherwise,CUDNN_STATUS_BAD_PARAM
status is returned.For backward compatibility with how tensors are specified in cudnnTensorDescriptor_t and used in convolution API, it is also possible to specify a 5D tensor with the following dimension:
image:
[n, g*c, d, h, w]
filter:
[g*k, c, t, r, s]
response:
[n, g*k, o, p, q]
In this format, a similar consistency check is performed when finalizing a convolution operator descriptor to which the tensors are bound.
Create, set, and finalize a convolution operator descriptor.
cudnnBackendDescriptor_t cDesc; int64_t nbDims = 3; cudnnDataType_t compType = CUDNN_DATA_FLOAT; cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION; int64_t pad[] = {0, 0, 0}; int64_t filterStr[] = {1, 1, 1}; int64_t dilation[] = {1, 1, 1}; cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &cDesc); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS, CUDNN_TYPE_INT64, 1, &nbDims); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_COMP_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &compType); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, CUDNN_TYPE_INT64, nbDims, pad); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_POST_PADDINGS, CUDNN_TYPE_INT64, nbDims, pad); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_DILATIONS, CUDNN_TYPE_INT64, nbDims, dilation); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES, CUDNN_TYPE_INT64, nbDims, filterStr); cudnnBackendFinalize(cDesc);
Create, set, and finalize a convolution forward operation descriptor.
cudnnBackendDescriptor_t fprop; float alpha = 1.0; float beta = 0.5; cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, &fprop); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &xDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &wDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &yDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &cDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA, CUDNN_TYPE_FLOAT, 1, &alpha); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA, CUDNN_TYPE_FLOAT, 1, &beta); cudnnBackendFinalize(fprop);
Create, set, and finalize an operation graph descriptor.
cudnnBackendDescriptor_t op_graph; cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph); cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &fprop); cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); cudnnBackendFinalize(op_graph);
Setting Up An Engine Configuration#
This use case describes the steps with which users can set up an engine config from a previously finalized operation graph. This is an example in which users would like to use the engine with CUDNN_ATTR_ENGINE_GLOBAL_INDEX 0
for this operation graph and does not set any performance knobs.
Create, set, and finalize an engine descriptor.
cudnnBackendDescriptor_t engine; cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine); cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph); int64_t gidx = 0; cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &gidx); cudnnBackendFinalize(engine);
The user can query a finalized engine descriptor with cudnnBackendGetAttribute() API call for its attributes, including the performance knobs that it has. For simplicity, this use case skips this step and assumes the user is setting up an engine config descriptor below without making any changes to performance knobs.
Create, set, and finalize an engine config descriptor. Obtain the workspace size from the engine config.
cudnnBackendDescriptor_t engcfg; cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engcfg); cudnnBackendSetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine); cudnnBackendFinalize(engcfg); int64_t workspaceSize; cudnnBackendGetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE, CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);
Setting Up And Executing A Plan#
This use case describes the steps with which users set up an execution plan with a previously finalized engine config descriptor, set up the data pointer variant pack, and finally execute the plan.
Create, set, and finalize an execution plan descriptor. Obtain workspace size to allocate.
cudnnBackendDescriptor_t plan; cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan); cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engcfg); cudnnBackendFinalize(plan); int64_t workspaceSize; cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE, CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);
Create, set and finalize a variant pack descriptor.
void *dev_ptrs[3] = {xData, wData, yData}; // device pointer int64_t uids[3] = {'x', 'w', 'y'}; void *workspace; cudnnBackendDescriptor_t varpack; cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack); cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS, CUDNN_TYPE_VOID_PTR, 3, dev_ptrs); cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS, CUDNN_TYPE_INT64, 3, uids); cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE, CUDNN_TYPE_VOID_PTR, 1, &workspace); cudnnBackendFinalize(varpack);
Execute the plan with a variant pack.
cudnnBackendExecute(handle, plan, varpack);
Creating and Updating a CUDA Graph from a Plan#
This use case describes the steps with which users create a CUDA graph from a previously finalized execution plan and variant pack, execute the graph on a desired stream, and update the graph with a new variant pack. Note that this use case currently only works with a limited selection of cuDNN engines.
Create a CUDA graph.
cudaGraph_t cuda_graph; cudaGraphCreate(&cuda_graph, 0); cudnnBackendPopulateCudaGraph(handle, plan, varpack, cuda_graph);
Instantiate and execute the CUDA graph.
cudaGraphExec_t cuda_graph_exec; cudaGraphInstantiate (&cuda_graph_exec, cuda_graph); cudaGraphLaunch(instance, stream); // stream is a cudaStream_t cudaStreamSynchronize(stream);
Update the CUDA graph, update the instantiated graph and re-execute the graph. (This can be performed any number of times.)
cudaGraphExec_t cuda_graph_exec; cudnnBackendUpdateCudaGraph(handle, plan, new_varpack, cuda_graph); cudaGraphExecUpdateResultInfo result_info; cudaGraphExecUpdate(cuda_graph_exec, cuda_graph, &result_info); cudaGraphLaunch(instance, stream); cudaStreamSynchronize(stream);
Destroy the CUDA graph when we’re all done.
cudaGraphDestroy(cuda_graph);