cudnn_graph Library#
Data Type References#
These are the data type references for the cudnn_graph library.
Struct Types#
These are the struct types for the cudnn_graph library.
cudnnDebug_t#
cudnnDebug_t is a structure used by cudnnSetCallback() and cudnnGetCallback() containing the metadata, such as time, time since start, stream ID, process and thread ID, that the user may choose to print or store in customized callbacks.
cudnnFraction_t#
cudnnFraction_t is a structure that allows a user to define int64_t fractions.
typedef struct cudnnFractionStruct { int64_t numerator; int64_t denominator; } cudnnFraction_t;
Enumeration Types#
These are the enumeration types for the cudnn_graph library.
cudnnActivationMode_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnActivationMode_t is an enumerated type used to select the neuron activation function used in cudnnActivationForward(), cudnnActivationBackward(), and cudnnConvolutionBiasActivationForward().
Values
CUDNN_ACTIVATION_SIGMOIDSelects the sigmoid function.
CUDNN_ACTIVATION_RELUSelects the rectified linear function.
CUDNN_ACTIVATION_TANHSelects the hyperbolic tangent function.
CUDNN_ACTIVATION_CLIPPED_RELUSelects the clipped rectified linear function.
CUDNN_ACTIVATION_ELUSelects the exponential linear function.
CUDNN_ACTIVATION_IDENTITYSelects the identity function, intended for bypassing the activation step in cudnnConvolutionBiasActivationForward(). (The cudnnConvolutionBiasActivationForward() function must use
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM.) Does not work with cudnnActivationForward() or cudnnActivationBackward().CUDNN_ACTIVATION_SWISHSelects the swish function.
cudnnBackendAttributeName_t#
cudnnBackendAttributeName_t is an enumerated type that indicates the backend descriptor attributes that can be set or get using cudnnBackendSetAttribute() and cudnnBackendGetAttribute() functions. The backend descriptor to which an attribute belongs is identified by the prefix of the attribute name.
typedef enum { CUDNN_ATTR_POINTWISE_MODE = 0, CUDNN_ATTR_POINTWISE_MATH_PREC = 1, CUDNN_ATTR_POINTWISE_NAN_PROPAGATION = 2, CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP = 3, CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP = 4, CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5, CUDNN_ATTR_POINTWISE_ELU_ALPHA = 6, CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA = 7, CUDNN_ATTR_POINTWISE_SWISH_BETA = 8, CUDNN_ATTR_POINTWISE_AXIS = 9, CUDNN_ATTR_CONVOLUTION_COMP_TYPE = 100, CUDNN_ATTR_CONVOLUTION_CONV_MODE = 101, CUDNN_ATTR_CONVOLUTION_DILATIONS = 102, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103, CUDNN_ATTR_CONVOLUTION_POST_PADDINGS = 104, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS = 105, CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS = 106, CUDNN_ATTR_ENGINEHEUR_MODE = 200, CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201, CUDNN_ATTR_ENGINEHEUR_RESULTS = 202, CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203, CUDNN_ATTR_ENGINEHEUR_DEVICEPROP = 204, CUDNN_ATTR_ENGINECFG_ENGINE = 300, CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301, CUDNN_ATTR_ENGINECFG_KNOB_CHOICES = 302, CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE = 303, CUDNN_ATTR_EXECUTION_PLAN_HANDLE = 400, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG = 401, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE = 402, CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403, CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404, CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION = 405, CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE = 406, CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP = 407, CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID = 500, CUDNN_ATTR_INTERMEDIATE_INFO_SIZE = 501, CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS = 502, CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503, CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE = 600, CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA = 700, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA = 701, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC = 702, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W = 703, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X = 704, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y = 705, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA = 706, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA = 707, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC = 708, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W = 709, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX = 710, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY = 711, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA = 712, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA = 713, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW = 715, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X = 716, CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY = 717, CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750, CUDNN_ATTR_OPERATION_POINTWISE_XDESC = 751, CUDNN_ATTR_OPERATION_POINTWISE_BDESC = 752, CUDNN_ATTR_OPERATION_POINTWISE_YDESC = 753, CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1 = 754, CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2 = 755, CUDNN_ATTR_OPERATION_POINTWISE_DXDESC = 756, CUDNN_ATTR_OPERATION_POINTWISE_DYDESC = 757, CUDNN_ATTR_OPERATION_POINTWISE_TDESC = 758, CUDNN_ATTR_OPERATION_GENSTATS_MODE = 770, CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771, CUDNN_ATTR_OPERATION_GENSTATS_XDESC = 772, CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC = 773, CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774, CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE = 780, CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC = 781, CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC = 782, CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC = 783, CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC = 784, CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC = 785, CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC = 786, CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC = 787, CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788, CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC = 789, CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC = 790, CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC = 791, CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC = 792, CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC = 793, CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC = 794, CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC = 795, CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC = 796, CUDNN_ATTR_OPERATIONGRAPH_HANDLE = 800, CUDNN_ATTR_OPERATIONGRAPH_OPS = 801, CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802, CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT = 900, CUDNN_ATTR_TENSOR_DATA_TYPE = 901, CUDNN_ATTR_TENSOR_DIMENSIONS = 902, CUDNN_ATTR_TENSOR_STRIDES = 903, CUDNN_ATTR_TENSOR_VECTOR_COUNT = 904, CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905, CUDNN_ATTR_TENSOR_UNIQUE_ID = 906, CUDNN_ATTR_TENSOR_IS_VIRTUAL = 907, CUDNN_ATTR_TENSOR_IS_BY_VALUE = 908, CUDNN_ATTR_TENSOR_REORDERING_MODE = 909, CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC = 910, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS = 1000, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001, CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002, CUDNN_ATTR_VARIANT_PACK_WORKSPACE = 1003, CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100, CUDNN_ATTR_LAYOUT_INFO_TYPES = 1101, CUDNN_ATTR_KNOB_INFO_TYPE = 1200, CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201, CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202, CUDNN_ATTR_KNOB_INFO_STRIDE = 1203, CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300, CUDNN_ATTR_ENGINE_GLOBAL_INDEX = 1301, CUDNN_ATTR_ENGINE_KNOB_INFO = 1302, CUDNN_ATTR_ENGINE_NUMERICAL_NOTE = 1303, CUDNN_ATTR_ENGINE_LAYOUT_INFO = 1304, CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE = 1305, CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306, CUDNN_ATTR_ENGINE_DEVICEPROP = 1307 CUDNN_ATTR_MATMUL_COMP_TYPE = 1500, CUDNN_ATTR_MATMUL_PADDING_VALUE = 1501, CUDNN_ATTR_OPERATION_MATMUL_ADESC = 1520, CUDNN_ATTR_OPERATION_MATMUL_BDESC = 1521, CUDNN_ATTR_OPERATION_MATMUL_CDESC = 1522, CUDNN_ATTR_OPERATION_MATMUL_DESC = 1523, CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524, CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC = 1525, CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC = 1526, CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC = 1527, CUDNN_ATTR_REDUCTION_OPERATOR = 1600, CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601, CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610, CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611, CUDNN_ATTR_OPERATION_REDUCTION_DESC = 1612, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC = 1620, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC = 1621, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC = 1622, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC = 1623, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC = 1624, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC = 1625, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC = 1626, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC = 1627, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC = 1629, CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS = 1630, CUDNN_ATTR_RESAMPLE_MODE = 1700, CUDNN_ATTR_RESAMPLE_COMP_TYPE = 1701, CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS = 1702, CUDNN_ATTR_RESAMPLE_POST_PADDINGS = 1703, CUDNN_ATTR_RESAMPLE_PRE_PADDINGS = 1704, CUDNN_ATTR_RESAMPLE_STRIDES = 1705, CUDNN_ATTR_RESAMPLE_WINDOW_DIMS = 1706, CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707, CUDNN_ATTR_RESAMPLE_PADDING_MODE = 1708, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC = 1710, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC = 1711, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA = 1713, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA = 1714, CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC = 1716, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC = 1720, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC = 1721, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA = 1723, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA = 1724, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC = 1725, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC = 1726, CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC = 1727, CUDNN_ATTR_OPERATION_CONCAT_AXIS = 1800, CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS = 1801, CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802, CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC = 1803, CUDNN_ATTR_OPERATION_SIGNAL_MODE = 1900, CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901, CUDNN_ATTR_OPERATION_SIGNAL_VALUE = 1902, CUDNN_ATTR_OPERATION_SIGNAL_XDESC = 1903, CUDNN_ATTR_OPERATION_SIGNAL_YDESC = 1904, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC = 1950, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC = 1951, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC = 1952, CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953, CUDNN_ATTR_OPERATION_NORM_FWD_MODE = 2000, CUDNN_ATTR_OPERATION_NORM_FWD_PHASE = 2001, CUDNN_ATTR_OPERATION_NORM_FWD_XDESC = 2002, CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC = 2003, CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC = 2004, CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC = 2005, CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC = 2006, CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC = 2007, CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC = 2008, CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC = 2009, CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC = 2010, CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011, CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC = 2012, CUDNN_ATTR_OPERATION_NORM_FWD_YDESC = 2013, CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS = 2014, CUDNN_ATTR_OPERATION_NORM_BWD_MODE = 2100, CUDNN_ATTR_OPERATION_NORM_BWD_XDESC = 2101, CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC = 2102, CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103, CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC = 2104, CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC = 2105, CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC = 2106, CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC = 2107, CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC = 2108, CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC = 2109, CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS = 2110, CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200, CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201, CUDNN_ATTR_RNG_DISTRIBUTION = 2300, CUDNN_ATTR_RNG_NORMAL_DIST_MEAN = 2301, CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302, CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM = 2303, CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM = 2304, CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY = 2305, CUDNN_ATTR_OPERATION_RNG_YDESC = 2310, CUDNN_ATTR_OPERATION_RNG_SEED = 2311, CUDNN_ATTR_OPERATION_RNG_DESC = 2312, CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313, CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2400, CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC = 2500, CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC = 2501, CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502, CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC = 2503, CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504, CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC = 2600, CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601, CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC = 2602, CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC = 2603, CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604, CUDNN_ATTR_DEVICEPROP_DEVICE_ID = 2700, CUDNN_ATTR_DEVICEPROP_HANDLE = 2701, CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702, } cudnnBackendAttributeName_t;
cudnnBackendAttributeType_t#
The enumeration type cudnnBackendAttributeType_t specifies the data type of an attribute of a cuDNN backend descriptor. It is used to specify the type of data pointed to by the void *arrayOfElements argument of cudnnBackendSetAttribute() and cudnnBackendGetAttribute().
typedef enum { CUDNN_TYPE_HANDLE = 0, CUDNN_TYPE_DATA_TYPE, CUDNN_TYPE_BOOLEAN, CUDNN_TYPE_INT64, CUDNN_TYPE_FLOAT, CUDNN_TYPE_DOUBLE, CUDNN_TYPE_VOID_PTR, CUDNN_TYPE_CONVOLUTION_MODE, CUDNN_TYPE_HEUR_MODE, CUDNN_TYPE_KNOB_TYPE, CUDNN_TYPE_NAN_PROPOGATION, CUDNN_TYPE_NUMERICAL_NOTE, CUDNN_TYPE_LAYOUT_TYPE, CUDNN_TYPE_ATTRIB_NAME, CUDNN_TYPE_POINTWISE_MODE, CUDNN_TYPE_BACKEND_DESCRIPTOR, CUDNN_TYPE_GENSTATS_MODE, CUDNN_TYPE_BN_FINALIZE_STATS_MODE, CUDNN_TYPE_REDUCTION_OPERATOR_TYPE, CUDNN_TYPE_BEHAVIOR_NOTE, CUDNN_TYPE_TENSOR_REORDERING_MODE, CUDNN_TYPE_RESAMPLE_MODE, CUDNN_TYPE_PADDING_MODE, CUDNN_TYPE_INT32, CUDNN_TYPE_CHAR, CUDNN_TYPE_SIGNAL_MODE, CUDNN_TYPE_FRACTION, CUDNN_TYPE_NORM_MODE, CUDNN_TYPE_NORM_FWD_PHASE, CUDNN_TYPE_RNG_DISTRIBUTION } cudnnBackendAttributeType_t;
|
Attribute Type |
|---|---|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cudnnBackendBehaviorNote_t#
cudnnBackendBehaviorNote_t is an enumerated type that indicates queryable behavior notes of an engine. Users can query for an array of behavior notes from an CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.
typedef enum { CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION = 0, CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1, CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER = 2, CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API = 3, CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, } cudnnBackendBehaviorNote_t;
cudnnBackendDescriptorType_t#
cudnnBackendDescriptor_t is an enumerated type that indicates the type of backend descriptors. Users create a backend descriptor of a particular type by passing a value from this enumerate to the cudnnBackendCreateDescriptor() function.
typedef enum { CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0, CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, CUDNN_BACKEND_ENGINE_DESCRIPTOR, CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR, CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR, CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR, CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR, CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR, CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, CUDNN_BACKEND_TENSOR_DESCRIPTOR, CUDNN_BACKEND_MATMUL_DESCRIPTOR, CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR, CUDNN_BACKEND_REDUCTION_DESCRIPTOR, CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR, CUDNN_BACKEND_RESAMPLE_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR, CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR, CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR, CUDNN_BACKEND_RNG_DESCRIPTOR, CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR, CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR, CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR } cudnnBackendDescriptorType_t;
cudnnBackendHeurMode_t#
cudnnBackendHeurMode_t is an enumerated type that indicates the operation mode of a CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR.
typedef enum { CUDNN_HEUR_MODE_INSTANT = 0, CUDNN_HEUR_MODE_B = 1, CUDNN_HEUR_MODE_FALLBACK = 2, CUDNN_HEUR_MODE_A = 3 }
Values
CUDNN_HEUR_MODE_AandCUDNN_HEUR_MODE_INSTANTCUDNN_HEUR_MODE_Aprovides the exact same functionality asCUDNN_HEUR_MODE_INSTANT. The purpose of this renaming is to better match the naming ofCUDNN_HEUR_MODE_B.Consider the use of
CUDNN_HEUR_MODE_INSTANTas deprecated; instead, useCUDNN_HEUR_MODE_A.CUDNN_HEUR_MODE_Autilizes a decision tree heuristic which provides optimal inference time on the CPU in comparison toCUDNN_HEUR_MODE_B.CUDNN_HEUR_MODE_AandCUDNN_HEUR_MODE_INSTANTsupport the following operation node or operation graph:All other operation graphs are not supported.
CUDNN_HEUR_MODE_BCan utilize the neural net based heuristics to improve generalization performance compared to
CUDNN_HEUR_MODE_INSTANT.In cases where the neural net is utilized, inference time on the CPU will be increased by 10-100x compared to
CUDNN_HEUR_MODE_INSTANT. These neural net heuristics are not supported for any of the following cases:3-D convolutions
Grouped convolutions (
groupCountlarger than1)Dilated convolutions (any dilation for any spatial dimension larger than
1)
Further, the neural net is only enabled on x86 platforms when cuDNN is run on an A100 GPU. In cases where the neural net is not supported,
CUDNN_HEUR_MODE_Bwill fall back toCUDNN_HEUR_MODE_INSTANT.CUDNN_HEUR_MODE_Bwill also fall back toCUDNN_HEUR_MODE_INSTANTin cases where the overhead ofCUDNN_HEUR_MODE_Bis projected to reduce overall network performance.CUDNN_HEUR_MODE_FALLBACKThis heuristic mode is intended to be used for finding fallback options which provide functional support (without any expectation of providing optimal GPU performance).
cudnnBackendKnobType_t#
cudnnBackendKnobType_t is an enumerated type that indicates the type of performance knobs. Performance knobs are runtime settings to an engine that will affect its performance. Users can query for an array of performance knobs and their valid value range from a CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function. Users can set the choice for each knob using the cudnnBackendSetAttribute() function with a CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR.
typedef enum { CUDNN_KNOB_TYPE_SPLIT_K = 0, CUDNN_KNOB_TYPE_SWIZZLE = 1, CUDNN_KNOB_TYPE_TILE_SIZE = 2, CUDNN_KNOB_TYPE_USE_TEX = 3, CUDNN_KNOB_TYPE_EDGE = 4, CUDNN_KNOB_TYPE_KBLOCK = 5, CUDNN_KNOB_TYPE_LDGA = 6, CUDNN_KNOB_TYPE_LDGB = 7, CUDNN_KNOB_TYPE_CHUNK_K = 8, CUDNN_KNOB_TYPE_SPLIT_H = 9, CUDNN_KNOB_TYPE_WINO_TILE = 10, CUDNN_KNOB_TYPE_MULTIPLY = 11, CUDNN_KNOB_TYPE_SPLIT_K_BUF = 12, CUDNN_KNOB_TYPE_TILEK = 13, CUDNN_KNOB_TYPE_STAGES = 14, CUDNN_KNOB_TYPE_REDUCTION_MODE = 15, CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE = 16, CUDNN_KNOB_TYPE_SPLIT_K_SLC = 17, CUDNN_KNOB_TYPE_IDX_MODE = 18, CUDNN_KNOB_TYPE_SLICED = 19, CUDNN_KNOB_TYPE_SPLIT_RS = 20, CUDNN_KNOB_TYPE_SINGLEBUFFER = 21, CUDNN_KNOB_TYPE_LDGC = 22, CUDNN_KNOB_TYPE_SPECFILT = 23, CUDNN_KNOB_TYPE_KERNEL_CFG = 24, CUDNN_KNOB_TYPE_WORKSPACE = 25, CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM = 26, CUDNN_KNOB_TYPE_TILE_CGA_M = 27, CUDNN_KNOB_TYPE_TILE_CGA_N = 28, CUDNN_KNOB_TYPE_BLOCK_SIZE = 29, CUDNN_KNOB_TYPE_OCCUPANCY = 30, CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD = 31, CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM = 32, CUDNN_KNOB_TYPE_SPLIT_COLS = 33, CUDNN_KNOB_TYPE_TILE_ROWS = 34, CUDNN_KNOB_TYPE_TILE_COLS = 35, } cudnnBackendKnobType_t;
cudnnBackendLayoutType_t#
cudnnBackendLayoutType_t is an enumerated type that indicates queryable layout requirements of an engine. Users can query for layout requirements from a CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.
typedef enum { CUDNN_LAYOUT_TYPE_PREFERRED_NCHW = 0, CUDNN_LAYOUT_TYPE_PREFERRED_NHWC = 1, CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2, CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3, CUDNN_LAYOUT_TYPE_COUNT = 4, } cudnnBackendLayoutType_t;
cudnnBackendNormFwdPhase_t#
cudnnBackendNormFwdPhase_t is an enumerated type used to distinguish the inference and training phase of the normalization forward operation.
typedef enum { CUDNN_NORM_FWD_INFERENCE = 0, CUDNN_NORM_FWD_TRAINING = 1, } cudnnBackendNormFwdPhase_t;
cudnnBackendNormMode_t#
cudnnBackendNormMode_t is an enumerated type to indicate the normalization mode in the backend normalization forward and normalization backward operations.
For reference:
The definition of layer normalization can be found in the Layer Normalization paper.
The definition of instance normalization can be found in the Instance Normalization: The Missing Ingredient for Fast Stylization paper.
The definition of batch normalization can be found in the Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift paper.
The definition of root mean square normalization can be found in the Root Mean Square Layer Normalization paper.
The definition of adaptive layer normalization can be found in the Understanding and Improving Layer Normalization paper.
CUDNN_GROUP_NORM is not yet supported. If you try to use it, cuDNN returns a CUDNN_STATUS_INTERNAL_ERROR error.
typedef enum { CUDNN_LAYER_NORM = 0, CUDNN_INSTANCE_NORM = 1, CUDNN_BATCH_NORM = 2, CUDNN_GROUP_NORM = 3, CUDNN_RMS_NORM = 4, CUDNN_ADA_LAYER_NORM = 5, } cudnnBackendNormMode_t;
cudnnBackendNumericalNote_t#
cudnnBackendNumericalNot_t is an enumerated type that indicates queryable numerical properties of an engine. Users can query for an array of numerical notes from an CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.
typedef enum { CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0, CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS, CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION, CUDNN_NUMERICAL_NOTE_FFT, CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC, CUDNN_NUMERICAL_NOTE_WINOGRAD, CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4, CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6, CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13, CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP, CUDNN_NUMERICAL_NOTE_TYPE_COUNT, } cudnnBackendNumericalNote_t;
cudnnBackendTensorReordering_t#
cudnnBackendTensorReordering_t is an enumerated type that indicates tensor reordering as a property of the tensor descriptor. Users can get and set this property in a CUDNN_BACKEND_TENSOR_DESCRIPTOR using the cudnnBackendSetAttribute() and cudnnBackendGetAttribute() functions.
typedef enum { CUDNN_TENSOR_REORDERING_NONE = 0, CUDNN_TENSOR_REORDERING_INT8x32 = 1, CUDNN_TENSOR_REORDERING_F16x16 = 2, CUDNN_TENSOR_REORDERING_F8_128x4 = 3, } cudnnBackendTensorReordering_t;
cudnnBnFinalizeStatsMode_t#
cudnnBnFinalizeStatsMode_t is an enumerated type that that exposes the different mathematical operation modes that converts batch norm statistics and the trained scale and bias to the equivalent scale and bias to be applied in the next normalization stage for inference and training use cases.
typedef enum { CUDNN_BN_FINALIZE_STATISTICS_TRAINING = 0, CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1, } cudnnBnFinalizeStatsMode_t;
cudnnConvolutionMode_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnConvolutionMode_t is an enumerated type used by cudnnSetConvolution2dDescriptor() to configure a convolution descriptor. The filter used for the convolution can be applied in two different ways, corresponding mathematically to a convolution or to a cross-correlation. (A cross-correlation is equivalent to a convolution with its filter rotated by 180 degrees.)
Values
CUDNN_CONVOLUTIONIn this mode, a convolution operation will be done when applying the filter to the images.
CUDNN_CROSS_CORRELATIONIn this mode, a cross-correlation operation will be done when applying the filter to the images.
cudnnDataType_t#
cudnnDataType_t is an enumerated type indicating the data type to which a tensor descriptor or filter descriptor refers.
Values
CUDNN_DATA_FLOATThe data is a 32-bit single-precision floating-point (
float).CUDNN_DATA_DOUBLEThe data is a 64-bit double-precision floating-point (
double).CUDNN_DATA_HALFThe data is a 16-bit floating-point.
CUDNN_DATA_INT8The data is an 8-bit signed integer.
CUDNN_DATA_INT32The data is a 32-bit signed integer.
CUDNN_DATA_INT8x4The data is 32-bit elements each composed of 4 8-bit signed integers. This data type is only supported with the tensor format
CUDNN_TENSOR_NCHW_VECT_C.CUDNN_DATA_UINT8The data is an 8-bit unsigned integer.
CUDNN_DATA_UINT8x4The data is 32-bit elements each composed of 4 8-bit unsigned integers. This data type is only supported with the tensor format
CUDNN_TENSOR_NCHW_VECT_C.CUDNN_DATA_INT8x32The data is 32-element vectors, each element being an 8-bit signed integer. This data type is only supported with the tensor format
CUDNN_TENSOR_NCHW_VECT_C. Moreover, this data type can only be used withalgo 1, meaning,CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. For more information, refer to cudnnConvolutionFwdAlgo_t.CUDNN_DATA_BFLOAT16The data is a 16-bit quantity, with 7 mantissa bits, 8 exponent bits, and 1 sign bit.
CUDNN_DATA_INT64The data is a 64-bit signed integer.
CUDNN_DATA_BOOLEANThe data is a boolean (
bool).Note that for type
CUDNN_TYPE_BOOLEAN, elements are expected to be “packed”: that is, one byte contains 8 elements of typeCUDNN_TYPE_BOOLEAN. Further, within each byte, elements are indexed from the least significant bit to the most significant bit. For example, a 1 dimensional tensor of 8 elements containing 01001111 has value 1 for elements 0 through 3, 0 for elements 4 and 5, 1 for element 6 and 0 for element 7.Tensors with more than 8 elements simply use more bytes, where the order is also from least significant to most significant byte. Note, CUDA is little-endian, meaning that the least significant byte has the lower memory address address. For example, in the case of 16 elements, 01001111 11111100 has value 1 for elements 0 through 3, 0 for elements 4 and 5, 1 for element 6 and 0 for element 7, value 0 for elements 8 and 9, 1 for elements 10 through 15.
CUDNN_DATA_FP8_E4M3The data is an 8-bit quantity, with 3 mantissa bits, 4 exponent bits, and 1 sign bit.
CUDNN_DATA_FP8_E5M2The data is an 8-bit quantity, with 2 mantissa bits, 5 exponent bits, and 1 sign bit.
CUDNN_DATA_FAST_FLOAT_FOR_FP8The data type is a higher throughput but lower precision compute type (compared to
CUDNN_DATA_FLOAT) used for FP8 tensor core operations.
cudnnErrQueryMode_t#
cudnnErrQueryMode_t is an enumerated type passed to cudnnQueryRuntimeError() to select the remote kernel error query mode.
Values
CUDNN_ERRQUERY_RAWCODERead the error storage location regardless of the kernel completion status.
CUDNN_ERRQUERY_NONBLOCKINGReport if all tasks in the user stream of the cuDNN handle were completed. If that is the case, report the remote kernel error code.
CUDNN_ERRQUERY_BLOCKINGWait for all tasks to complete in the user stream before reporting the remote kernel error code.
cudnnGenStatsMode_t#
cudnnGenStatsMode_t is an enumerated type to indicate the statistics mode in the backend statistics generation operation.
Values
CUDNN_GENSTATS_SUM_SQSUMIn this mode, the sum and sum of squares of the input tensor along the specified dimensions are computed and written out. The reduction dimensions currently supported are limited per channel, however additional support may be added upon request.
cudnnHandle_t#
cudnnHandle_t is a pointer to an opaque structure holding the cuDNN library context. The cuDNN library context must be created using cudnnCreate() and the returned handle must be passed to all subsequent library function calls. The context should be destroyed at the end using cudnnDestroy(). The context is associated with only one GPU device, the current device at the time of the call to cudnnCreate(). However, multiple contexts can be created on the same GPU device.
cudnnMathType_t#
cudnnMathType_t is an enumerated type used to indicate if the use of Tensor Core operations is permitted in a given library routine.
Values
CUDNN_DEFAULT_MATHTensor Core operations are not used on pre-NVIDIA A100 GPU devices. On A100 GPU architecture devices, Tensor Core TF32 operation is permitted.
CUDNN_TENSOR_OP_MATHThe use of Tensor Core operations is permitted but will not actively perform datatype down conversion on tensors in order to utilize Tensor Cores.
CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSIONThe use of Tensor Core operations is permitted and will actively perform datatype down conversion on tensors in order to utilize Tensor Cores.
CUDNN_FMA_MATHRestricted to only kernels that use FMA instructions.
On pre-NVIDIA A100 GPU devices,
CUDNN_DEFAULT_MATHandCUDNN_FMA_MATHhave the same behavior: Tensor Core kernels will not be selected. With NVIDIA Ampere architecture and CUDA toolkit 11,CUDNN_DEFAULT_MATHpermits TF32 Tensor Core operation andCUDNN_FMA_MATHdoes not. The TF32 behavior forCUDNN_DEFAULT_MATHand the other Tensor Core math types can be explicitly disabled by the environment variableNVIDIA_TF32_OVERRIDE=0.
cudnnNanPropagation_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnNanPropagation_t is an enumerated type used to indicate if a given routine should propagate Nan numbers. This enumerated type is used as a field for the cudnnActivationDescriptor_t descriptor and cudnnPoolingDescriptor_t descriptor.
Values
CUDNN_NOT_PROPAGATE_NANNANnumbers are not propagated.CUDNN_PROPAGATE_NANNANnumbers are propagated.
cudnnCTCGradMode_t#
Enumerated type used by cudnnSetCTCLossDescriptor_v9 and cudnnGetCTCLossDescriptor_v9 to indicate the behavior for out of boundary (OOB) samples. OOB samples are samples where L+R > T is encountered during the gradient calculation.
If
ctcGradModeis set toCUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for that sample. Instead, the current values, even not finite, are retained.If
ctcGradModeis set toCUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
cudnnPaddingMode_t#
cudnnPaddingMode_t is an enumerated type to indicate the padding mode in the backend resample operations.
typedef enum { CUDNN_ZERO_PAD = 0, CUDNN_NEG_INF_PAD = 1, CUDNN_EDGE_VAL_PAD = 2, } cudnnPaddingMode_t;
cudnnPointwiseMode_t#
cudnnPointwiseMode_t is an enumerated type to indicate the intended pointwise math operation in the backend pointwise operation descriptor.
Values
CUDNN_POINTWISE_ADDA pointwise addition between two tensors is computed.
CUDNN_POINTWISE_ADD_SQUAREA pointwise addition between the first tensor and the square of the second tensor is computed.
CUDNN_POINTWISE_DIVA pointwise true division of the first tensor by second tensor is computed.
CUDNN_POINTWISE_MAXA pointwise maximum is taken between two tensors.
CUDNN_POINTWISE_MINA pointwise minimum is taken between two tensors.
CUDNN_POINTWISE_MODA pointwise floating-point remainder of the first tensor’s division by the second tensor is computed.
CUDNN_POINTWISE_MULA pointwise multiplication between two tensors is computed.
CUDNN_POINTWISE_POWA pointwise value from the first tensor to the power of the second tensor is computed.
CUDNN_POINTWISE_SUBA pointwise subtraction between two tensors is computed.
CUDNN_POINTWISE_ABSA pointwise absolute value of the input tensor is computed.
CUDNN_POINTWISE_CEILA pointwise ceiling of the input tensor is computed.
CUDNN_POINTWISE_COSA pointwise trigonometric cosine of the input tensor is computed.
CUDNN_POINTWISE_EXPA pointwise exponential of the input tensor is computed.
CUDNN_POINTWISE_FLOORA pointwise floor of the input tensor is computed.
CUDNN_POINTWISE_LOGA pointwise natural logarithm of the input tensor is computed.
CUDNN_POINTWISE_NEGA pointwise numerical negative of the input tensor is computed.
CUDNN_POINTWISE_RSQRTA pointwise reciprocal of the square root of the input tensor is computed.
CUDNN_POINTWISE_SINA pointwise trigonometric sine of the input tensor is computed.
CUDNN_POINTWISE_SQRTA pointwise square root of the input tensor is computed.
CUDNN_POINTWISE_TANA pointwise trigonometric tangent of the input tensor is computed.
CUDNN_POINTWISE_ERFA pointwise Error Function is computed.
CUDNN_POINTWISE_IDENTITYNo computation is performed. As with other pointwise modes, this mode provides implicit conversions by specifying the data type of the input tensor as one type, and the data type of the output tensor as another.
CUDNN_POINTWISE_RELU_FWDA pointwise rectified linear activation function of the input tensor is computed.
CUDNN_POINTWISE_TANH_FWDA pointwise tanh activation function of the input tensor is computed.
CUDNN_POINTWISE_SIGMOID_FWDA pointwise sigmoid activation function of the input tensor is computed.
CUDNN_POINTWISE_ELU_FWDA pointwise Exponential Linear Unit activation function of the input tensor is computed.
CUDNN_POINTWISE_GELU_FWDA pointwise Gaussian Error Linear Unit activation function of the input tensor is computed.
CUDNN_POINTWISE_SOFTPLUS_FWDA pointwise softplus activation function of the input tensor is computed.
CUDNN_POINTWISE_SWISH_FWDA pointwise swish activation function of the input tensor is computed.
CUDNN_POINTWISE_GELU_APPROX_TANH_FWDA pointwise tanh approximation of the Gaussian Error Linear Unit activation function of the input tensor is computed. The tanh GELU approximation is computed as \(0.5x\left( 1+\tanh\left[ \sqrt{2/\pi}\left( x+0.044715x^{3} \right) \right] \right)\). For more information, refer to the GAUSSIAN ERROR LINEAR UNIT (GELUS) paper.
CUDNN_POINTWISE_RELU_BWDA pointwise first derivative of rectified linear activation of the input tensor is computed.
CUDNN_POINTWISE_TANH_BWDA pointwise first derivative of tanh activation of the input tensor is computed.
CUDNN_POINTWISE_SIGMOID_BWDA pointwise first derivative of sigmoid activation of the input tensor is computed.
CUDNN_POINTWISE_ELU_BWDA pointwise first derivative of Exponential Linear Unit activation of the input tensor is computed.
CUDNN_POINTWISE_GELU_BWDA pointwise first derivative of Gaussian Error Linear Unit activation of the input tensor is computed.
CUDNN_POINTWISE_SOFTPLUS_BWDA pointwise first derivative of softplus activation of the input tensor is computed.
CUDNN_POINTWISE_SWISH_BWDA pointwise first derivative of swish activation of the input tensor is computed.
CUDNN_POINTWISE_GELU_APPROX_TANH_BWDA pointwise first derivative of the tanh approximation of the Gaussian Error Linear Unit activation of the input tensor is computed. This is computed as \(0.5\left( 1+\tanh\left( b\left( x+cx^{3} \right) \right)+bxsech^{2}\left( b\left( cx^{3}+x \right) \right)\left( 3cx^{2}+1 \right)dy \right)\) where \(b\) is \(\sqrt{2/\pi}\) and \(c\) is \(0.044715\).
CUDNN_POINTWISE_CMP_EQA pointwise truth value of the first tensor equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_NEQA pointwise truth value of the first tensor not equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_GTA pointwise truth value of the first tensor greater than the second tensor is computed.
CUDNN_POINTWISE_CMP_GEA pointwise truth value of the first tensor greater than equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_LTA pointwise truth value of the first tensor less than the second tensor is computed.
CUDNN_POINTWISE_CMP_LEA pointwise truth value of the first tensor less than equal to the second tensor is computed.
CUDNN_POINTWISE_LOGICAL_ANDA pointwise truth value of the first tensor logical
ANDsecond tensor is computed.CUDNN_POINTWISE_LOGICAL_ORA pointwise truth value of the first tensor logical
ORsecond tensor is computed.CUDNN_POINTWISE_LOGICAL_NOTA pointwise truth value of input tensors logical
NOTis computed.CUDNN_POINTWISE_GEN_INDEXA pointwise index value of the input tensor is generated along a given axis.
CUDNN_POINTWISE_BINARY_SELECTA pointwise value is selected amongst two input tensors based on a given predicate tensor.
CUDNN_POINTWISE_RECIPROCALA pointwise reciprocal of the input tensor is computed. In other words, for every element x in the input tensor, 1/x is computed.
cudnnReduceTensorOp_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnReduceTensorOp_t is an enumerated type used to indicate the Tensor Core operation to be used by the cudnnReduceTensor() routine. This enumerated type is used as a field for the cudnnReduceTensorDescriptor_t descriptor.
Values
CUDNN_REDUCE_TENSOR_ADDThe operation to be performed is addition.
CUDNN_REDUCE_TENSOR_MULThe operation to be performed is multiplication.
CUDNN_REDUCE_TENSOR_MINThe operation to be performed is a minimum comparison.
CUDNN_REDUCE_TENSOR_MAXThe operation to be performed is a maximum comparison.
CUDNN_REDUCE_TENSOR_AMAXThe operation to be performed is a maximum comparison of absolute values.
CUDNN_REDUCE_TENSOR_AVGThe operation to be performed is averaging.
CUDNN_REDUCE_TENSOR_NORM1The operation to be performed is addition of absolute values.
CUDNN_REDUCE_TENSOR_NORM2The operation to be performed is a square root of the sum of squares.
CUDNN_REDUCE_TENSOR_MUL_NO_ZEROSThe operation to be performed is multiplication, not including elements of value zero.
cudnnReorderType_t#
This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.
cudnnReorderType_t is an enumerated type to set the convolution reordering type. The reordering type can be set by cudnnSetConvolutionReorderType() and its status can be read by cudnnGetConvolutionReorderType().
typedef enum { CUDNN_DEFAULT_REORDER = 0, CUDNN_NO_REORDER = 1, } cudnnReorderType_t;
cudnnResampleMode_t#
cudnnResampleMode_t is an enumerated type to indicate the resample mode in the backend resample operations.
typedef enum { CUDNN_RESAMPLE_NEAREST = 0, CUDNN_RESAMPLE_BILINEAR = 1, CUDNN_RESAMPLE_AVGPOOL = 2, CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2, CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4, CUDNN_RESAMPLE_MAXPOOL = 3, } cudnnResampleMode_t;
cudnnRngDistribution_t#
cudnnRngDistribution_t is an enumerated type to indicate the distribution to be used in the backend Rng (random number generator) operation.
typedef enum { CUDNN_RNG_DISTRIBUTION_BERNOULLI, CUDNN_RNG_DISTRIBUTION_UNIFORM, CUDNN_RNG_DISTRIBUTION_NORMAL, } cudnnRngDistribution_t;
Values
CUDNN_RNG_DISTRIBUTION_BERNOULLIThe bernoulli distribution is used for the random number generation. The attribute
CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITYcan be used to specify the probability of generating 1’s.CUDNN_RNG_DISTRIBUTION_UNIFORMThe uniform distribution is used for the random number generation. The attribute
CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUMandCUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUMcan be used to specify the minimum and maximum value between which the random numbers should be uniformly generated.CUDNN_RNG_DISTRIBUTION_NORMALThe normal distribution is used for the random number generation. The attribute
CUDNN_ATTR_RNG_NORMAL_DIST_MEANandCUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATIONcan be used to specify the mean and standard deviation of the random number generator.
cudnnSeverity_t#
cudnnSeverity_t is an enumerated type passed to the customized callback function for logging that users may set. This enumerate describes the severity level of the item, so the customized logging call back may react differently. The numerical values are the same that are to be used for setting the CUDNN_LOGLEVEL_DBG environment variable.
Values
CUDNN_SEV_FATAL = 0This value indicates a fatal error emitted by cuDNN.
CUDNN_SEV_ERROR = 1This value indicates a normal error emitted by cuDNN.
CUDNN_SEV_WARNING = 2This value indicates a warning emitted by cuDNN.
CUDNN_SEV_INFO = 3This value indicates a piece of information (for example, API log) emitted by cuDNN.
cudnnSignalMode_t#
cudnnSignalMode_t is an enumerated type to indicate the signaling mode in the backend signal operation.
typedef enum { CUDNN_SIGNAL_SET = 0, CUDNN_SIGNAL_WAIT = 1, } cudnnSignalMode_t;
Values
CUDNN_SIGNAL_SETThe flag variable is updated with the provided signal value atomically.
CUDNN_SIGNAL_WAITThe operation blocks until the flag variable keeps comparing equal to the provided signal value.
cudnnStatus_t#
cudnnStatus_t is an enumerated type used for function status returns. All cuDNN library functions return their status, which can be one of the following values:
Values
CUDNN_STATUS_SUCCESSThe operation was completed successfully.
CUDNN_STATUS_NOT_INITIALIZEDThe cuDNN library was not initialized properly. This error is usually returned when a call to cudnnCreate() fails or when cudnnCreate() has not been called prior to calling another cuDNN routine. In the former case, it is usually due to an error in the CUDA Runtime API called by cudnnCreate() or by an error in the hardware setup.
CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCHSome cuDNN sub libraries have different versions, indicative of an installation issue.
CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCHThe schema used for serialization is not what the current cuDNN library expects, thus the serialized artifact is no longer valid and needs to be re-serialized.
CUDNN_STATUS_DEPRECATEDThis error code may be reported in the warning logging level as a reminder that some functionality is under deprecation and will be removed in the next major version update.
CUDNN_STATUS_LICENSE_ERRORThe functionality requested requires some license and an error was detected when trying to check the current licensing. This error can happen if the license is not present or is expired or if the environment variable
NVIDIA_LICENSE_FILEis not set properly.CUDNN_STATUS_RUNTIME_IN_PROGRESSSome tasks in the user stream are not completed.
CUDNN_STATUS_RUNTIME_FP_OVERFLOWNumerical overflow occurred during the GPU kernel execution.
CUDNN_STATUS_BAD_PARAMThis is an error category code. An incorrect value or parameter was passed to the function.
CUDNN_STATUS_BAD_PARAM_NULL_POINTERThe cuDNN API has unexpectedly received a null pointer from the user.
CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTERThe cuDNN API has received a misaligned pointer from the user.
CUDNN_STATUS_BAD_PARAM_NOT_FINALIZEDThe backend descriptor has not been finalized.
CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUNDThe cuDNN API has received an out-of-bound value.
CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENTThe cuDNN API has received a memory buffer with insufficient space.
CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCHThe cuDNN API has received an unexpected stream.
CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCHThe cuDNN API has received inconsistent tensor shapes.
CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIESThe cuDNN API has received duplicated entries.
CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPEThe cuDNN API has received an invalid or unsupported attribute type.
CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCHThe cuDNN API has received an unexpected CUDA graph.
CUDNN_STATUS_NOT_SUPPORTEDThis is an error category code. The functionality requested is not currently supported by cuDNN.
CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERNcuDNN does not currently support such an operation graph pattern.
CUDNN_STATUS_NOT_SUPPORTED_SHAPEcuDNN does not currently support the tensor shapes used in some specific operation or graph pattern.
CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPEcuDNN does not currently support the tensor data type.
CUDNN_STATUS_NOT_SUPPORTED_LAYOUTcuDNN does not currently support the tensor layout.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVERThe requested functionality is not compatible with the current CUDA driver.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDARTThe requested functionality is not compatible with the current CUDA runtime.
CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCHThe function requires a feature absent from the current GPU device.
CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSINGA runtime library required by cuDNN cannot be found in the predefined search paths. These libraries are
libcuda.so(nvcuda.dll) andlibnvrtc.so(nvrtc64_<Major Release Version><Minor Release Version>_0.dllandnvrtc-builtins64_<Major Release Version><Minor Release Version>.dll).CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLEThe requested functionality is not available due to missing a sublibrary.
CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENTThe requested functionality is not available due to the insufficient shared memory size on the GPU.
CUDNN_STATUS_NOT_SUPPORTED_PADDINGThe requested functionality is not available due to padding requirements.
CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAMThe requested functionality is not available because they lead to invalid kernel launch parameters.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_APIThe requested functionality is not available because this particular engine does not support the native CUDA graph API. (The engines that do support that API have the behavior note
CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API.)CUDNN_STATUS_INTERNAL_ERRORThis is an error category code. An internal cuDNN operation failed.
CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILEDA runtime kernel has failed to be compiled.
CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUEAn unexpected internal inconsistency has been detected.
CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILEDAn internal host memory allocation failed inside the cuDNN library.
CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILEDResource allocation failed inside the cuDNN library.
CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAMInvalid kernel launch parameters are unexpectedly detected.
CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILEDAccess to GPU memory space failed, which is usually caused by a failure to bind a texture. To correct, prior to the function call, unbind any previously bound textures. Otherwise, this may indicate an internal error/bug in the library.
CUDNN_STATUS_EXECUTION_FAILEDThis is an error category code. The GPU program failed to execute. This is usually caused by a failure to launch a kernel on the GPU, which can be caused by another library that cuDNN depends on.
CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVERThe GPU program failed to execute due to an error reported by the CUDA driver.
CUDNN_STATUS_EXECUTION_FAILED_CUBLASThe GPU program failed to execute due to an error reported by cuBLAS.
CUDNN_STATUS_EXECUTION_FAILED_CUDARTThe GPU program failed to execute due to an error reported by the CUDA runtime.
CUDNN_STATUS_EXECUTION_FAILED_CURANDThe GPU program failed to execute due to an error reported by cuRAND.
Additionally, the following macros can be used on cudnnStatus_t error codes.
CUDNN_STATUS_CATEGORY(full_error_code)Extract the category error code from a
cudnnStatus_terror codefull_error_code. This is useful for checking if an error belongs to a certain category. For example,CUDNN_STATUS_CATEGORY(CUDNN_STATUS_BAD_PARAM_NULL_POINTER)will outputCUDNN_STATUS_BAD_PARAM.CUDNN_STATUS_SPECIFIC_ERROR(full_error_code)Extract the specific error code from a
cudnnStatus_terror codefull_error_code, that is, removing the category.CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err)Recombine a category error code and a specific code in the same category into a full
cudnnStatus_terror code, such thatCUDNN_STATUS_FULL_ERROR_CODE(CUDNN_STATUS_CATEGORY(e), CUDNN_STATUS_SPECIFIC_ERROR(e)) == e, for any validcudnnStatus_terror code.
cudnnTensorFormat_t#
cudnnTensorFormat_t is an enumerated type used by cudnnSetTensor4dDescriptor() to create a tensor with a pre-defined layout. For a detailed explanation of how these tensors are arranged in memory, refer to Data Layout Formats.
Values
CUDNN_TENSOR_NCHWThis tensor format specifies that the data is laid out in the following order: batch size, feature maps, rows, columns. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, feature maps, rows, and columns; the columns are the inner dimension and the images are the outermost dimension.
CUDNN_TENSOR_NHWCThis tensor format specifies that the data is laid out in the following order: batch size, rows, columns, feature maps. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, rows, columns, and feature maps; the feature maps are the inner dimension and the images are the outermost dimension.
CUDNN_TENSOR_NCHW_VECT_CThis tensor format specifies that the data is laid out in the following order: batch size, feature maps, rows, columns. However, each element of the tensor is a vector of multiple feature maps. The length of the vector is carried by the data type of the tensor. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, feature maps, rows, and columns; the columns are the inner dimension and the images are the outermost dimension. This format is only supported with tensor data types
CUDNN_DATA_INT8x4,CUDNN_DATA_INT8x32, andCUDNN_DATA_UINT8x4.The
CUDNN_TENSOR_NCHW_VECT_Ccan also be interpreted in the following way: The NCHW INT8x32 format is really N x (C/32) x H x W x 32 (32 Cs for every W), just as the NCHW INT8x4 format is N x (C/4) x H x W x 4 (4 Cs for every W). Hence, the VECT_C name - each W is a vector (4 or 32) of Cs.
Uncategorized#
These are the uncategorized references found in the cudnn_graph library.
cudnnBackendDescriptor_t#
cudnnBackendDescriptor_t is a typedef void pointer to one of many opaque descriptor structures. The type of structure that it points to is determined by the argument when allocating the memory for the opaque structure using cudnnBackendCreateDescriptor().
Attributes of a descriptor can be set using cudnnBackendSetAttribute(). After all required attributes of a descriptor are set, the descriptor can be finalized by cudnnBackendFinalize(). From a finalized descriptor, one can query its queryable attributes using cudnnBackendGetAttribute(). Finally, the memory allocated for a descriptor can be freed using cudnnBackendDestroyDescriptor().
API Functions#
These are the API functions in the cudnn_graph library.
cudnnBackendPopulateCudaGraph()#
This method, part of the new Native CUDA Graph API, directly builds a CUDA graph (not to be confused with a cuDNN graph) representing the given engine. When the caller instantiates and executes this CUDA graph, the graph will execute the engine configuration plan on the VariantPack and the finalized ExecutionPlan on the data.
The resulting CUDA graph captures the pointers (data and working space) in the VariantPack at the time this API is called, but it can be run arbitrarily many times with different data at the same pointers. The graph can also can later be modified in place with different VariantPack pointers by using cudnnBackendUpdateCudaGraph().
The initial CUDA graph passed in to this API must be empty (having no nodes), and the caller should not append additional nodes to the resulting graph. However, the graph can be embedded as a child node of a larger CUDA graph, for example, by cudaGraphAddChildGraphNode. (This is typical usage.)
Only a limited number of engines currently support this API (with more to be added in future releases of cuDNN). Those supporting it have the behavior note CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API.
Note
This API is only supported in versions of cuDNN compiled against CUDA runtime 12.x and above.
cudnnStatus_t cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t cudaGraph)
Parameters
executionPlanInput. Pointer to the finalized
ExecutionPlan.variantPackInput. Pointer to the finalized VariantPack consisting of:
Data pointer for each non-virtual pointer of the operation set in the execution plan.
Pointer to user-allocated workspace in global memory at least as large as the size queried from
CUDNN_BACKEND_.
cudaGraphInput/Output. A CUDA graph handle, representing an already created, empty CUDA graph to be populated by the API.
Returns
CUDNN_STATUS_SUCCESSThe CUDA graph was generated successfully.
CUDNN_STATUS_BAD_PARAMAn incorrect or inconsistent value is encountered. For example, a required data pointer is invalid, or the given
cudaGraphisn’t initially empty.CUDNN_STATUS_INTERNAL_ERRORSome internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILEDAn error was encountered creating a CUDA graph for the plan with the variant pack.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_APIThis particular engine does not support the native CUDA graph API.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDARTThis cuDNN was built to a CUDA runtime less than 12.0, and doesn’t support the native CUDA graph API.
cudnnBackendCreateDescriptor()#
This function allocates memory in the descriptor for a given descriptor type and at the location pointed by the descriptor.
cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor)Note
The cudnnBackendDescriptor_t is a pointer to
void *.
Parameters
descriptorTypeInput. One among the enumerated cudnnBackendDescriptorType_t.
descriptorInput. Pointer to an instance of cudnnBackendDescriptor_t to be created.
Returns
CUDNN_STATUS_SUCCESSThe creation was successful.
CUDNN_STATUS_NOT_SUPPORTEDCreating a descriptor of a given type is not supported.
CUDNN_STATUS_ALLOC_FAILEDThe memory allocation failed.
cudnnBackendDestroyDescriptor()#
This function destroys instances of cudnnBackendDescriptor_t that were previously created using cudnnBackendCreateDescriptor().
cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor)
Parameters
descriptorInput. Instance of cudnnBackendDescriptor_t previously created by cudnnBackendCreateDescriptor().
Returns
CUDNN_STATUS_SUCCESSThe memory was destroyed successfully.
CUDNN_STATUS_ALLOC_FAILEDThe destruction of memory failed.
Undefined BehaviorThe
descriptorwas altered between theCreateandDestroy Descriptor.UndefinedThe value pointed by the
descriptorwill beUndefinedafter the memory is free and done.
cudnnBackendExecute()#
This function executes the given Engine Configuration Plan on the VariantPack and the finalized ExecutionPlan on the data. The data and the working space are encapsulated in the VariantPack.
cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t varianPack)
Parameters
executionPlanInput. Pointer to the finalized
ExecutionPlan.variantPackInput. Pointer to the finalized
VariantPackconsisting of:Data pointer for each non-virtual pointer of the operation set in the execution plan.
Pointer to user-allocated workspace in global memory at least as large as the size queried from
CUDNN_BACKEND_.
Returns
CUDNN_STATUS_SUCCESSThe
ExecutionPlanwas executed successfully.CUDNN_STATUS_BAD_PARAMAn incorrect or inconsistent value is encountered. For example, a required data pointer is invalid.
CUDNN_STATUS_INTERNAL_ERRORSome internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILEDAn error was encountered executing the plan with the variant pack.
cudnnBackendFinalize()#
This function finalizes the memory pointed to by the descriptor. The type of finalization is done depending on the descriptorType argument with which the descriptor was created using cudnnBackendCreateDescriptor() or initialized using cudnnBackendInitialize().
cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor descriptor)
cudnnBackendFinalize() also checks all the attributes set between the create/initialization and finalize phase. If successful, cudnnBackendFinalize() returns CUDNN_STATUS_SUCCESS and the finalized state of the descriptor is set to true. In this state, setting attributes using cudnnBackendSetAttribute() is not allowed. Getting attributes using cudnnBackendGetAttribute() is only allowed when the finalized state of the descriptor is true.
Parameters
descriptorInput. Instance of cudnnBackendDescriptor_t to finalize.
Returns
CUDNN_STATUS_SUCCESSThe
descriptorwas finalized successfully.CUDNN_STATUS_BAD_PARAMInvalid
descriptorattribute values or combination thereof is encountered.CUDNN_STATUS_NOT_SUPPORTEDDescriptor attribute values or combinations therefore not supported by the current version of cuDNN are encountered.
CUDNN_STATUS_INTERNAL_ERRORSome internal errors are encountered.
cudnnBackendGetAttribute()#
This function retrieves the values of an attribute of a descriptor. attributeName is the name of the attribute whose value is requested. attributeType is the type of attribute. requestsedElementCount is the number of elements to be potentially retrieved. The number of elements for the requested attribute is stored in elementCount. The retrieved values are stored in arrayOfElements. When the attribute is expected to have a single value, arrayOfElements can be pointer to the output value. This function will return CUDNN_STATUS_NOT_INTIALIZED if the descriptor has not been successfully finalized using cudnnBackendFinalize().
cudnnStatus_t cudnnBackendGetAttribute( cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, cudnnBackendAttributeType_t attributeType, int64_t requestedElementCount, int64_t *elementCount, void *arrayOfElements);
Parameters
descriptorInput. Instance of cudnnBackendDescriptor_t whose attribute the user wants to retrieve.
attributeNameInput. The name of the attribute being get from the on the
descriptor.attributeTypeInput. The type of attribute.
requestedElementCountInput. Number of elements to output to
arrayOfElements.elementCountInput. Output pointer for the number of elements the
descriptorattribute has. Note thatcudnnBackendGetAttribute()will only write the least of this andrequestedElementCountelements toarrayOfElements.arrayOfElementsInput. Array of elements of the datatype of the
attributeType. The data type of theattributeTypeis listed in the mapping table of cudnnBackendAttributeType_t.
Returns
CUDNN_STATUS_SUCCESSThe
attributeNamewas given to thedescriptorsuccessfully.CUDNN_STATUS_BAD_PARAMOne or more invalid or inconsistent argument values were encountered. Some examples include:
attributeNameis not a valid attribute for the descriptor.attributeTypeis not one of the valid types for the attribute.
CUDNN_STATUS_NOT_INITIALIZEDThe
descriptorhas not been successfully finalized using cudnnBackendFinalize().
cudnnBackendInitialize()#
This function has been deprecated in cuDNN 9.2.
This function repurposes a pre-allocated memory pointed to by a descriptor of size sizeInByte to a backend descriptor of type descriptorType. The finalized state of the descriptor is set to false.
cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor, cudnnBackendDescriptorType_t descriptorType, size_t sizeInBytes)
Parameters
descriptorInput. Instance of cudnnBackendDescriptor_t to be initialized.
descriptorTypeInput. Enumerated value for the type of cuDNN backend
descriptor.sizeInBytesInput. Size of memory pointed to by
descriptor.
Returns
CUDNN_STATUS_SUCCESSThe memory was initialized successfully.
CUDNN_STATUS_BAD_PARAMAn invalid or inconsistent argument value is encountered. Some examples include:
descriptoris anullptrsizeInBytesis less than the size required by thedescriptortype
cudnnBackendSetAttribute()#
This function sets an attribute of a descriptor to values provided as a pointer. descriptor is the descriptor to be set. attributeName is the name of the attribute to be set. attributeType is the type of attribute. The value to which the attribute is set, is pointed by the arrayOfElements. The number of elements is given by elementCount. This function will return CUDNN_STATUS_NOT_INTIALIZED if the descriptor is already successfully finalized using cudnnBackendFinalize().
cudnnStatus_t cudnnBackendSetAttribute( cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, cudnnBackendAttributeType_t attributeType, int64_t elementCount, void *arrayOfElements);
Parameters
descriptorInput. Instance of cudnnBackendDescriptor_t whose attribute is being set.
attributeNameInput. The name of the attribute being set on the
descriptor.attributeTypeInput. The type of attribute.
elementCountInput. Number of elements being set.
arrayOfElementsInput. The starting location for an array from where to read the values from. The elements of the array are expected to be of the datatype of the
attributeType. The datatype of theattributeTypeis listed in the mapping table of cudnnBackendAttributeType_t.
Returns
CUDNN_STATUS_SUCCESSThe
attributeNamewas set to thedescriptor.CUDNN_STATUS_NOT_INITIALIZEDThe backend
descriptorpointed to by thedescriptoris already in the finalized state.CUDNN_STATUS_BAD_PARAMThe function is called with arguments that correspond to invalid values. Some examples include:
attributeNameis not a settable attribute ofdescriptor.attributeTypeis incorrect for thisattributeName.elemCountvalue is unexpected.arrayOfElementscontains values invalid for theattributeType.
CUDNN_STATUS_NOT_SUPPORTEDThe values to which the attributes are being set are not supported by the current version of cuDNN.
cudnnBackendUpdateCudaGraph()#
This method, part of the new Native CUDA Graph API, updates an existing CUDA graph previously populated by cudnnBackendPopulateCudaGraph() (or a clone thereof) with a new variantPack.
Only a limited number of engines currently support this API (with more to be added in future releases of cuDNN). Those supporting it have the behavior note CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API.
Note
This API is only supported in versions of cuDNN compiled against CUDA runtime 12.x and above.
cudnnStatus_t cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t cudaGraph)
Parameters
executionPlanInput. Pointer to the finalized
ExecutionPlan. This must match theExecutionPlanoriginally passed to cudnnBackendPopulateCudaGraph().variantPackInput. Pointer to a finalized
VariantPackconsisting of the following pointers, which replace theVariantPackpointers captured in the CUDA graph:Data pointer for each non-virtual pointer of the operation set in the execution plan.
Pointer to a user-allocated workspace in global memory at least as large as the size queried from
CUDNN_BACKEND_.
cudaGraphInput. Pointer to an existing CUDA graph handle. This graph must have been populated by cudnnBackendPopulateCudaGraph(), or be a clone thereof (for example, as created by
cudaGraphClone, or as embedded in a larger graph by usingcudaGraphAddChildGraphNode).
Returns
CUDNN_STATUS_SUCCESSThe CUDA graph was updated successfully.
CUDNN_STATUS_BAD_PARAMAn incorrect or inconsistent value is encountered. For example, a required data pointer is invalid.
CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCHThe CUDA graph doesn’t appear to have been populated by cudnnBackendPopulateCudaGraph(), (or a clone thereof) for this execution plan, or contains unexpected additional nodes.
CUDNN_STATUS_INTERNAL_ERRORSome internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILEDAn error was encountered updating a CUDA graph for the plan with the variant pack.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_APIThis particular engine does not support the native CUDA graph API.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDARTThis cuDNN was built to a CUDA runtime less than 12.0, and doesn’t support the native CUDA graph API.
cudnnCreate()#
This function initializes the cuDNN library and creates a handle to an opaque structure holding the cuDNN library context. It allocates hardware resources on the host and device and must be called prior to making any other cuDNN library calls.
cudnnStatus_t cudnnCreate(cudnnHandle_t *handle)
The cuDNN library handle is tied to the current CUDA device (context). To use the library on multiple devices, one cuDNN handle needs to be created for each device.
For a given device, multiple cuDNN handles with different configurations (for example, different current CUDA streams) may be created. Because cudnnCreate() allocates some internal resources, the release of those resources by calling cudnnDestroy() will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate() or cudnnDestroy() outside of performance-critical code paths.
For multithreaded applications that use the same device from different threads, the recommended programming model is to create one (or a few, as is convenient) cuDNN handles per thread and use that cuDNN handle for the entire life of the thread.
Parameters
handleOutput. Pointer to pointer where to store the address to the allocated cuDNN handle. For more information, refer to cudnnHandle_t.
Returns
CUDNN_STATUS_BAD_PARAMInvalid (
NULL) input pointer supplied.CUDNN_STATUS_NOT_INITIALIZEDNo compatible GPU found, CUDA driver not installed or disabled, CUDA runtime API initialization failed.
CUDNN_STATUS_ARCH_MISMATCHNVIDIA GPU architecture is too old.
CUDNN_STATUS_ALLOC_FAILEDHost memory allocation failed.
CUDNN_STATUS_INTERNAL_ERRORCUDA resource allocation failed.
CUDNN_STATUS_LICENSE_ERRORcuDNN license validation failed (only when the feature is enabled).
CUDNN_STATUS_SUCCESScuDNN handle was created successfully.
cudnnDestroy()#
This function releases the resources used by the cuDNN handle. This function is usually the last call made to cuDNN with a particular handle. Because cudnnCreate() allocates internal resources, the release of those resources by calling cudnnDestroy() will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate() or cudnnDestroy() outside of performance-critical code paths.
cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)
Parameters
handleInput. The cuDNN handle to be destroyed.
Returns
CUDNN_STATUS_SUCCESSThe cuDNN context destruction was successful.
cudnnGetCallback()#
This function queries the internal states of cuDNN error reporting functionality.
cudnnStatus_t cudnnGetCallback( unsigned mask, void **udata, cudnnCallback_t fptr)
Parameters
maskOutput. Pointer to the address where the current internal error reporting message bit mask will be outputted.
udataOutput. Pointer to the address where the current internally stored
udataaddress will be stored.fptrOutput. Pointer to the address where the current internally stored
callbackfunction pointer will be stored. When the built-in default callback function is used,NULLwill be outputted.
Returns
CUDNN_STATUS_SUCCESSThe function launched successfully.
CUDNN_STATUS_BAD_PARAMIf any of the input parameters are
NULL.
cudnnGetCudartVersion()#
The same version of a given cuDNN library can be compiled against different CUDA toolkit versions. This routine returns the CUDA toolkit version that the currently used cuDNN library has been compiled against.
size_t cudnnGetCudartVersion()
cudnnGetErrorString()#
This function converts the cuDNN status code to a NULL terminated (ASCIIZ) static string. For example, when the input argument is CUDNN_STATUS_SUCCESS, the returned string is CUDNN_STATUS_SUCCESS. When an invalid status value is passed to the function, the returned string is CUDNN_UNKNOWN_STATUS.
const char * cudnnGetErrorString(cudnnStatus_t status)
Parameters
statusInput. cuDNN enumerant status code.
Returns
Pointer to a static, NULL terminated string with the status name.
cudnnGetLastErrorString()#
This function retrieves the last encountered cuDNN error message in the current thread to a NULL terminated (ASCIIZ) string. Inside the cuDNN library, the messages are stored in thread local buffers. The error is cleared after the user calls this API to retrieve it.
void cudnnGetLastErrorString(char *message, size_t max_size);
Parameters
messageOutput. Pointer to a character buffer that can store the error message. As we do not manage the thread-safety of the pre-allocated output buffer “message” to avoid unnecessary overhead, we ask that the user ensure it is thread-safe themselves on their own need-basis.
max_sizeInput. Maximum size that can be stored in the location pointed to by message. The output is strictly limited by the size limit
max_size, also counting the terminator character\0, which will be automatically appended to the message if there is space.
cudnnGetMaxDeviceVersion()#
This function returns the maximum SM version that the cuDNN library is aware of and supports natively. Any SM version higher than this would be supported in forward compatibility mode. For more information about forward compatibility, refer to the cuDNN Developer Guide.
size_t cudnnGetMaxDeviceVersion(void);
Returns
A size_t type value indicating the latest known SM number for the current version of the library. For example, if NVIDIA Hopper (GH100) is the latest known SM that the library is aware of, the value returned would be 900.
cudnnGetProperty()#
This function writes a specific part of the cuDNN library version number into the provided host storage.
cudnnStatus_t cudnnGetProperty( libraryPropertyType type, int *value)
Parameters
typeInput. Enumerant type that instructs the function to report the numerical value of the cuDNN major version, minor version, or the patch level depending on whether
typeis set toMAJOR_VERSION,MINOR_VERSION, orPATCH_LEVEL.valueOutput. Host pointer where the version information should be written.
Returns
CUDNN_STATUS_INVALID_VALUEInvalid value of the
typeargument.CUDNN_STATUS_SUCCESSVersion information was stored successfully at the provided address.
cudnnGetStream()#
This function retrieves the user CUDA stream programmed in the cuDNN handle. When the user’s CUDA stream is not set in the cuDNN handle, this function reports the null-stream.
cudnnStatus_t cudnnGetStream( cudnnHandle_t handle, cudaStream_t *streamId)
Parameters
handleInput. Pointer to the cuDNN handle.
streamIDOutput. Pointer where the current CUDA stream from the cuDNN handle should be stored.
Returns
CUDNN_STATUS_BAD_PARAMInvalid (
NULL) handle.CUDNN_STATUS_SUCCESSThe stream identifier was retrieved successfully.
cudnnGetVersion()#
This function returns the version number of the cuDNN library. It returns the CUDNN_VERSION defined present in the cudnn.h header file. Starting with release R2, the routine can be used to identify dynamically the current cuDNN library used by the application. The defined CUDNN_VERSION can be used to have the same application linked against different cuDNN versions using conditional compilation statements.
size_t cudnnGetVersion()
cudnnGraphVersionCheck()#
Cross-library version checker. Each sub-library has a version checker that checks whether its own version matches that of its dependencies.
cudnnStatus_t cudnnGraphVersionCheck(void);
Returns
CUDNN_STATUS_SUCCESSThe version check passed.
CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCHThe versions are inconsistent.
cudnnQueryRuntimeError()#
cuDNN library functions perform extensive input argument checking before launching GPU kernels. The last step is to verify that the GPU kernel actually started. When a kernel fails to start, CUDNN_STATUS_EXECUTION_FAILED is returned by the corresponding API call. Typically, after a GPU kernel starts, no runtime checks are performed by the kernel itself - numerical results are simply written to output buffers.
cudnnStatus_t cudnnQueryRuntimeError( cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode)
When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is selected in cudnnBatchNormalizationForwardTraining() or cudnnBatchNormalizationBackward(), the algorithm may encounter numerical overflows where CUDNN_BATCHNORM_SPATIAL performs just fine albeit at a slower speed. The user can invoke cudnnQueryRuntimeError() to make sure numerical overflows did not occur during the kernel execution. Those issues are reported by the kernel that performs computations.
cudnnQueryRuntimeError() can be used in polling and blocking software control flows. There are two polling modes (CUDNN_ERRQUERY_RAWCODE and CUDNN_ERRQUERY_NONBLOCKING) and one blocking mode CUDNN_ERRQUERY_BLOCKING.
CUDNN_ERRQUERY_RAWCODE reads the error storage location regardless of the kernel completion status. The kernel might not even start and the error storage (allocated per cuDNN handle) might be used by an earlier call.
CUDNN_ERRQUERY_NONBLOCKING checks if all tasks in the user stream are completed. The cudnnQueryRuntimeError() function will return immediately and report CUDNN_STATUS_RUNTIME_IN_PROGRESS in rstatus if some tasks in the user stream are pending. Otherwise, the function will copy the remote kernel error code to rstatus.
In the blocking mode (CUDNN_ERRQUERY_BLOCKING), the function waits for all tasks to drain in the user stream before reporting the remote kernel error code. The blocking flavor can be further adjusted by calling cudaSetDeviceFlags with the cudaDeviceScheduleSpin, cudaDeviceScheduleYield, or cudaDeviceScheduleBlockingSync flag.
CUDNN_ERRQUERY_NONBLOCKING and CUDNN_ERRQUERY_BLOCKING modes should not be used when the user stream is changed in the cuDNN handle, meaning, cudnnSetStream() is invoked between functions that report runtime kernel errors and the cudnnQueryRuntimeError() function.
The remote error status reported in rstatus can be set to: CUDNN_STATUS_SUCCESS, CUDNN_STATUS_RUNTIME_IN_PROGRESS, or CUDNN_STATUS_RUNTIME_FP_OVERFLOW. The remote kernel error is automatically cleared by cudnnQueryRuntimeError().
The cudnnQueryRuntimeError() function should be used in conjunction with cudnnBatchNormalizationForwardTraining() and cudnnBatchNormalizationBackward() when the cudnnBatchNormMode_t argument is CUDNN_BATCHNORM_SPATIAL_PERSISTENT.
Parameters
handleInput. Handle to a previously created cuDNN context.
rstatusOutput. Pointer to the user’s error code storage.
modeInput. Remote error query mode.
tagInput/Output. Currently, this argument should be
NULL.
Returns
CUDNN_STATUS_SUCCESSNo errors detected (
rstatusholds a valid value).CUDNN_STATUS_BAD_PARAMInvalid input argument.
CUDNN_STATUS_INTERNAL_ERRORA stream blocking synchronization or a non-blocking stream query failed.
CUDNN_STATUS_MAPPING_ERRORThe device cannot access zero-copy memory to report kernel errors.
cudnnSetCallback()#
This function sets the internal states of cuDNN error reporting functionality.
cudnnStatus_t cudnnSetCallback( unsigned mask, void *udata, cudnnCallback_t fptr)
Parameters
maskInput. An unsigned integer. The four least significant bits (LSBs) of this unsigned integer are used for switching on and off the different levels of error reporting messages. This applies for both the default callbacks, and for the customized callbacks. The bit position is in correspondence with the enum of cudnnSeverity_t. The user may utilize the predefined macros
CUDNN_SEV_ERROR_EN,CUDNN_SEV_WARNING_EN, andCUDNN_SEV_INFO_ENto form the bit mask. When a bit is set to 1, the corresponding message channel is enabled.For example, when bit 3 is set to 1, the API logging is enabled. Currently, only the log output of level
CUDNN_SEV_INFOis functional; the others are not yet implemented. When used for turning on and off the logging with the default callback, the user may passNULLtoudataandfptr. In addition, the environment variableCUDNN_LOGDEST_DBGmust be set. For more information, refer to the Deprecation Policy.CUDNN_SEV_INFO_EN = 0b1000(functional).CUDNN_SEV_ERROR_EN = 0b0010(functional).CUDNN_SEV_WARNING_EN = 0b0100(functional).The output of
CUDNN_SEV_FATALis always enabled and cannot be disabled.udataInput. A pointer provided by the user. This pointer will be passed to the user’s custom logging callback function. The data it points to will not be read, nor be changed by cuDNN. This pointer may be used in many ways, such as in a mutex or in a communication socket for the user’s callback function for logging. If the user is utilizing the default callback function, or doesn’t want to use this input in the customized callback function, they may pass in
NULL.fptrInput. A pointer to a user-supplied callback function. When
NULLis passed to this pointer, then cuDNN switches back to the built-in default callback function. The user-supplied callback function prototype must be similar to the following (also defined in the header file):void customizedLoggingCallback (cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
The structure cudnnDebug_t is defined in the header file. It provides the metadata, such as time, time since start, stream ID, process and thread ID, that the user may choose to print or store in their customized callback.
The variable
msgis the logging message generated by cuDNN. Each line of this message is terminated by\0, and the end of the message is terminated by\0\0. Users may select what is necessary to show in the log, and may reformat the string.
Returns
CUDNN_STATUS_SUCCESSThe function launched successfully.
cudnnSetStream()#
This function sets the user’s CUDA stream in the cuDNN handle. The new stream will be used to launch cuDNN GPU kernels or to synchronize to this stream when cuDNN kernels are launched in the internal streams. If the cuDNN library stream is not set, all kernels use the default (NULL) stream. Setting the user stream in the cuDNN handle guarantees the issue-order execution of cuDNN calls and other GPU kernels launched in the same stream.
cudnnStatus_t cudnnSetStream( cudnnHandle_t handle, cudaStream_t streamId)
With CUDA 11.x or later, internal streams have the same priority as the stream set by the last call to this function. In CUDA graph capture mode, CUDA 11.8 or later is required in order for the stream priorities to match.
Parameters
handleInput. Pointer to the cuDNN handle.
streamIDInput. New CUDA stream to be written to the cuDNN handle.
Returns
CUDNN_STATUS_BAD_PARAMInvalid (
NULL) handle.CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCHMismatch between the user stream and the cuDNN handle context.
CUDNN_STATUS_NOT_SUPPORTEDThe stream priority is out of range.
CUDNN_STATUS_INTERNAL_ERRORCUDA stream APIs reported further errors inside cuDNN.
CUDNN_STATUS_SUCCESSThe new stream was set successfully.
Backend Descriptor Types#
This section enumerates all valid attributes of various descriptors.
CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &desc); the cuDNN backend convolution descriptor specifies the parameters for a convolution operator for both forward and backward propagation: compute data type, convolution mode, filter dilation and stride, and padding on both sides.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_CONVOLUTION_:
CUDNN_ATTR_CONVOLUTION_COMP_TYPEThe compute type of the convolution operator.
CUDNN_TYPE_DATA_TYPE; one element.Required attribute.
CUDNN_ATTR_CONVOLUTION_MODEConvolution or cross-correlation mode.
CUDNN_TYPE_CONVOLUTION_MODE; one element.Required attribute.
CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMSThe number of spatial dimensions, expected array length for each of dilations, filter strides, and padding arrays.
CUDNN_TYPE_INT64; one element.Required attribute.
CUDNN_ATTR_CONVOLUTION_DILATIONSFilter dilation.
CUDNN_TYPE_INT64; one or more, but at mostCUDNN_MAX_DIMSelements.Required attribute.
CUDNN_ATTR_CONVOLUTION_FILTER_STRIDESFilter stride.
CUDNN_TYPE_INT64; one or more, but at mostCUDNN_MAX_DIMSelements.Required attribute.
CUDNN_ATTR_CONVOLUTION_PRE_PADDINGSPadding at the beginning of each spatial dimension.
CUDNN_TYPE_INT64; one or more, but at mostCUDNN_MAX_DIMSelements.Required attribute.
CUDNN_ATTR_CONVOLUTION_POST_PADDINGSPadding at the end of each spatial dimension.
CUDNN_TYPE_INT64; one or more, but at mostCUDNN_MAX_DIMSelements.Required attribute.
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR can have the following return values:
CUDNN_STATUS_BAD_PARAMAn
elemCountargument for settingCUDNN_ATTR_CONVOLUTION_DILATIONS,CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, andCUDNN_ATTR_CONVOLUTION_POST_PADDINGSis not equal to the value set forCUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_ENGINE_DESCRIPTOR#
Created with descriptor type value CUDNN_BACKEND_ENGINE_DESCRIPTOR, cuDNN backend engine descriptor describes an engine to compute an operation graph. An engine is a grouping of kernels with similar compute and numerical attributes.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_ENGINE_:
CUDNN_ATTR_ENGINE_OPERATION_GRAPHThe operation graph to compute.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR.Required attribute.
CUDNN_ATTR_ENGINE_GLOBAL_INDEXThe index for the engine.
CUDNN_TYPE_INT64; one element.Valid values are between
0andCUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT-1.Required attribute.
CUDNN_ATTR_ENGINE_KNOB_INFOThe descriptors of performance knobs of the engine.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR.Read-only attribute.
CUDNN_ATTR_ENGINE_NUMERICAL_NOTEThe numerical attributes of the engine.
CUDNN_TYPE_NUMERICAL_NOTE; zero or more elements.Read-only attribute.
CUDNN_ATTR_ENGINE_LAYOUT_INFOThe preferred tensor layouts of the engine.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR.Read-only attribute.
CUDNN_ATTR_ENGINE_BEHAVIOR_NOTEThe behavior attributes of the engine.
CUDNN_TYPE_BEHAVIOR_NOTE; zero or more elements.Read-only attribute.
CUDNN_ATTR_ENGINE_SM_COUNT_TARGETThe number of SMs to target.
CUDNN_TYPE_INT32; one element.Valid values are between
0and the number of SMs on the device, where0is default meaning all the SMs will be used.Optional attribute.
CUDNN_ATTR_ENGINE_DEVICEPROPThe descriptor of the device that this engine descriptor targets.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR.Optional attribute.
Finalization
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_STATUS_NOT_SUPPORTEDThe descriptor attribute set is not supported by the current version of cuDNN. For example, the value of
CUDNN_ATTR_ENGINE_GLOBAL_INDEXis not in a valid range.CUDNN_STATUS_BAD_PARAMThe descriptor attribute set is inconsistent or in an unexpected state. For example, the operation graph descriptor set is not already finalized.
CUDNN_BACKEND_ENGINECFG_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &desc); the cuDNN backend engine configuration descriptor consists of an engine descriptor and an array of knob choice descriptors. Users can query from engine config information about intermediates: computational intermediate results that can be reused between executions.
Attributes
CUDNN_ATTR_ENGINECFG_ENGINEThe backend engine.
CUDNN_TYPE_BACKEND_DESCRIPTOR: one element, a backend descriptor of type CUDNN_BACKEND_ENGINE_DESCRIPTOR.Required attribute.
CUDNN_ATTR_ENGINECFG_KNOB_CHOICESThe engine tuning knobs and choices.
CUDNN_TYPE_BACKEND_DESCRIPTOR: zero or more elements, backend descriptors of type CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR.
CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFOInformation of the computational intermediate of this engine config.
CUDNN_TYPE_BACKEND_DESCRIPTOR: one element, a backend descriptor of type CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR.Read-only attribute.
Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZEThe size of the workspace buffer required to execute this engine config.
CUDNN_TYPE_INT64; one element.Read-only attribute.
Finalization
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_STATUS_NOT_SUPPORTEDThe descriptor attribute set is not supported by the current version of cuDNN. For example, the value knob.
CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, &desc); the cuDNN backend engine heuristics descriptor allows users to obtain for an operation graph engine configuration descriptors ranked by performance according to cuDNN’s heuristics.
Attributes
CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPHThe operation graph for which heuristics result in a query.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element.Required attribute.
CUDNN_ATTR_ENGINEHEUR_MODEThe heuristic mode to query the result.
CUDNN_TYPE_HEUR_MODE; one element.Required attribute.
CUDNN_ATTR_ENGINEHEUR_RESULTSThe result of the heuristics query.
CUDNN_TYPE_BACKEND_DESCRIPTOR; zero or more elements of descriptor type CUDNN_BACKEND_ENGINECFG_DESCRIPTOR.Get-only attribute.
CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGETThe number of SMs to target.
CUDNN_TYPE_INT32; one element.Valid values are between
0and the number of SMs on the device, where0is default meaning all the SMs will be used.Optional attribute.
CUDNN_ATTR_ENGINEHEUR_DEVICEPROPThe descriptor of the device that this engine heuristics descriptor targets.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR.Optional attribute.
Finalization
Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend engine heuristics descriptor:
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &desc); the cuDNN backend execution plan descriptor allows the user to specify an execution plan, consists of a cuDNN handle, an engine configuration, and optionally an array of intermediates to compute.
Attributes
CUDNN_ATTR_EXECUTION_PLAN_HANDLEA cuDNN handle.
CUDNN_TYPE_HANDLE; one element.Required attribute.
CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIGAn engine configuration to execute.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor typeCUDNN_BACKEND_ENGINECFG_DESCRIPTOR.Required attribute.
CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDSUnique identifiers of intermediates to compute.
CUDNN_TYPE_INT64; zero or more elements.Optional attribute. If set, the execution plan will only compute the specified intermediate and not any of the output tensors on the operation graph in the engine configuration.
CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDSUnique identifiers of precomputed intermediates.
CUDNN_TYPE_INT64; zero or more elements.Optional attribute. If set, the plan will expect and use pointers for each intermediate in the variant pack descriptor during execution.
Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZEThe size of the workspace buffer required to execute this plan.
CUDNN_TYPE_INT64; one element.Read-only attribute.
CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATIONThe JSON representation of the serialized execution plan. Serialization and deserialization can be done by getting and setting this attribute, respectively.
CUDNN_TYPE_CHAR; many elements, the same amount as the size of a null-terminated string of the json representation of the execution plan.
CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHEThe kernel cache that the execution plan can refer to in order to accelerate the finalization for runtime fusion engines by reusing a previously compiled identical kernel implementation.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor typeCUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR.
CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROPThe descriptor of the device that this execution plan targets.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR.Optional attribute.
Finalization
Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend execution plan descriptor:
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR, &desc); the cuDNN backend intermediate descriptor is a read-only descriptor that contains information about an execution intermediate. An execution intermediate is some intermediate computation for an engine config in device memory that can be reused between plan execution to amortize the kernel. Each intermediate is identified by a unique ID. Users can query for the device memory size of the intermediate. An intermediate can depend on the data of one or more tensors identified by the tensor UIDs or one more attribute of the operation graph.
This is a read-only descriptor. Users cannot set the descriptor attributes or finalize the descriptor. User query for a finalized descriptor from an engine config descriptor.
Attributes
CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_IDA unique identifier of the intermediate.
CUDNN_TYPE_INT64; one element.Read-only attribute.
CUDNN_ATTR_INTERMEDIATE_INFO_SIZEThe required device memory size for the intermediate.
CUDNN_TYPE_INT64; one element.Read-only attribute.
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDSUID of tensors on which the intermediate depends.
CUDNN_TYPE_INT64; zero or more elements.Read-only attribute.
CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTESCurrently unsupported. Placeholder for future implementation.
Finalization
User does not finalize this descriptor. cudnnBackendFinalize(desc) with a backend intermediate descriptor returns CUDNN_STATUS_NOT_SUPPORTED.
CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR, &desc); the cuDNN backend knob choice descriptor consists of the type of knobs to be set and the value to which the knob is set.
Attributes
CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPEThe type of knobs to be set.
CUDNN_TYPE_KNOB_TYPE: one element.Required attribute.
CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUEThe value of the knobs to be set.
CUDNN_TYPE_INT64: one element.Required attribute.
Finalization
Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend knob choice descriptor:
CUDNN_STATUS_SUCCESSThe knob choice descriptor was finalized successfully.
CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_INFO_DESCRIPTOR, &desc); the cuDNN backend knob info descriptor consists of the type and valid value range of an engine performance knob. Valid value range is given in terms of minimum, maximum, and stride of valid values. This is a purely informative descriptor type. Setting descriptor attributes is not supported. User obtains an array of finalized descriptors, one for each knob type, from a finalized backend descriptor.
Attributes
CUDNN_ATTR_KNOB_INFO_TYPEThe type of the performance knob.
CUDNN_TYPE_KNOB_TYPE: one element.Read-only attribute.
CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUEThe smallest valid value choice value for this knob.
CUDNN_TYPE_INT64: one element.Read-only attribute.
CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUEThe largest valid choice value for this knob.
CUDNN_TYPE_INT64: one element.Read-only attribute.
CUDNN_ATTR_KNOB_INFO_STRIDEThe stride of valid choice values for this knob.
CUDNN_TYPE_INT64: one element.Read-only attribute.
Finalization
This descriptor is read-only; it is retrieved and finalized from a cuDNN backend engine configuration descriptor. Users cannot set or finalize.
CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR#
Created with descriptor type value CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR, cuDNN backend layout info descriptor provides information on the preferred layout for a tensor.
Attributes
CUDNN_ATTR_LAYOUT_INFO_TENSOR_UIDThe UID of the tensor.
CUDNN_TYPE_INT64; one element.Read-only attribute.
CUDNN_ATTR_LAYOUT_INFO_TYPESThe preferred layout of the tensor.
CUDNN_TYPE_LAYOUT_TYPE: zero or more element cudnnBackendLayoutType_t.Read-only attribute.
Finalization
This descriptor is read-only; it is retrieved and finalized from a cuDNN backend engine configuration descriptor. Users cannot set its attribute or finalize it.
CUDNN_BACKEND_MATMUL_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR, &desc); the cuDNN backend matmul descriptor specifies any metadata needed for the matmul operation.
Attributes
CUDNN_ATTR_MATMUL_COMP_TYPEThe compute precision used for the
matmuloperation.CUDNN_TYPE_DATA_TYPE; one element.Required attribute.
Finalization
Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend matmul descriptor:
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR, &desc); the cuDNN backend concatenation operation descriptor specifies an operation node for concatenating a given vector of tensors along a given concatenation axis.
This operation also supports an in-place mode, where one of the input tensors is already assumed to be at the correct location in the output tensor, that is, they share the same device buffer.
Attributes
Attributes of a cuDNN backend concat operation descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONCAT_:
CUDNN_ATTR_OPERATION_CONCAT_AXISThe dimension which tensors are being concatenated over.
Type:
CUDNN_TYPE_INT64Required attribute.
CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCSA vector of input tensor descriptors, which are concatenated in the same order as provided in this vector.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEXThe index of input tensor in the vector of input tensor descriptors that is already present in-place in the output tensor.
Type:
CUDNN_TYPE_INT64Optional attribute.
CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESCThe output tensor descriptor for the result from concatenation of input tensors.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR() can have the following return values:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The tensors involved in the operation should have the same shape in all dimensions except the dimension that they are being concatenated over.
The output tensor shape in the concatenating dimension should equal the sum of tensor shape of all input tensors in that same dimension.
Concatenation axis should be a valid tensor dimension.
If provided, the in-place input tensor index should be a valid index in the vector of input tensor descriptors.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, &desc); the cuDNN backend convolution backward data operation descriptor specifies an operation node for convolution backward data to compute the gradient of input data dx with filter tensor w and gradient of response dy with output \(\alpha\) scaling and residue add with \(\beta\) scaling. That is, the equation: \(dx=\alpha\left( w\bar{*}dy \right)+\beta dx\) where \(\bar{*}\) denotes the convolution backward data operator.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_:
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHAThe alpha value.
CUDNN_TYPE_FLOATorCUDNN_TYPE_DOUBLE; one or more elements.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETAThe beta value.
CUDNN_TYPE_FLOATorCUDNN_TYPE_DOUBLE; one or more elements.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESCThe convolution operator descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_WThe convolution filter tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DXThe image gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DYThe response gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In finalizing the convolution operation, the tensor dimensions of the tensor DX, W, and DY are bound based on the same interpretations as the X, W, and Y tensor dimensions described in the CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR section.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR can have the following return values:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. For example, the
DX,W, andDYtensors do not constitute a valid convolution operation under the convolution operator.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, &desc); the cuDNN backend convolution backward filter operation descriptor specifies an operation node for convolution backward filter to compute the gradient of filter dw with image tensor x and gradient of response dy with output \(\alpha\) scaling and residue add with \(\beta\) scaling. That is, the equation: \(dx=\alpha\left( x\tilde{*}dy \right)+\beta dx\) where \(\tilde{*}\) denotes the convolution backward filter operator.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_:
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHAThe alpha value.
CUDNN_TYPE_FLOATorCUDNN_TYPE_DOUBLE; one or more elements.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETAThe beta value.
CUDNN_TYPE_FLOATorCUDNN_TYPE_DOUBLE; one or more elements.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESCThe convolution operator descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DWThe convolution filter tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_XThe image gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute. Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DYThe response gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute. Required to be set before finalization.
Finalization
In finalizing the convolution operation, the tensor dimensions of the tensor X, DW, and DY are bound based on the same interpretations as the X, W, and Y tensor dimensions described in the CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR section.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR() can have the following return values:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. For example, the
X,DW, andDYtensors do not constitute a valid convolution operation under the convolution operator.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, &desc); the cuDNN backend convolution forward operation descriptor specifies an operation node for forward convolution to compute the response tensor y of image tensor x convoluted with filter tensor w with output scaling \(\alpha\) and residual add with \(\beta\) scaling. That is, the equation: \(y=\alpha\left( w*x \right)+\beta y\) where \(*\) is the convolution operator in the forward direction.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_:
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHAThe alpha value.
CUDNN_TYPE_FLOATorCUDNN_TYPE_DOUBLE; one or more elements.Required to be set before finalization.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETAThe beta value.
CUDNN_TYPE_FLOATorCUDNN_TYPE_DOUBLE; one or more elements.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESCThe convolution operator descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_WThe convolution filter tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_XThe image tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_YThe response tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In finalizing the convolution operation, the tensor dimensions of the tensor X, W, and Y are bound based on the following interpretations:
The CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS attribute of CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC is the number of spatial dimension of the convolution. The number of dimensions for tensor X, W, and Y must be larger than the number of spatial dimensions by 2 or 3 depending on how users choose to specify the convolution tensors.
If the number of tensor dimension is the number of spatial dimensions plus 2:
Xtensor dimension and stride arrays are[N, GC, …]
Wtensor dimension and stride arrays are[GK, C, …]
Ytensor dimension and stride arrays are[N, GK, …]
Where the ellipsis … are shorthand for spatial dimensions of each tensor, G is the number of convolution groups, and C and K are the number of input and output feature maps per group. In this interpretation, it is assumed that the memory layout for each group is packed. cudnnBackendFinalize() asserts the tensors dimensions and strides are consistent with this interpretation or it returns CUDNN_STATUS_BAD_PARAM.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR can have the following return values:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. For example, the
X,W, andYtensors do not constitute a valid convolution operation under the convolution operator.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR#
Represents an operation that will generate per-channel statistics. The specific statistics that will be generated depends on the CUDNN_ATTR_OPERATION_GENSTATS_MODE attribute in the descriptor. Currently, only CUDNN_GENSTATS_SUM_SQSUM is supported for the CUDNN_ATTR_OPERATION_GENSTATS_MODE. It will generate the sum and quadratic sum of per-channel elements of the input tensor x. The output dimension should be all 1 except the C dimension. Also, the C dimension of outputs should equal the C dimension of the input. This opaque struct can be created with cudnnBackendCreateDescriptor() (CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR).
Attributes
CUDNN_ATTR_OPERATION_GENSTATS_MODESets the
CUDNN_TYPE_GENSTATS_MODEof the operation. This attribute is required.CUDNN_ATTR_OPERATION_GENSTATS_MATH_PRECThe math precision of the computation. This attribute is required.
CUDNN_ATTR_OPERATION_GENSTATS_XDESCSets the descriptor for the input tensor
X. This attribute is required.CUDNN_ATTR_OPERATION_GENSTATS_SUMDESCSets the descriptor for the output tensor
sum. This attribute is required.CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESCSets the descriptor for the output tensor
quadraticsum. This attribute is required.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The number of dimensions do not match between the input and output tensors.
The input/output tensor dimensions do not agree with the above description.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &desc); the cuDNN backend matmul operation descriptor specifies an operation node for matmul to compute the matrix product C by multiplying Matrix A and Matrix B, as shown in the following equation: C=AB
When using the matmul operation, the matrices are expected to be at least rank-2 tensors. The last two dimensions are expected to correspond to either M, K or N. All the preceding dimensions are interpreted as batch dimensions. If there are zero batch dimensions then the requirements are as follows:
Case |
Matrix A |
Matrix B |
Matrix C |
|---|---|---|---|
Single Matmul |
M x K |
K x N |
M x N |
For a single batch dimension we have the following requirements:
Case |
Matrix A |
Matrix B |
Matrix C |
|---|---|---|---|
Single Matmul |
1 x M x K |
1 x K x N |
1 x M x N |
Batch Matmul |
B x M x K |
B x K x N |
B x M x N |
Broadcast A |
(B/c) x M x K |
B x K x N |
B x M x N |
Broadcast B |
B x M x K |
(B/c) x K x N |
B x M x N |
Where:
B indicates the batch size
M is the number of rows of the Matrix A
K is the number or columns of the input Matrix A (which is the same as the number of rows as the input Matrix B)
N is the number of columns of the input Matrix B
c is a constant integer and a factor of B
If either the batch size of Matrix A or B is set to B/c, this indicates that the matrix will be broadcasted in the batch matmul. The resulting output Matrix C will be a tensor of B x M x N.
The above broadcasting convention is extended to all the batch dimensions. Concretely, for tensors with three batch dimensions:
Case |
Matrix A |
Matrix B |
Matrix C |
|---|---|---|---|
Multiple Batched Matmul |
B1 x 1 x B3 x M x K |
1 x B2 x (B3/c) x K x N |
B1 x B2 x B3 x M x N |
The functionality of having multiple batch dimensions allows you to have layouts where the batch is not packed at a single stride. This case is especially seen in multihead attention. c is only allowed to be B (leading to a batch dimension for 1) for matmul and matmul fusions. The other possible values of c are supported for Grouped Query Attention in the cuDNN Fused Flash Attention.
The addressing of the matrix elements from a given tensor can be specified using strides in the tensor descriptor. The strides represent the spacing between elements for each tensor dimension. Considering a matrix tensor A (B x M x N) with strides [BS, MS, NS], it indicates that the actual matrix element A[x, y, z] is found at (A_base_address + x * BS + y * MS + z * NS) from the linear memory space allocated for tensor A. With our current support, the innermost dimension must be packed, which requires either MS=1 or NS=1. Otherwise, there are no other technical constraints with regard to how the strides can be specified in a tensor descriptor as it should follow the aforementioned addressing formula and the strides as specified by the user.
This representation provides support for some common usages, such as leading dimension and matrix transpose as we will explain through the following examples.
The most basic case is a fully packed row-major batch matrix, without any consideration of leading dimension or transpose. In this case, BS = M*N, MS = N, and NS = 1.
Matrix transpose can be achieved by exchanging the inner and outer dimensions using strides. Namely:
To specify a non-transposed matrix: BS = M*N, MS = N, and NS = 1.
To specify matrix transpose: BS = M*N, MS = 1, and NS = M.
Leading dimension, a widely used concept in BLAS-like APIs, describes the inner dimension of the 2D array memory allocation (as opposed to the conceptual matrix dimension). It resembles the stride in a way that it defines the spacing between elements in the outer dimension. The most typical use cases where it shows difference from the matrix inner dimension is when the matrix is only part of the data in the allocated memory, addressing submatrices, or addressing matrices from an aligned memory allocation. Therefore, the leading dimension LDA in a column-major matrix A must satisfy LDA >= M, whereas in a row-major matrix A, it must satisfy LDA >= N. To transition from the leading dimension concept to using strides, this entails MS >= N and NS = 1 or MS = 1 and NS >= M. Keep in mind that, while these are some practical use cases, these inequalities do not impose technical constraints with respect to an acceptable specification of the strides.
Other commonly used GEMM features, such as alpha/beta output blending, can also be achieved using this matmul operation along with other pointwise operations.
Attributes
Attributes of a cuDNN backend matmul descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_MATMUL_:
CUDNN_ATTR_OPERATION_MATMUL_ADESCThe Matrix A descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_MATMUL_BDESCThe Matrix B descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_MATMUL_CDESCThe Matrix C descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNTNumber of
matmuloperations to perform in the batch on matrix.CUDNN_TYPE_INT64; one element.Default value is
1.
CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESCThe tensor
gemm_m_overridedescriptor. Allows you to override the M dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESCThe tensor
gemm_n_overridedescriptor. Allows you to override the N dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESCThe tensor
gemm_k_overridedescriptor. Allows you to override the K dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_MATMUL_DESCThe
matmuloperation descriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_MATMUL_DESCRIPTOR.Required attribute.
Finalization
In the finalization of the matmul operation, the tensor dimensions of the Matrices A, B, and C will be checked to ensure that they satisfy the requirements of matmul:
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR can have the following return values:
CUDNN_STATUS_NOT_SUPPORTEDAn unsupported attribute value was encountered. For example, if not all of the Matrices A, B, and C are at least rank-2 tensors.
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNTspecified is a negative value.The
CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNTand one or more of the batch sizes of the Matrices A, B, and C are not equal to one. That is to say there is a conflict where both irregularly and regularly strided batched matmul are specified, which is not a valid use case.The dimensions of the Matrices A, B, and C do not satisfy the matmul requirements.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR, &desc); the cuDNN backend normalization backward operation specifies a node for a backward normalization that takes as input the gradient tensor dY and outputs the gradient tensor dX and weight gradients dScale and dBias. The normalization mode is set using the CUDNN_ATTR_OPERATION_NORM_BWD_MODE attribute.
Limitations
Does not support
CUDNN_GROUP_NORMmode.
|
|
|
|
|
|---|---|---|---|---|
Yes |
Yes |
Yes |
No |
Yes |
Note
In addition to single GPU, CUDNN_BATCH_NORM also supports single node multi-GPU batch norm, while other normalization modes only support running on a single GPU. For more information, refer to the DReluForkDNorm pattern.
Attributes
CUDNN_ATTR_OPERATION_NORM_BWD_MODEChooses the normalization mode for the norm backward operation.
CUDNN_TYPE_NORM_MODE; one element.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_XDESCInput tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESCSaved mean input tensor descriptor for reusing the mean computed during the forward computation of the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESCSaved inverse variance input tensor descriptor for reusing the mean computed during the forward computation of the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DYDESCGradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESCNormalization scale descriptor. Note that the bias descriptor is not necessary for the backward pass.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESCScalar input tensor descriptor for the epsilon value. The epsilon values are needed only if the saved mean and variances are not passed as inputs to the operation. Note that the attribute
CUDNN_ATTR_TENSOR_IS_BY_VALUEof this descriptor should be set totrue.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESCScale gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESCBias gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_DXDESCInput gradient tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCSVector of tensor descriptors for the communication buffers used in multi-GPU normalization. Typically, one buffer is provided for every GPU in the node. This is an optional attribute only used for multi-GPU tensor stats reduction.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Finalization
In the finalization stage, the attributes are checked to ensure there are no conflicts.
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The tensor dimensions of the gradient tensors
dY,dX, and input tensorX, do not match.The channel count C for the
mean,scale, andinv_variancetensors do not match.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR, &desc); the cuDNN backend normalization forward operation specifies a node for a forward normalization that takes as input a tensor X and produces a normalized output Y with the normalization mode set by the CUDNN_ATTR_OPERATION_NORM_FWD_MODE attribute. The operation supports optional running stats computation and allows for storing the computed means and variances for reuse in the backwards calculation depending on the setting of the CUDNN_ATTR_OPERATION_NORM_FWD_PHASE attribute.
Limitations
Does not support
CUDNN_GROUP_NORMmode.Batch norm only supports forward training and not forward inference.
|
|
|
|
|
|
|---|---|---|---|---|---|
|
Yes |
Yes |
Yes |
No |
Yes |
|
Yes |
Yes |
No |
No |
Yes |
Note
In addition to single-GPU, batch normalization supports running on single node multi-GPUs, while other normalization modes only support running on a single GPU. For more information, refer to the NormAddRelu pattern.
Attributes
CUDNN_ATTR_OPERATION_NORM_FWD_MODEChooses the normalization mode for the norm forward operation.
CUDNN_TYPE_NORM_MODE; one element.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_PHASESelects the training or inference phase for the norm forward operation.
CUDNN_TYPE_NORM_FWD_PHASE; one element.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_XDESCInput tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESCEstimated mean input tensor descriptor for the inference phase and the computed mean output tensor descriptor for the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESCEstimated inverse variance input tensor descriptor for the inference phase and the computed inverse variance output tensor descriptor for the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESCNormalization scale input tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESCNormalization bias input tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESCScalar input tensor descriptor for the epsilon value used in normalization calculation. Note that the attribute
CUDNN_ATTR_TENSOR_IS_BY_VALUEof this descriptor should be set totrue.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESCScalar input tensor descriptor for the exponential average factor value used in running stats computation.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESCInput running mean tensor descriptor for the running stats computation in the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESCInput running variance tensor descriptor for the running stats computation in the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESCOutput running
meantensor descriptor for the running stats computation in the training phase.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESCOutput running variance tensor descriptor for the running stats computation in the training phase.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_YDESCTensor descriptor for the output of the normalization operation.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCSVector of tensor descriptors for the communication buffers used in multi-GPU normalization. Typically, one buffer is provided for every GPU in the node. This is an optional attribute only used for multi-GPU tensor stats reduction.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Finalization
In the finalization stage, the attributes are checked to ensure there are no conflicts.
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The output tensor dimensions do not match the input tensor dimensions.
The channel count C for the
mean,scale,bias, andinv_variancetensors do not match.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR#
Represents a pointwise operation that implements the equation Y = op(alpha1 * X) or Y = op(alpha1 * X, alpha2 * B) depending on the operation type. The actual type of operation represented by op() above depends on the CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR attribute in the descriptor. This operation descriptor supports operations with single-input single-output.
For a list of supported operations, refer to the cudnnPointwiseMode_t section.
For dual-input pointwise operations, broadcasting is assumed when a tensor dimension in one of the tensors is 1 while the other tensors corresponding dimension is not 1.
For three-input single-output pointwise operations, we do not support broadcasting in any tensor.
This opaque struct can be created with cudnnBackendCreateDescriptor() (CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR).
Attributes
CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTORSets the descriptor containing the mathematical settings of the pointwise operation. This attribute is required.
CUDNN_ATTR_OPERATION_POINTWISE_XDESCSets the descriptor for the input tensor
X. This attribute is required for pointwise mathematical functions or activation forward propagation computations.CUDNN_ATTR_OPERATION_POINTWISE_BDESCIf the operation requires two inputs, such as
addormultiply, this attribute sets the second input tensorB. If the operation requires only 1 input, this field is not used and should not be set.CUDNN_ATTR_OPERATION_POINTWISE_YDESCSets the descriptor for the output tensor
Y. This attribute is required for pointwise mathematical functions or activation forward propagation computations.CUDNN_ATTR_OPERATION_POINTWISE_TDESCSets the descriptor for the tensor
T. This attribute is required forCUDNN_ATTR_POINTWISE_MODEset toCUDNN_POINTWISE_BINARY_SELECTand acts as the mask based on which the selection is done.CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1Sets the scalar
alpha1value in the equation. Can be infloatorhalf. This attribute is optional, if not set, the default value is1.0.CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2If the operation requires 2 inputs, such as
addormultiply. This attribute sets the scalaralpha2value in the equation. Can be infloatorhalf. This attribute is optional, if not set, the default value is1.0. If the operation requires only 1 input, this field is not used and should not be set.CUDNN_ATTR_OPERATION_POINTWISE_DXDESCSets the descriptor for the output tensor
dX. This attribute is required for pointwise activation back propagation computations.CUDNN_ATTR_OPERATION_POINTWISE_DYDESCSets the descriptor for the input tensor
dY. This attribute is required for pointwise activation back propagation computations.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The number of dimensions do not match between the input and output tensors.
The input/output tensor dimensions do not agree with the above described automatic broadcasting rules.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR#
The cuDNN backend reduction operation descriptor represents an operation node that implements reducing values of an input tensor X in one or more dimensions to get an output tensor Y. The math operation and compute data type used for reducing tensor values is specified via CUDNN_ATTR_OPERATION_REDUCTION_DESC.
This operation descriptor can be created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, &desc).
The output tensor Y should be the size as that of input tensor X, except dimensions where its size is 1.
There is a special use case for Grouped Query Attention and Multi Query Attention in cuDNN Fused Flash Attention where some dimensions in the output tensor Y can also be factors of the corresponding dimensions in the input tensor X.
Attributes
Attributes of a cuDNN backend reduction descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_REDUCTION_:
CUDNN_ATTR_OPERATION_REDUCTION_XDESCThe matrix
Xdescriptor.CUDNN_TYPE_BACKEND_DESCRIPTORone element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_REDUCTION_YDESCThe matrix
Ydescriptor.CUDNN_TYPE_BACKEND_DESCRIPTORone element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_REDUCTION_DESCThe
reductionoperation descriptor.CUDNN_TYPE_BACKEND_DESCRIPTORone element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In the finalization of the reduction operation, the dimensions of tensors X and Y are checked to ensure that they satisfy the requirements of the reduction operation.
cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR can have the following return values:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. For example, the dimensions of the tensors
XandYdo not satisfy the requirements of the reduction operation.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR, &desc); the cuDNN backend resample backward operation descriptor specifies an operation node for backward resampling. It computes the input tensor gradient dx from output tensor gradient dy with backward resampling done according to CUDNN_ATTR_RESAMPLE_MODE with output scaling \(\alpha\) and residual add with \(\beta\) scaling.
Attributes
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESCResample operation descriptor (CUDNN_BACKEND_RESAMPLE_DESCRIPTOR) instance contains metadata about the operation.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_RESAMPLE_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESCInput tensor gradient descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESCOutput tensor gradient descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESCTensor containing maxpool or nearest neighbor resampling indices to be used in backprop.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHASets the alpha parameter used in blending.
CUDNN_TYPE_DOUBLEorCUDNN_TYPE_FLOAT; one element.Optional attribute.
Default value is
1.0.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETASets the beta parameter used in blending.
CUDNN_TYPE_DOUBLEorCUDNN_TYPE_FLOAT; one element.Optional attribute.
Default value is
0.0.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESCInput tensor
Xdescriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Required for NCHW layout.
CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESCInput tensor
Ydescriptor.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Required for NCHW layout.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The output shape calculated based on the padding and strides does not match the given output tensor dimensions.
The shape of
YDESCandIDXDESC(if given) do not match.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR, &desc); the cuDNN backend resample forward operation descriptor specifies an operation node for forward resampling. It computes the output tensor y of image tensor x resampled according to CUDNN_ATTR_RESAMPLE_MODE, with output scaling \(\alpha\) and residual add with \(\beta\) scaling.
The resampling mode acts independently on each spatial dimension. For spatial dimension i, the output spatial dimension size y_i can be calculated by combining input image’s spatial dimension size x_i, post padding post_i, pre padding pre_i, stride s_i, and window size w_i as: y_i = 1+(x_i + post_i + pre_i - w_i) / s_i
Attributes
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESCResample operation descriptor (CUDNN_BACKEND_RESAMPLE_DESCRIPTOR) instance contains metadata about the operation.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_RESAMPLE_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESCInput tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESCOutput tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESCTensor containing maxpool or nearest neighbor resampling indices to be used in backprop.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute (primarily used for use cases involving training).
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHASets the alpha parameter used in blending.
CUDNN_TYPE_DOUBLEorCUDNN_TYPE_FLOAT; one element.Optional attribute.
Default value is
1.0.
CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETASets the beta parameter used in blending.
CUDNN_TYPE_DOUBLEorCUDNN_TYPE_FLOAT; one element.Optional attribute.
Default value is
0.0.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered. Some examples include:
The output shape calculated based on the padding and strides does not match the given output tensor dimensions.
The shape of the
YDESCandIDXDESC(if given) do not match.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR, &desc); the cuDNN backend Rng operation descriptor specifies an operation node for generating a tensor with random numbers based on the probability distribution specified in the Rng descriptor.
The random numbers are generated using a Philox random number generator (RNG) as described in Pytorch. The Philox object takes a seed value, a subsequence for starting the generation, and an offset for the subsequence. Seed and offset can be set by using the attributes. The subsequence is internally set, to ensure independent random numbers.
Attributes
CUDNN_ATTR_OPERATION_RNG_DESCRngdescriptor (CUDNN_BACKEND_RNG_DESCRIPTOR) instance containing metadata about the operation.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_RNG_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RNG_YDESCOutput tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_RNG_SEEDSets the seed for the random number generator which creates the
Ytensor. It can be a host INT64 value or a backend descriptor binded to a value on the device. Only supports a tensor with all dimensions set to1and all strides set to1.CUDNN_TYPE_INT64; one element orCUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Default value is
0.
CUDNN_ATTR_OPERATION_RNG_OFFSET_DESCTensor descriptor for the offset used in the RNG Philox object. Only supports a tensor with all dimensions set to
1and all strides set to1.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAMCUDNN_ATTR_OPERATION_RNG_OFFSET_DESCorCUDNN_ATTR_OPERATION_RNG_SEEDdo not have all dimensions and strides set to1.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR, &desc); the cuDNN backend paged cache load operation descriptor is used with a fused flash attention fprop graph, and specifies an operation node for reconstructing the k- or v-cache.
The k/v-cache is reconstructed by using a page table tensor to look up the location of a specific sequence ID in a non-contiguous container tensor. Storing a k/v-cache non-contiguously enables efficient memory management by avoiding fragmentation. For more information, refer to the Paged Attention paper.
Attributes
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESCVirtual output tensor descriptor, containing the reconstructed k/v-cache.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Datatype: FP16, BF16, or FP8
Required attribute.
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESCA non-virtual tensor descriptor with dimensions [num_blocks,H,block_size,D] containing the k/v-cache. The k/v-cache is divided into
num_blocksof [H,block_size,D] tensors, whereblock_sizeis a parameter chosen by the user. A smallerblock_sizeleads to less fragmentation, but also less parallelism.num_blocksis arbitrary and depends on the size of the allocated k/v-cache.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Datatype: FP16, BF16, or FP8
Required attribute.
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESCA non-virtual tensor descriptor of dimensions [B,1,ceil(max_seq_size/block_size),1] pointing to the lookup table.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Datatype: INT32
Required attribute.
CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESCA non-virtual [B,1,1,1] tensor descriptor indicates which sequence numbers from the k/v-cache are requested. For each batch, all items from the container will be copied from sequence 0 to sequence number 1.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Datatype: INT32 or INT64
Sequence numbers are in the interval [1, max_seq_size]
Required attribute.
Finalization
In the finalization stage, the attributes are cross-checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAMTypes or dimensions of one or more of the input/output tensors are invalid.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR, &desc); the cuDNN backend signal operation descriptor specifies an operation node for updating or waiting on a flag variable. Signaling operations can be used to communicate between cuDNN operation graphs, even with operation graphs in another GPU.
This operation, to connect to other nodes in the graph, also has a pass-through input tensor, which is not operated on and is just passed along to the output tensor. This mandatory pass-through input tensor helps in determining the predecessor node after which the signal operation should be executed. The optional output tensor helps in determining the successor node before which the signal execution should have completed. It is also guaranteed that for a non-virtual tensor as the output tensor, all writes for the tensor will have taken place before the signal value is updated by the operation.
Attributes
CUDNN_ATTR_OPERATION_SIGNAL_MODEThe signaling mode to use.
CUDNN_TYPE_SIGNAL_MODERequired attribute.
CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESCFlag tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_SIGNAL_VALUEThe scalar value to compare or update the flag variable with.
CUDNN_TYPE_INT64Required attribute.
CUDNN_ATTR_OPERATION_SIGNAL_XDESCA pass-through input tensor to enable connecting this signal operation to other nodes in the graph.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_SIGNAL_YDESCThe output tensor for the pass-through input tensor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
Finalization
In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:
CUDNN_STATUS_BAD_PARAMInvalid or inconsistent attribute values are encountered.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR, &desc); the cuDNN backend bn_finalize statistics operation descriptor specifies an operation node for the batch norm finalize operation.
In ResNet like models, a common technique to fuse batch norm with convolutions would involve splitting the batch norm operation into three parts - genStats, finalize, and apply (pointwise, scale, and bias). The genStats operation is usually fused with the convolution operation that precedes the batch norm while the apply is fused with the ReLU and convolution that follows the batch norm op. The batch norm finalize operation is a buffer op between the two fusions that takes the batch norm scale, bias, sum, and sqsum produced by the genStats operation as inputs and produces an equivalent scale and bias as output. The equivalent scale and bias are then consumed in the apply phase. Additionally, the bn_finalize operation also produces the running stats, mean, and inverse standard deviation as outputs.
Attributes
CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODESets inference or training mode for the
bn_finalizeoperation.CUDNN_TYPE_BN_FINALIZE_STATS_MODE; one element.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PRECMath precision of the computation.
Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESCInput sum tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESCInput square sum tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESCBatch norm input scale tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESCBatch norm input bias tensor descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESCBatch norm input running mean descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESCBatch norm input running variance descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESCBatch norm output running mean descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESCBatch norm output running variance descriptor.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Optional attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESCBatch norm output saved mean tensor descriptor. This is computed from the sum input that’s fed in from the preceding
genStatsoperation. Storing out the saved mean helps avoid recomputation in the backpropagation phase.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESCBatch norm output inverse standard deviation tensor descriptor. This is computed from the sum and sqm sums input that’s fed in from the preceding
genStatsoperation. Storing out the saved inv standard deviations helps avoid recomputation in the backpropagation phase.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESCOutput tensor descriptor for the equivalent scale tensor. The equivalent scale tensor is typically fed as input to the batch norm
applycomputation (pointwise, scale, and bias) that follows the batch normfinalizeoperation.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESCOutput tensor descriptor for the equivalent bias tensor. The equivalent bias tensor is typically fed as input to the batch norm
applycomputation (pointwise, scale, and bias) that follows the batch normfinalizeoperation.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESCScalar input tensor descriptor representing the number of elements accumulated over while calculating the
sumandsqsuminputs. The count usually equals N*H*W in case of batch norm.CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESCScalar input tensor descriptor for the epsilon value used in batch norm variance calculation.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERAGE_FACTOR_DESCScalar input tensor descriptor for the exponential average value used in batch norm running stats calculation.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.Required attribute.
CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR#
Created with descriptor type value CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, cuDNN backend operation graph descriptor describes an operation graph, a small network of one or more operations connected by virtual tensors. Operation graph defines users’ computation case or mathematical expression that they wish to compute.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATIONGRAPH_:
CUDNN_ATTR_OPERATIONGRAPH_HANDLEA cuDNN handle.
CUDNN_TYPE_HANDLE; one element.Required attribute.
CUDNN_ATTR_OPERATIONGRAPH_OPSOperation nodes to form the operation graph.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor typeCUDNN_BACKEND_OPERATION_*_DESCRIPTOR.Required attribute.
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNTThe number of engines to support the operation graph.
CUDNN_TYPE_INT64; one element.Read-only attribute.
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_SUPPORTED_COUNTThe number of engines that support the operation graph.
CUDNN_TYPE_INT64; one element.Read-only attribute.
Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLEDWhether dynamic shape is enabled for the operation graph. The rest of the backend API will treat the graph as a dynamic shape graph and enable this feature.
CUDNN_TYPE_BOOLEAN; one element.
Finalization
CUDNN_STATUS_BAD_PARAMAn invalid attribute value was encountered. Some examples include:
One of the backend descriptors in
CUDNN_ATTR_OPERATIONGRAPH_OPSis not finalized.The value
CUDNN_ATTR_OPERATIONGRAPH_HANDLEis not a valid cuDNN handle.
CUDNN_STATUS_NOT_SUPPORTEDAn unsupported attribute value was encountered. For example, the combination of operations of attribute
CUDNN_ATTR_OPERATIONGRAPH_OPSis not supported.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_POINTWISE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_POINTWISE_DESCRIPTOR, &desc); the cuDNN backend pointwise descriptor specifies the parameters for a pointwise operator like mode, math precision, nan propagation, and so on.
Attributes
Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_POINTWISE_:
CUDNN_ATTR_POINTWISE_MODEMode of the pointwise operation.
CUDNN_TYPE_POINTWISE_MODE; one element.Required attribute.
CUDNN_ATTR_POINTWISE_MATH_PRECThe math precision of the computation.
CUDNN_TYPE_DATA_TYPE; one element.Required attribute.
CUDNN_ATTR_POINTWISE_NAN_PROPAGATIONSpecifies a method by which to propagate NaNs.
CUDNN_TYPE_NAN_PROPOGATION; one element.Required only for comparison based pointwise modes, like ReLU.
Current support only includes enum value
CUDNN_PROPAGATE_NAN.Default value is
CUDNN_NOT_PROPAGATE_NAN.
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIPSets the lower clip value for ReLU. If
(value < lower_clip) value = lower_clip + lower_clip_slope * (value - lower_clip):CUDNN_TYPE_DOUBLE/CUDNN_TYPE_FLOAT; one element.Default value is
0.0f.
CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIPSets the upper clip value for ReLU. If
(value > upper_clip) value = upper_clip:CUDNN_TYPE_DOUBLE/CUDNN_TYPE_FLOAT; one element.Default value is
Numeric limit max.
CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPESets the lower clip slope value for ReLU. If
(value < lower_clip) value = lower_clip + lower_clip_slope * (value - lower_clip):CUDNN_TYPE_DOUBLE/CUDNN_TYPE_FLOAT; one element.Default value is
0.0f.
CUDNN_ATTR_POINTWISE_ELU_ALPHASets the alpha value for ELU. If
(value < 0.0) value = alpha * (e^value - 1.0):CUDNN_TYPE_DOUBLE/CUDNN_TYPE_FLOAT; one element.Default value is
1.0f.
CUDNN_ATTR_POINTWISE_SOFTPLUS_BETASets the beta value for softplus. If
value = log (1 + e^(beta * value)) / beta:CUDNN_TYPE_DOUBLE/CUDNN_TYPE_FLOAT; one element.Default value is
1.0f.
CUDNN_ATTR_POINTWISE_SWISH_BETASets the beta value for swish. If
value = value / (1 + e^(-beta * value)):CUDNN_TYPE_DOUBLE/CUDNN_TYPE_FLOAT; one element.Default value is
1.0f.
CUDNN_ATTR_POINTWISE_AXISSets the axis value for
GEN_INDEX. The index will be generated for this axis.CUDNN_TYPE_INT64; one element.Default value is
-1.Needs to lie between
[0,input_dim_size-1]. For example, if your input has dimensions [N,C,H,W], the axis can be set to anything in[0,3].
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_POINTWISE_DESCRIPTOR can have the following return values:
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_REDUCTION_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_REDUCTION_DESCRIPTOR, &desc); the cuDNN backend reduction descriptor specifies any metadata, including the math operation and compute data type, needed for the reduction operation.
Attributes
CUDNN_ATTR_REDUCTION_OPERATORThe math operation used for the
reductionoperation.CUDNN_TYPE_REDUCTION_OPERATOR_TYPE; one element.Required attribute.
CUDNN_ATTR_REDUCTION_COMP_TYPEThe compute precision used for the
reductionoperation.CUDNN_TYPE_DATA_TYPE; one element.Required attribute.
Finalization
Return values of cudnnBackendFinalize(desc) where desc is CUDNN_BACKEND_REDUCTION_DESCRIPTOR are:
CUDNN_STATUS_NOT_SUPPORTEDAn unsupported attribute value was encountered. For example,
CUDNN_ATTR_REDUCTION_OPERATORis not set to either ofCUDNN_REDUCE_TENSOR_ADD,CUDNN_REDUCE_TENSOR_MUL,CUDNN_REDUCE_TENSOR_MIN, orCUDNN_REDUCE_TENSOR_MAX.CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_RESAMPLE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_RESAMPLE_DESCRIPTOR, &desc); the cuDNN backend resample descriptor specifies the parameters for a resample operation (upsampling or downsampling) in both forward and backward propagation.
Attributes
CUDNN_ATTR_RESAMPLE_MODESpecifies mode of resampling, for example, average pool, nearest-neighbor, and so on.
CUDNN_TYPE_RESAMPLE_MODE; one element.Default value is
CUDNN_RESAMPLE_NEAREST.
CUDNN_ATTR_RESAMPLE_COMP_TYPECompute data type for the resampling operator.
CUDNN_TYPE_DATA_TYPE; one element.Default value is
CUDNN_DATA_FLOAT.
CUDNN_ATTR_RESAMPLE_NAN_PROPAGATIONSpecifies a method by which to propagate NaNs.
CUDNN_TYPE_NAN_PROPAGATION; one element.Default value is
CUDNN_NOT_PROPAGATE_NAN.
CUDNN_ATTR_RESAMPLE_SPATIAL_DIMSSpecifies the number of spatial dimensions to perform the resampling over.
CUDNN_TYPE_INT64; one element.Required attribute.
CUDNN_ATTR_RESAMPLE_PADDING_MODESpecifies which values to use for padding.
CUDNN_TYPE_PADDING_MODE; one element.Default value is
CUDNN_ZERO_PAD.
CUDNN_ATTR_RESAMPLE_STRIDESStride in each dimension for the kernel or filter.
CUDNN_TYPE_INT64orCUDNN_TYPE_FRACTION; at mostCUDNN_MAX_DIMS - 2.Required attribute.
CUDNN_ATTR_RESAMPLE_PRE_PADDINGSPadding added to the beginning of the input tensor in each dimension.
CUDNN_TYPE_INT64orCUDNN_TYPE_FRACTION; at mostCUDNN_MAX_DIMS - 2.Required attribute.
CUDNN_ATTR_RESAMPLE_POST_PADDINGSPadding added to the end of the input tensor in each dimension.
CUDNN_TYPE_INT64orCUDNN_TYPE_FRACTION; at mostCUDNN_MAX_DIMS - 2.Required attribute.
CUDNN_ATTR_RESAMPLE_WINDOW_DIMSSpatial dimensions of filter.
CUDNN_TYPE_INT64orCUDNN_TYPE_FRACTION; at mostCUDNN_MAX_DIMS - 2.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a CUDNN_BACKEND_RESAMPLE_DESCRIPTOR is:
CUDNN_STATUS_NOT_SUPPORTEDAn unsupported attribute value was encountered. Some examples include:
An
elemCountargument for settingCUDNN_ATTR_RESAMPLE_WINDOW_DIMS,CUDNN_ATTR_RESAMPLE_STRIDES,CUDNN_ATTR_RESAMPLE_PRE_PADDINGS, andCUDNN_ATTR_RESAMPLE_POST_PADDINGSis not equal to the value set forCUDNN_ATTR_RESAMPLE_SPATIAL_DIMS.CUDNN_ATTR_RESAMPLE_MODEis set toCUDNN_RESAMPLE_BILINEARand any of theCUDNN_ATTR_RESAMPLE_WINDOW_DIMSare not set to2.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_RNG_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_RNG_DESCRIPTOR, &desc); the cuDNN backend Rng descriptor specifies any metadata, including the probability distribution that will be used to generate the tensor and the distribution’s corresponding parameters.
Attributes
CUDNN_ATTR_RNG_DISTRIBUTIONThe probability distribution used for the
rngoperation.CUDNN_TYPE_RNG_DISTRIBUTION; one element.Default value is
CUDNN_RNG_DISTRIBUTION_BERNOULLI.
CUDNN_ATTR_RNG_NORMAL_DIST_MEANThe mean value for the normal distribution, used if
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL.CUDNN_TYPE_DOUBLE; one element.Default value is
-1.
CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATIONThe standard deviation value for the normal distribution, used if
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL.CUDNN_TYPE_DOUBLE; one element.Default value is
-1.
Finalization
Return values of cudnnBackendFinalize(desc) where desc is CUDNN_BACKEND_RNG_DESCRIPTOR are:
CUDNN_STATUS_BAD_PARAMAn invalid attribute value was encountered. Some examples include:
If
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMALand the standard deviation supplied is negative.If
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_UNIFORMand the maximum value of the range is lower than minimum value.If
CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_BERNOULLIand the probability supplied is negative.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_TENSOR_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &desc); the cuDNN backend tensor allows users to specify the memory storage of a generic tensor. A tensor is identified by a unique identifier and described by its data type, its data byte-alignment requirements, and the extents and strides of its dimensions. Optionally, a tensor element can be vector in one of its dimensions. A tensor can also be set to be virtual when it is an intermediate variable in a computation graph and not mapped to physical global memory storage.
Attributes
Attributes of a cuDNN backend tensor descriptors are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_TENSOR_:
CUDNN_ATTR_TENSOR_UNIQUE_IDAn integer that uniquely identifies the tensor.
CUDNN_TYPE_INT64; one element.Required attribute.
CUDNN_ATTR_TENSOR_DATA_TYPEData type of tensor.
CUDNN_TYPE_DATA_TYPE; one element.Required attribute.
CUDNN_ATTR_TENSOR_BYTE_ALIGNMENTByte alignment of pointers for this tensor.
CUDNN_TYPE_INT64; one element.Required attribute.
CUDNN_ATTR_TENSOR_DIMENSIONSTensor dimensions.
CUDNN_TYPE_INT64; at mostCUDNN_MAX_DIMSelements.Required attribute.
CUDNN_ATTR_TENSOR_STRIDESTensor strides.
CUDNN_TYPE_INT64; at mostCUDNN_MAX_DIMSelements.Required attribute.
CUDNN_ATTR_TENSOR_VECTOR_COUNTSize of vectorization.
CUDNN_TYPE_INT64; one element.Default value is
1.
CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSIONIndex of the vectorized dimension.
CUDNN_TYPE_INT64; one element.Required to be set before finalization if
CUDNN_ATTR_TENSOR_VECTOR_COUNTis set to a value different than its default; otherwise it’s ignored.
CUDNN_ATTR_TENSOR_IS_VIRTUALIndicates whether the tensor is virtual. A virtual tensor is an intermediate tensor in the operation graph that exists in transient and not read from or written to in global device memory.
CUDNN_TYPE_BOOLEAN; one element.Default value is
false.
CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESCA ragged tensor, that is, a tensor with nested variable length lists as inner dimensions, will have another tensor called the ragged offset descriptor that contains offsets in memory to the next variable length list.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element.Default value is
None.
Finalization
cudnnBackendFinalize() with a CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR can have the following return values:
CUDNN_STATUS_BAD_PARAMAn invalid attribute value was encountered. Some examples include:
Any of the tensor dimensions or strides is not positive.
The value of the tensor alignment attribute is not divisible by the size of the data type.
CUDNN_STATUS_NOT_SUPPORTEDAn unsupported attribute value was encountered. Some examples include:
The data type attribute is
CUDNN_DATA_INT8x4,CUDNN_DATA_UINT8x4, orCUDNN_DATA_INT8x32.The data type attribute is
CUDNN_DATA_INT8andCUDNN_ATTR_TENSOR_VECTOR_COUNTvalue is not1,4, or32.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &desc); the cuDNN backend variant pack plan allows users to set up pointers to device buffers to various non-virtual tensors, identified by unique identifiers, of the operation graph, workspace, and computation intermediates.
Attributes
CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDSA unique identifier of tensor for each data pointer.
CUDNN_TYPE_INT64; zero of more elements.Required attribute.
CUDNN_ATTR_VARIANT_PACK_DATA_POINTERSTensor data device pointers.
CUDNN_TYPE_VOID_PTR; zero or more elements.Required attribute.
CUDNN_ATTR_VARIANT_PACK_INTERMEDIATESIntermediate device pointers.
CUDNN_TYPE_VOID_PTR; zero or more elements.Currently unsupported. Placeholder for future implementation.
CUDNN_ATTR_VARIANT_PACK_WORKSPACEWorkspace to device pointer.
CUDNN_TYPE_VOID_PTR; one element.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a cuDNN backend variant pack descriptor is:
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR, &desc); the cuDNN backend kernel cache helps significantly reduce execution plan finalizing time for use cases that have same-topology dynamic shape operation graph by binding the previously compiled applicable kernel to the execution plan instead of re-compiling a new one from scratch. This is used with execution plans containing a graph with CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED enabled.
Attributes
CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHEDAn attribute used to query whether a given engine config is cached.
CUDNN_TYPE_BACKEND_DESCRIPTOR; one element.Read-only attribute using the cudnnBackendGetAttribute() API. The engine config in question is to be passed into this attribute as a constant input through
arrayOfElementsand theelementCountwill serve as the resulting output, a value of zero meaning not cached and a positive number meaning that it is cached.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a cuDNN backend variant pack descriptor are:
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR, &desc); the cuDNN block scale quantize descriptor specifies the parameters for the block scale quantize operation to output block scaled tensors.
Attributes
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PRECThe math precision of the computation.
CUDNN_TYPE_DATA_TYPE; one element.Required attribute.
CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZEThe number of elements per block to perform block scaling.
CUDNN_TYPE_INT32; one element.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a cuDNN backend block scale quantize descriptor are:
CUDNN_STATUS_BAD_PARAMAn invalid attribute value was encountered. Some examples include:
Tensor shape mismatch between x, y, and scale tensors.
Data type mismatch between y and scale tensors.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR#
Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR, &desc); the cuDNN block scale dequantize descriptor specifies the parameters for the block scale dequantize operation to take in block scaled tensors.
Attributes
CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PRECThe math precision of the computation.
CUDNN_TYPE_DATA_TYPE; one element.Required attribute.
CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZEThe number of elements per block to perform block scaling.
CUDNN_TYPE_INT32; one element.Required attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a cuDNN backend block scale dequantize descriptor are:
CUDNN_STATUS_BAD_PARAMAn invalid attribute value was encountered. Some examples include:
Tensor shape mismatch between x, scale, and y tensors.
Data type mismatch between x and scale tensors.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR#
Created with
cudnnBackendCreateDescriptor(CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR, &desc); the cuDNN device property descriptor specifies the properties of a device.
Attributes
CUDNN_ATTR_DEVICEPROP_DEVICE_IDThe CUDA device ID of the device that the descriptor targets.
CUDNN_TYPE_INT32; one element.Optional attribute.
CUDNN_ATTR_DEVICEPROP_HANDLEThe cuDNN handle of the device that the descriptor targets.
CUDNN_TYPE_HANDLE; one element.Optional attribute.
CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATIONThe JSON representation of the device that the descriptor targets.
CUDNN_TYPE_CHAR; one element.Optional attribute.
Finalization
The return values for cudnnBackendFinalize() when called with a cuDNN device property descriptor are:
CUDNN_STATUS_BAD_PARAMAn invalid attribute value was encountered, for example, the provided JSON representation is invalid.
CUDNN_STATUS_NOT_INITIALIZEDFor some reason, querying the device properties failed.
CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCHThe target device is not supported.
CUDNN_STATUS_SUCCESSThe descriptor was finalized successfully.
Use Cases#
This section describes some typical use cases of the cuDNN backend API; for example, setting up a simple operation graph, setting up an engine config for that operation graph, and finally setting up an execution plan and executing it with data pointers set in a variant pack descriptor. An example of cuDNN’s native CUDA graph API is given as well.
Setting Up An Operation Graph For A Grouped Convolution#
This use case creates an operation graph with a single grouped 3D convolution forward operation. It starts by setting up the input and output tensors, binding them to a convolution forward operation, and finally setting up an operation graph with a single node.
Create tensor descriptors.
cudnnBackendDescriptor_t xDesc; cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &xDesc); cudnnDataType_t dtype = CUDNN_DATA_FLOAT; cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_DATA_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &dtype); int64_t xDim[] = {n, g, c, d, h, w}; int64_t xStr[] = {g * c * d * h * w, c *d *h *w, d *h *w, h *w, w, 1}; int64_t xUi = 'x'; int64_t alignment = 4; cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_DIMENSIONS, CUDNN_TYPE_INT64, 6, xDim); cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_STRIDES, CUDNN_TYPE_INT64, 6, xStr); cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_UNIQUE_ID, CUDNN_TYPE_INT64, 1, &xUi); cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT, CUDNN_TYPE_INT64, 1, &alignment); cudnnBackendFinalize(xDesc);
Repeat the above step for the convolution filter and output tensor descriptor. The six filter tensor dimensions are
[g, k, c, t, r, s]and the six output tensor dimensions are[n, g, k, o, p, q], respectively. Below, when finalizing a convolution operator to which the tensors are bound, dimension consistency is checked, meaning alln,g,c,kvalues shared among the three tensors are required to be the same. Otherwise,CUDNN_STATUS_BAD_PARAMstatus is returned.For backward compatibility with how tensors are specified in cudnnTensorDescriptor_t and used in convolution API, it is also possible to specify a 5D tensor with the following dimension:
image:
[n, g*c, d, h, w]filter:
[g*k, c, t, r, s]response:
[n, g*k, o, p, q]
In this format, a similar consistency check is performed when finalizing a convolution operator descriptor to which the tensors are bound.
Create, set, and finalize a convolution operator descriptor.
cudnnBackendDescriptor_t cDesc; int64_t nbDims = 3; cudnnDataType_t compType = CUDNN_DATA_FLOAT; cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION; int64_t pad[] = {0, 0, 0}; int64_t filterStr[] = {1, 1, 1}; int64_t dilation[] = {1, 1, 1}; cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &cDesc); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS, CUDNN_TYPE_INT64, 1, &nbDims); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_COMP_TYPE, CUDNN_TYPE_DATA_TYPE, 1, &compType); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_CONV_MODE, CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, CUDNN_TYPE_INT64, nbDims, pad); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_POST_PADDINGS, CUDNN_TYPE_INT64, nbDims, pad); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_DILATIONS, CUDNN_TYPE_INT64, nbDims, dilation); cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES, CUDNN_TYPE_INT64, nbDims, filterStr); cudnnBackendFinalize(cDesc);
Create, set, and finalize a convolution forward operation descriptor.
cudnnBackendDescriptor_t fprop; float alpha = 1.0; float beta = 0.5; cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, &fprop); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &xDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &wDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &yDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &cDesc); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA, CUDNN_TYPE_FLOAT, 1, &alpha); cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA, CUDNN_TYPE_FLOAT, 1, &beta); cudnnBackendFinalize(fprop);
Create, set, and finalize an operation graph descriptor.
cudnnBackendDescriptor_t op_graph; cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph); cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &fprop); cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); cudnnBackendFinalize(op_graph);
Setting Up An Engine Configuration#
This use case describes the steps with which users can set up an engine config from a previously finalized operation graph. This is an example in which users would like to use the engine with CUDNN_ATTR_ENGINE_GLOBAL_INDEX 0 for this operation graph and does not set any performance knobs.
Create, set, and finalize an engine descriptor.
cudnnBackendDescriptor_t engine; cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine); cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph); int64_t gidx = 0; cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX, CUDNN_TYPE_INT64, 1, &gidx); cudnnBackendFinalize(engine);
The user can query a finalized engine descriptor with cudnnBackendGetAttribute() API call for its attributes, including the performance knobs that it has. For simplicity, this use case skips this step and assumes the user is setting up an engine config descriptor below without making any changes to performance knobs.
Create, set, and finalize an engine config descriptor. Obtain the workspace size from the engine config.
cudnnBackendDescriptor_t engcfg; cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engcfg); cudnnBackendSetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_ENGINE, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine); cudnnBackendFinalize(engcfg); int64_t workspaceSize; cudnnBackendGetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE, CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);
Setting Up And Executing A Plan#
This use case describes the steps with which users set up an execution plan with a previously finalized engine config descriptor, set up the data pointer variant pack, and finally execute the plan.
Create, set, and finalize an execution plan descriptor. Obtain workspace size to allocate.
cudnnBackendDescriptor_t plan; cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan); cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle); cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG, CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engcfg); cudnnBackendFinalize(plan); int64_t workspaceSize; cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE, CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);
Create, set and finalize a variant pack descriptor.
void *dev_ptrs[3] = {xData, wData, yData}; // device pointer int64_t uids[3] = {'x', 'w', 'y'}; void *workspace; cudnnBackendDescriptor_t varpack; cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack); cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS, CUDNN_TYPE_VOID_PTR, 3, dev_ptrs); cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS, CUDNN_TYPE_INT64, 3, uids); cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE, CUDNN_TYPE_VOID_PTR, 1, &workspace); cudnnBackendFinalize(varpack);
Execute the plan with a variant pack.
cudnnBackendExecute(handle, plan, varpack);
Creating and Updating a CUDA Graph from a Plan#
This use case describes the steps with which users create a CUDA graph from a previously finalized execution plan and variant pack, execute the graph on a desired stream, and update the graph with a new variant pack. Note that this use case currently only works with a limited selection of cuDNN engines.
Create a CUDA graph.
cudaGraph_t cuda_graph; cudaGraphCreate(&cuda_graph, 0); cudnnBackendPopulateCudaGraph(handle, plan, varpack, cuda_graph);
Instantiate and execute the CUDA graph.
cudaGraphExec_t cuda_graph_exec; cudaGraphInstantiate (&cuda_graph_exec, cuda_graph); cudaGraphLaunch(instance, stream); // stream is a cudaStream_t cudaStreamSynchronize(stream);
Update the CUDA graph, update the instantiated graph and re-execute the graph. (This can be performed any number of times.)
cudaGraphExec_t cuda_graph_exec; cudnnBackendUpdateCudaGraph(handle, plan, new_varpack, cuda_graph); cudaGraphExecUpdateResultInfo result_info; cudaGraphExecUpdate(cuda_graph_exec, cuda_graph, &result_info); cudaGraphLaunch(instance, stream); cudaStreamSynchronize(stream);
Destroy the CUDA graph when we’re all done.
cudaGraphDestroy(cuda_graph);