cudnn_graph Library #

Data Type References #

These are the data type references for the cudnn_graph library.

Struct Types #

These are the struct types for the cudnn_graph library.

cudnnDebug_t is a structure used by cudnnSetCallback() and cudnnGetCallback() containing the metadata, such as time, time since start, stream ID, process and thread ID, that the user may choose to print or store in customized callbacks.

cudnnFraction_t #

cudnnFraction_t is a structure that allows a user to define int64_t fractions.

typedef struct cudnnFractionStruct {
    int64_t numerator;
    int64_t denominator;
} cudnnFraction_t;

Enumeration Types #

These are the enumeration types for the cudnn_graph library.

cudnnActivationMode_t #

This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.

cudnnActivationMode_t is an enumerated type used to select the neuron activation function used in cudnnActivationForward(), cudnnActivationBackward(), and cudnnConvolutionBiasActivationForward().

Values

CUDNN_ACTIVATION_SIGMOID: Selects the sigmoid function.
CUDNN_ACTIVATION_RELU: Selects the rectified linear function.
CUDNN_ACTIVATION_TANH: Selects the hyperbolic tangent function.
CUDNN_ACTIVATION_CLIPPED_RELU: Selects the clipped rectified linear function.
CUDNN_ACTIVATION_ELU: Selects the exponential linear function.
CUDNN_ACTIVATION_IDENTITY: Selects the identity function, intended for bypassing the activation step in cudnnConvolutionBiasActivationForward(). (The cudnnConvolutionBiasActivationForward() function must use CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM.) Does not work with cudnnActivationForward() or cudnnActivationBackward().
CUDNN_ACTIVATION_SWISH: Selects the swish function.

cudnnBackendAttributeName_t #

cudnnBackendAttributeName_t is an enumerated type that indicates the backend descriptor attributes that can be set or get using cudnnBackendSetAttribute() and cudnnBackendGetAttribute() functions. The backend descriptor to which an attribute belongs is identified by the prefix of the attribute name.

typedef enum {
    CUDNN_ATTR_POINTWISE_MODE                  = 0,
    CUDNN_ATTR_POINTWISE_MATH_PREC             = 1,
    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION       = 2,
    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP       = 3,
    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP       = 4,
    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE = 5,
    CUDNN_ATTR_POINTWISE_ELU_ALPHA             = 6,
    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA         = 7,
    CUDNN_ATTR_POINTWISE_SWISH_BETA            = 8,
    CUDNN_ATTR_POINTWISE_AXIS                  = 9,

    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,

    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
    CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
    CUDNN_ATTR_ENGINEHEUR_DEVICEPROP      = 204,

    CUDNN_ATTR_ENGINECFG_ENGINE            = 300,
    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES      = 302,
    CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE    = 303,

    CUDNN_ATTR_EXECUTION_PLAN_HANDLE                     = 400,
    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG              = 401,
    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE             = 402,
    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION        = 405,
    CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE               = 406,
    CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP                 = 407,

    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,

    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,

    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR          = 750,
    CUDNN_ATTR_OPERATION_POINTWISE_XDESC                  = 751,
    CUDNN_ATTR_OPERATION_POINTWISE_BDESC                  = 752,
    CUDNN_ATTR_OPERATION_POINTWISE_YDESC                  = 753,
    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1                 = 754,
    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2                 = 755,
    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC                 = 756,
    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC                 = 757,
    CUDNN_ATTR_OPERATION_POINTWISE_TDESC                  = 758,

    CUDNN_ATTR_OPERATION_GENSTATS_MODE                    = 770,
    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC               = 771,
    CUDNN_ATTR_OPERATION_GENSTATS_XDESC                   = 772,
    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC                 = 773,
    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC               = 774,

    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,

    CUDNN_ATTR_OPERATIONGRAPH_HANDLE                   = 800,
    CUDNN_ATTR_OPERATIONGRAPH_OPS                      = 801,
    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT      = 802,
    CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED = 803,

    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
    CUDNN_ATTR_TENSOR_STRIDES              = 903,
    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
    CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC   = 910,

    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,

    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,

    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,

    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
    CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
    CUDNN_ATTR_ENGINE_DEVICEPROP      = 1307

    CUDNN_ATTR_MATMUL_COMP_TYPE       = 1500,
    CUDNN_ATTR_MATMUL_PADDING_VALUE   = 1501,

    CUDNN_ATTR_OPERATION_MATMUL_ADESC                           = 1520,
    CUDNN_ATTR_OPERATION_MATMUL_BDESC                           = 1521,
    CUDNN_ATTR_OPERATION_MATMUL_CDESC                           = 1522,
    CUDNN_ATTR_OPERATION_MATMUL_DESC                            = 1523,
    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT = 1524,
    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC            = 1525,
    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC            = 1526,
    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC            = 1527,

    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,

    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,

    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,

    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,

    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC   = 1710,
    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC   = 1711,
    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC = 1712,
    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA   = 1713,
    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA    = 1714,
    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC    = 1716,

    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC  = 1720,
    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC  = 1721,
    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC = 1722,
    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA   = 1723,
    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA    = 1724,
    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC    = 1725,
    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC   = 1726,
    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC   = 1727,

    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,

    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,

    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC  = 1950,
    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC           = 1951,
    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC   = 1952,
    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953,

    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,

    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,

    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,

    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,

    CUDNN_ATTR_OPERATION_RNG_YDESC       = 2310,
    CUDNN_ATTR_OPERATION_RNG_SEED        = 2311,
    CUDNN_ATTR_OPERATION_RNG_DESC        = 2312,
    CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,

    CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2400,

    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC      = 2500,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC      = 2501,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC  = 2503,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504,

    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC      = 2600,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC      = 2602,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC  = 2603,
    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604,

    CUDNN_ATTR_DEVICEPROP_DEVICE_ID           = 2700,
    CUDNN_ATTR_DEVICEPROP_HANDLE              = 2701,
    CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702,

    CUDNN_ATTR_OPERATION_SDPA_FWD_QDESC                 = 2800,
    CUDNN_ATTR_OPERATION_SDPA_FWD_KDESC                 = 2801,
    CUDNN_ATTR_OPERATION_SDPA_FWD_VDESC                 = 2802,
    CUDNN_ATTR_OPERATION_SDPA_FWD_ODESC                 = 2803,
    CUDNN_ATTR_OPERATION_SDPA_FWD_STATSDESC             = 2804,
    CUDNN_ATTR_OPERATION_SDPA_FWD_SCALEDESC             = 2805,
    CUDNN_ATTR_OPERATION_SDPA_FWD_BLOCK_MASK_DESC       = 2806,
    CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_KDESC      = 2807,
    CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_VDESC      = 2808,
    CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_QDESC         = 2809,
    CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_KVDESC        = 2810,
    CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH              = 2811,
    CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH_INPUT_UID    = 2812,
    CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH_OUTPUT_UID   = 2813,
    CUDNN_ATTR_OPERATION_SDPA_FWD_SOFTMAX_DESC          = 2814,
    CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_SEED_DESC     = 2815,
    CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_OFFSET_DESC   = 2816,
    CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_RNG_DUMP_DESC = 2817,
    CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_PROBABILITY   = 2818,
    CUDNN_ATTR_OPERATION_SDPA_FWD_UNFUSE_FMA            = 2819,

    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_XDESC                  = 3000,
    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SEQ_LEN_KVDESC         = 3001,
    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SEQ_LEN_QDESC          = 3002,
    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_LEFT_BOUND_DESC        = 3003,
    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SHIFT_RIGHT_BOUND_DESC = 3004,
    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_BDESC                  = 3005,
    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_YDESC                  = 3006,
    CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_COMPARISON_MODE        = 3007,

    CUDNN_ATTR_OPERATION_SOFTMAX_XDESC        = 3100,
    CUDNN_ATTR_OPERATION_SOFTMAX_YDESC        = 3101,
    CUDNN_ATTR_OPERATION_SOFTMAX_STATS_DESC   = 3102,
    CUDNN_ATTR_OPERATION_SOFTMAX_MAX_DESC     = 3103,
    CUDNN_ATTR_OPERATION_SOFTMAX_SUM_EXP_DESC = 3104,
    CUDNN_ATTR_OPERATION_SOFTMAX_SINK_DESC    = 3105,
} cudnnBackendAttributeName_t;

cudnnBackendAttributeType_t #

The enumeration type cudnnBackendAttributeType_t specifies the data type of an attribute of a cuDNN backend descriptor. It is used to specify the type of data pointed to by the void *arrayOfElements argument of cudnnBackendSetAttribute() and cudnnBackendGetAttribute().

typedef enum {
    CUDNN_TYPE_HANDLE = 0,
    CUDNN_TYPE_DATA_TYPE,
    CUDNN_TYPE_BOOLEAN,
    CUDNN_TYPE_INT64,
    CUDNN_TYPE_FLOAT,
    CUDNN_TYPE_DOUBLE,
    CUDNN_TYPE_VOID_PTR,
    CUDNN_TYPE_CONVOLUTION_MODE,
    CUDNN_TYPE_HEUR_MODE,
    CUDNN_TYPE_KNOB_TYPE,
    CUDNN_TYPE_NAN_PROPOGATION,
    CUDNN_TYPE_NUMERICAL_NOTE,
    CUDNN_TYPE_LAYOUT_TYPE,
    CUDNN_TYPE_ATTRIB_NAME,
    CUDNN_TYPE_POINTWISE_MODE,
    CUDNN_TYPE_BACKEND_DESCRIPTOR,
    CUDNN_TYPE_GENSTATS_MODE,
    CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
    CUDNN_TYPE_BEHAVIOR_NOTE,
    CUDNN_TYPE_TENSOR_REORDERING_MODE,
    CUDNN_TYPE_RESAMPLE_MODE,
    CUDNN_TYPE_PADDING_MODE,
    CUDNN_TYPE_INT32,
    CUDNN_TYPE_CHAR,
    CUDNN_TYPE_SIGNAL_MODE,
    CUDNN_TYPE_FRACTION,
    CUDNN_TYPE_NORM_MODE,
    CUDNN_TYPE_NORM_FWD_PHASE,
    CUDNN_TYPE_RNG_DISTRIBUTION
} cudnnBackendAttributeType_t;

Attribute Types for `cudnnBackendAttributeType_t`#
`cudnnBackendAttributeType_t`	Attribute Type
`CUDNN_TYPE_HANDLE`	cudnnHandle_t
`CUDNN_TYPE_DATA_TYPE`	cudnnDataType_t
`CUDNN_TYPE_BOOLEAN`	`bool`
`CUDNN_TYPE_INT64`	`int64_t`
`CUDNN_TYPE_FLOAT`	`float`
`CUDNN_TYPE_DOUBLE`	`double`
`CUDNN_TYPE_VOID_PTR`	`void *`
`CUDNN_TYPE_CONVOLUTION_MODE`	cudnnConvolutionMode_t
`CUDNN_TYPE_HEUR_MODE`	cudnnBackendHeurMode_t
`CUDNN_TYPE_KNOB_TYPE`	cudnnBackendKnobType_t
`CUDNN_TYPE_NAN_PROPOGATION`	cudnnNanPropagation_t
`CUDNN_TYPE_NUMERICAL_NOTE`	cudnnBackendNumericalNote_t
`CUDNN_TYPE_LAYOUT_TYPE`	cudnnBackendLayoutType_t
`CUDNN_TYPE_ATTRIB_NAME`	cudnnBackendAttributeName_t
`CUDNN_TYPE_POINTWISE_MODE`	cudnnPointwiseMode_t
`CUDNN_TYPE_BACKEND_DESCRIPTOR`	cudnnBackendDescriptor_t
`CUDNN_TYPE_GENSTATS_MODE`	cudnnGenStatsMode_t
`CUDNN_TYPE_BN_FINALIZE_STATS_MODE`	cudnnBnFinalizeStatsMode_t
`CUDNN_TYPE_REDUCTION_OPERATOR_TYPE`	cudnnReduceTensorOp_t
`CUDNN_TYPE_BEHAVIOR_NOTE`	cudnnBackendBehaviorNote_t
`CUDNN_TYPE_TENSOR_REORDERING_MODE`	cudnnBackendTensorReordering_t
`CUDNN_TYPE_RESAMPLE_MODE`	cudnnResampleMode_t
`CUDNN_TYPE_PADDING_MODE`	cudnnPaddingMode_t
`CUDNN_TYPE_INT32`	`int32_t`
`CUDNN_TYPE_CHAR`	`char`
`CUDNN_TYPE_SIGNAL_MODE`	cudnnSignalMode_t
`CUDNN_TYPE_FRACTION`	cudnnFraction_t
`CUDNN_TYPE_NORM_MODE`	cudnnBackendNormMode_t
`CUDNN_TYPE_NORM_FWD_PHASE`	cudnnBackendNormFwdPhase_t
`CUDNN_TYPE_RNG_DISTRIBUTION`	cudnnRngDistribution_t

cudnnBackendBehaviorNote_t #

cudnnBackendBehaviorNote_t is an enumerated type that indicates queryable behavior notes of an engine. Users can query for an array of behavior notes from an CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.

typedef enum {
    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
    CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API  = 3,
    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
} cudnnBackendBehaviorNote_t;

cudnnBackendDescriptorType_t #

cudnnBackendDescriptor_t is an enumerated type that indicates the type of backend descriptors. Users create a backend descriptor of a particular type by passing a value from this enumerate to the cudnnBackendCreateDescriptor() function.

typedef enum {
    CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR,
    CUDNN_BACKEND_ENGINE_DESCRIPTOR,
    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR,
    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR,
    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR,
    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR,
    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR,
    CUDNN_BACKEND_TENSOR_DESCRIPTOR,
    CUDNN_BACKEND_MATMUL_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
    CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR,
    CUDNN_BACKEND_RNG_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR,
    CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR,
    CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_SDPA_FWD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_MOE_GROUPED_MATMUL_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_SDPA_BWD_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_DIAGONAL_BAND_MASK_DESCRIPTOR,
    CUDNN_BACKEND_OPERATION_SOFTMAX_DESCRIPTOR,
} cudnnBackendDescriptorType_t;

cudnnBackendHeurMode_t #

cudnnBackendHeurMode_t is an enumerated type that indicates the operation mode of a CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR.

typedef enum {
    CUDNN_HEUR_MODE_INSTANT  = 0,
    CUDNN_HEUR_MODE_B        = 1,
    CUDNN_HEUR_MODE_FALLBACK = 2,
    CUDNN_HEUR_MODE_A        = 3
}

Values

CUDNN_HEUR_MODE_A and CUDNN_HEUR_MODE_INSTANT

CUDNN_HEUR_MODE_A provides the exact same functionality as CUDNN_HEUR_MODE_INSTANT. The purpose of this renaming is to better match the naming of CUDNN_HEUR_MODE_B.

Consider the use of CUDNN_HEUR_MODE_INSTANT as deprecated; instead, use CUDNN_HEUR_MODE_A.

CUDNN_HEUR_MODE_A utilizes a decision tree heuristic which provides optimal inference time on the CPU in comparison to CUDNN_HEUR_MODE_B.

CUDNN_HEUR_MODE_A and CUDNN_HEUR_MODE_INSTANT support the following operation node or operation graph:

ConvolutionFwd

ConvolutionBwdFilter

ConvolutionBwdData

ConvBNfprop

ConvBNwgrad

ConvBiasAct

ConvScaleBiasAct

DgradDreluBNBwdWeight

Patterns supported by the runtime fusion engine

All other operation graphs are not supported.

CUDNN_HEUR_MODE_B

Can utilize the neural net based heuristics to improve generalization performance compared to CUDNN_HEUR_MODE_INSTANT.

In cases where the neural net is utilized, inference time on the CPU will be increased by 10-100x compared to CUDNN_HEUR_MODE_INSTANT. These neural net heuristics are not supported for any of the following cases:

3-D convolutions

Grouped convolutions (groupCount larger than 1)

Dilated convolutions (any dilation for any spatial dimension larger than 1)

Further, the neural net is only enabled on x86 platforms when cuDNN is run on an A100 GPU. In cases where the neural net is not supported, CUDNN_HEUR_MODE_B will fall back to CUDNN_HEUR_MODE_INSTANT. CUDNN_HEUR_MODE_B will also fall back to CUDNN_HEUR_MODE_INSTANT in cases where the overhead of CUDNN_HEUR_MODE_B is projected to reduce overall network performance.

CUDNN_HEUR_MODE_FALLBACK

This heuristic mode is intended to be used for finding fallback options which provide functional support (without any expectation of providing optimal GPU performance).

cudnnBackendKnobType_t #

cudnnBackendKnobType_t is an enumerated type that indicates the type of performance knobs. Performance knobs are runtime settings to an engine that will affect its performance. Users can query for an array of performance knobs and their valid value range from a CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function. Users can set the choice for each knob using the cudnnBackendSetAttribute() function with a CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR.

typedef enum {
    CUDNN_KNOB_TYPE_SPLIT_K                                = 0,
    CUDNN_KNOB_TYPE_SWIZZLE                                = 1,
    CUDNN_KNOB_TYPE_TILE_SIZE                              = 2,
    CUDNN_KNOB_TYPE_USE_TEX                                = 3,
    CUDNN_KNOB_TYPE_EDGE                                   = 4,
    CUDNN_KNOB_TYPE_KBLOCK                                 = 5,
    CUDNN_KNOB_TYPE_LDGA                                   = 6,
    CUDNN_KNOB_TYPE_LDGB                                   = 7,
    CUDNN_KNOB_TYPE_CHUNK_K                                = 8,
    CUDNN_KNOB_TYPE_SPLIT_H                                = 9,
    CUDNN_KNOB_TYPE_WINO_TILE                              = 10,
    CUDNN_KNOB_TYPE_MULTIPLY                               = 11,
    CUDNN_KNOB_TYPE_SPLIT_K_BUF                            = 12,
    CUDNN_KNOB_TYPE_TILEK                                  = 13,
    CUDNN_KNOB_TYPE_STAGES                                 = 14,
    CUDNN_KNOB_TYPE_REDUCTION_MODE                         = 15,
    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE                       = 16,
    CUDNN_KNOB_TYPE_SPLIT_K_SLC                            = 17,
    CUDNN_KNOB_TYPE_IDX_MODE                               = 18,
    CUDNN_KNOB_TYPE_SLICED                                 = 19,
    CUDNN_KNOB_TYPE_SPLIT_RS                               = 20,
    CUDNN_KNOB_TYPE_SINGLEBUFFER                           = 21,
    CUDNN_KNOB_TYPE_LDGC                                   = 22,
    CUDNN_KNOB_TYPE_SPECFILT                               = 23,
    CUDNN_KNOB_TYPE_KERNEL_CFG                             = 24,
    CUDNN_KNOB_TYPE_WORKSPACE                              = 25,
    CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM         = 26,
    CUDNN_KNOB_TYPE_TILE_CGA_M                             = 27,
    CUDNN_KNOB_TYPE_TILE_CGA_N                             = 28,
    CUDNN_KNOB_TYPE_BLOCK_SIZE                             = 29,
    CUDNN_KNOB_TYPE_OCCUPANCY                              = 30,
    CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD                  = 31,
    CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM  = 32,
    CUDNN_KNOB_TYPE_SPLIT_COLS                             = 33,
    CUDNN_KNOB_TYPE_TILE_ROWS                              = 34,
    CUDNN_KNOB_TYPE_TILE_COLS                              = 35,
} cudnnBackendKnobType_t;

cudnnBackendLayoutType_t #

cudnnBackendLayoutType_t is an enumerated type that indicates queryable layout requirements of an engine. Users can query for layout requirements from a CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.

typedef enum {
    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
    CUDNN_LAYOUT_TYPE_COUNT            = 4,
} cudnnBackendLayoutType_t;

cudnnBackendNormFwdPhase_t #

cudnnBackendNormFwdPhase_t is an enumerated type used to distinguish the inference and training phase of the normalization forward operation.

typedef enum {
    CUDNN_NORM_FWD_INFERENCE = 0,
    CUDNN_NORM_FWD_TRAINING  = 1,
} cudnnBackendNormFwdPhase_t;

cudnnBackendNormMode_t #

cudnnBackendNormMode_t is an enumerated type to indicate the normalization mode in the backend normalization forward and normalization backward operations.

For reference:

The definition of layer normalization can be found in the Layer Normalization paper.

The definition of instance normalization can be found in the Instance Normalization: The Missing Ingredient for Fast Stylization paper.

The definition of batch normalization can be found in the Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift paper.

The definition of root mean square normalization can be found in the Root Mean Square Layer Normalization paper.

The definition of adaptive layer normalization can be found in the Understanding and Improving Layer Normalization paper.

CUDNN_GROUP_NORM is not yet supported. If you try to use it, cuDNN returns a CUDNN_STATUS_INTERNAL_ERROR error.

typedef enum {
    CUDNN_LAYER_NORM     = 0,
    CUDNN_INSTANCE_NORM  = 1,
    CUDNN_BATCH_NORM     = 2,
    CUDNN_GROUP_NORM     = 3,
    CUDNN_RMS_NORM       = 4,
    CUDNN_ADA_LAYER_NORM = 5,
} cudnnBackendNormMode_t;

cudnnBackendNumericalNote_t #

cudnnBackendNumericalNot_t is an enumerated type that indicates queryable numerical properties of an engine. Users can query for an array of numerical notes from an CUDNN_BACKEND_ENGINE_DESCRIPTOR using the cudnnBackendGetAttribute() function.

typedef enum {
    CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS,
    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION,
    CUDNN_NUMERICAL_NOTE_FFT,
    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC,
    CUDNN_NUMERICAL_NOTE_WINOGRAD,
    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4,
    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6,
    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13,
    CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP,
    CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
} cudnnBackendNumericalNote_t;

cudnnBackendTensorReordering_t #

cudnnBackendTensorReordering_t is an enumerated type that indicates tensor reordering as a property of the tensor descriptor. Users can get and set this property in a CUDNN_BACKEND_TENSOR_DESCRIPTOR using the cudnnBackendSetAttribute() and cudnnBackendGetAttribute() functions.

typedef enum {
    CUDNN_TENSOR_REORDERING_NONE     = 0,
    CUDNN_TENSOR_REORDERING_INT8x32  = 1,
    CUDNN_TENSOR_REORDERING_F16x16   = 2,
    CUDNN_TENSOR_REORDERING_F8_128x4 = 3,

} cudnnBackendTensorReordering_t;

cudnnBnFinalizeStatsMode_t #

cudnnBnFinalizeStatsMode_t is an enumerated type that that exposes the different mathematical operation modes that converts batch norm statistics and the trained scale and bias to the equivalent scale and bias to be applied in the next normalization stage for inference and training use cases.

typedef enum {
    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
} cudnnBnFinalizeStatsMode_t;

cudnnConvolutionMode_t #

This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.

cudnnConvolutionMode_t is an enumerated type used by cudnnSetConvolution2dDescriptor() to configure a convolution descriptor. The filter used for the convolution can be applied in two different ways, corresponding mathematically to a convolution or to a cross-correlation. (A cross-correlation is equivalent to a convolution with its filter rotated by 180 degrees.)

Values

CUDNN_CONVOLUTION: In this mode, a convolution operation will be done when applying the filter to the images.
CUDNN_CROSS_CORRELATION: In this mode, a cross-correlation operation will be done when applying the filter to the images.

cudnnDataType_t #

cudnnDataType_t is an enumerated type indicating the data type to which a tensor descriptor or filter descriptor refers.

Values

CUDNN_DATA_FLOAT

The data is a 32-bit single-precision floating-point (float).

CUDNN_DATA_DOUBLE

The data is a 64-bit double-precision floating-point (double).

CUDNN_DATA_HALF

The data is a 16-bit floating-point.

CUDNN_DATA_INT8

The data is an 8-bit signed integer.

CUDNN_DATA_INT32

The data is a 32-bit signed integer.

CUDNN_DATA_INT8x4

The data is 32-bit elements each composed of 4 8-bit signed integers. This data type is only supported with the tensor format CUDNN_TENSOR_NCHW_VECT_C.

CUDNN_DATA_UINT8

The data is an 8-bit unsigned integer.

CUDNN_DATA_UINT8x4

The data is 32-bit elements each composed of 4 8-bit unsigned integers. This data type is only supported with the tensor format CUDNN_TENSOR_NCHW_VECT_C.

CUDNN_DATA_INT8x32

The data is 32-element vectors, each element being an 8-bit signed integer. This data type is only supported with the tensor format CUDNN_TENSOR_NCHW_VECT_C. Moreover, this data type can only be used with algo 1, meaning, CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. For more information, refer to cudnnConvolutionFwdAlgo_t.

CUDNN_DATA_BFLOAT16

The data is a 16-bit quantity, with 7 mantissa bits, 8 exponent bits, and 1 sign bit.

CUDNN_DATA_INT64

The data is a 64-bit signed integer.

CUDNN_DATA_BOOLEAN

The data is a boolean (bool).

Note that for type CUDNN_TYPE_BOOLEAN, elements are expected to be “packed”: that is, one byte contains 8 elements of type CUDNN_TYPE_BOOLEAN. Further, within each byte, elements are indexed from the least significant bit to the most significant bit. For example, a 1 dimensional tensor of 8 elements containing 01001111 has value 1 for elements 0 through 3, 0 for elements 4 and 5, 1 for element 6 and 0 for element 7.

Tensors with more than 8 elements simply use more bytes, where the order is also from least significant to most significant byte. Note, CUDA is little-endian, meaning that the least significant byte has the lower memory address address. For example, in the case of 16 elements, 01001111 11111100 has value 1 for elements 0 through 3, 0 for elements 4 and 5, 1 for element 6 and 0 for element 7, value 0 for elements 8 and 9, 1 for elements 10 through 15.

CUDNN_DATA_FP8_E4M3

The data is an 8-bit quantity, with 3 mantissa bits, 4 exponent bits, and 1 sign bit.

CUDNN_DATA_FP8_E5M2

The data is an 8-bit quantity, with 2 mantissa bits, 5 exponent bits, and 1 sign bit.

CUDNN_DATA_FAST_FLOAT_FOR_FP8

The data type is a higher throughput but lower precision compute type (compared to CUDNN_DATA_FLOAT) used for FP8 tensor core operations.

cudnnErrQueryMode_t #

cudnnErrQueryMode_t is an enumerated type passed to cudnnQueryRuntimeError() to select the remote kernel error query mode.

Values

CUDNN_ERRQUERY_RAWCODE: Read the error storage location regardless of the kernel completion status.
CUDNN_ERRQUERY_NONBLOCKING: Report if all tasks in the user stream of the cuDNN handle were completed. If that is the case, report the remote kernel error code.
CUDNN_ERRQUERY_BLOCKING: Wait for all tasks to complete in the user stream before reporting the remote kernel error code.

cudnnGenStatsMode_t #

cudnnGenStatsMode_t is an enumerated type to indicate the statistics mode in the backend statistics generation operation.

Values

CUDNN_GENSTATS_SUM_SQSUM: In this mode, the sum and sum of squares of the input tensor along the specified dimensions are computed and written out. The reduction dimensions currently supported are limited per channel, however additional support may be added upon request.

cudnnHandle_t #

cudnnHandle_t is a pointer to an opaque structure holding the cuDNN library context. The cuDNN library context must be created using cudnnCreate() and the returned handle must be passed to all subsequent library function calls. The context should be destroyed at the end using cudnnDestroy(). The context is associated with only one GPU device, the current device at the time of the call to cudnnCreate(). However, multiple contexts can be created on the same GPU device.

cudnnMathType_t #

cudnnMathType_t is an enumerated type used to indicate if the use of Tensor Core operations is permitted in a given library routine.

Values

CUDNN_DEFAULT_MATH

Tensor Core operations are not used on pre-NVIDIA A100 GPU devices. On A100 GPU architecture devices, Tensor Core TF32 operation is permitted.

CUDNN_TENSOR_OP_MATH

The use of Tensor Core operations is permitted but will not actively perform datatype down conversion on tensors in order to utilize Tensor Cores.

CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION

The use of Tensor Core operations is permitted and will actively perform datatype down conversion on tensors in order to utilize Tensor Cores.

CUDNN_FMA_MATH

Restricted to only kernels that use FMA instructions.

On pre-NVIDIA A100 GPU devices, CUDNN_DEFAULT_MATH and CUDNN_FMA_MATH have the same behavior: Tensor Core kernels will not be selected. With NVIDIA Ampere architecture and CUDA toolkit 11, CUDNN_DEFAULT_MATH permits TF32 Tensor Core operation and CUDNN_FMA_MATH does not. The TF32 behavior for CUDNN_DEFAULT_MATH and the other Tensor Core math types can be explicitly disabled by the environment variable NVIDIA_TF32_OVERRIDE=0.

cudnnNanPropagation_t #

This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.

cudnnNanPropagation_t is an enumerated type used to indicate if a given routine should propagate Nan numbers. This enumerated type is used as a field for the cudnnActivationDescriptor_t descriptor and cudnnPoolingDescriptor_t descriptor.

Values

CUDNN_NOT_PROPAGATE_NAN: NAN numbers are not propagated.
CUDNN_PROPAGATE_NAN: NAN numbers are propagated.

cudnnCTCGradMode_t #

Enumerated type used by cudnnSetCTCLossDescriptor_v9 and cudnnGetCTCLossDescriptor_v9 to indicate the behavior for out of boundary (OOB) samples. OOB samples are samples where L+R > T is encountered during the gradient calculation.

If ctcGradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for that sample. Instead, the current values, even not finite, are retained.

If ctcGradMode is set to CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.

cudnnPaddingMode_t #

cudnnPaddingMode_t is an enumerated type to indicate the padding mode in the backend resample operations.

typedef enum {
    CUDNN_ZERO_PAD     = 0,
    CUDNN_NEG_INF_PAD  = 1,
    CUDNN_EDGE_VAL_PAD = 2,
} cudnnPaddingMode_t;

cudnnPointwiseMode_t #

cudnnPointwiseMode_t is an enumerated type to indicate the intended pointwise math operation in the backend pointwise operation descriptor.

Values

CUDNN_POINTWISE_ADD: A pointwise addition between two tensors is computed.
CUDNN_POINTWISE_ADD_SQUARE: A pointwise addition between the first tensor and the square of the second tensor is computed.
CUDNN_POINTWISE_DIV: A pointwise true division of the first tensor by second tensor is computed.
CUDNN_POINTWISE_MAX: A pointwise maximum is taken between two tensors.
CUDNN_POINTWISE_MIN: A pointwise minimum is taken between two tensors.
CUDNN_POINTWISE_MOD: A pointwise floating-point remainder of the first tensor’s division by the second tensor is computed.
CUDNN_POINTWISE_MUL: A pointwise multiplication between two tensors is computed.
CUDNN_POINTWISE_POW: A pointwise value from the first tensor to the power of the second tensor is computed.
CUDNN_POINTWISE_SUB: A pointwise subtraction between two tensors is computed.
CUDNN_POINTWISE_ABS: A pointwise absolute value of the input tensor is computed.
CUDNN_POINTWISE_CEIL: A pointwise ceiling of the input tensor is computed.
CUDNN_POINTWISE_COS: A pointwise trigonometric cosine of the input tensor is computed.
CUDNN_POINTWISE_EXP: A pointwise exponential of the input tensor is computed.
CUDNN_POINTWISE_FLOOR: A pointwise floor of the input tensor is computed.
CUDNN_POINTWISE_LOG: A pointwise natural logarithm of the input tensor is computed.
CUDNN_POINTWISE_NEG: A pointwise numerical negative of the input tensor is computed.
CUDNN_POINTWISE_RSQRT: A pointwise reciprocal of the square root of the input tensor is computed.
CUDNN_POINTWISE_SIN: A pointwise trigonometric sine of the input tensor is computed.
CUDNN_POINTWISE_SQRT: A pointwise square root of the input tensor is computed.
CUDNN_POINTWISE_TAN: A pointwise trigonometric tangent of the input tensor is computed.
CUDNN_POINTWISE_ERF: A pointwise Error Function is computed.
CUDNN_POINTWISE_IDENTITY: No computation is performed. As with other pointwise modes, this mode provides implicit conversions by specifying the data type of the input tensor as one type, and the data type of the output tensor as another.
CUDNN_POINTWISE_RELU_FWD: A pointwise rectified linear activation function of the input tensor is computed.
CUDNN_POINTWISE_TANH_FWD: A pointwise tanh activation function of the input tensor is computed.
CUDNN_POINTWISE_SIGMOID_FWD: A pointwise sigmoid activation function of the input tensor is computed.
CUDNN_POINTWISE_ELU_FWD: A pointwise Exponential Linear Unit activation function of the input tensor is computed.
CUDNN_POINTWISE_GELU_FWD: A pointwise Gaussian Error Linear Unit activation function of the input tensor is computed.
CUDNN_POINTWISE_SOFTPLUS_FWD: A pointwise softplus activation function of the input tensor is computed.
CUDNN_POINTWISE_SWISH_FWD: A pointwise swish activation function of the input tensor is computed.
CUDNN_POINTWISE_GELU_APPROX_TANH_FWD: A pointwise tanh approximation of the Gaussian Error Linear Unit activation function of the input tensor is computed. The tanh GELU approximation is computed as \(0.5x\left( 1+\tanh\left[ \sqrt{2/\pi}\left( x+0.044715x^{3} \right) \right] \right)\). For more information, refer to the GAUSSIAN ERROR LINEAR UNIT (GELUS) paper.
CUDNN_POINTWISE_RELU_BWD: A pointwise first derivative of rectified linear activation of the input tensor is computed.
CUDNN_POINTWISE_TANH_BWD: A pointwise first derivative of tanh activation of the input tensor is computed.
CUDNN_POINTWISE_SIGMOID_BWD: A pointwise first derivative of sigmoid activation of the input tensor is computed.
CUDNN_POINTWISE_ELU_BWD: A pointwise first derivative of Exponential Linear Unit activation of the input tensor is computed.
CUDNN_POINTWISE_GELU_BWD: A pointwise first derivative of Gaussian Error Linear Unit activation of the input tensor is computed.
CUDNN_POINTWISE_SOFTPLUS_BWD: A pointwise first derivative of softplus activation of the input tensor is computed.
CUDNN_POINTWISE_SWISH_BWD: A pointwise first derivative of swish activation of the input tensor is computed.
CUDNN_POINTWISE_GELU_APPROX_TANH_BWD: A pointwise first derivative of the tanh approximation of the Gaussian Error Linear Unit activation of the input tensor is computed. This is computed as \(0.5\left( 1+\tanh\left( b\left( x+cx^{3} \right) \right)+bxsech^{2}\left( b\left( cx^{3}+x \right) \right)\left( 3cx^{2}+1 \right)dy \right)\) where \(b\) is \(\sqrt{2/\pi}\) and \(c\) is \(0.044715\).
CUDNN_POINTWISE_CMP_EQ: A pointwise truth value of the first tensor equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_NEQ: A pointwise truth value of the first tensor not equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_GT: A pointwise truth value of the first tensor greater than the second tensor is computed.
CUDNN_POINTWISE_CMP_GE: A pointwise truth value of the first tensor greater than equal to the second tensor is computed.
CUDNN_POINTWISE_CMP_LT: A pointwise truth value of the first tensor less than the second tensor is computed.
CUDNN_POINTWISE_CMP_LE: A pointwise truth value of the first tensor less than equal to the second tensor is computed.
CUDNN_POINTWISE_LOGICAL_AND: A pointwise truth value of the first tensor logical AND second tensor is computed.
CUDNN_POINTWISE_LOGICAL_OR: A pointwise truth value of the first tensor logical OR second tensor is computed.
CUDNN_POINTWISE_LOGICAL_NOT: A pointwise truth value of input tensors logical NOT is computed.
CUDNN_POINTWISE_GEN_INDEX: A pointwise index value of the input tensor is generated along a given axis.
CUDNN_POINTWISE_BINARY_SELECT: A pointwise value is selected amongst two input tensors based on a given predicate tensor.
CUDNN_POINTWISE_RECIPROCAL: A pointwise reciprocal of the input tensor is computed. In other words, for every element x in the input tensor, 1/x is computed.

cudnnReduceTensorOp_t #

This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.

cudnnReduceTensorOp_t is an enumerated type used to indicate the Tensor Core operation to be used by the cudnnReduceTensor() routine. This enumerated type is used as a field for the cudnnReduceTensorDescriptor_t descriptor.

Values

CUDNN_REDUCE_TENSOR_ADD: The operation to be performed is addition.
CUDNN_REDUCE_TENSOR_MUL: The operation to be performed is multiplication.
CUDNN_REDUCE_TENSOR_MIN: The operation to be performed is a minimum comparison.
CUDNN_REDUCE_TENSOR_MAX: The operation to be performed is a maximum comparison.
CUDNN_REDUCE_TENSOR_AMAX: The operation to be performed is a maximum comparison of absolute values.
CUDNN_REDUCE_TENSOR_AVG: The operation to be performed is averaging.
CUDNN_REDUCE_TENSOR_NORM1: The operation to be performed is addition of absolute values.
CUDNN_REDUCE_TENSOR_NORM2: The operation to be performed is a square root of the sum of squares.
CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS: The operation to be performed is multiplication, not including elements of value zero.

cudnnReorderType_t #

This enumerated type is deprecated and is currently only used by deprecated APIs. Consider using replacements for the deprecated APIs that use this enumerated type.

cudnnReorderType_t is an enumerated type to set the convolution reordering type. The reordering type can be set by cudnnSetConvolutionReorderType() and its status can be read by cudnnGetConvolutionReorderType().

typedef enum {
    CUDNN_DEFAULT_REORDER = 0,
    CUDNN_NO_REORDER      = 1,
    } cudnnReorderType_t;

cudnnResampleMode_t #

cudnnResampleMode_t is an enumerated type to indicate the resample mode in the backend resample operations.

typedef enum {
    CUDNN_RESAMPLE_NEAREST                 = 0,
    CUDNN_RESAMPLE_BILINEAR                = 1,
    CUDNN_RESAMPLE_AVGPOOL                 = 2,
    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
    CUDNN_RESAMPLE_MAXPOOL                 = 3,
} cudnnResampleMode_t;

cudnnRngDistribution_t #

cudnnRngDistribution_t is an enumerated type to indicate the distribution to be used in the backend Rng (random number generator) operation.

typedef enum {
    CUDNN_RNG_DISTRIBUTION_BERNOULLI,
    CUDNN_RNG_DISTRIBUTION_UNIFORM,
    CUDNN_RNG_DISTRIBUTION_NORMAL,
} cudnnRngDistribution_t;

Values

CUDNN_RNG_DISTRIBUTION_BERNOULLI: The bernoulli distribution is used for the random number generation. The attribute CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY can be used to specify the probability of generating 1’s.
CUDNN_RNG_DISTRIBUTION_UNIFORM: The uniform distribution is used for the random number generation. The attribute CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM and CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM can be used to specify the minimum and maximum value between which the random numbers should be uniformly generated.
CUDNN_RNG_DISTRIBUTION_NORMAL: The normal distribution is used for the random number generation. The attribute CUDNN_ATTR_RNG_NORMAL_DIST_MEAN and CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION can be used to specify the mean and standard deviation of the random number generator.

cudnnSeverity_t #

cudnnSeverity_t is an enumerated type passed to the customized callback function for logging that users may set. This enumerate describes the severity level of the item, so the customized logging call back may react differently. The numerical values are the same that are to be used for setting the CUDNN_LOGLEVEL_DBG environment variable.

Values

CUDNN_SEV_FATAL = 0: This value indicates a fatal error emitted by cuDNN.
CUDNN_SEV_ERROR = 1: This value indicates a normal error emitted by cuDNN.
CUDNN_SEV_WARNING = 2: This value indicates a warning emitted by cuDNN.
CUDNN_SEV_INFO = 3: This value indicates a piece of information (for example, API log) emitted by cuDNN.

cudnnSignalMode_t #

cudnnSignalMode_t is an enumerated type to indicate the signaling mode in the backend signal operation.

typedef enum {
    CUDNN_SIGNAL_SET  = 0,
    CUDNN_SIGNAL_WAIT = 1,
} cudnnSignalMode_t;

Values

CUDNN_SIGNAL_SET: The flag variable is updated with the provided signal value atomically.
CUDNN_SIGNAL_WAIT: The operation blocks until the flag variable keeps comparing equal to the provided signal value.

cudnnStatus_t #

cudnnStatus_t is an enumerated type used for function status returns. All cuDNN library functions return their status, which can be one of the following values:

Values

CUDNN_STATUS_SUCCESS: The operation was completed successfully.
CUDNN_STATUS_NOT_INITIALIZED: The cuDNN library was not initialized properly. This error is usually returned when a call to cudnnCreate() fails or when cudnnCreate() has not been called prior to calling another cuDNN routine. In the former case, it is usually due to an error in the CUDA Runtime API called by cudnnCreate() or by an error in the hardware setup.
CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH: Some cuDNN sub libraries have different versions, indicative of an installation issue.
CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH: The schema used for serialization is not what the current cuDNN library expects, thus the serialized artifact is no longer valid and needs to be re-serialized.
CUDNN_STATUS_DEPRECATED: This error code may be reported in the warning logging level as a reminder that some functionality is under deprecation and will be removed in the next major version update.
CUDNN_STATUS_LICENSE_ERROR: The functionality requested requires some license and an error was detected when trying to check the current licensing. This error can happen if the license is not present or is expired or if the environment variable NVIDIA_LICENSE_FILE is not set properly.
CUDNN_STATUS_RUNTIME_IN_PROGRESS: Some tasks in the user stream are not completed.
CUDNN_STATUS_RUNTIME_FP_OVERFLOW: Numerical overflow occurred during the GPU kernel execution.
CUDNN_STATUS_BAD_PARAM: This is an error category code. An incorrect value or parameter was passed to the function.
CUDNN_STATUS_BAD_PARAM_NULL_POINTER: The cuDNN API has unexpectedly received a null pointer from the user.
CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER: The cuDNN API has received a misaligned pointer from the user.
CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED: The backend descriptor has not been finalized.
CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND: The cuDNN API has received an out-of-bound value.
CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT: The cuDNN API has received a memory buffer with insufficient space.
CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH: The cuDNN API has received an unexpected stream.
CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH: The cuDNN API has received inconsistent tensor shapes.
CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES: The cuDNN API has received duplicated entries.
CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE: The cuDNN API has received an invalid or unsupported attribute type.
CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH: The cuDNN API has received an unexpected CUDA graph.
CUDNN_STATUS_NOT_SUPPORTED: This is an error category code. The functionality requested is not currently supported by cuDNN.
CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN: cuDNN does not currently support such an operation graph pattern.
CUDNN_STATUS_NOT_SUPPORTED_SHAPE: cuDNN does not currently support the tensor shapes used in some specific operation or graph pattern.
CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE: cuDNN does not currently support the tensor data type.
CUDNN_STATUS_NOT_SUPPORTED_LAYOUT: cuDNN does not currently support the tensor layout.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER: The requested functionality is not compatible with the current CUDA driver.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART: The requested functionality is not compatible with the current CUDA runtime.
CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH: The function requires a feature absent from the current GPU device.
CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING: A runtime library required by cuDNN cannot be found in the predefined search paths. These libraries are libcuda.so (nvcuda.dll) and libnvrtc.so (nvrtc64_<Major Release Version><Minor Release Version>_0.dll and nvrtc-builtins64_<Major Release Version><Minor Release Version>.dll).
CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE: The requested functionality is not available due to missing a sublibrary.
CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT: The requested functionality is not available due to the insufficient shared memory size on the GPU.
CUDNN_STATUS_NOT_SUPPORTED_PADDING: The requested functionality is not available due to padding requirements.
CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM: The requested functionality is not available because they lead to invalid kernel launch parameters.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API: The requested functionality is not available because this particular engine does not support the native CUDA graph API. (The engines that do support that API have the behavior note CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API.)
CUDNN_STATUS_INTERNAL_ERROR: This is an error category code. An internal cuDNN operation failed.
CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED: A runtime kernel has failed to be compiled.
CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE: An unexpected internal inconsistency has been detected.
CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED: An internal host memory allocation failed inside the cuDNN library.
CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED: Resource allocation failed inside the cuDNN library.
CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM: Invalid kernel launch parameters are unexpectedly detected.
CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED: Access to GPU memory space failed, which is usually caused by a failure to bind a texture. To correct, prior to the function call, unbind any previously bound textures. Otherwise, this may indicate an internal error/bug in the library.
CUDNN_STATUS_EXECUTION_FAILED: This is an error category code. The GPU program failed to execute. This is usually caused by a failure to launch a kernel on the GPU, which can be caused by another library that cuDNN depends on.
CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER: The GPU program failed to execute due to an error reported by the CUDA driver.
CUDNN_STATUS_EXECUTION_FAILED_CUBLAS: The GPU program failed to execute due to an error reported by cuBLAS.
CUDNN_STATUS_EXECUTION_FAILED_CUDART: The GPU program failed to execute due to an error reported by the CUDA runtime.
CUDNN_STATUS_EXECUTION_FAILED_CURAND: The GPU program failed to execute due to an error reported by cuRAND.

Additionally, the following macros can be used on cudnnStatus_t error codes.

CUDNN_STATUS_CATEGORY(full_error_code): Extract the category error code from a cudnnStatus_t error code full_error_code. This is useful for checking if an error belongs to a certain category. For example, CUDNN_STATUS_CATEGORY(CUDNN_STATUS_BAD_PARAM_NULL_POINTER) will output CUDNN_STATUS_BAD_PARAM.
CUDNN_STATUS_SPECIFIC_ERROR(full_error_code): Extract the specific error code from a cudnnStatus_t error code full_error_code, that is, removing the category.
CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err): Recombine a category error code and a specific code in the same category into a full cudnnStatus_t error code, such that CUDNN_STATUS_FULL_ERROR_CODE(CUDNN_STATUS_CATEGORY(e), CUDNN_STATUS_SPECIFIC_ERROR(e)) == e, for any valid cudnnStatus_t error code.

cudnnTensorFormat_t #

cudnnTensorFormat_t is an enumerated type used by cudnnSetTensor4dDescriptor() to create a tensor with a pre-defined layout. For a detailed explanation of how these tensors are arranged in memory, refer to Data Layout Formats.

Values

CUDNN_TENSOR_NCHW

This tensor format specifies that the data is laid out in the following order: batch size, feature maps, rows, columns. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, feature maps, rows, and columns; the columns are the inner dimension and the images are the outermost dimension.

CUDNN_TENSOR_NHWC

This tensor format specifies that the data is laid out in the following order: batch size, rows, columns, feature maps. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, rows, columns, and feature maps; the feature maps are the inner dimension and the images are the outermost dimension.

CUDNN_TENSOR_NCHW_VECT_C

This tensor format specifies that the data is laid out in the following order: batch size, feature maps, rows, columns. However, each element of the tensor is a vector of multiple feature maps. The length of the vector is carried by the data type of the tensor. The strides are implicitly defined in such a way that the data are contiguous in memory with no padding between images, feature maps, rows, and columns; the columns are the inner dimension and the images are the outermost dimension. This format is only supported with tensor data types CUDNN_DATA_INT8x4, CUDNN_DATA_INT8x32, and CUDNN_DATA_UINT8x4.

The CUDNN_TENSOR_NCHW_VECT_C can also be interpreted in the following way: The NCHW INT8x32 format is really N x (C/32) x H x W x 32 (32 Cs for every W), just as the NCHW INT8x4 format is N x (C/4) x H x W x 4 (4 Cs for every W). Hence, the VECT_C name - each W is a vector (4 or 32) of Cs.

Uncategorized #

These are the uncategorized references found in the cudnn_graph library.

cudnnBackendDescriptor_t #

cudnnBackendDescriptor_t is a typedef void pointer to one of many opaque descriptor structures. The type of structure that it points to is determined by the argument when allocating the memory for the opaque structure using cudnnBackendCreateDescriptor().

Attributes of a descriptor can be set using cudnnBackendSetAttribute(). After all required attributes of a descriptor are set, the descriptor can be finalized by cudnnBackendFinalize(). From a finalized descriptor, one can query its queryable attributes using cudnnBackendGetAttribute(). Finally, the memory allocated for a descriptor can be freed using cudnnBackendDestroyDescriptor().

API Functions #

These are the API functions in the cudnn_graph library.

cudnnBackendPopulateCudaGraph()#

This method, part of the new Native CUDA Graph API, directly builds a CUDA graph (not to be confused with a cuDNN graph) representing the given engine. When the caller instantiates and executes this CUDA graph, the graph will execute the engine configuration plan on the VariantPack and the finalized ExecutionPlan on the data.

The resulting CUDA graph captures the pointers (data and working space) in the VariantPack at the time this API is called, but it can be run arbitrarily many times with different data at the same pointers. The graph can also can later be modified in place with different VariantPack pointers by using cudnnBackendUpdateCudaGraph().

The initial CUDA graph passed in to this API must be empty (having no nodes), and the caller should not append additional nodes to the resulting graph. However, the graph can be embedded as a child node of a larger CUDA graph, for example, by cudaGraphAddChildGraphNode. (This is typical usage.)

Only a limited number of engines currently support this API (with more to be added in future releases of cuDNN). Those supporting it have the behavior note CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API.

Note

This API is only supported in versions of cuDNN compiled against CUDA runtime 12.x and above.

cudnnStatus_t cudnnBackendPopulateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t cudaGraph)

Parameters

executionPlan

Input. Pointer to the finalized ExecutionPlan.

variantPack

Input. Pointer to the finalized VariantPack consisting of:

Data pointer for each non-virtual pointer of the operation set in the execution plan.

Pointer to user-allocated workspace in global memory at least as large as the size queried from CUDNN_BACKEND_.

cudaGraph

Input/Output. A CUDA graph handle, representing an already created, empty CUDA graph to be populated by the API.

Returns

CUDNN_STATUS_SUCCESS: The CUDA graph was generated successfully.
CUDNN_STATUS_BAD_PARAM: An incorrect or inconsistent value is encountered. For example, a required data pointer is invalid, or the given cudaGraph isn’t initially empty.
CUDNN_STATUS_INTERNAL_ERROR: Some internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILED: An error was encountered creating a CUDA graph for the plan with the variant pack.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API: This particular engine does not support the native CUDA graph API.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART: This cuDNN was built to a CUDA runtime less than 12.0, and doesn’t support the native CUDA graph API.

cudnnBackendCreateDescriptor()#

This function allocates memory in the descriptor for a given descriptor type and at the location pointed by the descriptor.

cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor)

Note

The cudnnBackendDescriptor_t is a pointer to void *.

Parameters

descriptorType: Input. One among the enumerated cudnnBackendDescriptorType_t.
descriptor: Input. Pointer to an instance of cudnnBackendDescriptor_t to be created.

Returns

CUDNN_STATUS_SUCCESS: The creation was successful.
CUDNN_STATUS_NOT_SUPPORTED: Creating a descriptor of a given type is not supported.
CUDNN_STATUS_ALLOC_FAILED: The memory allocation failed.

cudnnBackendDestroyDescriptor()#

This function destroys instances of cudnnBackendDescriptor_t that were previously created using cudnnBackendCreateDescriptor().

cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor)

Parameters

descriptor: Input. Instance of cudnnBackendDescriptor_t previously created by cudnnBackendCreateDescriptor().

Returns

CUDNN_STATUS_SUCCESS: The memory was destroyed successfully.
CUDNN_STATUS_ALLOC_FAILED: The destruction of memory failed.
Undefined Behavior: The descriptor was altered between the Create and Destroy Descriptor.
Undefined: The value pointed by the descriptor will be Undefined after the memory is free and done.

cudnnBackendExecute()#

This function executes the given Engine Configuration Plan on the VariantPack and the finalized ExecutionPlan on the data. The data and the working space are encapsulated in the VariantPack.

cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t varianPack)

Parameters

executionPlan

Input. Pointer to the finalized ExecutionPlan.

variantPack

Input. Pointer to the finalized VariantPack consisting of:

Data pointer for each non-virtual pointer of the operation set in the execution plan.

Pointer to user-allocated workspace in global memory at least as large as the size queried from CUDNN_BACKEND_.

Returns

CUDNN_STATUS_SUCCESS: The ExecutionPlan was executed successfully.
CUDNN_STATUS_BAD_PARAM: An incorrect or inconsistent value is encountered. For example, a required data pointer is invalid.
CUDNN_STATUS_INTERNAL_ERROR: Some internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILED: An error was encountered executing the plan with the variant pack.

cudnnBackendFinalize()#

This function finalizes the memory pointed to by the descriptor. The type of finalization is done depending on the descriptorType argument with which the descriptor was created using cudnnBackendCreateDescriptor() or initialized using cudnnBackendInitialize().

cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor descriptor)

cudnnBackendFinalize() also checks all the attributes set between the create/initialization and finalize phase. If successful, cudnnBackendFinalize() returns CUDNN_STATUS_SUCCESS and the finalized state of the descriptor is set to true. In this state, setting attributes using cudnnBackendSetAttribute() is not allowed. Getting attributes using cudnnBackendGetAttribute() is only allowed when the finalized state of the descriptor is true.

Parameters

descriptor: Input. Instance of cudnnBackendDescriptor_t to finalize.

Returns

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.
CUDNN_STATUS_BAD_PARAM: Invalid descriptor attribute values or combination thereof is encountered.
CUDNN_STATUS_NOT_SUPPORTED: Descriptor attribute values or combinations therefore not supported by the current version of cuDNN are encountered.
CUDNN_STATUS_INTERNAL_ERROR: Some internal errors are encountered.

cudnnBackendGetAttribute()#

This function retrieves the values of an attribute of a descriptor. attributeName is the name of the attribute whose value is requested. attributeType is the type of attribute. requestsedElementCount is the number of elements to be potentially retrieved. The number of elements for the requested attribute is stored in elementCount. The retrieved values are stored in arrayOfElements. When the attribute is expected to have a single value, arrayOfElements can be pointer to the output value. This function will return CUDNN_STATUS_NOT_INTIALIZED if the descriptor has not been successfully finalized using cudnnBackendFinalize().

cudnnStatus_t cudnnBackendGetAttribute(
    cudnnBackendDescriptor_t descriptor,
    cudnnBackendAttributeName_t attributeName,
    cudnnBackendAttributeType_t attributeType,
    int64_t requestedElementCount,
    int64_t *elementCount,
    void *arrayOfElements);

Parameters

descriptor: Input. Instance of cudnnBackendDescriptor_t whose attribute the user wants to retrieve.
attributeName: Input. The name of the attribute being get from the on the descriptor.
attributeType: Input. The type of attribute.
requestedElementCount: Input. Number of elements to output to arrayOfElements.
elementCount: Input. Output pointer for the number of elements the descriptor attribute has. Note that cudnnBackendGetAttribute() will only write the least of this and requestedElementCount elements to arrayOfElements.
arrayOfElements: Input. Array of elements of the datatype of the attributeType. The data type of the attributeType is listed in the mapping table of cudnnBackendAttributeType_t.

Returns

CUDNN_STATUS_SUCCESS

The attributeName was given to the descriptor successfully.

CUDNN_STATUS_BAD_PARAM

One or more invalid or inconsistent argument values were encountered. Some examples include:

attributeName is not a valid attribute for the descriptor.

attributeType is not one of the valid types for the attribute.

CUDNN_STATUS_NOT_INITIALIZED

The descriptor has not been successfully finalized using cudnnBackendFinalize().

cudnnBackendInitialize()#

This function has been deprecated in cuDNN 9.2.

This function repurposes a pre-allocated memory pointed to by a descriptor of size sizeInByte to a backend descriptor of type descriptorType. The finalized state of the descriptor is set to false.

cudnnStatus_t cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor, cudnnBackendDescriptorType_t descriptorType, size_t sizeInBytes)

Parameters

descriptor: Input. Instance of cudnnBackendDescriptor_t to be initialized.
descriptorType: Input. Enumerated value for the type of cuDNN backend descriptor.
sizeInBytes: Input. Size of memory pointed to by descriptor.

Returns

CUDNN_STATUS_SUCCESS

The memory was initialized successfully.

CUDNN_STATUS_BAD_PARAM

An invalid or inconsistent argument value is encountered. Some examples include:

descriptor is a nullptr

sizeInBytes is less than the size required by the descriptor type

cudnnBackendSetAttribute()#

This function sets an attribute of a descriptor to values provided as a pointer. descriptor is the descriptor to be set. attributeName is the name of the attribute to be set. attributeType is the type of attribute. The value to which the attribute is set, is pointed by the arrayOfElements. The number of elements is given by elementCount. This function will return CUDNN_STATUS_NOT_INTIALIZED if the descriptor is already successfully finalized using cudnnBackendFinalize().

cudnnStatus_t cudnnBackendSetAttribute(
    cudnnBackendDescriptor_t descriptor,
    cudnnBackendAttributeName_t attributeName,
    cudnnBackendAttributeType_t attributeType,
    int64_t elementCount,
    void *arrayOfElements);

Parameters

descriptor: Input. Instance of cudnnBackendDescriptor_t whose attribute is being set.
attributeName: Input. The name of the attribute being set on the descriptor.
attributeType: Input. The type of attribute.
elementCount: Input. Number of elements being set.
arrayOfElements: Input. The starting location for an array from where to read the values from. The elements of the array are expected to be of the datatype of the attributeType. The datatype of the attributeType is listed in the mapping table of cudnnBackendAttributeType_t.

Returns

CUDNN_STATUS_SUCCESS

The attributeName was set to the descriptor.

CUDNN_STATUS_NOT_INITIALIZED

The backend descriptor pointed to by the descriptor is already in the finalized state.

CUDNN_STATUS_BAD_PARAM

The function is called with arguments that correspond to invalid values. Some examples include:

attributeName is not a settable attribute of descriptor.

attributeType is incorrect for this attributeName.

elemCount value is unexpected.

arrayOfElements contains values invalid for the attributeType.

CUDNN_STATUS_NOT_SUPPORTED

The values to which the attributes are being set are not supported by the current version of cuDNN.

cudnnBackendUpdateCudaGraph()#

This method, part of the new Native CUDA Graph API, updates an existing CUDA graph previously populated by cudnnBackendPopulateCudaGraph() (or a clone thereof) with a new variantPack.

Only a limited number of engines currently support this API (with more to be added in future releases of cuDNN). Those supporting it have the behavior note CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API.

Note

This API is only supported in versions of cuDNN compiled against CUDA runtime 12.x and above.

cudnnStatus_t cudnnBackendUpdateCudaGraph(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack, cudaGraph_t cudaGraph)

Parameters

executionPlan

Input. Pointer to the finalized ExecutionPlan. This must match the ExecutionPlan originally passed to cudnnBackendPopulateCudaGraph().

variantPack

Input. Pointer to a finalized VariantPack consisting of the following pointers, which replace the VariantPack pointers captured in the CUDA graph:

Data pointer for each non-virtual pointer of the operation set in the execution plan.

Pointer to a user-allocated workspace in global memory at least as large as the size queried from CUDNN_BACKEND_.

cudaGraph

Input. Pointer to an existing CUDA graph handle. This graph must have been populated by cudnnBackendPopulateCudaGraph(), or be a clone thereof (for example, as created by cudaGraphClone, or as embedded in a larger graph by using cudaGraphAddChildGraphNode).

Returns

CUDNN_STATUS_SUCCESS: The CUDA graph was updated successfully.
CUDNN_STATUS_BAD_PARAM: An incorrect or inconsistent value is encountered. For example, a required data pointer is invalid.
CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH: The CUDA graph doesn’t appear to have been populated by cudnnBackendPopulateCudaGraph(), (or a clone thereof) for this execution plan, or contains unexpected additional nodes.
CUDNN_STATUS_INTERNAL_ERROR: Some internal errors were encountered.
CUDNN_STATUS_EXECUTION_FAILED: An error was encountered updating a CUDA graph for the plan with the variant pack.
CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API: This particular engine does not support the native CUDA graph API.
CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART: This cuDNN was built to a CUDA runtime less than 12.0, and doesn’t support the native CUDA graph API.

cudnnCreate()#

This function initializes the cuDNN library and creates a handle to an opaque structure holding the cuDNN library context. It allocates hardware resources on the host and device and must be called prior to making any other cuDNN library calls.

cudnnStatus_t cudnnCreate(cudnnHandle_t *handle)

The cuDNN library handle is tied to the current CUDA device (context). To use the library on multiple devices, one cuDNN handle needs to be created for each device.

For a given device, multiple cuDNN handles with different configurations (for example, different current CUDA streams) may be created. Because cudnnCreate() allocates some internal resources, the release of those resources by calling cudnnDestroy() will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate() or cudnnDestroy() outside of performance-critical code paths.

For multithreaded applications that use the same device from different threads, the recommended programming model is to create one (or a few, as is convenient) cuDNN handles per thread and use that cuDNN handle for the entire life of the thread.

Parameters

handle: Output. Pointer to pointer where to store the address to the allocated cuDNN handle. For more information, refer to cudnnHandle_t.

Returns

CUDNN_STATUS_BAD_PARAM: Invalid (NULL) input pointer supplied.
CUDNN_STATUS_NOT_INITIALIZED: No compatible GPU found, CUDA driver not installed or disabled, CUDA runtime API initialization failed.
CUDNN_STATUS_ARCH_MISMATCH: NVIDIA GPU architecture is too old.
CUDNN_STATUS_ALLOC_FAILED: Host memory allocation failed.
CUDNN_STATUS_INTERNAL_ERROR: CUDA resource allocation failed.
CUDNN_STATUS_LICENSE_ERROR: cuDNN license validation failed (only when the feature is enabled).
CUDNN_STATUS_SUCCESS: cuDNN handle was created successfully.

cudnnDestroy()#

This function releases the resources used by the cuDNN handle. This function is usually the last call made to cuDNN with a particular handle. Because cudnnCreate() allocates internal resources, the release of those resources by calling cudnnDestroy() will implicitly call cudaDeviceSynchronize; therefore, the recommended best practice is to call cudnnCreate() or cudnnDestroy() outside of performance-critical code paths.

cudnnStatus_t cudnnDestroy(cudnnHandle_t handle)

Parameters

handle: Input. The cuDNN handle to be destroyed.

Returns

CUDNN_STATUS_SUCCESS: The cuDNN context destruction was successful.

cudnnGetCallback()#

This function queries the internal states of cuDNN error reporting functionality.

cudnnStatus_t cudnnGetCallback(
        unsigned            mask,
        void                **udata,
        cudnnCallback_t     fptr)

Parameters

mask: Output. Pointer to the address where the current internal error reporting message bit mask will be outputted.
udata: Output. Pointer to the address where the current internally stored udata address will be stored.
fptr: Output. Pointer to the address where the current internally stored callback function pointer will be stored. When the built-in default callback function is used, NULL will be outputted.

Returns

CUDNN_STATUS_SUCCESS: The function launched successfully.
CUDNN_STATUS_BAD_PARAM: If any of the input parameters are NULL.

cudnnGetCudartVersion()#

The same version of a given cuDNN library can be compiled against different CUDA toolkit versions. This routine returns the CUDA toolkit version that the currently used cuDNN library has been compiled against.

size_t cudnnGetCudartVersion()

cudnnGetErrorString()#

This function converts the cuDNN status code to a NULL terminated (ASCIIZ) static string. For example, when the input argument is CUDNN_STATUS_SUCCESS, the returned string is CUDNN_STATUS_SUCCESS. When an invalid status value is passed to the function, the returned string is CUDNN_UNKNOWN_STATUS.

const char * cudnnGetErrorString(cudnnStatus_t status)

Parameters

status: Input. cuDNN enumerant status code.

Returns

Pointer to a static, NULL terminated string with the status name.

cudnnGetLastErrorString()#

This function retrieves the last encountered cuDNN error message in the current thread to a NULL terminated (ASCIIZ) string. Inside the cuDNN library, the messages are stored in thread local buffers. The error is cleared after the user calls this API to retrieve it.

void cudnnGetLastErrorString(char *message, size_t max_size);

Parameters

message: Output. Pointer to a character buffer that can store the error message. As we do not manage the thread-safety of the pre-allocated output buffer “message” to avoid unnecessary overhead, we ask that the user ensure it is thread-safe themselves on their own need-basis.
max_size: Input. Maximum size that can be stored in the location pointed to by message. The output is strictly limited by the size limit max_size, also counting the terminator character \0, which will be automatically appended to the message if there is space.

cudnnGetMaxDeviceVersion()#

This function returns the maximum SM version that the cuDNN library is aware of and supports natively. Any SM version higher than this would be supported in forward compatibility mode. For more information about forward compatibility, refer to Hardware Forward Compatibility in the Frontend Developer Guide.

size_t cudnnGetMaxDeviceVersion(void);

Returns

A size_t type value indicating the latest known SM number for the current version of the library. For example, if NVIDIA Hopper (GH100) is the latest known SM that the library is aware of, the value returned would be 900.

cudnnGetProperty()#

This function writes a specific part of the cuDNN library version number into the provided host storage.

cudnnStatus_t cudnnGetProperty(
    libraryPropertyType     type,
    int                    *value)

Parameters

type: Input. Enumerant type that instructs the function to report the numerical value of the cuDNN major version, minor version, or the patch level depending on whether type is set to MAJOR_VERSION, MINOR_VERSION, or PATCH_LEVEL.
value: Output. Host pointer where the version information should be written.

Returns

CUDNN_STATUS_INVALID_VALUE: Invalid value of the type argument.
CUDNN_STATUS_SUCCESS: Version information was stored successfully at the provided address.

cudnnGetStream()#

This function retrieves the user CUDA stream programmed in the cuDNN handle. When the user’s CUDA stream is not set in the cuDNN handle, this function reports the null-stream.

cudnnStatus_t cudnnGetStream(
    cudnnHandle_t   handle,
    cudaStream_t   *streamId)

Parameters

handle: Input. Pointer to the cuDNN handle.
streamID: Output. Pointer where the current CUDA stream from the cuDNN handle should be stored.

Returns

CUDNN_STATUS_BAD_PARAM: Invalid (NULL) handle.
CUDNN_STATUS_SUCCESS: The stream identifier was retrieved successfully.

cudnnGetVersion()#

This function returns the version number of the cuDNN library. It returns the CUDNN_VERSION defined present in the cudnn.h header file. Starting with release R2, the routine can be used to identify dynamically the current cuDNN library used by the application. The defined CUDNN_VERSION can be used to have the same application linked against different cuDNN versions using conditional compilation statements.

size_t cudnnGetVersion()

cudnnGraphVersionCheck()#

Cross-library version checker. Each sub-library has a version checker that checks whether its own version matches that of its dependencies.

cudnnStatus_t cudnnGraphVersionCheck(void);

Returns

CUDNN_STATUS_SUCCESS: The version check passed.
CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH: The versions are inconsistent.

cudnnQueryRuntimeError()#

cuDNN library functions perform extensive input argument checking before launching GPU kernels. The last step is to verify that the GPU kernel actually started. When a kernel fails to start, CUDNN_STATUS_EXECUTION_FAILED is returned by the corresponding API call. Typically, after a GPU kernel starts, no runtime checks are performed by the kernel itself - numerical results are simply written to output buffers.

cudnnStatus_t cudnnQueryRuntimeError(
    cudnnHandle_t            handle,
    cudnnStatus_t           *rstatus,
    cudnnErrQueryMode_t      mode)

When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is selected in cudnnBatchNormalizationForwardTraining() or cudnnBatchNormalizationBackward(), the algorithm may encounter numerical overflows where CUDNN_BATCHNORM_SPATIAL performs just fine albeit at a slower speed. The user can invoke cudnnQueryRuntimeError() to make sure numerical overflows did not occur during the kernel execution. Those issues are reported by the kernel that performs computations.

cudnnQueryRuntimeError() can be used in polling and blocking software control flows. There are two polling modes (CUDNN_ERRQUERY_RAWCODE and CUDNN_ERRQUERY_NONBLOCKING) and one blocking mode CUDNN_ERRQUERY_BLOCKING.

CUDNN_ERRQUERY_RAWCODE reads the error storage location regardless of the kernel completion status. The kernel might not even start and the error storage (allocated per cuDNN handle) might be used by an earlier call.

CUDNN_ERRQUERY_NONBLOCKING checks if all tasks in the user stream are completed. The cudnnQueryRuntimeError() function will return immediately and report CUDNN_STATUS_RUNTIME_IN_PROGRESS in rstatus if some tasks in the user stream are pending. Otherwise, the function will copy the remote kernel error code to rstatus.

In the blocking mode (CUDNN_ERRQUERY_BLOCKING), the function waits for all tasks to drain in the user stream before reporting the remote kernel error code. The blocking flavor can be further adjusted by calling cudaSetDeviceFlags with the cudaDeviceScheduleSpin, cudaDeviceScheduleYield, or cudaDeviceScheduleBlockingSync flag.

CUDNN_ERRQUERY_NONBLOCKING and CUDNN_ERRQUERY_BLOCKING modes should not be used when the user stream is changed in the cuDNN handle, meaning, cudnnSetStream() is invoked between functions that report runtime kernel errors and the cudnnQueryRuntimeError() function.

The remote error status reported in rstatus can be set to: CUDNN_STATUS_SUCCESS, CUDNN_STATUS_RUNTIME_IN_PROGRESS, or CUDNN_STATUS_RUNTIME_FP_OVERFLOW. The remote kernel error is automatically cleared by cudnnQueryRuntimeError().

The cudnnQueryRuntimeError() function should be used in conjunction with cudnnBatchNormalizationForwardTraining() and cudnnBatchNormalizationBackward() when the cudnnBatchNormMode_t argument is CUDNN_BATCHNORM_SPATIAL_PERSISTENT.

Parameters

handle: Input. Handle to a previously created cuDNN context.
rstatus: Output. Pointer to the user’s error code storage.
mode: Input. Remote error query mode.
tag: Input/Output. Currently, this argument should be NULL.

Returns

CUDNN_STATUS_SUCCESS: No errors detected (rstatus holds a valid value).
CUDNN_STATUS_BAD_PARAM: Invalid input argument.
CUDNN_STATUS_INTERNAL_ERROR: A stream blocking synchronization or a non-blocking stream query failed.
CUDNN_STATUS_MAPPING_ERROR: The device cannot access zero-copy memory to report kernel errors.

cudnnSetCallback()#

This function sets the internal states of cuDNN error reporting functionality.

cudnnStatus_t cudnnSetCallback(
        unsigned            mask,
        void                *udata,
        cudnnCallback_t     fptr)

Parameters

mask

Input. An unsigned integer. The four least significant bits (LSBs) of this unsigned integer are used for switching on and off the different levels of error reporting messages. This applies for both the default callbacks, and for the customized callbacks. The bit position is in correspondence with the enum of cudnnSeverity_t. The user may utilize the predefined macros CUDNN_SEV_ERROR_EN, CUDNN_SEV_WARNING_EN, and CUDNN_SEV_INFO_EN to form the bit mask. When a bit is set to 1, the corresponding message channel is enabled.

For example, when bit 3 is set to 1, the API logging is enabled. Currently, only the log output of level CUDNN_SEV_INFO is functional; the others are not yet implemented. When used for turning on and off the logging with the default callback, the user may pass NULL to udata and fptr. In addition, the environment variable CUDNN_LOGDEST_DBG must be set. For more information, refer to the Deprecation Policy.

CUDNN_SEV_INFO_EN = 0b1000 (functional).

CUDNN_SEV_ERROR_EN = 0b0010 (functional).

CUDNN_SEV_WARNING_EN = 0b0100 (functional).

The output of CUDNN_SEV_FATAL is always enabled and cannot be disabled.

udata

Input. A pointer provided by the user. This pointer will be passed to the user’s custom logging callback function. The data it points to will not be read, nor be changed by cuDNN. This pointer may be used in many ways, such as in a mutex or in a communication socket for the user’s callback function for logging. If the user is utilizing the default callback function, or doesn’t want to use this input in the customized callback function, they may pass in NULL.

fptr

Input. A pointer to a user-supplied callback function. When NULL is passed to this pointer, then cuDNN switches back to the built-in default callback function. The user-supplied callback function prototype must be similar to the following (also defined in the header file):

void customizedLoggingCallback (cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);

The structure cudnnDebug_t is defined in the header file. It provides the metadata, such as time, time since start, stream ID, process and thread ID, that the user may choose to print or store in their customized callback.

The variable msg is the logging message generated by cuDNN. Each line of this message is terminated by \0, and the end of the message is terminated by \0\0. Users may select what is necessary to show in the log, and may reformat the string.

Returns

CUDNN_STATUS_SUCCESS: The function launched successfully.

cudnnSetStream()#

This function sets the user’s CUDA stream in the cuDNN handle. The new stream will be used to launch cuDNN GPU kernels or to synchronize to this stream when cuDNN kernels are launched in the internal streams. If the cuDNN library stream is not set, all kernels use the default (NULL) stream. Setting the user stream in the cuDNN handle guarantees the issue-order execution of cuDNN calls and other GPU kernels launched in the same stream.

cudnnStatus_t cudnnSetStream(
    cudnnHandle_t   handle,
    cudaStream_t    streamId)

With CUDA 11.x or later, internal streams have the same priority as the stream set by the last call to this function. In CUDA graph capture mode, CUDA 11.8 or later is required in order for the stream priorities to match.

Parameters

handle: Input. Pointer to the cuDNN handle.
streamID: Input. New CUDA stream to be written to the cuDNN handle.

Returns

CUDNN_STATUS_BAD_PARAM: Invalid (NULL) handle.
CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH: Mismatch between the user stream and the cuDNN handle context.
CUDNN_STATUS_NOT_SUPPORTED: The stream priority is out of range.
CUDNN_STATUS_INTERNAL_ERROR: CUDA stream APIs reported further errors inside cuDNN.
CUDNN_STATUS_SUCCESS: The new stream was set successfully.

Backend Descriptor Types #

This section enumerates all valid attributes of various descriptors.

CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &desc); the cuDNN backend convolution descriptor specifies the parameters for a convolution operator for both forward and backward propagation: compute data type, convolution mode, filter dilation and stride, and padding on both sides.

Attributes

Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_CONVOLUTION_:

CUDNN_ATTR_CONVOLUTION_COMP_TYPE

The compute type of the convolution operator.

CUDNN_TYPE_DATA_TYPE; one element.

Required attribute.

CUDNN_ATTR_CONVOLUTION_MODE

Convolution or cross-correlation mode.

CUDNN_TYPE_CONVOLUTION_MODE; one element.

Required attribute.

CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS

The number of spatial dimensions, expected array length for each of dilations, filter strides, and padding arrays.

CUDNN_TYPE_INT64; one element.

Required attribute.

CUDNN_ATTR_CONVOLUTION_DILATIONS

Filter dilation.

CUDNN_TYPE_INT64; one or more, but at most CUDNN_MAX_DIMS elements.

Required attribute.

CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES

Filter stride.

CUDNN_TYPE_INT64; one or more, but at most CUDNN_MAX_DIMS elements.

Required attribute.

CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS

Padding at the beginning of each spatial dimension.

CUDNN_TYPE_INT64; one or more, but at most CUDNN_MAX_DIMS elements.

Required attribute.

CUDNN_ATTR_CONVOLUTION_POST_PADDINGS

Padding at the end of each spatial dimension.

CUDNN_TYPE_INT64; one or more, but at most CUDNN_MAX_DIMS elements.

Required attribute.

Finalization

cudnnBackendFinalize() with a CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR can have the following return values:

CUDNN_STATUS_BAD_PARAM: An elemCount argument for setting CUDNN_ATTR_CONVOLUTION_DILATIONS, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS, and CUDNN_ATTR_CONVOLUTION_POST_PADDINGS is not equal to the value set for CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_ENGINE_DESCRIPTOR #

Created with descriptor type value CUDNN_BACKEND_ENGINE_DESCRIPTOR, cuDNN backend engine descriptor describes an engine to compute an operation graph. An engine is a grouping of kernels with similar compute and numerical attributes.

Attributes

Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_ENGINE_:

CUDNN_ATTR_ENGINE_OPERATION_GRAPH

The operation graph to compute.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_ENGINE_GLOBAL_INDEX

The index for the engine.

CUDNN_TYPE_INT64; one element.

Valid values are between 0 and CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT-1.

Required attribute.

CUDNN_ATTR_ENGINE_KNOB_INFO

The descriptors of performance knobs of the engine.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR.

Read-only attribute.

CUDNN_ATTR_ENGINE_NUMERICAL_NOTE

The numerical attributes of the engine.

CUDNN_TYPE_NUMERICAL_NOTE; zero or more elements.

Read-only attribute.

CUDNN_ATTR_ENGINE_LAYOUT_INFO

The preferred tensor layouts of the engine.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR.

Read-only attribute.

CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE

The behavior attributes of the engine.

CUDNN_TYPE_BEHAVIOR_NOTE; zero or more elements.

Read-only attribute.

CUDNN_ATTR_ENGINE_SM_COUNT_TARGET

The number of SMs to target.

CUDNN_TYPE_INT32; one element.

Valid values are between 0 and the number of SMs on the device, where 0 is default meaning all the SMs will be used.

Optional attribute.

CUDNN_ATTR_ENGINE_DEVICEPROP

The descriptor of the device that this engine descriptor targets.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR.

Optional attribute.

Finalization

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.
CUDNN_STATUS_NOT_SUPPORTED: The descriptor attribute set is not supported by the current version of cuDNN. For example, the value of CUDNN_ATTR_ENGINE_GLOBAL_INDEX is not in a valid range.
CUDNN_STATUS_BAD_PARAM: The descriptor attribute set is inconsistent or in an unexpected state. For example, the operation graph descriptor set is not already finalized.

CUDNN_BACKEND_ENGINECFG_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &desc); the cuDNN backend engine configuration descriptor consists of an engine descriptor and an array of knob choice descriptors. Users can query from engine config information about intermediates: computational intermediate results that can be reused between executions.

Attributes

CUDNN_ATTR_ENGINECFG_ENGINE

The backend engine.

CUDNN_TYPE_BACKEND_DESCRIPTOR: one element, a backend descriptor of type CUDNN_BACKEND_ENGINE_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_ENGINECFG_KNOB_CHOICES

The engine tuning knobs and choices.

CUDNN_TYPE_BACKEND_DESCRIPTOR: zero or more elements, backend descriptors of type CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR.

CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO

Information of the computational intermediate of this engine config.

CUDNN_TYPE_BACKEND_DESCRIPTOR: one element, a backend descriptor of type CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR.

Read-only attribute.

Currently unsupported. Placeholder for future implementation.

CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE

The size of the workspace buffer required to execute this engine config.

CUDNN_TYPE_INT64; one element.

Read-only attribute.

Finalization

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.
CUDNN_STATUS_NOT_SUPPORTED: The descriptor attribute set is not supported by the current version of cuDNN. For example, the value knob.

CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR, &desc); the cuDNN backend engine heuristics descriptor allows users to obtain for an operation graph engine configuration descriptors ranked by performance according to cuDNN’s heuristics.

Attributes

CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH

The operation graph for which heuristics result in a query.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element.

Required attribute.

CUDNN_ATTR_ENGINEHEUR_MODE

The heuristic mode to query the result.

CUDNN_TYPE_HEUR_MODE; one element.

Required attribute.

CUDNN_ATTR_ENGINEHEUR_RESULTS

The result of the heuristics query.

CUDNN_TYPE_BACKEND_DESCRIPTOR; zero or more elements of descriptor type CUDNN_BACKEND_ENGINECFG_DESCRIPTOR.

Get-only attribute.

CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET

The number of SMs to target.

CUDNN_TYPE_INT32; one element.

Valid values are between 0 and the number of SMs on the device, where 0 is default meaning all the SMs will be used.

Optional attribute.

CUDNN_ATTR_ENGINEHEUR_DEVICEPROP

The descriptor of the device that this engine heuristics descriptor targets.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR.

Optional attribute.

Finalization

Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend engine heuristics descriptor:

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &desc); the cuDNN backend execution plan descriptor allows the user to specify an execution plan, consists of a cuDNN handle, an engine configuration, and optionally an array of intermediates to compute.

Attributes

CUDNN_ATTR_EXECUTION_PLAN_HANDLE

A cuDNN handle.

CUDNN_TYPE_HANDLE; one element.

Required attribute.

CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG

An engine configuration to execute.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_ENGINECFG_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS

Unique identifiers of intermediates to compute.

CUDNN_TYPE_INT64; zero or more elements.

Optional attribute. If set, the execution plan will only compute the specified intermediate and not any of the output tensors on the operation graph in the engine configuration.

CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS

Unique identifiers of precomputed intermediates.

CUDNN_TYPE_INT64; zero or more elements.

Optional attribute. If set, the plan will expect and use pointers for each intermediate in the variant pack descriptor during execution.

Currently unsupported. Placeholder for future implementation.

CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE

The size of the workspace buffer required to execute this plan.

CUDNN_TYPE_INT64; one element.

Read-only attribute.

CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION

The JSON representation of the serialized execution plan. Serialization and deserialization can be done by getting and setting this attribute, respectively.

CUDNN_TYPE_CHAR; many elements, the same amount as the size of a null-terminated string of the json representation of the execution plan.

CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE

The kernel cache that the execution plan can refer to in order to accelerate the finalization for runtime fusion engines by reusing a previously compiled identical kernel implementation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR.

CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP

The descriptor of the device that this execution plan targets.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR.

Optional attribute.

Finalization

Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend execution plan descriptor:

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR, &desc); the cuDNN backend intermediate descriptor is a read-only descriptor that contains information about an execution intermediate. An execution intermediate is some intermediate computation for an engine config in device memory that can be reused between plan execution to amortize the kernel. Each intermediate is identified by a unique ID. Users can query for the device memory size of the intermediate. An intermediate can depend on the data of one or more tensors identified by the tensor UIDs or one more attribute of the operation graph.

This is a read-only descriptor. Users cannot set the descriptor attributes or finalize the descriptor. User query for a finalized descriptor from an engine config descriptor.

Attributes

CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID

A unique identifier of the intermediate.

CUDNN_TYPE_INT64; one element.

Read-only attribute.

CUDNN_ATTR_INTERMEDIATE_INFO_SIZE

The required device memory size for the intermediate.

CUDNN_TYPE_INT64; one element.

Read-only attribute.

CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS

UID of tensors on which the intermediate depends.

CUDNN_TYPE_INT64; zero or more elements.

Read-only attribute.

CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES

Currently unsupported. Placeholder for future implementation.

Finalization

User does not finalize this descriptor. cudnnBackendFinalize(desc) with a backend intermediate descriptor returns CUDNN_STATUS_NOT_SUPPORTED.

CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR, &desc); the cuDNN backend knob choice descriptor consists of the type of knobs to be set and the value to which the knob is set.

Attributes

CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE

The type of knobs to be set.

CUDNN_TYPE_KNOB_TYPE: one element.

Required attribute.

CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE

The value of the knobs to be set.

CUDNN_TYPE_INT64: one element.

Required attribute.

Finalization

Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend knob choice descriptor:

CUDNN_STATUS_SUCCESS: The knob choice descriptor was finalized successfully.

CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_INFO_DESCRIPTOR, &desc); the cuDNN backend knob info descriptor consists of the type and valid value range of an engine performance knob. Valid value range is given in terms of minimum, maximum, and stride of valid values. This is a purely informative descriptor type. Setting descriptor attributes is not supported. User obtains an array of finalized descriptors, one for each knob type, from a finalized backend descriptor.

Attributes

CUDNN_ATTR_KNOB_INFO_TYPE

The type of the performance knob.

CUDNN_TYPE_KNOB_TYPE: one element.

Read-only attribute.

CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE

The smallest valid value choice value for this knob.

CUDNN_TYPE_INT64: one element.

Read-only attribute.

CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE

The largest valid choice value for this knob.

CUDNN_TYPE_INT64: one element.

Read-only attribute.

CUDNN_ATTR_KNOB_INFO_STRIDE

The stride of valid choice values for this knob.

CUDNN_TYPE_INT64: one element.

Read-only attribute.

Finalization

This descriptor is read-only; it is retrieved and finalized from a cuDNN backend engine configuration descriptor. Users cannot set or finalize.

CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR #

Created with descriptor type value CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR, cuDNN backend layout info descriptor provides information on the preferred layout for a tensor.

Attributes

CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID

The UID of the tensor.

CUDNN_TYPE_INT64; one element.

Read-only attribute.

CUDNN_ATTR_LAYOUT_INFO_TYPES

The preferred layout of the tensor.

CUDNN_TYPE_LAYOUT_TYPE: zero or more element cudnnBackendLayoutType_t.

Read-only attribute.

Finalization

This descriptor is read-only; it is retrieved and finalized from a cuDNN backend engine configuration descriptor. Users cannot set its attribute or finalize it.

CUDNN_BACKEND_MATMUL_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_MATMUL_DESCRIPTOR, &desc); the cuDNN backend matmul descriptor specifies any metadata needed for the matmul operation.

Attributes

CUDNN_ATTR_MATMUL_COMP_TYPE

The compute precision used for the matmul operation.

CUDNN_TYPE_DATA_TYPE; one element.

Required attribute.

Finalization

Return values of cudnnBackendFinalize(desc) where desc is a cuDNN backend matmul descriptor:

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR, &desc); the cuDNN backend concatenation operation descriptor specifies an operation node for concatenating a given vector of tensors along a given concatenation axis.

This operation also supports an in-place mode, where one of the input tensors is already assumed to be at the correct location in the output tensor, that is, they share the same device buffer.

Attributes

Attributes of a cuDNN backend concat operation descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONCAT_:

CUDNN_ATTR_OPERATION_CONCAT_AXIS

The dimension which tensors are being concatenated over.

Type: CUDNN_TYPE_INT64

Required attribute.

CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS

A vector of input tensor descriptors, which are concatenated in the same order as provided in this vector.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX

The index of input tensor in the vector of input tensor descriptors that is already present in-place in the output tensor.

Type: CUDNN_TYPE_INT64

Optional attribute.

CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC

The output tensor descriptor for the result from concatenation of input tensors.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

Finalization

cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR() can have the following return values:

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The tensors involved in the operation should have the same shape in all dimensions except the dimension that they are being concatenated over.

The output tensor shape in the concatenating dimension should equal the sum of tensor shape of all input tensors in that same dimension.

Concatenation axis should be a valid tensor dimension.

If provided, the in-place input tensor index should be a valid index in the vector of input tensor descriptors.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, &desc); the cuDNN backend convolution backward data operation descriptor specifies an operation node for convolution backward data to compute the gradient of input data dx with filter tensor w and gradient of response dy with output \(\alpha\) scaling and residue add with \(\beta\) scaling. That is, the equation: \(dx=\alpha\left( w\bar{*}dy \right)+\beta dx\) where \(\bar{*}\) denotes the convolution backward data operator.

Attributes

Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_:

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA

The alpha value.

CUDNN_TYPE_FLOAT or CUDNN_TYPE_DOUBLE; one or more elements.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA

The beta value.

CUDNN_TYPE_FLOAT or CUDNN_TYPE_DOUBLE; one or more elements.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC

The convolution operator descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W

The convolution filter tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX

The image gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY

The response gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

Finalization

In finalizing the convolution operation, the tensor dimensions of the tensor DX, W, and DY are bound based on the same interpretations as the X, W, and Y tensor dimensions described in the CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR section.

cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR can have the following return values:

CUDNN_STATUS_BAD_PARAM: Invalid or inconsistent attribute values are encountered. For example, the DX, W, and DY tensors do not constitute a valid convolution operation under the convolution operator.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, &desc); the cuDNN backend convolution backward filter operation descriptor specifies an operation node for convolution backward filter to compute the gradient of filter dw with image tensor x and gradient of response dy with output \(\alpha\) scaling and residue add with \(\beta\) scaling. That is, the equation: \(dx=\alpha\left( x\tilde{*}dy \right)+\beta dx\) where \(\tilde{*}\) denotes the convolution backward filter operator.

Attributes

Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_:

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA

The alpha value.

CUDNN_TYPE_FLOAT or CUDNN_TYPE_DOUBLE; one or more elements.

Required attribute. Required to be set before finalization.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA

The beta value.

CUDNN_TYPE_FLOAT or CUDNN_TYPE_DOUBLE; one or more elements.

Required attribute. Required to be set before finalization.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC

The convolution operator descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.

Required attribute. Required to be set before finalization.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW

The convolution filter tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute. Required to be set before finalization.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X

The image gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute. Required to be set before finalization.

CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY

The response gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute. Required to be set before finalization.

Finalization

In finalizing the convolution operation, the tensor dimensions of the tensor X, DW, and DY are bound based on the same interpretations as the X, W, and Y tensor dimensions described in the CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR section.

cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR() can have the following return values:

CUDNN_STATUS_BAD_PARAM: Invalid or inconsistent attribute values are encountered. For example, the X, DW, and DY tensors do not constitute a valid convolution operation under the convolution operator.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, &desc); the cuDNN backend convolution forward operation descriptor specifies an operation node for forward convolution to compute the response tensor y of image tensor x convoluted with filter tensor w with output scaling \(\alpha\) and residual add with \(\beta\) scaling. That is, the equation: \(y=\alpha\left( w*x \right)+\beta y\) where \(*\) is the convolution operator in the forward direction.

Attributes

Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_:

CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA

The alpha value.

CUDNN_TYPE_FLOAT or CUDNN_TYPE_DOUBLE; one or more elements.

Required to be set before finalization.

CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA

The beta value.

CUDNN_TYPE_FLOAT or CUDNN_TYPE_DOUBLE; one or more elements.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC

The convolution operator descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W

The convolution filter tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X

The image tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y

The response tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

Finalization

In finalizing the convolution operation, the tensor dimensions of the tensor X, W, and Y are bound based on the following interpretations:

The CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS attribute of CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC is the number of spatial dimension of the convolution. The number of dimensions for tensor X, W, and Y must be larger than the number of spatial dimensions by 2 or 3 depending on how users choose to specify the convolution tensors.

If the number of tensor dimension is the number of spatial dimensions plus 2:

X tensor dimension and stride arrays are [N, GC, …]

W tensor dimension and stride arrays are [GK, C, …]

Y tensor dimension and stride arrays are [N, GK, …]

Where the ellipsis … are shorthand for spatial dimensions of each tensor, G is the number of convolution groups, and C and K are the number of input and output feature maps per group. In this interpretation, it is assumed that the memory layout for each group is packed. cudnnBackendFinalize() asserts the tensors dimensions and strides are consistent with this interpretation or it returns CUDNN_STATUS_BAD_PARAM.

cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR can have the following return values:

CUDNN_STATUS_BAD_PARAM: Invalid or inconsistent attribute values are encountered. For example, the X, W, and Y tensors do not constitute a valid convolution operation under the convolution operator.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR #

Represents an operation that will generate per-channel statistics. The specific statistics that will be generated depends on the CUDNN_ATTR_OPERATION_GENSTATS_MODE attribute in the descriptor. Currently, only CUDNN_GENSTATS_SUM_SQSUM is supported for the CUDNN_ATTR_OPERATION_GENSTATS_MODE. It will generate the sum and quadratic sum of per-channel elements of the input tensor x. The output dimension should be all 1 except the C dimension. Also, the C dimension of outputs should equal the C dimension of the input. This opaque struct can be created with cudnnBackendCreateDescriptor() (CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR).

Attributes

CUDNN_ATTR_OPERATION_GENSTATS_MODE: Sets the CUDNN_TYPE_GENSTATS_MODE of the operation. This attribute is required.
CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC: The math precision of the computation. This attribute is required.
CUDNN_ATTR_OPERATION_GENSTATS_XDESC: Sets the descriptor for the input tensor X. This attribute is required.
CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC: Sets the descriptor for the output tensor sum. This attribute is required.
CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC: Sets the descriptor for the output tensor quadratic sum. This attribute is required.

Finalization

In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The number of dimensions do not match between the input and output tensors.

The input/output tensor dimensions do not agree with the above description.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR, &desc); the cuDNN backend matmul operation descriptor specifies an operation node for matmul to compute the matrix product C by multiplying Matrix A and Matrix B, as shown in the following equation: C=AB

When using the matmul operation, the matrices are expected to be at least rank-2 tensors. The last two dimensions are expected to correspond to either M, K or N. All the preceding dimensions are interpreted as batch dimensions. If there are zero batch dimensions then the requirements are as follows:

`CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR` for Zero Batch Dimensions#
Case	Matrix A	Matrix B	Matrix C
Single Matmul	M x K	K x N	M x N

For a single batch dimension we have the following requirements:

`CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR` for a Single Batch Dimension#
Case	Matrix A	Matrix B	Matrix C
Single Matmul	1 x M x K	1 x K x N	1 x M x N
Batch Matmul	B x M x K	B x K x N	B x M x N
Broadcast A	(B/c) x M x K	B x K x N	B x M x N
Broadcast B	B x M x K	(B/c) x K x N	B x M x N

Where:

B indicates the batch size

M is the number of rows of the Matrix A

K is the number or columns of the input Matrix A (which is the same as the number of rows as the input Matrix B)

N is the number of columns of the input Matrix B

c is a constant integer and a factor of B

If either the batch size of Matrix A or B is set to B/c, this indicates that the matrix will be broadcasted in the batch matmul. The resulting output Matrix C will be a tensor of B x M x N.

The above broadcasting convention is extended to all the batch dimensions. Concretely, for tensors with three batch dimensions:

`CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR` for a Three Batch Dimension#
Case	Matrix A	Matrix B	Matrix C
Multiple Batched Matmul	B1 x 1 x B3 x M x K	1 x B2 x (B3/c) x K x N	B1 x B2 x B3 x M x N

The functionality of having multiple batch dimensions allows you to have layouts where the batch is not packed at a single stride. This case is especially seen in multihead attention. c is only allowed to be B (leading to a batch dimension for 1) for matmul and matmul fusions. The other possible values of c are supported for Grouped Query Attention in the cuDNN Fused Flash Attention.

The addressing of the matrix elements from a given tensor can be specified using strides in the tensor descriptor. The strides represent the spacing between elements for each tensor dimension. Considering a matrix tensor A (B x M x N) with strides [BS, MS, NS], it indicates that the actual matrix element A[x, y, z] is found at (A_base_address + x * BS + y * MS + z * NS) from the linear memory space allocated for tensor A. With our current support, the innermost dimension must be packed, which requires either MS=1 or NS=1. Otherwise, there are no other technical constraints with regard to how the strides can be specified in a tensor descriptor as it should follow the aforementioned addressing formula and the strides as specified by the user.

This representation provides support for some common usages, such as leading dimension and matrix transpose as we will explain through the following examples.

The most basic case is a fully packed row-major batch matrix, without any consideration of leading dimension or transpose. In this case, BS = M*N, MS = N, and NS = 1.
Matrix transpose can be achieved by exchanging the inner and outer dimensions using strides. Namely:
1. To specify a non-transposed matrix: BS = M*N, MS = N, and NS = 1.
2. To specify matrix transpose: BS = M*N, MS = 1, and NS = M.
Leading dimension, a widely used concept in BLAS-like APIs, describes the inner dimension of the 2D array memory allocation (as opposed to the conceptual matrix dimension). It resembles the stride in a way that it defines the spacing between elements in the outer dimension. The most typical use cases where it shows difference from the matrix inner dimension is when the matrix is only part of the data in the allocated memory, addressing submatrices, or addressing matrices from an aligned memory allocation. Therefore, the leading dimension LDA in a column-major matrix A must satisfy LDA >= M, whereas in a row-major matrix A, it must satisfy LDA >= N. To transition from the leading dimension concept to using strides, this entails MS >= N and NS = 1 or MS = 1 and NS >= M. Keep in mind that, while these are some practical use cases, these inequalities do not impose technical constraints with respect to an acceptable specification of the strides.

Other commonly used GEMM features, such as alpha/beta output blending, can also be achieved using this matmul operation along with other pointwise operations.

Attributes

Attributes of a cuDNN backend matmul descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_MATMUL_:

CUDNN_ATTR_OPERATION_MATMUL_ADESC

The Matrix A descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_MATMUL_BDESC

The Matrix B descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_MATMUL_CDESC

The Matrix C descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT

Number of matmul operations to perform in the batch on matrix.

CUDNN_TYPE_INT64; one element.

Default value is 1.

CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC

The tensor gemm_m_override descriptor. Allows you to override the M dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC

The tensor gemm_n_override descriptor. Allows you to override the N dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC

The tensor gemm_k_override descriptor. Allows you to override the K dimension of a batch matmul through this tensor. It is only supported as documented in the Fused Attention fprop, Fused Attention bprop, Fused Flash Attention fprop, and Fused Flash Attention bprop sections.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_MATMUL_DESC

The matmul operation descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_MATMUL_DESCRIPTOR.

Required attribute.

Finalization

In the finalization of the matmul operation, the tensor dimensions of the Matrices A, B, and C will be checked to ensure that they satisfy the requirements of matmul:

cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR can have the following return values:

CUDNN_STATUS_NOT_SUPPORTED

An unsupported attribute value was encountered. For example, if not all of the Matrices A, B, and C are at least rank-2 tensors.

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT specified is a negative value.

The CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT and one or more of the batch sizes of the Matrices A, B, and C are not equal to one. That is to say there is a conflict where both irregularly and regularly strided batched matmul are specified, which is not a valid use case.

The dimensions of the Matrices A, B, and C do not satisfy the matmul requirements.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR, &desc); the cuDNN backend normalization backward operation specifies a node for a backward normalization that takes as input the gradient tensor dY and outputs the gradient tensor dX and weight gradients dScale and dBias. The normalization mode is set using the CUDNN_ATTR_OPERATION_NORM_BWD_MODE attribute.

Limitations

Does not support CUDNN_GROUP_NORM mode.

Supported Configurations for `CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR`#
`CUDNN_LAYER_NORM`	`CUDNN_INSTANCE_NORM`	`CUDNN_BATCH_NORM`	`CUDNN_GROUP_NORM`	`CUDNN_RMS_NORM`
Yes	Yes	Yes	No	Yes

Note

In addition to single GPU, CUDNN_BATCH_NORM also supports single node multi-GPU batch norm, while other normalization modes only support running on a single GPU. For more information, refer to the DReluForkDNorm pattern.

Attributes

CUDNN_ATTR_OPERATION_NORM_BWD_MODE

Chooses the normalization mode for the norm backward operation.

CUDNN_TYPE_NORM_MODE; one element.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_XDESC

Input tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC

Saved mean input tensor descriptor for reusing the mean computed during the forward computation of the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC

Saved inverse variance input tensor descriptor for reusing the mean computed during the forward computation of the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC

Gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC

Normalization scale descriptor. Note that the bias descriptor is not necessary for the backward pass.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC

Scalar input tensor descriptor for the epsilon value. The epsilon values are needed only if the saved mean and variances are not passed as inputs to the operation. Note that the attribute CUDNN_ATTR_TENSOR_IS_BY_VALUE of this descriptor should be set to true.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC

Scale gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC

Bias gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC

Input gradient tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS

Vector of tensor descriptors for the communication buffers used in multi-GPU normalization. Typically, one buffer is provided for every GPU in the node. This is an optional attribute only used for multi-GPU tensor stats reduction.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

Finalization

In the finalization stage, the attributes are checked to ensure there are no conflicts.

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The tensor dimensions of the gradient tensors dY, dX, and input tensor X, do not match.

The channel count C for the mean, scale, and inv_variance tensors do not match.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR, &desc); the cuDNN backend normalization forward operation specifies a node for a forward normalization that takes as input a tensor X and produces a normalized output Y with the normalization mode set by the CUDNN_ATTR_OPERATION_NORM_FWD_MODE attribute. The operation supports optional running stats computation and allows for storing the computed means and variances for reuse in the backwards calculation depending on the setting of the CUDNN_ATTR_OPERATION_NORM_FWD_PHASE attribute.

Limitations

Does not support CUDNN_GROUP_NORM mode.
Batch norm only supports forward training and not forward inference.

Supported Configurations for `CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR`#
`CUDNN_ATTR_OPERATION_NORM_FWD_PHASE`	`CUDNN_LAYER_NORM`	`CUDNN_INSTANCE_NORM`	`CUDNN_BATCH_NORM`	`CUDNN_GROUP_NORM`	`CUDNN_RMS_NORM`
`CUDNN_NORM_FWD_TRAINING`	Yes	Yes	Yes	No	Yes
`CUDNN_NORM_FWD_INFERENCE`	Yes	Yes	No	No	Yes

Note

In addition to single-GPU, batch normalization supports running on single node multi-GPUs, while other normalization modes only support running on a single GPU. For more information, refer to the NormAddRelu pattern.

Attributes

CUDNN_ATTR_OPERATION_NORM_FWD_MODE

Chooses the normalization mode for the norm forward operation.

CUDNN_TYPE_NORM_MODE; one element.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_PHASE

Selects the training or inference phase for the norm forward operation.

CUDNN_TYPE_NORM_FWD_PHASE; one element.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_XDESC

Input tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC

Estimated mean input tensor descriptor for the inference phase and the computed mean output tensor descriptor for the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC

Estimated inverse variance input tensor descriptor for the inference phase and the computed inverse variance output tensor descriptor for the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC

Normalization scale input tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC

Normalization bias input tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC

Scalar input tensor descriptor for the epsilon value used in normalization calculation. Note that the attribute CUDNN_ATTR_TENSOR_IS_BY_VALUE of this descriptor should be set to true.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC

Scalar input tensor descriptor for the exponential average factor value used in running stats computation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC

Input running mean tensor descriptor for the running stats computation in the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC

Input running variance tensor descriptor for the running stats computation in the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC

Output running mean tensor descriptor for the running stats computation in the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC

Output running variance tensor descriptor for the running stats computation in the training phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_YDESC

Tensor descriptor for the output of the normalization operation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS

Vector of tensor descriptors for the communication buffers used in multi-GPU normalization. Typically, one buffer is provided for every GPU in the node. This is an optional attribute only used for multi-GPU tensor stats reduction.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

Finalization

In the finalization stage, the attributes are checked to ensure there are no conflicts.

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The output tensor dimensions do not match the input tensor dimensions.

The channel count C for the mean, scale, bias, and inv_variance tensors do not match.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR #

Represents a pointwise operation that implements the equation Y = op(alpha1 * X) or Y = op(alpha1 * X, alpha2 * B) depending on the operation type. The actual type of operation represented by op() above depends on the CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR attribute in the descriptor. This operation descriptor supports operations with single-input single-output.

For a list of supported operations, refer to the cudnnPointwiseMode_t section.

For dual-input pointwise operations, broadcasting is assumed when a tensor dimension in one of the tensors is 1 while the other tensors corresponding dimension is not 1.

For three-input single-output pointwise operations, we do not support broadcasting in any tensor.

This opaque struct can be created with cudnnBackendCreateDescriptor() (CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR).

Attributes

CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR: Sets the descriptor containing the mathematical settings of the pointwise operation. This attribute is required.
CUDNN_ATTR_OPERATION_POINTWISE_XDESC: Sets the descriptor for the input tensor X. This attribute is required for pointwise mathematical functions or activation forward propagation computations.
CUDNN_ATTR_OPERATION_POINTWISE_BDESC: If the operation requires two inputs, such as add or multiply, this attribute sets the second input tensor B. If the operation requires only 1 input, this field is not used and should not be set.
CUDNN_ATTR_OPERATION_POINTWISE_YDESC: Sets the descriptor for the output tensor Y. This attribute is required for pointwise mathematical functions or activation forward propagation computations.
CUDNN_ATTR_OPERATION_POINTWISE_TDESC: Sets the descriptor for the tensor T. This attribute is required for CUDNN_ATTR_POINTWISE_MODE set to CUDNN_POINTWISE_BINARY_SELECT and acts as the mask based on which the selection is done.
CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1: Sets the scalar alpha1 value in the equation. Can be in float or half. This attribute is optional, if not set, the default value is 1.0.
CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2: If the operation requires 2 inputs, such as add or multiply. This attribute sets the scalar alpha2 value in the equation. Can be in float or half. This attribute is optional, if not set, the default value is 1.0. If the operation requires only 1 input, this field is not used and should not be set.
CUDNN_ATTR_OPERATION_POINTWISE_DXDESC: Sets the descriptor for the output tensor dX. This attribute is required for pointwise activation back propagation computations.
CUDNN_ATTR_OPERATION_POINTWISE_DYDESC: Sets the descriptor for the input tensor dY. This attribute is required for pointwise activation back propagation computations.

Finalization

In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The number of dimensions do not match between the input and output tensors.

The input/output tensor dimensions do not agree with the above described automatic broadcasting rules.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR #

The cuDNN backend reduction operation descriptor represents an operation node that implements reducing values of an input tensor X in one or more dimensions to get an output tensor Y. The math operation and compute data type used for reducing tensor values is specified via CUDNN_ATTR_OPERATION_REDUCTION_DESC.

This operation descriptor can be created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR, &desc).

The output tensor Y should be the size as that of input tensor X, except dimensions where its size is 1.

There is a special use case for Grouped Query Attention and Multi Query Attention in cuDNN Fused Flash Attention where some dimensions in the output tensor Y can also be factors of the corresponding dimensions in the input tensor X.

Attributes

Attributes of a cuDNN backend reduction descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATION_REDUCTION_:

CUDNN_ATTR_OPERATION_REDUCTION_XDESC

The matrix X descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_REDUCTION_YDESC

The matrix Y descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_REDUCTION_DESC

The reduction operation descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

Finalization

In the finalization of the reduction operation, the dimensions of tensors X and Y are checked to ensure that they satisfy the requirements of the reduction operation.

cudnnBackendFinalize() with a CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR can have the following return values:

CUDNN_STATUS_BAD_PARAM: Invalid or inconsistent attribute values are encountered. For example, the dimensions of the tensors X and Y do not satisfy the requirements of the reduction operation.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR, &desc); the cuDNN backend resample backward operation descriptor specifies an operation node for backward resampling. It computes the input tensor gradient dx from output tensor gradient dy with backward resampling done according to CUDNN_ATTR_RESAMPLE_MODE with output scaling \(\alpha\) and residual add with \(\beta\) scaling.

Attributes

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC

Resample operation descriptor (CUDNN_BACKEND_RESAMPLE_DESCRIPTOR) instance contains metadata about the operation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_RESAMPLE_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC

Input tensor gradient descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC

Output tensor gradient descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC

Tensor containing maxpool or nearest neighbor resampling indices to be used in backprop.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA

Sets the alpha parameter used in blending.

CUDNN_TYPE_DOUBLE or CUDNN_TYPE_FLOAT; one element.

Optional attribute.

Default value is 1.0.

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA

Sets the beta parameter used in blending.

CUDNN_TYPE_DOUBLE or CUDNN_TYPE_FLOAT; one element.

Optional attribute.

Default value is 0.0.

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC

Input tensor X descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

Required for NCHW layout.

CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC

Input tensor Y descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

Required for NCHW layout.

Finalization

In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The output shape calculated based on the padding and strides does not match the given output tensor dimensions.

The shape of YDESC and IDXDESC (if given) do not match.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR, &desc); the cuDNN backend resample forward operation descriptor specifies an operation node for forward resampling. It computes the output tensor y of image tensor x resampled according to CUDNN_ATTR_RESAMPLE_MODE, with output scaling \(\alpha\) and residual add with \(\beta\) scaling.

The resampling mode acts independently on each spatial dimension. For spatial dimension i, the output spatial dimension size y_i can be calculated by combining input image’s spatial dimension size x_i, post padding post_i, pre padding pre_i, stride s_i, and window size w_i as: y_i = 1+(x_i + post_i + pre_i - w_i) / s_i

Attributes

CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC

Resample operation descriptor (CUDNN_BACKEND_RESAMPLE_DESCRIPTOR) instance contains metadata about the operation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_RESAMPLE_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC

Input tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC

Output tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC

Tensor containing maxpool or nearest neighbor resampling indices to be used in backprop.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute (primarily used for use cases involving training).

CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA

Sets the alpha parameter used in blending.

CUDNN_TYPE_DOUBLE or CUDNN_TYPE_FLOAT; one element.

Optional attribute.

Default value is 1.0.

CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA

Sets the beta parameter used in blending.

CUDNN_TYPE_DOUBLE or CUDNN_TYPE_FLOAT; one element.

Optional attribute.

Default value is 0.0.

Finalization

In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:

CUDNN_STATUS_BAD_PARAM

Invalid or inconsistent attribute values are encountered. Some examples include:

The output shape calculated based on the padding and strides does not match the given output tensor dimensions.

The shape of the YDESC and IDXDESC (if given) do not match.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR, &desc); the cuDNN backend Rng operation descriptor specifies an operation node for generating a tensor with random numbers based on the probability distribution specified in the Rng descriptor.

The random numbers are generated using a Philox random number generator (RNG) as described in Pytorch. The Philox object takes a seed value, a subsequence for starting the generation, and an offset for the subsequence. Seed and offset can be set by using the attributes. The subsequence is internally set, to ensure independent random numbers.

Attributes

CUDNN_ATTR_OPERATION_RNG_DESC

Rng descriptor (CUDNN_BACKEND_RNG_DESCRIPTOR) instance containing metadata about the operation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_RNG_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RNG_YDESC

Output tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_RNG_SEED

Sets the seed for the random number generator which creates the Y tensor. It can be a host INT64 value or a backend descriptor binded to a value on the device. Only supports a tensor with all dimensions set to 1 and all strides set to 1.

CUDNN_TYPE_INT64; one element or CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

Default value is 0.

CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC

Tensor descriptor for the offset used in the RNG Philox object. Only supports a tensor with all dimensions set to 1 and all strides set to 1.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

Finalization

In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:

CUDNN_STATUS_BAD_PARAM: CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC or CUDNN_ATTR_OPERATION_RNG_SEED do not have all dimensions and strides set to 1.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR, &desc); the cuDNN backend paged cache load operation descriptor is used with a fused flash attention fprop graph, and specifies an operation node for reconstructing the k- or v-cache.

The k/v-cache is reconstructed by using a page table tensor to look up the location of a specific sequence ID in a non-contiguous container tensor. Storing a k/v-cache non-contiguously enables efficient memory management by avoiding fragmentation. For more information, refer to the Paged Attention paper.

Attributes

CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC

Virtual output tensor descriptor, containing the reconstructed k/v-cache.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Datatype: FP16, BF16, or FP8

Required attribute.

CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC

A non-virtual tensor descriptor with dimensions [num_blocks,H,block_size,D] containing the k/v-cache. The k/v-cache is divided into num_blocks of [H,block_size,D] tensors, where block_size is a parameter chosen by the user. A smaller block_size leads to less fragmentation, but also less parallelism. num_blocks is arbitrary and depends on the size of the allocated k/v-cache.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Datatype: FP16, BF16, or FP8

Required attribute.

CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC

A non-virtual tensor descriptor of dimensions [B,1,ceil(max_seq_size/block_size),1] pointing to the lookup table.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Datatype: INT32

Required attribute.

CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC

A non-virtual [B,1,1,1] tensor descriptor indicates which sequence numbers from the k/v-cache are requested. For each batch, all items from the container will be copied from sequence 0 to sequence number 1.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Datatype: INT32 or INT64

Sequence numbers are in the interval [1, max_seq_size]

Required attribute.

Finalization

In the finalization stage, the attributes are cross-checked to make sure there are no conflicts. The status below may be returned:

CUDNN_STATUS_BAD_PARAM: Types or dimensions of one or more of the input/output tensors are invalid.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR, &desc); the cuDNN backend signal operation descriptor specifies an operation node for updating or waiting on a flag variable. Signaling operations can be used to communicate between cuDNN operation graphs, even with operation graphs in another GPU.

This operation, to connect to other nodes in the graph, also has a pass-through input tensor, which is not operated on and is just passed along to the output tensor. This mandatory pass-through input tensor helps in determining the predecessor node after which the signal operation should be executed. The optional output tensor helps in determining the successor node before which the signal execution should have completed. It is also guaranteed that for a non-virtual tensor as the output tensor, all writes for the tensor will have taken place before the signal value is updated by the operation.

Attributes

CUDNN_ATTR_OPERATION_SIGNAL_MODE

The signaling mode to use.

CUDNN_TYPE_SIGNAL_MODE

Required attribute.

CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC

Flag tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_SIGNAL_VALUE

The scalar value to compare or update the flag variable with.

CUDNN_TYPE_INT64

Required attribute.

CUDNN_ATTR_OPERATION_SIGNAL_XDESC

A pass-through input tensor to enable connecting this signal operation to other nodes in the graph.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_SIGNAL_YDESC

The output tensor for the pass-through input tensor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

Finalization

In the finalization stage, the attributes are cross checked to make sure there are no conflicts. The status below may be returned:

CUDNN_STATUS_BAD_PARAM: Invalid or inconsistent attribute values are encountered.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR, &desc); the cuDNN backend bn_finalize statistics operation descriptor specifies an operation node for the batch norm finalize operation.

In ResNet like models, a common technique to fuse batch norm with convolutions would involve splitting the batch norm operation into three parts - genStats, finalize, and apply (pointwise, scale, and bias). The genStats operation is usually fused with the convolution operation that precedes the batch norm while the apply is fused with the ReLU and convolution that follows the batch norm op. The batch norm finalize operation is a buffer op between the two fusions that takes the batch norm scale, bias, sum, and sqsum produced by the genStats operation as inputs and produces an equivalent scale and bias as output. The equivalent scale and bias are then consumed in the apply phase. Additionally, the bn_finalize operation also produces the running stats, mean, and inverse standard deviation as outputs.

Attributes

CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE

Sets inference or training mode for the bn_finalize operation.

CUDNN_TYPE_BN_FINALIZE_STATS_MODE; one element.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC

Math precision of the computation.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC

Input sum tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC

Input square sum tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC

Batch norm input scale tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC

Batch norm input bias tensor descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC

Batch norm input running mean descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC

Batch norm input running variance descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC

Batch norm output running mean descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC

Batch norm output running variance descriptor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC

Batch norm output saved mean tensor descriptor. This is computed from the sum input that’s fed in from the preceding genStats operation. Storing out the saved mean helps avoid recomputation in the backpropagation phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC

Batch norm output inverse standard deviation tensor descriptor. This is computed from the sum and sqm sums input that’s fed in from the preceding genStats operation. Storing out the saved inv standard deviations helps avoid recomputation in the backpropagation phase.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC

Output tensor descriptor for the equivalent scale tensor. The equivalent scale tensor is typically fed as input to the batch norm apply computation (pointwise, scale, and bias) that follows the batch norm finalize operation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC

Output tensor descriptor for the equivalent bias tensor. The equivalent bias tensor is typically fed as input to the batch norm apply computation (pointwise, scale, and bias) that follows the batch norm finalize operation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC

Scalar input tensor descriptor representing the number of elements accumulated over while calculating the sum and sqsum inputs. The count usually equals N*H*W in case of batch norm.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC

Scalar input tensor descriptor for the epsilon value used in batch norm variance calculation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERAGE_FACTOR_DESC

Scalar input tensor descriptor for the exponential average value used in batch norm running stats calculation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Required attribute.

CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR #

Created with descriptor type value CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, cuDNN backend operation graph descriptor describes an operation graph, a small network of one or more operations connected by virtual tensors. Operation graph defines users’ computation case or mathematical expression that they wish to compute.

Attributes

Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_OPERATIONGRAPH_:

CUDNN_ATTR_OPERATIONGRAPH_HANDLE

A cuDNN handle.

CUDNN_TYPE_HANDLE; one element.

Required attribute.

CUDNN_ATTR_OPERATIONGRAPH_OPS

Operation nodes to form the operation graph.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one or more elements of descriptor type CUDNN_BACKEND_OPERATION_*_DESCRIPTOR.

Required attribute.

CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT

The number of engines to support the operation graph.

CUDNN_TYPE_INT64; one element.

Read-only attribute.

CUDNN_ATTR_OPERATIONGRAPH_ENGINE_SUPPORTED_COUNT

The number of engines that support the operation graph.

CUDNN_TYPE_INT64; one element.

Read-only attribute.

Currently unsupported. Placeholder for future implementation.

CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED

Whether dynamic shape is enabled for the operation graph. The rest of the backend API will treat the graph as a dynamic shape graph and enable this feature.

CUDNN_TYPE_BOOLEAN; one element.

Finalization

CUDNN_STATUS_BAD_PARAM

An invalid attribute value was encountered. Some examples include:

One of the backend descriptors in CUDNN_ATTR_OPERATIONGRAPH_OPS is not finalized.

The value CUDNN_ATTR_OPERATIONGRAPH_HANDLE is not a valid cuDNN handle.

CUDNN_STATUS_NOT_SUPPORTED

An unsupported attribute value was encountered. For example, the combination of operations of attribute CUDNN_ATTR_OPERATIONGRAPH_OPS is not supported.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_POINTWISE_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_POINTWISE_DESCRIPTOR, &desc); the cuDNN backend pointwise descriptor specifies the parameters for a pointwise operator like mode, math precision, nan propagation, and so on.

Attributes

Attributes of a cuDNN backend convolution descriptor are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_POINTWISE_:

CUDNN_ATTR_POINTWISE_MODE

Mode of the pointwise operation.

CUDNN_TYPE_POINTWISE_MODE; one element.

Required attribute.

CUDNN_ATTR_POINTWISE_MATH_PREC

The math precision of the computation.

CUDNN_TYPE_DATA_TYPE; one element.

Required attribute.

CUDNN_ATTR_POINTWISE_NAN_PROPAGATION

Specifies a method by which to propagate NaNs.

CUDNN_TYPE_NAN_PROPOGATION; one element.

Required only for comparison based pointwise modes, like ReLU.

Current support only includes enum value CUDNN_PROPAGATE_NAN.

Default value is CUDNN_NOT_PROPAGATE_NAN.

CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP

Sets the lower clip value for ReLU. If (value < lower_clip) value = lower_clip + lower_clip_slope * (value - lower_clip):

CUDNN_TYPE_DOUBLE/ CUDNN_TYPE_FLOAT; one element.

Default value is 0.0f.

CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP

Sets the upper clip value for ReLU. If (value > upper_clip) value = upper_clip:

CUDNN_TYPE_DOUBLE/ CUDNN_TYPE_FLOAT; one element.

Default value is Numeric limit max.

CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE

Sets the lower clip slope value for ReLU. If (value < lower_clip) value = lower_clip + lower_clip_slope * (value - lower_clip):

CUDNN_TYPE_DOUBLE/ CUDNN_TYPE_FLOAT; one element.

Default value is 0.0f.

CUDNN_ATTR_POINTWISE_ELU_ALPHA

Sets the alpha value for ELU. If (value < 0.0) value = alpha * (e^value - 1.0):

CUDNN_TYPE_DOUBLE/ CUDNN_TYPE_FLOAT; one element.

Default value is 1.0f.

CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA

Sets the beta value for softplus. If value = log (1 + e^(beta * value)) / beta:

CUDNN_TYPE_DOUBLE/ CUDNN_TYPE_FLOAT; one element.

Default value is 1.0f.

CUDNN_ATTR_POINTWISE_SWISH_BETA

Sets the beta value for swish. If value = value / (1 + e^(-beta * value)):

CUDNN_TYPE_DOUBLE/ CUDNN_TYPE_FLOAT; one element.

Default value is 1.0f.

CUDNN_ATTR_POINTWISE_AXIS

Sets the axis value for GEN_INDEX. The index will be generated for this axis.

CUDNN_TYPE_INT64; one element.

Default value is -1.

Needs to lie between [0,input_dim_size-1]. For example, if your input has dimensions [N,C,H,W], the axis can be set to anything in [0,3].

Finalization

cudnnBackendFinalize() with a CUDNN_BACKEND_POINTWISE_DESCRIPTOR can have the following return values:

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_REDUCTION_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_REDUCTION_DESCRIPTOR, &desc); the cuDNN backend reduction descriptor specifies any metadata, including the math operation and compute data type, needed for the reduction operation.

Attributes

CUDNN_ATTR_REDUCTION_OPERATOR

The math operation used for the reduction operation.

CUDNN_TYPE_REDUCTION_OPERATOR_TYPE; one element.

Required attribute.

CUDNN_ATTR_REDUCTION_COMP_TYPE

The compute precision used for the reduction operation.

CUDNN_TYPE_DATA_TYPE; one element.

Required attribute.

Finalization

Return values of cudnnBackendFinalize(desc) where desc is CUDNN_BACKEND_REDUCTION_DESCRIPTOR are:

CUDNN_STATUS_NOT_SUPPORTED: An unsupported attribute value was encountered. For example, CUDNN_ATTR_REDUCTION_OPERATOR is not set to either of CUDNN_REDUCE_TENSOR_ADD, CUDNN_REDUCE_TENSOR_MUL, CUDNN_REDUCE_TENSOR_MIN, or CUDNN_REDUCE_TENSOR_MAX.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_RESAMPLE_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_RESAMPLE_DESCRIPTOR, &desc); the cuDNN backend resample descriptor specifies the parameters for a resample operation (upsampling or downsampling) in both forward and backward propagation.

Attributes

CUDNN_ATTR_RESAMPLE_MODE

Specifies mode of resampling, for example, average pool, nearest-neighbor, and so on.

CUDNN_TYPE_RESAMPLE_MODE; one element.

Default value is CUDNN_RESAMPLE_NEAREST.

CUDNN_ATTR_RESAMPLE_COMP_TYPE

Compute data type for the resampling operator.

CUDNN_TYPE_DATA_TYPE; one element.

Default value is CUDNN_DATA_FLOAT.

CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION

Specifies a method by which to propagate NaNs.

CUDNN_TYPE_NAN_PROPAGATION; one element.

Default value is CUDNN_NOT_PROPAGATE_NAN.

CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS

Specifies the number of spatial dimensions to perform the resampling over.

CUDNN_TYPE_INT64; one element.

Required attribute.

CUDNN_ATTR_RESAMPLE_PADDING_MODE

Specifies which values to use for padding.

CUDNN_TYPE_PADDING_MODE; one element.

Default value is CUDNN_ZERO_PAD.

CUDNN_ATTR_RESAMPLE_STRIDES

Stride in each dimension for the kernel or filter.

CUDNN_TYPE_INT64 or CUDNN_TYPE_FRACTION; at most CUDNN_MAX_DIMS - 2.

Required attribute.

CUDNN_ATTR_RESAMPLE_PRE_PADDINGS

Padding added to the beginning of the input tensor in each dimension.

CUDNN_TYPE_INT64 or CUDNN_TYPE_FRACTION; at most CUDNN_MAX_DIMS - 2.

Required attribute.

CUDNN_ATTR_RESAMPLE_POST_PADDINGS

Padding added to the end of the input tensor in each dimension.

CUDNN_TYPE_INT64 or CUDNN_TYPE_FRACTION; at most CUDNN_MAX_DIMS - 2.

Required attribute.

CUDNN_ATTR_RESAMPLE_WINDOW_DIMS

Spatial dimensions of filter.

CUDNN_TYPE_INT64 or CUDNN_TYPE_FRACTION; at most CUDNN_MAX_DIMS - 2.

Required attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a CUDNN_BACKEND_RESAMPLE_DESCRIPTOR is:

CUDNN_STATUS_NOT_SUPPORTED

An unsupported attribute value was encountered. Some examples include:

An elemCount argument for setting CUDNN_ATTR_RESAMPLE_WINDOW_DIMS, CUDNN_ATTR_RESAMPLE_STRIDES, CUDNN_ATTR_RESAMPLE_PRE_PADDINGS, and CUDNN_ATTR_RESAMPLE_POST_PADDINGS is not equal to the value set for CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS.

CUDNN_ATTR_RESAMPLE_MODE is set to CUDNN_RESAMPLE_BILINEAR and any of the CUDNN_ATTR_RESAMPLE_WINDOW_DIMS are not set to 2.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_RNG_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_RNG_DESCRIPTOR, &desc); the cuDNN backend Rng descriptor specifies any metadata, including the probability distribution that will be used to generate the tensor and the distribution’s corresponding parameters.

Attributes

CUDNN_ATTR_RNG_DISTRIBUTION

The probability distribution used for the rng operation.

CUDNN_TYPE_RNG_DISTRIBUTION; one element.

Default value is CUDNN_RNG_DISTRIBUTION_BERNOULLI.

CUDNN_ATTR_RNG_NORMAL_DIST_MEAN

The mean value for the normal distribution, used if CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL.

CUDNN_TYPE_DOUBLE; one element.

Default value is -1.

CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION

The standard deviation value for the normal distribution, used if CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL.

CUDNN_TYPE_DOUBLE; one element.

Default value is -1.

Finalization

Return values of cudnnBackendFinalize(desc) where desc is CUDNN_BACKEND_RNG_DESCRIPTOR are:

CUDNN_STATUS_BAD_PARAM

An invalid attribute value was encountered. Some examples include:

If CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_NORMAL and the standard deviation supplied is negative.

If CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_UNIFORM and the maximum value of the range is lower than minimum value.

If CUDNN_ATTR_RNG_DISTRIBUTION = CUDNN_RNG_DISTRIBUTION_BERNOULLI and the probability supplied is negative.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_TENSOR_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &desc); the cuDNN backend tensor allows users to specify the memory storage of a generic tensor. A tensor is identified by a unique identifier and described by its data type, its data byte-alignment requirements, and the extents and strides of its dimensions. Optionally, a tensor element can be vector in one of its dimensions. A tensor can also be set to be virtual when it is an intermediate variable in a computation graph and not mapped to physical global memory storage.

Attributes

Attributes of a cuDNN backend tensor descriptors are values of enumeration type cudnnBackendAttributeName_t with prefix CUDNN_ATTR_TENSOR_:

CUDNN_ATTR_TENSOR_UNIQUE_ID

An integer that uniquely identifies the tensor.

CUDNN_TYPE_INT64; one element.

Required attribute.

CUDNN_ATTR_TENSOR_DATA_TYPE

Data type of tensor.

CUDNN_TYPE_DATA_TYPE; one element.

Required attribute.

CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT

Byte alignment of pointers for this tensor.

CUDNN_TYPE_INT64; one element.

Required attribute.

CUDNN_ATTR_TENSOR_DIMENSIONS

Tensor dimensions.

CUDNN_TYPE_INT64; at most CUDNN_MAX_DIMS elements.

Required attribute.

CUDNN_ATTR_TENSOR_STRIDES

Tensor strides.

CUDNN_TYPE_INT64; at most CUDNN_MAX_DIMS elements.

Required attribute.

CUDNN_ATTR_TENSOR_VECTOR_COUNT

Size of vectorization.

CUDNN_TYPE_INT64; one element.

Default value is 1.

CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION

Index of the vectorized dimension.

CUDNN_TYPE_INT64; one element.

Required to be set before finalization if CUDNN_ATTR_TENSOR_VECTOR_COUNT is set to a value different than its default; otherwise it’s ignored.

CUDNN_ATTR_TENSOR_IS_VIRTUAL

Indicates whether the tensor is virtual. A virtual tensor is an intermediate tensor in the operation graph that exists in transient and not read from or written to in global device memory.

CUDNN_TYPE_BOOLEAN; one element.

Default value is false.

CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC

A ragged tensor, that is, a tensor with nested variable length lists as inner dimensions, will have another tensor called the ragged offset descriptor that contains offsets in memory to the next variable length list.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element.

Default value is None.

Finalization

cudnnBackendFinalize() with a CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR can have the following return values:

CUDNN_STATUS_BAD_PARAM

An invalid attribute value was encountered. Some examples include:

Any of the tensor dimensions or strides is not positive.

The value of the tensor alignment attribute is not divisible by the size of the data type.

CUDNN_STATUS_NOT_SUPPORTED

An unsupported attribute value was encountered. Some examples include:

The data type attribute is CUDNN_DATA_INT8x4, CUDNN_DATA_UINT8x4, or CUDNN_DATA_INT8x32.

The data type attribute is CUDNN_DATA_INT8 and CUDNN_ATTR_TENSOR_VECTOR_COUNT value is not 1, 4, or 32.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &desc); the cuDNN backend variant pack plan allows users to set up pointers to device buffers to various non-virtual tensors, identified by unique identifiers, of the operation graph, workspace, and computation intermediates.

Attributes

CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS

A unique identifier of tensor for each data pointer.

CUDNN_TYPE_INT64; zero of more elements.

Required attribute.

CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS

Tensor data device pointers.

CUDNN_TYPE_VOID_PTR; zero or more elements.

Required attribute.

CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES

Intermediate device pointers.

CUDNN_TYPE_VOID_PTR; zero or more elements.

Currently unsupported. Placeholder for future implementation.

CUDNN_ATTR_VARIANT_PACK_WORKSPACE

Workspace to device pointer.

CUDNN_TYPE_VOID_PTR; one element.

Required attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN backend variant pack descriptor is:

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR, &desc); the cuDNN backend kernel cache helps significantly reduce execution plan finalizing time for use cases that have same-topology dynamic shape operation graph by binding the previously compiled applicable kernel to the execution plan instead of re-compiling a new one from scratch. This is used with execution plans containing a graph with CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED enabled.

Attributes

CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED

An attribute used to query whether a given engine config is cached.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element.

Read-only attribute using the cudnnBackendGetAttribute() API. The engine config in question is to be passed into this attribute as a constant input through arrayOfElements and the elementCount will serve as the resulting output, a value of zero meaning not cached and a positive number meaning that it is cached.

Required attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN backend variant pack descriptor are:

CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR, &desc); the cuDNN block scale quantize descriptor specifies the parameters for the block scale quantize operation to output block scaled tensors.

Attributes

CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC

The math precision of the computation.

CUDNN_TYPE_DATA_TYPE; one element.

Required attribute.

CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE

The number of elements per block to perform block scaling.

CUDNN_TYPE_INT32; one element.

Required attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN backend block scale quantize descriptor are:

CUDNN_STATUS_BAD_PARAM

An invalid attribute value was encountered. Some examples include:

Tensor shape mismatch between x, y, and scale tensors.

Data type mismatch between y and scale tensors.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR, &desc); the cuDNN block scale dequantize descriptor specifies the parameters for the block scale dequantize operation to take in block scaled tensors.

Attributes

CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC

The math precision of the computation.

CUDNN_TYPE_DATA_TYPE; one element.

Required attribute.

CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE

The number of elements per block to perform block scaling.

CUDNN_TYPE_INT32; one element.

Required attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN backend block scale dequantize descriptor are:

CUDNN_STATUS_BAD_PARAM

An invalid attribute value was encountered. Some examples include:

Tensor shape mismatch between x, scale, and y tensors.

Data type mismatch between x and scale tensors.

CUDNN_STATUS_SUCCESS

The descriptor was finalized successfully.

CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR, &desc); the cuDNN device property descriptor specifies the properties of a device.

Attributes

CUDNN_ATTR_DEVICEPROP_DEVICE_ID

The CUDA device ID of the device that the descriptor targets.

CUDNN_TYPE_INT32; one element.

Optional attribute.

CUDNN_ATTR_DEVICEPROP_HANDLE

The cuDNN handle of the device that the descriptor targets.

CUDNN_TYPE_HANDLE; one element.

Optional attribute.

CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION

The JSON representation of the device that the descriptor targets.

CUDNN_TYPE_CHAR; one element.

Optional attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN device property descriptor are:

CUDNN_STATUS_BAD_PARAM: An invalid attribute value was encountered, for example, the provided JSON representation is invalid.
CUDNN_STATUS_NOT_INITIALIZED: For some reason, querying the device properties failed.
CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH: The target device is not supported.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_SDPA_FWD_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_SDPA_FWD_DESCRIPTOR, &desc); the cuDNN scaled dot-product attention forward operation descriptor represents a flash attention forward computation as a single operation, obviating in some cases the need to represent flash attention as many separate operations. The basic flash attention operation is:

\[O = softmax\left(\left(QK\text{^}t \right) \times scale\right) V\]

An optional padding mask is also supported. The padding mask does not have its own attribute, but is implicitly enabled if CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_QDESC and CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_KVDESC are both specified. In this case, the formula becomes:

\[O = softmax\left(padding\_mask\left(\left(QK\text{^}t\right) \times scale\right)\right) V\]

Furthermore, a block mask is also supported. A block mask can be applied to the scores to mask out entire blocks. In this case, the formula becomes:

\[O = softmax\left(padding\_mask\left(block\_mask\left(QK\text{^}t\right) \times scale\right)\right) V\]

A subgraph of custom operations (either pointwise or diagonal band mask) can also be fused into the computation. When CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH is provided, the subgraph is applied to the scores after scaling but before softmax. For example, with both masking and a subgraph, the formula becomes:

\[O = softmax\left(subgraph\left(padding\_mask\left(block\_mask\left(QK\text{^}t\right) \times scale\right)\right)\right) V\]

Optional dropout is also supported. When CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_PROBABILITY is set to a non-zero value, dropout is applied to the attention weights after softmax, before the multiplication with \(V\):

\[O = dropout\left(softmax\left(\ldots\right)\right) V\]

For ease of use, access this operation through the frontend’s attention API, which can automatically select this operation depending on the features requested and the cuDNN backend version. The details of this backend descriptor type are provided here for completeness.

Attributes

CUDNN_ATTR_OPERATION_SDPA_FWD_QDESC

Input tensor descriptor representing the \(Q\) input to the above formula.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, D\_qk\right)\).

Required attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_KDESC

Input tensor descriptor representing the \(K\) input to the above formula, or the \(K\) page container if paged attention for \(K\) is enabled (if CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_KDESC is given).

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions:

If paged attention is disabled: \(\left(B, H\_k, S\_kv, D\_qk\right)\). Note that this attribute represents a literal \(K\) tensor and not its transpose \(K\text{^}t\).

If paged attention is enabled: \(\left(num\_blocks\_k, H\_k, bs\_k, D\_qk\right)\) (where \(bs\_k\) is the \(K\) block size).

Required attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_VDESC

Input tensor descriptor representing the \(V\) input to the above formula, or the \(V\) page container if paged attention is enabled (if CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_VDESC is given).

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions

If paged attention is disabled: \(\left(B, H\_v, S\_kv, D\_v\right)\).

If paged attention is enabled: \(\left(num\_blocks\_v, H\_v, bs\_v, D\_v\right)\) (where \(bs\_v\) is the \(V\) block size).

Required attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_ODESC

Output tensor descriptor representing the \(O\) output of the above formula.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, D\_v\right)\).

Required attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_STATSDESC

Output tensor descriptor representing the stats output of the \(softmax\) in the above formula, used during training.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, 1\right)\).

Optional attribute.

DEPRECATED: Use the newer CUDNN_ATTR_OPERATION_SDPA_FWD_SOFTMAX_DESC attribute instead.

CUDNN_ATTR_OPERATION_SDPA_FWD_SCALEDESC

Input tensor descriptor representing the scale factor in the above formula.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Single element must be of type CUDNN_DATA_FLOAT.

Tensor must be of dimensions \(\left(1, 1, 1, 1\right)\).

Tensor may be a device-side tensor, or a “by value” (host side) tensor.

Optional attribute. If not given, a scale of 1.0 is used.

CUDNN_ATTR_OPERATION_SDPA_FWD_BLOCK_MASK_DESC

Input tensor descriptor representing the block mask of the scores assuming block size 128×128.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_INT8.

Tensor must be of dimensions \(\left(B, H\_q, ceil(S\_q / 128), ceil(S\_kv / 128)\right)\).

Optional attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_KDESC

Input tensor descriptor representing the page table for \(K\).

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_INT32.

Tensor must be of dimensions \(\left(B, 1, ceil(S\_kv / bs\_k), 1\right)\).

Optional attribute.

If not given, paged attention for \(K\) is disabled.

If given, paged attention for \(K\) is enabled, and the attribute CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_KVDESC must also be given.

CUDNN_ATTR_OPERATION_SDPA_FWD_PAGE_TABLE_VDESC

Input tensor descriptor representing the page table for \(V\).

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_INT32.

Tensor must be of dimensions \(\left(B, 1, ceil(S\_kv / bs\_v), 1\right)\).

Optional attribute.

If not given, paged attention for \(V\) is disabled.

If given, paged attention for \(V\) is enabled, and the attribute CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_KVDESC must also be given.

CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_QDESC

Input tensor descriptor representing sequence lengths for \(Q\).

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_INT32 or CUDNN_DATA_INT64.

Tensor must be of dimensions \(\left(B, 1, 1, 1\right)\).

Optional attribute.

If given and CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_KVDESC is also given, a padding mask is applied.

CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_KVDESC

Input tensor descriptor representing sequence lengths for \(K\) and \(V\). If specified and CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_QDESC is also specified, a padding mask is applied.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_INT32 or CUDNN_DATA_INT64.

Tensor must be of dimensions \(\left(B, 1, 1, 1\right)\).

Optional attribute, but required if paged attention is enabled.

If given and CUDNN_ATTR_OPERATION_SDPA_FWD_SEQ_LEN_QDESC is also given, a padding mask is applied.

CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH

An operation graph descriptor representing a subgraph to fuse into the forward SDPA operation. The subgraph can express custom computations (for example, custom masking or bias logic) that are applied after the scaling operation but before the softmax.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR.

Currently, all operations in the subgraph must be of type CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR or CUDNN_BACKEND_OPERATION_DIAGONAL_BAND_MASK_DESCRIPTOR.

Optional attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH_INPUT_UID

The unique identifier of the tensor within the subgraph that receives the intermediate attention scores as input. This connects the main SDPA computation to the subgraph.

CUDNN_TYPE_INT64; one element.

The corresponding tensor must be virtual and of dimensions \(\left(B, H\_q, S\_q, S\_kv\right)\).

Required if CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH is given.

CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH_OUTPUT_UID

The unique identifier of the tensor within the subgraph that produces the modified attention scores as output. This feeds the subgraph result back into the main SDPA computation, as the input to the softmax.

CUDNN_TYPE_INT64; one element.

The corresponding tensor must be virtual and of dimensions \(\left(B, H\_q, S\_q, S\_kv\right)\).

Required if CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH is given.

CUDNN_ATTR_OPERATION_SDPA_FWD_SOFTMAX_DESC

A softmax operation descriptor to use within the SDPA computation. When provided, this controls the softmax behavior and can enable additional inputs and outputs.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_OPERATION_SOFTMAX_DESCRIPTOR.

Optional attribute. If not given, a default softmax operation is used with no additional inputs or outputs.

Cannot be used in conjunction with the older CUDNN_ATTR_OPERATION_SDPA_FWD_STATSDESC attribute (which it supersedes).

CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_SEED_DESC

Input tensor descriptor for the dropout random seed. Used together with the dropout offset to initialize the random number generator for dropout within the attention computation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Single element must be of type CUDNN_DATA_INT32 or CUDNN_DATA_INT64.

Tensor must be of dimensions \(\left(1, 1, 1, 1\right)\).

Tensor may be a device-side tensor, or a “by value” (host side) tensor.

Optional attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_OFFSET_DESC

Input tensor descriptor for the dropout random offset. Used together with the dropout seed to initialize the random number generator for dropout within the attention computation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Single element must be of type CUDNN_DATA_INT32 or CUDNN_DATA_INT64.

Tensor must be of dimensions \(\left(1, 1, 1, 1\right)\).

Tensor may be a device-side tensor, or a “by value” (host side) tensor.

Optional attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_RNG_DUMP_DESC

Output tensor descriptor for the dropout RNG state dump. When provided, the random number generator state used for dropout is written to this tensor, which can be useful for debugging or reproducibility.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_FLOAT.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, S\_kv\right)\).

Optional attribute.

CUDNN_ATTR_OPERATION_SDPA_FWD_DROPOUT_PROBABILITY

The probability of dropping each attention weight element.

CUDNN_TYPE_FLOAT; one element.

Value must be in the range [0.0, 1.0].

Optional attribute. If not given, no dropout is applied.

CUDNN_ATTR_OPERATION_SDPA_FWD_UNFUSE_FMA

A boolean flag that, when set to true, unfuses the fused multiply-add (FMA) operation in the softmax computation. This may be required for certain SM100 (Blackwell) configurations to achieve numerical correctness or performance.

CUDNN_TYPE_BOOLEAN; one element.

Optional attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN device property descriptor are:

CUDNN_STATUS_BAD_PARAM: An invalid attribute value was encountered, for example, the provided input tensor sizes don’t match along the necessary dimensions to make the scaled dot-product attention operation mathematically possible.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_SDPA_BWD_DESCRIPTOR #

This descriptor and its attributes are reserved for future use.

CUDNN_BACKEND_OPERATION_DIAGONAL_BAND_MASK_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_DIAGONAL_BAND_MASK_DESCRIPTOR, &desc); the cuDNN diagonal band mask operation descriptor generates a diagonal band attention mask. It produces a mask tensor based on diagonal band boundaries (left and right bounds) relative to the query and key-value sequence positions, commonly used to implement sliding window or local attention patterns. This descriptor can only be used within SDPA operations, notably as part of the CUDNN_ATTR_OPERATION_SDPA_FWD_SUBGRAPH of CUDNN_BACKEND_OPERATION_SDPA_FWD_DESCRIPTOR.

Attributes

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_XDESC

Input tensor descriptor representing the attention scores to be masked.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, S\_kv\right)\).

Required attribute.

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SEQ_LEN_KVDESC

Input tensor descriptor representing the key-value sequence lengths. When provided alongside CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SEQ_LEN_QDESC, enables variable sequence length support for bottom-right alignment.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_INT32 or CUDNN_DATA_INT64.

Tensor must be of dimensions \(\left(B, 1, 1, 1\right)\).

Optional attribute.

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SEQ_LEN_QDESC

Input tensor descriptor representing the query sequence lengths. When provided alongside CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SEQ_LEN_KVDESC, enables variable sequence length support for bottom-right alignment.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Elements must be of type CUDNN_DATA_INT32 or CUDNN_DATA_INT64.

Tensor must be of dimensions \(\left(B, 1, 1, 1\right)\).

Optional attribute.

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_LEFT_BOUND_DESC

Input tensor descriptor representing the left bound offset of the diagonal band. Positions more than this many steps to the left of the diagonal are masked out. At most one of CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_LEFT_BOUND_DESC and CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SHIFT_RIGHT_BOUND_DESC may be set.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Single element must be of type CUDNN_DATA_INT32.

Tensor must be of dimensions \(\left(1, 1, 1, 1\right)\).

Tensor may be a device-side tensor, or a “by value” (host side) tensor.

Optional attribute.

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SHIFT_RIGHT_BOUND_DESC

Input tensor descriptor representing the right bound shift of the diagonal band. Positions more than this many steps to the right of the diagonal are masked out. At most one of CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_LEFT_BOUND_DESC and CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_SHIFT_RIGHT_BOUND_DESC may be set.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Single element must be of type CUDNN_DATA_INT32.

Tensor must be of dimensions \(\left(1, 1, 1, 1\right)\).

Tensor may be a device-side tensor, or a “by value” (host side) tensor.

Optional attribute.

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_BDESC

Input scalar tensor descriptor representing the value to fill masked-out positions (typically negative infinity).

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Single element must be of type CUDNN_DATA_FLOAT.

Tensor must be of dimensions \(\left(1, 1, 1, 1\right)\).

Tensor may be a device-side tensor, or a “by value” (host side) tensor.

Required attribute.

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_YDESC

Output tensor descriptor for the generated mask. Must have the same dimensions as the input tensor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, S\_kv\right)\).

Required attribute.

CUDNN_ATTR_OPERATION_DIAGONAL_BAND_MASK_COMPARISON_MODE

The comparison mode used when generating the mask. Determines how positions are compared against the diagonal band boundaries.

CUDNN_TYPE_POINTWISE_MODE; one element of type cudnnPointwiseMode_t.

Required attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN diagonal band mask operation descriptor are:

CUDNN_STATUS_BAD_PARAM: An invalid attribute value was encountered, for example, a required attribute was not set or the provided tensor dimensions are inconsistent.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

CUDNN_BACKEND_OPERATION_SOFTMAX_DESCRIPTOR #

Created with cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_SOFTMAX_DESCRIPTOR, &desc); the cuDNN softmax operation descriptor computes the softmax function over the input tensor and optionally outputs intermediate statistics (row-wise max, sum of exponentials) for use in fused attention patterns. This descriptor can only be used within SDPA operations, notably as the CUDNN_ATTR_OPERATION_SDPA_FWD_SOFTMAX_DESC attribute of CUDNN_BACKEND_OPERATION_SDPA_FWD_DESCRIPTOR.

Attributes

CUDNN_ATTR_OPERATION_SOFTMAX_XDESC

Input tensor descriptor for the softmax operation.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, S\_kv\right)\).

Required attribute.

CUDNN_ATTR_OPERATION_SOFTMAX_YDESC

Output tensor descriptor for the softmax result. Must have the same dimensions as the input tensor.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, S\_kv\right)\).

Required attribute.

CUDNN_ATTR_OPERATION_SOFTMAX_STATS_DESC

Output tensor descriptor for the softmax statistics, combining information about the row-wise max and normalization factor. The last dimension is reduced to 1.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, 1\right)\).

Optional attribute.

CUDNN_ATTR_OPERATION_SOFTMAX_MAX_DESC

Output tensor descriptor for the row-wise maximum values computed during softmax. The last dimension is reduced to 1.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, 1\right)\).

Optional attribute.

CUDNN_ATTR_OPERATION_SOFTMAX_SUM_EXP_DESC

Output tensor descriptor for the row-wise sum of exponentials computed during softmax. The last dimension is reduced to 1.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(B, H\_q, S\_q, 1\right)\).

Optional attribute.

CUDNN_ATTR_OPERATION_SOFTMAX_SINK_DESC

Tensor descriptor for the softmax sink, used in online softmax algorithms to track normalization state.

CUDNN_TYPE_BACKEND_DESCRIPTOR; one element of descriptor type CUDNN_BACKEND_TENSOR_DESCRIPTOR.

Tensor must be of dimensions \(\left(1, H\_q, 1, 1\right)\).

Optional attribute.

Finalization

The return values for cudnnBackendFinalize() when called with a cuDNN softmax operation descriptor are:

CUDNN_STATUS_BAD_PARAM: An invalid attribute value was encountered, for example, a required attribute was not set or the provided tensor dimensions are inconsistent.
CUDNN_STATUS_SUCCESS: The descriptor was finalized successfully.

Use Cases #

This section describes some typical use cases of the cuDNN backend API; for example, setting up a simple operation graph, setting up an engine config for that operation graph, and finally setting up an execution plan and executing it with data pointers set in a variant pack descriptor. An example of cuDNN’s native CUDA graph API is given as well.

Setting Up An Operation Graph For A Grouped Convolution #

This use case creates an operation graph with a single grouped 3D convolution forward operation. It starts by setting up the input and output tensors, binding them to a convolution forward operation, and finally setting up an operation graph with a single node.

Create tensor descriptors.

cudnnBackendDescriptor_t xDesc;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_TENSOR_DESCRIPTOR, &xDesc);

cudnnDataType_t dtype = CUDNN_DATA_FLOAT;
cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_DATA_TYPE,
                        CUDNN_TYPE_DATA_TYPE, 1, &dtype);

int64_t xDim[] = {n, g, c, d, h, w};
int64_t xStr[] = {g * c * d * h * w, c *d *h *w, d *h *w, h *w, w, 1};
int64_t xUi = 'x';
int64_t alignment = 4;

cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_DIMENSIONS,
                        CUDNN_TYPE_INT64, 6, xDim);

cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_STRIDES,
                        CUDNN_TYPE_INT64, 6, xStr);

cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_UNIQUE_ID,
                        CUDNN_TYPE_INT64, 1, &xUi);

cudnnBackendSetAttribute(xDesc, CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT,
                        CUDNN_TYPE_INT64, 1, &alignment);

cudnnBackendFinalize(xDesc);

Repeat the above step for the convolution filter and output tensor descriptor. The six filter tensor dimensions are [g, k, c, t, r, s] and the six output tensor dimensions are [n, g, k, o, p, q], respectively. Below, when finalizing a convolution operator to which the tensors are bound, dimension consistency is checked, meaning all n, g, c, k values shared among the three tensors are required to be the same. Otherwise, CUDNN_STATUS_BAD_PARAM status is returned.
For backward compatibility with how tensors are specified in cudnnTensorDescriptor_t and used in convolution API, it is also possible to specify a 5D tensor with the following dimension:
- image: [n, g*c, d, h, w]
- filter: [g*k, c, t, r, s]
- response: [n, g*k, o, p, q]
In this format, a similar consistency check is performed when finalizing a convolution operator descriptor to which the tensors are bound.

Create, set, and finalize a convolution operator descriptor.

cudnnBackendDescriptor_t cDesc;
int64_t nbDims = 3;
cudnnDataType_t compType = CUDNN_DATA_FLOAT;
cudnnConvolutionMode_t mode = CUDNN_CONVOLUTION;
int64_t pad[] = {0, 0, 0};
int64_t filterStr[] = {1, 1, 1};
int64_t dilation[] = {1, 1, 1};

cudnnBackendCreateDescriptor(CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR, &cDesc);

cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS,
                        CUDNN_TYPE_INT64, 1, &nbDims);

cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_COMP_TYPE,
                        CUDNN_TYPE_DATA_TYPE, 1, &compType);

cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_CONV_MODE,
                        CUDNN_TYPE_CONVOLUTION_MODE, 1, &mode);

cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS,
                        CUDNN_TYPE_INT64, nbDims, pad);

cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_POST_PADDINGS,
                        CUDNN_TYPE_INT64, nbDims, pad);

cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_DILATIONS,
                        CUDNN_TYPE_INT64, nbDims, dilation);

cudnnBackendSetAttribute(cDesc, CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES,
                        CUDNN_TYPE_INT64, nbDims, filterStr);
cudnnBackendFinalize(cDesc);

Create, set, and finalize a convolution forward operation descriptor.

cudnnBackendDescriptor_t fprop;
float alpha = 1.0;
float beta = 0.5;

cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
                &fprop);
cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &xDesc);
cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &wDesc);
cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &yDesc);
cudnnBackendSetAttribute(fprop,
CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &cDesc);

cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA,
                        CUDNN_TYPE_FLOAT, 1, &alpha);
cudnnBackendSetAttribute(fprop, CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA,
                        CUDNN_TYPE_FLOAT, 1, &beta);

cudnnBackendFinalize(fprop);

Create, set, and finalize an operation graph descriptor.

cudnnBackendDescriptor_t op_graph;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR, &op_graph);
cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_OPS,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &fprop);
cudnnBackendSetAttribute(op_graph, CUDNN_ATTR_OPERATIONGRAPH_HANDLE,
                        CUDNN_TYPE_HANDLE, 1, &handle);
cudnnBackendFinalize(op_graph);

Setting Up An Engine Configuration #

This use case describes the steps with which users can set up an engine config from a previously finalized operation graph. This is an example in which users would like to use the engine with CUDNN_ATTR_ENGINE_GLOBAL_INDEX 0 for this operation graph and does not set any performance knobs.

Create, set, and finalize an engine descriptor.

cudnnBackendDescriptor_t engine;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINE_DESCRIPTOR, &engine);
cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_OPERATION_GRAPH,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &op_graph);
int64_t gidx = 0;
cudnnBackendSetAttribute(engine, CUDNN_ATTR_ENGINE_GLOBAL_INDEX,
                        CUDNN_TYPE_INT64, 1, &gidx);
cudnnBackendFinalize(engine);
The user can query a finalized engine descriptor with cudnnBackendGetAttribute() API call for its attributes, including the performance knobs that it has. For simplicity, this use case skips this step and assumes the user is setting up an engine config descriptor below without making any changes to performance knobs.

Create, set, and finalize an engine config descriptor. Obtain the workspace size from the engine config.

cudnnBackendDescriptor_t engcfg;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_ENGINECFG_DESCRIPTOR, &engcfg);
cudnnBackendSetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_ENGINE,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engine);
cudnnBackendFinalize(engcfg);

int64_t workspaceSize;
cudnnBackendGetAttribute(engcfg, CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE,
                        CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);

Setting Up And Executing A Plan #

This use case describes the steps with which users set up an execution plan with a previously finalized engine config descriptor, set up the data pointer variant pack, and finally execute the plan.

Create, set, and finalize an execution plan descriptor. Obtain workspace size to allocate.

cudnnBackendDescriptor_t plan;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR, &plan);
cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_HANDLE, CUDNN_TYPE_HANDLE, 1, &handle);
cudnnBackendSetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG,
                        CUDNN_TYPE_BACKEND_DESCRIPTOR, 1, &engcfg);
cudnnBackendFinalize(plan);

int64_t workspaceSize;
cudnnBackendGetAttribute(plan, CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
                        CUDNN_TYPE_INT64, 1, NULL, &workspaceSize);

Create, set and finalize a variant pack descriptor.

void *dev_ptrs[3] = {xData, wData, yData}; // device pointer
int64_t uids[3] = {'x', 'w', 'y'};
void *workspace;

cudnnBackendDescriptor_t varpack;
cudnnBackendCreateDescriptor(CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR, &varpack);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS,
                        CUDNN_TYPE_VOID_PTR, 3, dev_ptrs);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS,
                        CUDNN_TYPE_INT64, 3, uids);
cudnnBackendSetAttribute(varpack, CUDNN_ATTR_VARIANT_PACK_WORKSPACE,
                        CUDNN_TYPE_VOID_PTR, 1, &workspace);
cudnnBackendFinalize(varpack);

Execute the plan with a variant pack.

cudnnBackendExecute(handle, plan, varpack);

Creating and Updating a CUDA Graph from a Plan #

This use case describes the steps with which users create a CUDA graph from a previously finalized execution plan and variant pack, execute the graph on a desired stream, and update the graph with a new variant pack. Note that this use case currently only works with a limited selection of cuDNN engines.

Create a CUDA graph.

cudaGraph_t cuda_graph;
cudaGraphCreate(&cuda_graph, 0);
cudnnBackendPopulateCudaGraph(handle, plan, varpack, cuda_graph);

Instantiate and execute the CUDA graph.

cudaGraphExec_t cuda_graph_exec;
cudaGraphInstantiate (&cuda_graph_exec, cuda_graph);
cudaGraphLaunch(instance, stream);  // stream is a cudaStream_t
cudaStreamSynchronize(stream);

Update the CUDA graph, update the instantiated graph and re-execute the graph. (This can be performed any number of times.)

cudaGraphExec_t cuda_graph_exec;
cudnnBackendUpdateCudaGraph(handle, plan, new_varpack, cuda_graph);
cudaGraphExecUpdateResultInfo result_info;
cudaGraphExecUpdate(cuda_graph_exec, cuda_graph, &result_info);
cudaGraphLaunch(instance, stream);
cudaStreamSynchronize(stream);

Destroy the CUDA graph when we’re all done.
cudaGraphDestroy(cuda_graph);