C/C++ API
The C/C++ API allows you to access the custom kernels defined in libtransformer_engine.so library directly from C/C++, without Python.
Headers
- transformer_engine.h
NVTETensorNVTEQuantizationConfigNVTEDTypeNVTETensorParamNVTEScalingModeNVTEQuantizationConfigAttributenvte_create_tensor()nvte_destroy_tensor()nvte_tensor_data()nvte_tensor_columnwise_data()nvte_make_shape()nvte_tensor_shape()nvte_tensor_columnwise_shape()nvte_tensor_ndims()nvte_tensor_size()nvte_tensor_numel()nvte_tensor_element_size()nvte_tensor_type()nvte_tensor_amax()nvte_tensor_scale()nvte_tensor_scale_inv()nvte_tensor_scale_inv_shape()nvte_zero_tensor()nvte_set_tensor_param()nvte_get_tensor_param()nvte_tensor_scaling_mode()nvte_tensor_pack_create()nvte_tensor_pack_destroy()nvte_create_quantization_config()nvte_get_quantization_config_attribute()nvte_set_quantization_config_attribute()nvte_destroy_quantization_config()nvte_is_non_tn_fp8_gemm_supported()nvte_memset()NVTEShapeNVTEBasicTensorNVTETensorPacktransformer_enginetransformer_engine::DTypetransformer_engine::DType::kBytetransformer_engine::DType::kInt16transformer_engine::DType::kInt32transformer_engine::DType::kInt64transformer_engine::DType::kFloat32transformer_engine::DType::kFloat16transformer_engine::DType::kBFloat16transformer_engine::DType::kFloat8E4M3transformer_engine::DType::kFloat8E5M2transformer_engine::DType::kFloat8E8M0transformer_engine::DType::kNumTypes
transformer_engine::is_fp8_dtype()transformer_engine::QuantizationConfigWrappertransformer_engine::QuantizationConfigWrapper::QuantizationConfigWrapper()transformer_engine::QuantizationConfigWrapper::QuantizationConfigWrapper()transformer_engine::QuantizationConfigWrapper::operator=()transformer_engine::QuantizationConfigWrapper::QuantizationConfigWrapper()transformer_engine::QuantizationConfigWrapper::operator=()transformer_engine::QuantizationConfigWrapper::~QuantizationConfigWrapper()transformer_engine::QuantizationConfigWrapper::operator NVTEQuantizationConfig()transformer_engine::QuantizationConfigWrapper::set_force_pow_2_scales()transformer_engine::QuantizationConfigWrapper::set_amax_epsilon()transformer_engine::QuantizationConfigWrapper::set_noop_tensor()transformer_engine::QuantizationConfigWrapper::config_
transformer_engine::TensorWrappertransformer_engine::TensorWrapper::TensorWrapper()transformer_engine::TensorWrapper::TensorWrapper()transformer_engine::TensorWrapper::TensorWrapper()transformer_engine::TensorWrapper::~TensorWrapper()transformer_engine::TensorWrapper::operator=()transformer_engine::TensorWrapper::TensorWrapper()transformer_engine::TensorWrapper::TensorWrapper()transformer_engine::TensorWrapper::operator=()transformer_engine::TensorWrapper::set_parameter()transformer_engine::TensorWrapper::set_rowwise_data()transformer_engine::TensorWrapper::set_columnwise_data()transformer_engine::TensorWrapper::set_scale()transformer_engine::TensorWrapper::set_amax()transformer_engine::TensorWrapper::set_rowwise_scale_inv()transformer_engine::TensorWrapper::set_columnwise_scale_inv()transformer_engine::TensorWrapper::get_parameter()transformer_engine::TensorWrapper::get_rowwise_data()transformer_engine::TensorWrapper::get_columnwise_data()transformer_engine::TensorWrapper::get_scale()transformer_engine::TensorWrapper::get_amax()transformer_engine::TensorWrapper::get_rowwise_scale_inv()transformer_engine::TensorWrapper::get_columnwise_scale_inv()transformer_engine::TensorWrapper::data()transformer_engine::TensorWrapper::shape()transformer_engine::TensorWrapper::columnwise_shape()transformer_engine::TensorWrapper::size()transformer_engine::TensorWrapper::ndim()transformer_engine::TensorWrapper::numel()transformer_engine::TensorWrapper::element_size()transformer_engine::TensorWrapper::bytes()transformer_engine::TensorWrapper::dtype()transformer_engine::TensorWrapper::dptr()transformer_engine::TensorWrapper::columnwise_dptr()transformer_engine::TensorWrapper::amax()transformer_engine::TensorWrapper::scale()transformer_engine::TensorWrapper::scale_inv()transformer_engine::TensorWrapper::scale_inv_shape()transformer_engine::TensorWrapper::scaling_mode()transformer_engine::TensorWrapper::zero_()transformer_engine::TensorWrapper::defaultDatatransformer_engine::TensorWrapper::defaultShapetransformer_engine::TensorWrapper::convertShape()transformer_engine::TensorWrapper::convertShape()transformer_engine::TensorWrapper::tensor_
- activation.h
- cast_transpose_noop.h
- cast.h
- cudnn.h
- fused_attn.h
NVTE_QKV_LayoutNVTE_QKV_Layout::NVTE_SB3HDNVTE_QKV_Layout::NVTE_SBH3DNVTE_QKV_Layout::NVTE_SBHD_SB2HDNVTE_QKV_Layout::NVTE_SBHD_SBH2DNVTE_QKV_Layout::NVTE_SBHD_SBHD_SBHDNVTE_QKV_Layout::NVTE_BS3HDNVTE_QKV_Layout::NVTE_BSH3DNVTE_QKV_Layout::NVTE_BSHD_BS2HDNVTE_QKV_Layout::NVTE_BSHD_BSH2DNVTE_QKV_Layout::NVTE_BSHD_BSHD_BSHDNVTE_QKV_Layout::NVTE_T3HDNVTE_QKV_Layout::NVTE_TH3DNVTE_QKV_Layout::NVTE_THD_T2HDNVTE_QKV_Layout::NVTE_THD_TH2DNVTE_QKV_Layout::NVTE_THD_THD_THDNVTE_QKV_Layout::NVTE_SBHD_BSHD_BSHDNVTE_QKV_Layout::NVTE_BSHD_SBHD_SBHDNVTE_QKV_Layout::NVTE_THD_BSHD_BSHDNVTE_QKV_Layout::NVTE_THD_SBHD_SBHDNVTE_QKV_Layout::NVTE_Paged_KV_BSHD_BSHD_BSHDNVTE_QKV_Layout::NVTE_Paged_KV_BSHD_SBHD_SBHDNVTE_QKV_Layout::NVTE_Paged_KV_SBHD_BSHD_BSHDNVTE_QKV_Layout::NVTE_Paged_KV_SBHD_SBHD_SBHDNVTE_QKV_Layout::NVTE_Paged_KV_THD_BSHD_BSHDNVTE_QKV_Layout::NVTE_Paged_KV_THD_SBHD_SBHD
NVTE_QKV_Layout_GroupNVTE_QKV_FormatNVTE_Bias_TypeNVTE_Mask_TypeNVTE_Fused_Attn_Backendnvte_get_qkv_layout_group()nvte_get_qkv_format()nvte_get_q_format()nvte_get_kv_format()nvte_get_fused_attn_backend()nvte_fused_attn_fwd_qkvpacked()nvte_fused_attn_bwd_qkvpacked()nvte_fused_attn_fwd_kvpacked()nvte_fused_attn_bwd_kvpacked()nvte_fused_attn_fwd()nvte_fused_attn_bwd()nvte_populate_rng_state_async()nvte_get_runtime_num_segments()nvte_extract_seed_and_offset()nvte_copy_to_kv_cache()nvte_cp_thd_read_half_tensor()nvte_cp_thd_second_half_lse_correction()nvte_cp_thd_read_second_half_lse()nvte_cp_thd_out_correction()nvte_cp_thd_grad_correction()nvte_cp_thd_get_partitioned_indices()nvte_convert_thd_to_bshd()nvte_convert_bshd_to_thd()nvte_prepare_flash_attn_fwd()nvte_prepare_flash_attn_bwd()
- fused_rope.h
- gemm.h
- multi_tensor.h
nvte_multi_tensor_l2norm_cuda()nvte_multi_tensor_unscale_l2norm_cuda()nvte_multi_tensor_adam_cuda()nvte_multi_tensor_adam_param_remainder_cuda()nvte_multi_tensor_adam_fp8_cuda()nvte_multi_tensor_adam_capturable_cuda()nvte_multi_tensor_adam_capturable_master_cuda()nvte_multi_tensor_sgd_cuda()nvte_multi_tensor_scale_cuda()nvte_multi_tensor_compute_scale_and_scale_inv_cuda()
- normalization.h
- padding.h
- permutation.h
- recipe.h
- softmax.h
nvte_scaled_softmax_forward()nvte_scaled_softmax_backward()nvte_scaled_masked_softmax_forward()nvte_scaled_masked_softmax_backward()nvte_scaled_upper_triang_masked_softmax_forward()nvte_scaled_upper_triang_masked_softmax_backward()nvte_scaled_aligned_causal_masked_softmax_forward()nvte_scaled_aligned_causal_masked_softmax_backward()
- swizzle.h
- transpose.h
nvte_cast_transpose()nvte_transpose()nvte_cast_transpose_dbias()nvte_fp8_transpose_dbias()nvte_multi_cast_transpose()nvte_cast_transpose_dbias_dgelu()nvte_cast_transpose_dbias_dsilu()nvte_cast_transpose_dbias_drelu()nvte_cast_transpose_dbias_dqgelu()nvte_cast_transpose_dbias_dsrelu()nvte_dgeglu_cast_transpose()nvte_dswiglu_cast_transpose()nvte_dreglu_cast_transpose()nvte_dqgeglu_cast_transpose()nvte_dsreglu_cast_transpose()