Index A | B | C | D | E | F | G | H | I | K | L | M | N | O | P | Q | R | S | T | U | V | W | Z A account_for_embedding_in_pipeline_split (core.transformer.transformer_config.TransformerConfig attribute) account_for_loss_in_pipeline_split (core.transformer.transformer_config.TransformerConfig attribute) activation_func (core.transformer.mlp.MLPSubmodules attribute) (core.transformer.transformer_config.TransformerConfig attribute) activation_func_clamp_value (core.transformer.transformer_config.TransformerConfig attribute) activation_func_fp8_input_store (core.transformer.transformer_config.TransformerConfig attribute) add() (core.tensor_parallel.random.CudaRNGStatesTracker method) add_bias_linear (core.transformer.transformer_config.TransformerConfig attribute) add_document() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_extra_token_to_sequence (core.datasets.gpt_dataset.GPTDatasetConfig attribute) add_index() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_item() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_prefix_for_sharding() (in module core.dist_checkpointing.utils) add_qkv_bias (core.transformer.transformer_config.TransformerConfig attribute) all_gather_last_dim_from_tensor_parallel_region() (in module core.tensor_parallel.mappings) all_to_all() (in module core.tensor_parallel.mappings) all_to_all_hp2sp() (in module core.tensor_parallel.mappings) all_to_all_sp2hp() (in module core.tensor_parallel.mappings) allow_shape_mismatch (core.dist_checkpointing.mapping.ShardedTensor attribute) apply() (core.fusions.fused_bias_gelu.GeLUFunction class method) apply_factories() (in module core.dist_checkpointing.mapping) apply_factory_merges() (in module core.dist_checkpointing.mapping) apply_prefix_mapping() (in module core.dist_checkpointing.utils) apply_query_key_layer_scaling (core.transformer.transformer_config.TransformerConfig attribute) apply_residual_connection_post_layernorm (core.transformer.transformer_config.TransformerConfig attribute) apply_rope_fusion (core.transformer.transformer_config.TransformerConfig attribute) apply_swiglu_sharded_factory() (in module core.transformer.mlp) arbitrary (core.transformer.enums.AttnMaskType attribute) async_save() (core.dist_checkpointing.strategies.base.AsyncSaveShardedStrategy method) AsyncSaveShardedStrategy (class in core.dist_checkpointing.strategies.base) Attention (class in core.transformer.attention) attention_backend (core.transformer.transformer_config.TransformerConfig attribute) attention_dropout (core.transformer.transformer_config.TransformerConfig attribute) attention_mask_func() (in module core.transformer.utils) attention_softmax_in_fp32 (core.transformer.transformer_config.TransformerConfig attribute) AttnBackend (class in core.transformer.enums) AttnMaskType (class in core.transformer.enums) AttnType (class in core.transformer.enums) auto (core.transformer.enums.AttnBackend attribute) axis_fragmentations (core.dist_checkpointing.mapping.ShardedTensor attribute) B backward() (core.fusions.fused_bias_gelu.GeLUFunction static method) (core.fusions.fused_softmax.ScaledMaskedSoftmax static method) (core.fusions.fused_softmax.ScaledSoftmax static method) (core.fusions.fused_softmax.ScaledUpperTriangMaskedSoftmax static method) (core.tensor_parallel.layers.LinearWithFrozenWeight static method) (core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication static method) (core.tensor_parallel.random.CheckpointFunction static method) (core.tensor_parallel.random.CheckpointWithoutOutputFunction static method) backward_dw() (core.transformer.attention.SelfAttention method) (core.transformer.mlp.MLP method) backward_step() (in module core.pipeline_parallel.schedules) BaseTransformerLayer (class in core.transformer.transformer_layer) bert_extended_attention_mask() (core.models.bert.bert_model.BertModel method) bert_position_ids() (core.models.bert.bert_model.BertModel method) BERTMaskedWordPieceDataset (class in core.datasets.bert_dataset) BERTMaskedWordPieceDatasetConfig (class in core.datasets.bert_dataset) BertModel (class in core.models.bert.bert_model) beta_fast (core.transformer.transformer_config.MLATransformerConfig attribute) beta_slow (core.transformer.transformer_config.MLATransformerConfig attribute) bf16 (core.transformer.module.Float16Module attribute) bias_activation_fusion (core.transformer.transformer_config.TransformerConfig attribute) bias_dropout_add_fused_inference() (in module core.fusions.fused_bias_dropout) bias_dropout_add_fused_train() (in module core.fusions.fused_bias_dropout) bias_dropout_add_unfused() (in module core.fusions.fused_bias_dropout) bias_dropout_fusion (core.transformer.transformer_config.TransformerConfig attribute) bias_gelu() (in module core.fusions.fused_bias_gelu) bias_gelu_back() (in module core.fusions.fused_bias_gelu) bias_gelu_impl() (in module core.fusions.fused_bias_gelu) blend (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) blend_per_split (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) BlendedDataset (class in core.datasets.blended_dataset) BlendedMegatronDatasetBuilder (class in core.datasets.blended_megatron_dataset_builder) BlendedMegatronDatasetConfig (class in core.datasets.blended_megatron_dataset_config) bos (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) broadcast_data() (in module core.tensor_parallel) (in module core.tensor_parallel.data) broadcast_params() (core.distributed.distributed_data_parallel.DistributedDataParallel method) build() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) build_fn (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) build_generic_dataset() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder static method) build_low_level_dataset() (core.datasets.gpt_dataset.GPTDataset static method) (core.datasets.gpt_dataset.MockGPTDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) build_schedule_plan() (core.models.gpt.gpt_model.GPTModel method) C cache_mla_latents (core.transformer.transformer_config.MLATransformerConfig attribute) calculate_cross_entropy_loss() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_gradients() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_logits_max() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_per_token_loss (core.transformer.transformer_config.TransformerConfig attribute) calculate_predicted_logits() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) can_handle_sharded_objects (core.dist_checkpointing.strategies.base.LoadStrategyBase property) (core.dist_checkpointing.strategies.base.SaveStrategyBase property) causal (core.transformer.enums.AttnMaskType attribute) causal_bottom_right (core.transformer.enums.AttnMaskType attribute) check_backend_compatibility() (core.dist_checkpointing.strategies.base.LoadStrategyBase method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) check_first_val_step() (in module core.pipeline_parallel.schedules) check_is_distributed_checkpoint() (in module core.dist_checkpointing.core) check_version_compatibility() (core.dist_checkpointing.strategies.base.LoadStrategyBase method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) checkpoint() (core.tensor_parallel.CheckpointWithoutOutput method) (core.tensor_parallel.random.CheckpointWithoutOutput method) (in module core.tensor_parallel) (in module core.tensor_parallel.random) CheckpointFunction (class in core.tensor_parallel.random) CheckpointingConfig (class in core.dist_checkpointing.core) CheckpointingException CheckpointWithoutOutput (class in core.tensor_parallel) (class in core.tensor_parallel.random) CheckpointWithoutOutputFunction (class in core.tensor_parallel.random) classification_head (core.datasets.bert_dataset.BERTMaskedWordPieceDatasetConfig attribute) clear_embedding_activation_buffer() (in module core.pipeline_parallel.schedules) clone_scatter_output_in_embedding (core.transformer.transformer_config.TransformerConfig attribute) cls (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) code_from_dtype() (core.datasets.indexed_dataset.DType class method) ColumnParallelLinear (class in core.tensor_parallel) (class in core.tensor_parallel.layers) common_backend (core.dist_checkpointing.core.CheckpointingConfig attribute) common_backend_version (core.dist_checkpointing.core.CheckpointingConfig attribute) compile_helpers() (in module core.datasets.utils) config (core.transformer.module.Float16Module attribute) config_attention_mask() (core.datasets.t5_dataset.T5MaskedWordPieceDataset static method) config_logger_dir (core.transformer.transformer_config.TransformerConfig attribute) ConstantNumMicroBatchesCalculator (class in core.num_microbatches_calculator) conversion_helper() (in module core.transformer.module) convert_schedule_table_to_order() (in module core.pipeline_parallel.schedules) convert_split_vector_to_split_matrix() (in module core.datasets.blended_megatron_dataset_config) copy_tensor_model_parallel_attributes() (in module core.tensor_parallel) (in module core.tensor_parallel.layers) copy_to_tensor_model_parallel_region() (in module core.tensor_parallel) (in module core.tensor_parallel.mappings) core.datasets module core.datasets.bert_dataset module core.datasets.blended_dataset module core.datasets.blended_megatron_dataset_builder module core.datasets.blended_megatron_dataset_config module core.datasets.gpt_dataset module core.datasets.indexed_dataset module core.datasets.masked_dataset module core.datasets.megatron_dataset module core.datasets.megatron_tokenizer module core.datasets.t5_dataset module core.datasets.utils module core.dist_checkpointing module core.dist_checkpointing.core module core.dist_checkpointing.dict_utils module core.dist_checkpointing.mapping module core.dist_checkpointing.optimizer module core.dist_checkpointing.serialization module core.dist_checkpointing.strategies module core.dist_checkpointing.strategies.base module core.dist_checkpointing.strategies.tensorstore module core.dist_checkpointing.strategies.two_stage module core.dist_checkpointing.strategies.zarr module core.dist_checkpointing.utils module core.distributed module core.distributed.distributed_data_parallel module core.distributed.finalize_model_grads module core.fusions.fused_bias_dropout module core.fusions.fused_bias_gelu module core.fusions.fused_cross_entropy module core.fusions.fused_layer_norm module core.fusions.fused_softmax module core.models module core.models.bert module core.models.bert.bert_model module core.models.gpt module core.models.gpt.gpt_model module core.models.T5 module core.models.T5.t5_model module core.num_microbatches_calculator module core.optimizer_param_scheduler module core.pipeline_parallel module core.pipeline_parallel.p2p_communication module core.pipeline_parallel.schedules module core.tensor_parallel module core.tensor_parallel.cross_entropy module core.tensor_parallel.data module core.tensor_parallel.layers module core.tensor_parallel.mappings module core.tensor_parallel.random module core.tensor_parallel.utils module core.transformer module core.transformer.attention module core.transformer.dot_product_attention module core.transformer.enums module core.transformer.identity_op module core.transformer.mlp module core.transformer.module module core.transformer.transformer_block module core.transformer.transformer_config module core.transformer.transformer_layer module core.transformer.utils module core_attention (core.transformer.attention.CrossAttentionSubmodules attribute) (core.transformer.attention.SelfAttentionSubmodules attribute) cp_comm_type (core.transformer.transformer_config.TransformerConfig attribute) create_attention_mask (core.datasets.gpt_dataset.GPTDatasetConfig attribute) cross_attn (core.transformer.enums.AttnType attribute) CrossAttention (class in core.transformer.attention) CrossAttentionSubmodules (class in core.transformer.attention) cuda_graph_retain_backward_graph (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_scope (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_use_single_mempool (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_warmup_steps (core.transformer.transformer_config.TransformerConfig attribute) CudaRNGStatesTracker (class in core.tensor_parallel.random) custom_backward() (in module core.pipeline_parallel.schedules) D data (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) deallocate_output_tensor() (in module core.pipeline_parallel.schedules) debug_msg() (in module core.dist_checkpointing.utils) debug_time() (in module core.dist_checkpointing.utils) decoder (core.transformer.enums.LayerType attribute) deduplicate_chunks() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) destroy_num_microbatches_calculator() (in module core.num_microbatches_calculator) detokenize() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) dict_list_map_inplace() (in module core.dist_checkpointing.dict_utils) dict_list_map_outplace() (in module core.dist_checkpointing.dict_utils) dict_map() (in module core.dist_checkpointing.dict_utils) dict_map_with_key() (in module core.dist_checkpointing.dict_utils) diff() (in module core.dist_checkpointing.dict_utils) disable_bf16_reduced_precision_matmul (core.transformer.transformer_config.TransformerConfig attribute) disable_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) disable_parameter_transpose_cache (core.transformer.transformer_config.TransformerConfig attribute) discard_output_and_register_recompute() (core.tensor_parallel.CheckpointWithoutOutput method) (core.tensor_parallel.random.CheckpointWithoutOutput method) distribute_saved_activations (core.transformer.transformer_config.TransformerConfig attribute) DistributedDataParallel (class in core.distributed.distributed_data_parallel) document_indices (core.datasets.indexed_dataset.IndexedDataset property) DotProductAttention (class in core.transformer.dot_product_attention) drop_last_partial_validation_sequence (core.datasets.gpt_dataset.GPTDatasetConfig attribute) DType (class in core.datasets.indexed_dataset) dtype (core.dist_checkpointing.mapping.ShardedTensor attribute) dtype_from_code() (core.datasets.indexed_dataset.DType class method) E embedding (core.transformer.enums.LayerType attribute) embedding_init_method (core.transformer.transformer_config.TransformerConfig attribute) embedding_init_method_std (core.transformer.transformer_config.TransformerConfig attribute) empty_from_unique_key() (core.dist_checkpointing.mapping.ShardedObject class method) enable_cuda_graph (core.transformer.transformer_config.TransformerConfig attribute) enable_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) encoder (core.transformer.enums.LayerType attribute) encoder_and_decoder (core.transformer.enums.ModelType property) encoder_or_decoder (core.transformer.enums.ModelType attribute) end_document() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) eod (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) eod_mask_loss (core.datasets.gpt_dataset.GPTDatasetConfig attribute) eos (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) erf_gelu() (in module core.transformer.utils) exists() (core.datasets.indexed_dataset.IndexedDataset static method) external_cuda_graph (core.transformer.transformer_config.TransformerConfig attribute) extract_matching_values() (in module core.dist_checkpointing.dict_utils) extract_nonpersistent() (in module core.dist_checkpointing.utils) extract_sharded_base() (in module core.dist_checkpointing.utils) extract_sharded_tensors() (in module core.dist_checkpointing.utils) extract_sharded_tensors_and_factories() (in module core.dist_checkpointing.utils) extract_sharded_tensors_or_nonpersistent() (in module core.dist_checkpointing.utils) F ffn_hidden_size (core.transformer.transformer_config.TransformerConfig attribute) finalize() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) finalize_model_grads() (in module core.distributed.finalize_model_grads) finish_embedding_wgrad_compute() (in module core.pipeline_parallel.schedules) finish_grad_sync() (core.distributed.distributed_data_parallel.DistributedDataParallel method) first_last_layers_bf16 (core.transformer.transformer_config.TransformerConfig attribute) flash (core.transformer.enums.AttnBackend attribute) flash_decode (core.transformer.transformer_config.TransformerConfig attribute) flash_decode() (core.transformer.attention.Attention method) flash_decode_and_prefill() (core.transformer.attention.Attention method) flatten_range() (in module core.dist_checkpointing.strategies.zarr) flattened_range (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) float16_to_fp32() (in module core.transformer.module) Float16Module (class in core.transformer.module) float32 (core.datasets.indexed_dataset.DType attribute) float64 (core.datasets.indexed_dataset.DType attribute) force_all_tensors_to_non_fp8() (in module core.dist_checkpointing.utils) fork() (core.tensor_parallel.random.CudaRNGStatesTracker method) forward() (core.fusions.fused_bias_gelu.GeLUFunction static method) (core.fusions.fused_layer_norm.FusedLayerNorm method) (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) (core.fusions.fused_softmax.ScaledMaskedSoftmax static method) (core.fusions.fused_softmax.ScaledSoftmax static method) (core.fusions.fused_softmax.ScaledUpperTriangMaskedSoftmax static method) (core.fusions.fused_softmax.SoftmaxOne method) (core.models.bert.bert_model.BertModel method) (core.models.gpt.gpt_model.GPTModel method) (core.models.T5.t5_model.T5LMHead method) (core.models.T5.t5_model.T5Model method) (core.tensor_parallel.ColumnParallelLinear method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.LinearWithFrozenWeight static method) (core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication static method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.layers.VocabParallelEmbedding method) (core.tensor_parallel.random.CheckpointFunction static method) (core.tensor_parallel.random.CheckpointWithoutOutputFunction static method) (core.tensor_parallel.RowParallelLinear method) (core.tensor_parallel.VocabParallelEmbedding method) (core.transformer.attention.Attention method) (core.transformer.dot_product_attention.DotProductAttention method) (core.transformer.identity_op.IdentityFuncOp method) (core.transformer.identity_op.IdentityOp method) (core.transformer.mlp.MLP method) (core.transformer.module.Float16Module method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) forward_backward_no_pipelining() (in module core.pipeline_parallel.schedules) forward_backward_pipelining_with_interleaving() (in module core.pipeline_parallel.schedules) forward_backward_pipelining_without_interleaving() (in module core.pipeline_parallel.schedules) forward_fused_softmax() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) forward_step() (in module core.pipeline_parallel.schedules) forward_step_calc_loss() (in module core.pipeline_parallel.schedules) forward_torch_softmax() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) fp16 (core.transformer.module.Float16Module attribute) fp32_residual_connection (core.transformer.transformer_config.TransformerConfig attribute) fp32_to_float16() (in module core.transformer.module) fp4 (core.transformer.transformer_config.TransformerConfig attribute) fp4_param (core.transformer.transformer_config.TransformerConfig attribute) fp4_recipe (core.transformer.transformer_config.TransformerConfig attribute) fp8 (core.transformer.transformer_config.TransformerConfig attribute) fp8_amax_compute_algo (core.transformer.transformer_config.TransformerConfig attribute) fp8_amax_history_len (core.transformer.transformer_config.TransformerConfig attribute) fp8_dot_product_attention (core.transformer.transformer_config.TransformerConfig attribute) fp8_interval (core.transformer.transformer_config.TransformerConfig attribute) fp8_margin (core.transformer.transformer_config.TransformerConfig attribute) fp8_multi_head_attention (core.transformer.transformer_config.TransformerConfig attribute) fp8_param (core.transformer.transformer_config.TransformerConfig attribute) fp8_recipe (core.transformer.transformer_config.TransformerConfig attribute) fp8_wgrad (core.transformer.transformer_config.TransformerConfig attribute) from_rank_offsets() (core.dist_checkpointing.mapping.ShardedTensor class method) from_rank_offsets_flat() (core.dist_checkpointing.mapping.ShardedTensor class method) full_validation (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) fused (core.transformer.enums.AttnBackend attribute) fused_vocab_parallel_cross_entropy() (in module core.fusions.fused_cross_entropy) FusedLayerNorm (class in core.fusions.fused_layer_norm) FusedScaleMaskSoftmax (class in core.fusions.fused_softmax) G gated_linear_unit (core.transformer.transformer_config.TransformerConfig attribute) gather_from_sequence_parallel_region() (in module core.tensor_parallel) (in module core.tensor_parallel.mappings) gather_from_tensor_model_parallel_region() (in module core.tensor_parallel) (in module core.tensor_parallel.mappings) gather_split_1d_tensor() (in module core.tensor_parallel) (in module core.tensor_parallel.utils) gelu_impl() (in module core.transformer.utils) GeLUFunction (class in core.fusions.fused_bias_gelu) get() (core.datasets.gpt_dataset.MockGPTLowLevelDataset method) (core.datasets.indexed_dataset.IndexedDataset method) (core.num_microbatches_calculator.NumMicroBatchesCalculator method) get_all_rng_states() (in module core.tensor_parallel.random) get_batch_per_block() (core.fusions.fused_softmax.FusedScaleMaskSoftmax static method) get_bias_dropout_add() (in module core.fusions.fused_bias_dropout) get_bin_path() (in module core.datasets.indexed_dataset) get_blend_from_list() (in module core.datasets.utils) get_cuda_rng_tracker() (in module core.tensor_parallel) (in module core.tensor_parallel.random) get_current_global_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_current_running_global_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_data_parallel_rng_tracker_name() (in module core.tensor_parallel.random) get_default_causal_mask() (in module core.transformer.utils) get_default_load_sharded_strategy() (in module core.dist_checkpointing.serialization) get_default_save_common_strategy() (in module core.dist_checkpointing.serialization) get_default_save_sharded_strategy() (in module core.dist_checkpointing.serialization) get_default_strategy() (in module core.dist_checkpointing.strategies.base) get_document_indices() (core.datasets.indexed_dataset.IndexedDataset method) get_expert_parallel_rng_tracker_name() (in module core.tensor_parallel) (in module core.tensor_parallel.random) get_extra_state() (core.tensor_parallel.ColumnParallelLinear method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.RowParallelLinear method) get_forward_backward_func() (in module core.pipeline_parallel.schedules) get_idx_path() (in module core.datasets.indexed_dataset) get_layer_static_inputs() (core.transformer.transformer_layer.TransformerLayer method) get_linear_layer() (in module core.transformer.utils) get_lr() (core.optimizer_param_scheduler.OptimizerParamScheduler method) get_micro_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_num_layers_to_build() (in module core.transformer.transformer_block) get_num_microbatches() (in module core.num_microbatches_calculator) get_optim_param_to_id_map() (in module core.dist_checkpointing.optimizer) get_param_id_to_sharded_param_map() (in module core.dist_checkpointing.optimizer) get_pp_rank_microbatches() (in module core.pipeline_parallel.schedules) get_query_key_value_tensors() (core.transformer.attention.Attention method) (core.transformer.attention.CrossAttention method) (core.transformer.attention.SelfAttention method) get_schedule_table() (in module core.pipeline_parallel.schedules) get_sliding_window_causal_mask() (in module core.transformer.utils) get_states() (core.tensor_parallel.random.CudaRNGStatesTracker method) get_te_version() (in module core.models.bert.bert_model) get_tensor_shapes() (in module core.pipeline_parallel.schedules) get_transformer_layer_offset() (in module core.transformer.transformer_layer) get_wd() (core.optimizer_param_scheduler.OptimizerParamScheduler method) global_coordinates() (core.dist_checkpointing.mapping.ShardedTensor method) global_offset (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) global_shape (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) global_slice() (core.dist_checkpointing.mapping.ShardedTensor method) glu_linear_offset (core.transformer.transformer_config.TransformerConfig attribute) GPTDataset (class in core.datasets.gpt_dataset) GPTDatasetConfig (class in core.datasets.gpt_dataset) GPTModel (class in core.models.gpt.gpt_model) H has_regular_grid (core.dist_checkpointing.mapping.ShardedTensor property) hetereogenous_dist_checkpoint (core.transformer.transformer_config.TransformerConfig attribute) heterogeneous_block_specs (core.transformer.transformer_config.TransformerConfig attribute) hidden_dropout (core.transformer.transformer_config.TransformerConfig attribute) hidden_size (core.transformer.transformer_config.TransformerConfig attribute) I IdentityFuncOp (class in core.transformer.identity_op) IdentityOp (class in core.transformer.identity_op) IndexedDataset (class in core.datasets.indexed_dataset) IndexedDatasetBuilder (class in core.datasets.indexed_dataset) inference_rng_tracker (core.transformer.transformer_config.TransformerConfig attribute) inference_sampling_seed (core.transformer.transformer_config.TransformerConfig attribute) init_cuda_graph_cache() (in module core.transformer.utils) init_data() (core.dist_checkpointing.mapping.ShardedTensor method) init_method (core.transformer.transformer_config.TransformerConfig attribute) init_method_std (core.transformer.transformer_config.TransformerConfig attribute) init_model_with_meta_device (core.transformer.transformer_config.TransformerConfig attribute) init_num_microbatches_calculator() (in module core.num_microbatches_calculator) initialize() (core.datasets.indexed_dataset.IndexedDataset method) initialize_rng_tracker() (in module core.tensor_parallel.random) inspect_types() (in module core.dist_checkpointing.dict_utils) int16 (core.datasets.indexed_dataset.DType attribute) int32 (core.datasets.indexed_dataset.DType attribute) int64 (core.datasets.indexed_dataset.DType attribute) int8 (core.datasets.indexed_dataset.DType attribute) inv_vocab (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) is_hybrid_model (core.transformer.transformer_config.TransformerConfig attribute) is_initialized() (core.tensor_parallel.random.CudaRNGStatesTracker method) is_kernel_available() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) is_layer_window_attention() (in module core.transformer.utils) is_main_replica() (in module core.dist_checkpointing.mapping) is_single_shape() (in module core.pipeline_parallel.p2p_communication) K k_layernorm (core.transformer.attention.SelfAttentionSubmodules attribute) key (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) kv_channels (core.transformer.transformer_config.TransformerConfig attribute) kv_lora_rank (core.transformer.transformer_config.MLATransformerConfig attribute) L layer_norm (core.transformer.transformer_block.TransformerBlockSubmodules attribute) layer_specs (core.transformer.transformer_block.TransformerBlockSubmodules attribute) layernorm_epsilon (core.transformer.transformer_config.TransformerConfig attribute) layernorm_zero_centered_gamma (core.transformer.transformer_config.TransformerConfig attribute) LayerType (class in core.transformer.enums) linear_fc1 (core.transformer.mlp.MLPSubmodules attribute) linear_fc2 (core.transformer.mlp.MLPSubmodules attribute) linear_kv (core.transformer.attention.CrossAttentionSubmodules attribute) linear_proj (core.transformer.attention.CrossAttentionSubmodules attribute) (core.transformer.attention.SelfAttentionSubmodules attribute) linear_q (core.transformer.attention.CrossAttentionSubmodules attribute) linear_qkv (core.transformer.attention.SelfAttentionSubmodules attribute) linear_with_frozen_weight() (in module core.tensor_parallel.layers) linear_with_grad_accumulation_and_async_allreduce() (in module core.tensor_parallel) (in module core.tensor_parallel.layers) LinearWithFrozenWeight (class in core.tensor_parallel.layers) LinearWithGradAccumulationAndAsyncCommunication (class in core.tensor_parallel.layers) load() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) LOAD_COMMON (core.dist_checkpointing.strategies.base.StrategyAction attribute) load_common() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) load_common_state_dict() (in module core.dist_checkpointing.serialization) load_content_metadata() (in module core.dist_checkpointing.serialization) load_plain_tensors() (in module core.dist_checkpointing.serialization) LOAD_SHARDED (core.dist_checkpointing.strategies.base.StrategyAction attribute) load_sharded_metadata() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (in module core.dist_checkpointing.serialization) load_sharded_objects() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) load_state_dict() (core.optimizer_param_scheduler.OptimizerParamScheduler method) (core.transformer.module.Float16Module method) load_tensor_from_storage() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) load_tensors_metadata() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) load_zarr_based_sharded_metadata() (in module core.dist_checkpointing.strategies.zarr) LoadCommonStrategy (class in core.dist_checkpointing.strategies.base) LoadShardedStrategy (class in core.dist_checkpointing.strategies.base) LoadStrategyBase (class in core.dist_checkpointing.strategies.base) local (core.transformer.enums.AttnBackend attribute) local_chunk_offset_in_global() (core.dist_checkpointing.mapping.ShardedTensor method) local_coordinates() (core.dist_checkpointing.mapping.ShardedTensor method) local_shape (core.dist_checkpointing.mapping.ShardedTensor attribute) LocalNonpersistentObject (class in core.dist_checkpointing.mapping) logger_stack() (in module core.dist_checkpointing.utils) loss (core.transformer.enums.LayerType attribute) M make_sharded_object_for_checkpoint() (in module core.transformer.utils) make_sharded_optimizer_tensor() (in module core.dist_checkpointing.optimizer) make_sharded_tensors_for_checkpoint() (in module core.transformer.utils) mamba_head_dim (core.transformer.transformer_config.TransformerConfig attribute) mamba_num_groups (core.transformer.transformer_config.TransformerConfig attribute) mamba_num_heads (core.transformer.transformer_config.TransformerConfig attribute) mamba_state_dim (core.transformer.transformer_config.TransformerConfig attribute) map_reduce() (in module core.dist_checkpointing.dict_utils) mask (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) masked_softmax_fusion (core.transformer.transformer_config.TransformerConfig attribute) MaskedWordPieceDataset (class in core.datasets.masked_dataset) MaskedWordPieceDatasetConfig (class in core.datasets.masked_dataset) masking_do_full_word (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_do_permutation (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_max_ngram (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_probability (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_use_geometric_distribution (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_use_longer_ngrams (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) max_allowed_chunks() (core.dist_checkpointing.mapping.ShardedTensor method) max_position_embeddings (core.transformer.transformer_config.MLATransformerConfig attribute) max_sequence_length (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) maybe_init_gloo_group() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) maybe_load_config() (in module core.dist_checkpointing.core) MegatronDataset (class in core.datasets.megatron_dataset) MegatronLegacyTokenizer (class in core.datasets.megatron_tokenizer) MegatronModule (class in core.transformer.module) memory_efficient_layer_norm (core.transformer.transformer_config.TransformerConfig attribute) merge() (in module core.dist_checkpointing.dict_utils) merge_fn (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) merge_global_slice_with_shape() (in module core.dist_checkpointing.strategies.tensorstore) mid_level_dataset_surplus (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MLATransformerConfig (class in core.transformer.transformer_config) MLP (class in core.transformer.mlp) mlp_chunks_for_prefill (core.transformer.transformer_config.TransformerConfig attribute) MLPSubmodules (class in core.transformer.mlp) mmap_bin_files (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) mock (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MockGPTDataset (class in core.datasets.gpt_dataset) MockGPTLowLevelDataset (class in core.datasets.gpt_dataset) model_parallel_cuda_manual_seed() (in module core.tensor_parallel) (in module core.tensor_parallel.random) ModelType (class in core.transformer.enums) module core.datasets core.datasets.bert_dataset core.datasets.blended_dataset core.datasets.blended_megatron_dataset_builder core.datasets.blended_megatron_dataset_config core.datasets.gpt_dataset core.datasets.indexed_dataset core.datasets.masked_dataset core.datasets.megatron_dataset core.datasets.megatron_tokenizer core.datasets.t5_dataset core.datasets.utils core.dist_checkpointing core.dist_checkpointing.core core.dist_checkpointing.dict_utils core.dist_checkpointing.mapping core.dist_checkpointing.optimizer core.dist_checkpointing.serialization core.dist_checkpointing.strategies core.dist_checkpointing.strategies.base core.dist_checkpointing.strategies.tensorstore core.dist_checkpointing.strategies.two_stage core.dist_checkpointing.strategies.zarr core.dist_checkpointing.utils core.distributed core.distributed.distributed_data_parallel core.distributed.finalize_model_grads core.fusions.fused_bias_dropout core.fusions.fused_bias_gelu core.fusions.fused_cross_entropy core.fusions.fused_layer_norm core.fusions.fused_softmax core.models core.models.bert core.models.bert.bert_model core.models.gpt core.models.gpt.gpt_model core.models.T5 core.models.T5.t5_model core.num_microbatches_calculator core.optimizer_param_scheduler core.pipeline_parallel core.pipeline_parallel.p2p_communication core.pipeline_parallel.schedules core.tensor_parallel core.tensor_parallel.cross_entropy core.tensor_parallel.data core.tensor_parallel.layers core.tensor_parallel.mappings core.tensor_parallel.random core.tensor_parallel.utils core.transformer core.transformer.attention core.transformer.dot_product_attention core.transformer.enums core.transformer.identity_op core.transformer.mlp core.transformer.module core.transformer.transformer_block core.transformer.transformer_config core.transformer.transformer_layer core.transformer.utils moe_apply_probs_on_input (core.transformer.transformer_config.TransformerConfig attribute) moe_aux_loss_coeff (core.transformer.transformer_config.TransformerConfig attribute) moe_deepep_num_sms (core.transformer.transformer_config.TransformerConfig attribute) moe_enable_deepep (core.transformer.transformer_config.TransformerConfig attribute) moe_expert_capacity_factor (core.transformer.transformer_config.TransformerConfig attribute) moe_ffn_hidden_size (core.transformer.transformer_config.TransformerConfig attribute) moe_grouped_gemm (core.transformer.transformer_config.TransformerConfig attribute) moe_input_jitter_eps (core.transformer.transformer_config.TransformerConfig attribute) moe_layer_freq (core.transformer.transformer_config.TransformerConfig attribute) moe_layer_recompute (core.transformer.transformer_config.TransformerConfig attribute) moe_pad_expert_input_to_capacity (core.transformer.transformer_config.TransformerConfig attribute) moe_per_layer_logging (core.transformer.transformer_config.TransformerConfig attribute) moe_permute_fusion (core.transformer.transformer_config.TransformerConfig attribute) moe_router_bias_update_rate (core.transformer.transformer_config.TransformerConfig attribute) moe_router_dtype (core.transformer.transformer_config.TransformerConfig attribute) moe_router_enable_expert_bias (core.transformer.transformer_config.TransformerConfig attribute) moe_router_force_load_balancing (core.transformer.transformer_config.TransformerConfig attribute) moe_router_fusion (core.transformer.transformer_config.TransformerConfig attribute) moe_router_group_topk (core.transformer.transformer_config.TransformerConfig attribute) moe_router_load_balancing_type (core.transformer.transformer_config.TransformerConfig attribute) moe_router_num_groups (core.transformer.transformer_config.TransformerConfig attribute) moe_router_padding_for_fp8 (core.transformer.transformer_config.TransformerConfig attribute) moe_router_pre_softmax (core.transformer.transformer_config.TransformerConfig attribute) moe_router_score_function (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk_limited_devices (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk_scaling_factor (core.transformer.transformer_config.TransformerConfig attribute) moe_shared_expert_intermediate_size (core.transformer.transformer_config.TransformerConfig attribute) moe_shared_expert_overlap (core.transformer.transformer_config.TransformerConfig attribute) moe_token_dispatcher_type (core.transformer.transformer_config.TransformerConfig attribute) moe_token_drop_policy (core.transformer.transformer_config.TransformerConfig attribute) moe_token_dropping (core.transformer.transformer_config.TransformerConfig attribute) moe_use_legacy_grouped_gemm (core.transformer.transformer_config.TransformerConfig attribute) moe_z_loss_coeff (core.transformer.transformer_config.TransformerConfig attribute) mrope_section (core.transformer.transformer_config.TransformerConfig attribute) mscale (core.transformer.transformer_config.MLATransformerConfig attribute) mscale_all_dim (core.transformer.transformer_config.MLATransformerConfig attribute) mtp (core.transformer.enums.LayerType attribute) mtp_loss_scaling_factor (core.transformer.transformer_config.TransformerConfig attribute) mtp_num_layers (core.transformer.transformer_config.TransformerConfig attribute) multi_latent_attention (core.transformer.transformer_config.MLATransformerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) multiple_validation_sets (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) N narrow() (core.dist_checkpointing.mapping.ShardedTensor method) nested_items_iter() (in module core.dist_checkpointing.dict_utils) nested_values() (in module core.dist_checkpointing.dict_utils) no_mask (core.transformer.enums.AttnMaskType attribute) no_rope_freq (core.transformer.transformer_config.TransformerConfig attribute) no_sync() (core.distributed.distributed_data_parallel.DistributedDataParallel method) normalization (core.transformer.transformer_config.MLATransformerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) normalize() (in module core.datasets.utils) num_attention_heads (core.transformer.transformer_config.TransformerConfig attribute) num_dataset_builder_threads (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) num_layers (core.transformer.transformer_config.TransformerConfig attribute) num_layers_at_end_in_bf16 (core.transformer.transformer_config.TransformerConfig attribute) num_layers_at_start_in_bf16 (core.transformer.transformer_config.TransformerConfig attribute) num_layers_in_first_pipeline_stage (core.transformer.transformer_config.TransformerConfig attribute) num_layers_in_last_pipeline_stage (core.transformer.transformer_config.TransformerConfig attribute) num_moe_experts (core.transformer.transformer_config.TransformerConfig attribute) num_query_groups (core.transformer.transformer_config.TransformerConfig attribute) numel_low_level_dataset() (core.datasets.gpt_dataset.GPTDataset static method) (core.datasets.gpt_dataset.MockGPTDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) NumMicroBatchesCalculator (class in core.num_microbatches_calculator) O object_storage_cache_path (core.datasets.gpt_dataset.GPTDatasetConfig attribute) offsets() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) open_ts_array() (in module core.dist_checkpointing.strategies.tensorstore) openai_gelu() (in module core.transformer.utils) optim_state_to_sharding_state() (in module core.dist_checkpointing.optimizer) optimal_dtype() (core.datasets.indexed_dataset.DType static method) OptimizerParamScheduler (class in core.optimizer_param_scheduler) original_max_position_embeddings (core.transformer.transformer_config.MLATransformerConfig attribute) output_layer_init_method (core.transformer.transformer_config.TransformerConfig attribute) P P2PCommunicator (class in core.pipeline_parallel.p2p_communication) pad (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) pad_to_expected_shape() (in module core.dist_checkpointing.strategies.zarr) padding (core.transformer.enums.AttnMaskType attribute) padding_causal (core.transformer.enums.AttnMaskType attribute) param_is_not_shared() (in module core.transformer.module) param_is_not_tensor_parallel_duplicate() (in module core.tensor_parallel) (in module core.tensor_parallel.layers) parse_and_normalize_split() (in module core.datasets.blended_megatron_dataset_config) path_to_cache (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) persist_layer_norm (core.transformer.transformer_config.TransformerConfig attribute) pipeline_model_parallel_layout (core.transformer.transformer_config.TransformerConfig attribute) postprocess_numpy_array() (in module core.dist_checkpointing.strategies.zarr) prepare_gradient_calculation_operands() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) prepend_axis_num (core.dist_checkpointing.mapping.ShardedTensor attribute) Q q_layernorm (core.transformer.attention.SelfAttentionSubmodules attribute) q_lora_rank (core.transformer.transformer_config.MLATransformerConfig attribute) qk_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) qk_layernorm (core.transformer.transformer_config.TransformerConfig attribute) qk_pos_emb_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) quant_recipe (core.transformer.transformer_config.TransformerConfig attribute) R RampupBatchsizeNumMicroBatchesCalculator (class in core.num_microbatches_calculator) random_seed (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) recompute_granularity (core.transformer.transformer_config.TransformerConfig attribute) recompute_method (core.transformer.transformer_config.TransformerConfig attribute) recompute_modules (core.transformer.transformer_config.TransformerConfig attribute) recompute_num_layers (core.transformer.transformer_config.TransformerConfig attribute) reconfigure_num_microbatches_calculator() (in module core.num_microbatches_calculator) recv_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) recv_forward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) reduce_from_tensor_model_parallel_region() (in module core.tensor_parallel) (in module core.tensor_parallel.mappings) reduce_scatter_last_dim_to_tensor_parallel_region() (in module core.tensor_parallel.mappings) reduce_scatter_to_sequence_parallel_region() (in module core.tensor_parallel) (in module core.tensor_parallel.mappings) register_default_strategy() (in module core.dist_checkpointing.strategies.base) register_default_tensorstore_strategies() (in module core.dist_checkpointing.strategies.tensorstore) register_default_zarr_strategies() (in module core.dist_checkpointing.strategies.zarr) remove_sharded_tensors() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (in module core.dist_checkpointing.serialization) replace_prefix_for_sharding() (in module core.dist_checkpointing.utils) replica_id (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) reset() (core.tensor_parallel.random.CudaRNGStatesTracker method) reset_attention_mask (core.datasets.gpt_dataset.GPTDatasetConfig attribute) reset_parameters() (core.fusions.fused_layer_norm.FusedLayerNorm method) reset_position_ids (core.datasets.gpt_dataset.GPTDatasetConfig attribute) rope_type (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_base (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_interleaved (core.transformer.transformer_config.TransformerConfig attribute) rotary_percent (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_scaling_factor (core.transformer.transformer_config.MLATransformerConfig attribute) RowParallelLinear (class in core.tensor_parallel) (class in core.tensor_parallel.layers) run_realtime_tests() (core.transformer.attention.SelfAttention method) S save() (core.dist_checkpointing.strategies.base.AsyncSaveShardedStrategy method) (core.dist_checkpointing.strategies.base.SaveShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrSaveShardedStrategy method) (in module core.dist_checkpointing.serialization) SAVE_COMMON (core.dist_checkpointing.strategies.base.StrategyAction attribute) save_common() (core.dist_checkpointing.strategies.base.SaveCommonStrategy method) save_config() (in module core.dist_checkpointing.core) SAVE_SHARDED (core.dist_checkpointing.strategies.base.StrategyAction attribute) save_sharded_objects() (core.dist_checkpointing.strategies.base.SaveCommonStrategy method) SaveCommonStrategy (class in core.dist_checkpointing.strategies.base) SaveShardedStrategy (class in core.dist_checkpointing.strategies.base) SaveStrategyBase (class in core.dist_checkpointing.strategies.base) scale_gradients() (core.distributed.distributed_data_parallel.DistributedDataParallel method) ScaledMaskedSoftmax (class in core.fusions.fused_softmax) ScaledSoftmax (class in core.fusions.fused_softmax) ScaledUpperTriangMaskedSoftmax (class in core.fusions.fused_softmax) scatter_to_sequence_parallel_region() (in module core.tensor_parallel) (in module core.tensor_parallel.mappings) scatter_to_tensor_model_parallel_region() (in module core.tensor_parallel) (in module core.tensor_parallel.mappings) seed (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) self_attn (core.transformer.enums.AttnType attribute) SelfAttention (class in core.transformer.attention) SelfAttentionSubmodules (class in core.transformer.attention) send_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_backward_recv_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_backward_recv_forward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_backward_recv_forward_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_recv_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_recv_forward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) sep (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) sequence_length (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) sequence_length_decoder (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig attribute) sequence_length_encoder (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig attribute) sequence_lengths (core.datasets.indexed_dataset.IndexedDataset property) sequence_modes (core.datasets.indexed_dataset.IndexedDataset property) set_current_microbatch() (in module core.pipeline_parallel.schedules) set_defaults_if_not_set_tensor_model_parallel_attributes() (in module core.tensor_parallel) (in module core.tensor_parallel.layers) set_document_indices() (core.datasets.indexed_dataset.IndexedDataset method) set_extra_state() (core.tensor_parallel.ColumnParallelLinear method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.RowParallelLinear method) set_for_recompute_input_layernorm() (core.transformer.attention.Attention method) (core.transformer.attention.SelfAttention method) set_input_tensor() (core.models.bert.bert_model.BertModel method) (core.models.gpt.gpt_model.GPTModel method) (core.models.T5.t5_model.T5Model method) (core.transformer.module.Float16Module method) (core.transformer.transformer_block.TransformerBlock method) set_is_first_microbatch() (core.transformer.module.MegatronModule method) set_model_to_sequence_parallel() (in module core.transformer.utils) set_states() (core.tensor_parallel.random.CudaRNGStatesTracker method) set_symmetric_ar() (core.transformer.module.MegatronModule method) set_tensor_model_parallel_attributes() (in module core.tensor_parallel) (in module core.tensor_parallel.layers) setup_manual_hooks() (core.transformer.transformer_layer.TransformerLayer method) sharded_backend (core.dist_checkpointing.core.CheckpointingConfig attribute) sharded_backend_version (core.dist_checkpointing.core.CheckpointingConfig attribute) sharded_state_dict() (core.models.gpt.gpt_model.GPTModel method) (core.models.T5.t5_model.T5Model method) (core.tensor_parallel.ColumnParallelLinear method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.layers.VocabParallelEmbedding method) (core.tensor_parallel.RowParallelLinear method) (core.tensor_parallel.VocabParallelEmbedding method) (core.transformer.dot_product_attention.DotProductAttention method) (core.transformer.mlp.MLP method) (core.transformer.module.Float16Module method) (core.transformer.module.MegatronModule method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) sharded_state_dict_default() (in module core.transformer.utils) sharded_state_dict_keys_map (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) sharded_tensor_chunk_id() (in module core.dist_checkpointing.strategies.two_stage) ShardedBase (class in core.dist_checkpointing.mapping) ShardedObject (class in core.dist_checkpointing.mapping) ShardedTensor (class in core.dist_checkpointing.mapping) ShardedTensorFactory (class in core.dist_checkpointing.mapping) shared_embedding_or_output_weight() (core.models.gpt.gpt_model.GPTModel method) (core.models.T5.t5_model.T5Model method) short_sequence_probability (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) size (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) size() (core.datasets.indexed_dataset.DType static method) softmax_scale (core.transformer.transformer_config.TransformerConfig attribute) softmax_type (core.transformer.transformer_config.TransformerConfig attribute) SoftmaxOne (class in core.fusions.fused_softmax) Split (class in core.datasets.utils) split (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) split_matrix (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) split_tensor_along_last_dim() (in module core.tensor_parallel) (in module core.tensor_parallel.utils) split_tensor_into_1d_equal_chunks() (in module core.tensor_parallel) (in module core.tensor_parallel.utils) start_grad_sync() (core.distributed.distributed_data_parallel.DistributedDataParallel method) start_param_sync() (core.distributed.distributed_data_parallel.DistributedDataParallel method) state_dict() (core.optimizer_param_scheduler.OptimizerParamScheduler method) (core.transformer.module.Float16Module method) state_dict_for_save_checkpoint() (core.transformer.module.Float16Module method) (core.transformer.module.MegatronModule method) step() (core.optimizer_param_scheduler.OptimizerParamScheduler method) StrategyAction (class in core.dist_checkpointing.strategies.base) summarize_load_times() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) symmetric_ar_type (core.transformer.transformer_config.TransformerConfig attribute) T t5_extended_attention_mask() (in module core.models.T5.t5_model) t5_position_ids() (in module core.models.T5.t5_model) T5LMHead (class in core.models.T5.t5_model) T5MaskedWordPieceDataset (class in core.datasets.t5_dataset) T5MaskedWordPieceDatasetConfig (class in core.datasets.t5_dataset) T5Model (class in core.models.T5.t5_model) TensorStoreLoadShardedStrategy (class in core.dist_checkpointing.strategies.tensorstore) test (core.datasets.utils.Split attribute) test_mode (core.transformer.transformer_config.TransformerConfig attribute) timed() (in module core.dist_checkpointing.strategies.two_stage) toggle_cuda_graphs() (in module core.transformer.utils) tokenize() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) tokenizer (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) tp_only_amax_red (core.transformer.transformer_config.TransformerConfig attribute) train (core.datasets.utils.Split attribute) transformer_impl (core.transformer.transformer_config.TransformerConfig attribute) TransformerBlock (class in core.transformer.transformer_block) TransformerBlockSubmodules (class in core.transformer.transformer_block) TransformerConfig (class in core.transformer.transformer_config) TransformerLayer (class in core.transformer.transformer_layer) TransformerLayerSubmodules (class in core.transformer.transformer_layer) TwoStageDataParallelLoadShardedStrategy (class in core.dist_checkpointing.strategies.two_stage) U uint16 (core.datasets.indexed_dataset.DType attribute) uint8 (core.datasets.indexed_dataset.DType attribute) unfused (core.transformer.enums.AttnBackend attribute) unique_key (core.dist_checkpointing.mapping.ShardedObject property) unset_num_microbatches_calculator() (in module core.num_microbatches_calculator) unwrap() (core.dist_checkpointing.mapping.LocalNonpersistentObject method) update() (core.num_microbatches_calculator.ConstantNumMicroBatchesCalculator method) (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (core.num_microbatches_calculator.RampupBatchsizeNumMicroBatchesCalculator method) update_num_microbatches() (in module core.num_microbatches_calculator) use_fused_weighted_squared_relu (core.transformer.transformer_config.TransformerConfig attribute) use_kitchen (core.transformer.transformer_config.TransformerConfig attribute) use_mamba_mem_eff_path (core.transformer.transformer_config.TransformerConfig attribute) use_te_activation_func (core.transformer.transformer_config.TransformerConfig attribute) use_te_rng_tracker (core.transformer.transformer_config.TransformerConfig attribute) V v_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) valid (core.datasets.utils.Split attribute) validate_metadata_integrity() (core.dist_checkpointing.mapping.ShardedBase method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) vocab (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) vocab_parallel_cross_entropy() (in module core.tensor_parallel) (in module core.tensor_parallel.cross_entropy) vocab_range_from_global_vocab_size() (core.tensor_parallel.utils.VocabUtility static method) vocab_range_from_per_partition_vocab_size() (core.tensor_parallel.utils.VocabUtility static method) vocab_size (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) VocabParallelCrossEntropy (class in core.tensor_parallel.cross_entropy) VocabParallelEmbedding (class in core.tensor_parallel) (class in core.tensor_parallel.layers) VocabUtility (class in core.tensor_parallel.utils) W window_attn_skip_freq (core.transformer.transformer_config.TransformerConfig attribute) window_size (core.transformer.transformer_config.TransformerConfig attribute) without_data() (core.dist_checkpointing.mapping.ShardedBase method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) Z ZarrLoadShardedStrategy (class in core.dist_checkpointing.strategies.zarr) ZarrSaveShardedStrategy (class in core.dist_checkpointing.strategies.zarr) zero_grad_buffer() (core.distributed.distributed_data_parallel.DistributedDataParallel method) zip_strict() (in module core.dist_checkpointing.utils)