Index _ | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | Y | Z _ __all__ (in module core) (in module core.config_logger) (in module core.distributed.fsdp.src.megatron_fsdp) (in module core.models.common.embeddings.relative_pos_embedding) (in module core.models.common.embeddings.rope_utils) (in module core.models.common.embeddings.rotary_pos_embedding) (in module core.models.mimo) (in module core.models.mimo.config) (in module core.models.mimo.model) (in module core.msc_utils) (in module core.tensor_parallel) (in module core.transformer.custom_layers.batch_invariant_kernels) __call__() (core.full_cuda_graph.FullCudaGraphWrapper method) (core.full_cuda_graph.StaticBufferLoader method) (core.timers.Timers method) (core.transformer.cuda_graphs.CudaGraphManager method) (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) (core.utils._ValueWithRank method) (core.utils.StragglerDetector method) __config_logger_path_counts (in module core.config_logger) __contact_emails__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __contact_names__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __create_chunk_list__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) __create_write_items__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) __del__() (core.datasets.indexed_dataset._IndexReader method) (core.datasets.indexed_dataset._MMapBinReader method) (core.datasets.indexed_dataset._S3BinReader method) (core.datasets.indexed_dataset.IndexedDataset method) (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) __description__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __download_url__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __enter__() (core.datasets.indexed_dataset._IndexWriter method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.MultiGroupUBRAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ResetParametersContext method) (core.nccl_allocator.MultiGroupMemPoolAllocator method) (core.nccl_allocator.nccl_mem method) (core.utils.StragglerDetector method) __eq__() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) (core.inference.contexts.static_context.StaticInferenceContext method) __exit__() (core.datasets.indexed_dataset._IndexWriter method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.MultiGroupUBRAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ResetParametersContext method) (core.nccl_allocator.MultiGroupMemPoolAllocator method) (core.nccl_allocator.nccl_mem method) (core.utils.StragglerDetector method) __get_tensor_shard__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) __getattr__() (in module core.models.bert.bert_layer_specs) __getitem__() (core.datasets.bert_dataset.BERTMaskedWordPieceDataset method) (core.datasets.blended_dataset.BlendedDataset method) (core.datasets.gpt_dataset.GPTDataset method) (core.datasets.gpt_dataset.MockGPTLowLevelDataset method) (core.datasets.indexed_dataset._IndexReader method) (core.datasets.indexed_dataset.IndexedDataset method) (core.datasets.megatron_dataset.MegatronDataset method) (core.datasets.multimodal_dataset.MockMultimodalDataset method) (core.datasets.retro.db.dataset.DBDataset method) (core.datasets.retro.query.gpt_chunk_dataset.GPTChunkDataset method) (core.datasets.retro.query.multi_split_gpt_dataset.MultiSplitGPTDataset method) (core.datasets.retro.query.retro_dataset.RetroDataset method) (core.datasets.retro.utils.BlockPathMap method) (core.datasets.retro.utils.GPTToTextDataset method) (core.datasets.t5_dataset.T5MaskedWordPieceDataset method) (core.inference.inference_request.DynamicInferenceRequestRecord method) (core.optimizer.optimizer.ProxyDict method) __getstate_() (core.rerun_state_machine.QuickStats method) __getstate__() (core.datasets.indexed_dataset.IndexedDataset method) (core.msc_utils._FeatureFlag method) __gt__() (core.utils._ValueWithRank method) __hash__() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) __homepage__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __init_() (core.inference.headers.UnknownHeaderError method) __iter__() (core.optimizer.optimizer.ProxyDict method) __keywords__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __len__() (core.datasets.blended_dataset.BlendedDataset method) (core.datasets.gpt_dataset.GPTDataset method) (core.datasets.gpt_dataset.MockGPTLowLevelDataset method) (core.datasets.indexed_dataset._IndexReader method) (core.datasets.indexed_dataset.IndexedDataset method) (core.datasets.masked_dataset.MaskedWordPieceDataset method) (core.datasets.megatron_dataset.MegatronDataset method) (core.datasets.retro.db.dataset.DBDataset method) (core.datasets.retro.query.gpt_chunk_dataset.GPTChunkDataset method) (core.datasets.retro.query.retro_dataset.RetroDataset method) (core.datasets.retro.utils.GPTToTextDataset method) (core.optimizer.distrib_optimizer.Range method) (core.optimizer.optimizer.ProxyDict method) __license__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __LOGGER_NAME_STACK (in module core.dist_checkpointing.utils) __LOGGER_STACK (in module core.dist_checkpointing.utils) __lt__() (core.utils._ValueWithRank method) __new__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) (core.extensions.transformer_engine.TENorm method) (core.post_training.modelopt.layers.Norm method) (core.transformer.torch_norm.WrappedTorchNorm method) (core.utils.StragglerDetector method) __next__() (core.inference.utils.Counter method) (core.rerun_state_machine.RerunDataIterator method) __package_name__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __post_init__() (core.datasets.bert_dataset.BERTMaskedWordPieceDatasetConfig method) (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig method) (core.datasets.gpt_dataset.GPTDatasetConfig method) (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig method) (core.datasets.multimodal_dataset.MultimodalDatasetConfig method) (core.datasets.retro.config.config.RetroPreprocessingConfig method) (core.datasets.retro.query.multi_split_gpt_dataset.MultiSplitGPTDatasetConfig method) (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig method) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig method) (core.export.export_config.ExportConfig method) (core.extensions.kitchen.CompoundParamsConfigSchema method) (core.extensions.kitchen.QAttentionParamsConfigSchema method) (core.extensions.kitchen.QFlashAttentionParamsConfigSchema method) (core.extensions.kitchen.QLinearParamsConfigSchema method) (core.inference.inference_request.DynamicInferenceEvent method) (core.inference.inference_request.DynamicInferenceRequest method) (core.inference.inference_request.InferenceRequest method) (core.inference.sampling_params.SamplingParams method) (core.model_parallel_config.ModelParallelConfig method) (core.models.retro.config.RetroConfig method) (core.optimizer.optimizer_config.OptimizerConfig method) (core.transformer.transformer_config.MLATransformerConfig method) (core.transformer.transformer_config.TransformerConfig method) __repository_url__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __repr__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) (core.extensions.kitchen.KitchenColumnParallelLinear method) (core.extensions.kitchen.KitchenLayerNormColumnParallelLinear method) (core.extensions.kitchen.KitchenRowParallelLinear method) (core.extensions.transformer_engine.TEColumnParallelLinear method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TERowParallelLinear method) (core.optimizer.distrib_optimizer.Range method) (core.process_groups_config.ProcessGroupCollection method) (core.quantization.quant_config.GlobMatcher method) (core.quantization.quant_config.QuantizationConfig method) (core.quantization.quant_config.RecipeConfig method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) __setattr__() (core.models.huggingface.module.HuggingFaceModule method) (core.process_groups_config.ProcessGroupHelperMeta method) __setitem__() (core.optimizer.optimizer.ProxyDict method) __setstate() (core.rerun_state_machine.QuickStats method) __setstate__() (core.datasets.indexed_dataset.IndexedDataset method) (core.msc_utils._FeatureFlag method) __shortversion__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __str__() (core.datasets.retro.utils.BlockPathMap method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.strategies.base.SaveStrategyBase method) (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) (core.inference.contexts.static_context.StaticInferenceContext method) (core.inference.inference_request.DynamicInferenceEvent method) (core.inference.inference_request.DynamicInferenceRequest method) (core.optimizer.distrib_optimizer.Range method) (core.transformer.cuda_graphs._CudaGraphRunner method) (core.utils._ValueWithRank method) __straggler__ (in module core.utils) __torch_dispatch__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor class method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer class method) __version__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) _ActiveAsyncRequest (class in core.dist_checkpointing.strategies.async_utils) _add() (core.datasets.retro.index.indexes.faiss_base.FaissBaseIndex method) _add_request() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _add_scales_to_converter() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _add_to_index() (in module core.datasets.retro.index.build) _add_to_trtllm_model_weights() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) _adjust_key_value_for_inference() (core.transformer.attention.Attention method) _all_gather() (core.tensor_parallel.inference_layers.InferenceLayerNormColumnParallelLinear method) _all_to_all_cp2hp() (in module core.ssm.mamba_context_parallel) _all_to_all_hp2cp() (in module core.ssm.mamba_context_parallel) _AllGatherFromTensorParallelRegion (class in core.tensor_parallel.mappings) _alloc (in module core.inference.unified_memory) _alloc_storage() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _allocate() (core.utils.GlobalSymmetricMemoryBuffer method) _allocate_auto() (in module core.ssm.mamba_hybrid_layer_allocation) _allocate_memory() (core.transformer.attention.Attention method) _allocate_override() (in module core.ssm.mamba_hybrid_layer_allocation) _allocate_recv_buffer() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) _allocator (in module core.nccl_allocator) _allreduce_conditional_embedding_grads() (in module core.distributed.finalize_model_grads) _allreduce_embedding_grad() (in module core.distributed.finalize_model_grads) _allreduce_layernorm_grads (in module core.distributed.finalize_model_grads) _allreduce_non_tensor_model_parallel_grads() (in module core.distributed.finalize_model_grads) _allreduce_position_embedding_grads() (in module core.distributed.finalize_model_grads) _allreduce_word_embedding_grads() (in module core.distributed.finalize_model_grads) _AllToAll (class in core.tensor_parallel.mappings) _append_kv_cache_kernel() (in module core.inference.contexts.fused_kv_append_kernel) _apply_aux_loss() (core.transformer.moe.router.TopKRouter method) _apply_bias() (core.transformer.moe.experts.TEGroupedMLP static method) _apply_expert_bias() (core.transformer.moe.router.TopKRouter method) _apply_global_aux_loss() (core.transformer.moe.router.TopKRouter method) _apply_rotary_pos_emb_bshd() (in module core.models.common.embeddings.rope_utils) _apply_rotary_pos_emb_thd() (in module core.models.common.embeddings.rope_utils) _apply_scaling() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) _apply_seq_aux_loss() (core.transformer.moe.router.TopKRouter method) _apply_tile_tagging() (core.models.multimodal.llava_model.LLaVAModel method) _assemble_full_tensor_from_uneven_chunks() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) _ASYNC_IO_LOOP (in module core.utils) _ASYNC_TASK_STATS (in module core.utils) _backward() (core.pipeline_parallel.utils.ScheduleNode method) _backward_kv_proj() (core.transformer.multi_latent_attention.MLASelfAttention method) _backward_output_proj() (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) _backward_q_proj() (core.transformer.multi_latent_attention.MLASelfAttention method) _backward_qkv_proj() (core.transformer.attention.SelfAttention method) _BaseDataParallel (class in core.distributed.data_parallel_base) _batch_invariant_LIB (in module core.transformer.custom_layers.batch_invariant_kernels) _batch_invariant_MODE (in module core.transformer.custom_layers.batch_invariant_kernels) _batched_p2p_ops() (in module core.pipeline_parallel.p2p_communication) _BF16_TYPES (in module core.transformer.module) _bias_dropout_add_func() (in module core.fusions.fused_bias_dropout) _BinReader (class in core.datasets.indexed_dataset) _bucket_group_gradient_reduce() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) _buffer (in module core.transformer.moe.fused_a2a) _build_attention_mask_and_position_ids() (core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper.GPTInferenceWrapper method) _build_b1ss_attention_mask() (core.datasets.t5_dataset.T5MaskedWordPieceDataset static method) _build_blended_dataset_splits() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) _build_callable_nodes() (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan method) _build_document_index() (in module core.datasets.gpt_dataset) _build_document_sample_shuffle_indices() (core.datasets.gpt_dataset.GPTDataset method) _build_gbuf_range_map() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_indices() (core.datasets.blended_dataset.BlendedDataset method) _build_key_size_numel_dictionaries() (in module core.tensor_parallel.data) _build_layers() (core.transformer.multi_token_prediction.MultiTokenPredictionBlock method) (core.transformer.transformer_block.TransformerBlock method) _build_load_plan() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) _build_matchers() (core.quantization.quant_config.RecipeConfig static method) _build_megatron_dataset_splits() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) _build_megatron_datasets_parallel() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) _build_model_and_main_param_groups() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_gbuf_param_range_map() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_gbuf_range() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_param_gbuf_map() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_param_to_state_dict_param_map() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _build_nccl_allocator() (in module core.nccl_allocator) _build_num_microbatches_calculator() (in module core.num_microbatches_calculator) _build_optimizer_group_ranges() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_sample_index() (core.datasets.masked_dataset.MaskedWordPieceDataset method) _build_shuffle_index() (in module core.datasets.gpt_dataset) _calculate_cuda_graph_token_counts() (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder static method) _can_allocate() (core.utils.GlobalSymmetricMemoryBuffer method) _cast_value() (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) _causal_conv1d_version (in module core.utils) _check_and_set() (core.optimizer_param_scheduler.OptimizerParamScheduler method) _check_data_types() (in module core.tensor_parallel.data) _check_mesh_ranks_and_group_ranks_are_consistent() (in module core.distributed.fsdp.mcore_fsdp_adapter) _check_module_parameter_types() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _check_supported_type() (in module core.transformer.cuda_graphs) _check_toggle() (core.utils.StragglerDetector method) _checkpointed_attention_forward() (core.transformer.attention.Attention method) _checkpointed_forward() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) (core.transformer.transformer_block.TransformerBlock method) _clean_metadata_for_serialization() (in module core.dist_checkpointing.utils) _clip_kv_proj_weight() (core.transformer.multi_latent_attention.MLASelfAttention method) _clip_linear_qkv() (core.transformer.attention.SelfAttention method) _clip_q_proj_weight() (core.transformer.multi_latent_attention.MLASelfAttention method) _collect_main_grad_data_for_unscaling() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _collect_original_tensor_info() (core.post_training.modelopt.layers.RealQuantTransformerLayer method) _COMM_STREAM (in module core.pipeline_parallel.utils) _communicate() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) _communicate_shapes() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) _COMP_STREAM (in module core.pipeline_parallel.utils) _compare_dataclasses() (in module core.dist_checkpointing.strategies.state_dict_saver) _compare_floats() (in module core.rerun_state_machine) _compilation_state (in module core.inference.unified_memory) _compile_timeout() (in module core.inference.unified_memory) _compute_bias() (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding method) _compute_pid() (in module core.transformer.custom_layers.batch_invariant_kernels) _compute_shards_access() (in module core.dist_checkpointing.validation) _concat_embeddings() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _configure_global_num_microbatches_calculator() (in module core.num_microbatches_calculator) _configured (core.utils.StragglerDetector attribute) _connect_with_inference_coordinator() (core.inference.inference_client.InferenceClient method) _CONTENT_METADATA_KEY (in module core.dist_checkpointing.serialization) _CONTEXT_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _CONTEXT_PARALLEL_GROUP (in module core.parallel_state) _controller() (core.utils.StragglerDetector method) _convert_non_transformer_layer() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) _convert_to_moe_state_dict() (in module core.transformer.moe.upcycling_utils) _convert_transformer_layer() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) _copy_main_params_to_model_params() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _copy_main_params_to_param_buffer() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _copy_model_grads_to_main_grads() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _copy_model_params_to_main_params() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _CopyToModelParallelRegion (class in core.tensor_parallel.mappings) _cpu_offloading_context (core.model_parallel_config.ModelParallelConfig attribute) _create_masked_lm_predictions() (core.datasets.masked_dataset.MaskedWordPieceDataset method) _create_or_open_zarr_arrays() (in module core.dist_checkpointing.strategies.zarr) _create_zarr_array() (in module core.dist_checkpointing.strategies.zarr) _CUDA_RNG_STATE_TRACKER (in module core.tensor_parallel.random) _CUDA_RNG_STATE_TRACKER_INITIALIZED (in module core.tensor_parallel.random) _CudagraphGlobalRecord (class in core.transformer.cuda_graphs) _CudagraphRecordNode (class in core.transformer.cuda_graphs) _CudagraphReplayNode (class in core.transformer.cuda_graphs) _CudaGraphRunner (class in core.transformer.cuda_graphs) _DATA_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP (in module core.parallel_state) _DATA_PARALLEL_GROUP (in module core.parallel_state) _DATA_PARALLEL_GROUP_GLOO (in module core.parallel_state) _DATA_PARALLEL_GROUP_WITH_CP (in module core.parallel_state) _DATA_PARALLEL_GROUP_WITH_CP_GLOO (in module core.parallel_state) _DATA_PARALLEL_RNG_TRACKER_NAME (in module core.tensor_parallel.random) _decode_extra_state() (core.extensions.kitchen.KitchenGroupedLinear method) _DeepepManager (class in core.transformer.moe.token_dispatcher) _defer_loading_sharded_items() (in module core.dist_checkpointing.strategies.fully_parallel) _defer_loading_sharded_objects() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) _defer_loading_sharded_tensors() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) _determine_if_first_last_layer_of_this_vp_chunk() (in module core.transformer.cuda_graphs) _determine_if_transformer_decoder_layer() (in module core.transformer.cuda_graphs) _determine_missing_and_unexpected_keys() (in module core.dist_checkpointing.validation) _detokenize() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _disable_gc() (in module core.dist_checkpointing.strategies.async_utils) _DispatchManager (class in core.transformer.moe.token_dispatcher) _distribute_data_to_state_dict() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) _download() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _dtype_size() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _dynamic_step_calculate_log_probs() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_calculate_top_n_logprobs() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_context_bookkeeping() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_context_init() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_forward_logits() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_log_probs_bookkeeping() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_sample_bookkeeping() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_sample_logits() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _EMBEDDING_GLOBAL_RANKS (in module core.parallel_state) _EMBEDDING_GROUP (in module core.parallel_state) _encode_extra_state() (core.extensions.kitchen.KitchenGroupedLinear method) _enforce_double_buffer_limit() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) _ep_group_has_work() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _ERROR_NAMES (core.rerun_state_machine.RerunErrorInjector attribute) _exchange_loaded_tensors() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) _EXPERT_DATA_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_DATA_PARALLEL_GROUP_GLOO (in module core.parallel_state) _EXPERT_MODEL_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_MODEL_PARALLEL_RANKS (in module core.parallel_state) _EXPERT_PARALLEL_RNG_TRACKER_NAME (in module core.tensor_parallel.random) _EXPERT_TENSOR_AND_MODEL_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_TENSOR_MODEL_PIPELINE_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_TENSOR_PARALLEL_GROUP (in module core.parallel_state) _extract_common_per_param_step() (core.optimizer.optimizer.MegatronOptimizer static method) _extract_from_cache() (core.datasets.indexed_dataset._S3BinReader method) _extract_te_gemm_args() (in module core.transformer.custom_layers.batch_invariant_kernels) _fa_version (in module core.utils) _FeatureFlag (class in core.msc_utils) _FileBinReader (class in core.datasets.indexed_dataset) _fill_in_deferred_sharded_items() (in module core.dist_checkpointing.strategies.fully_parallel) _filter_and_reorder_param_groups() (core.optimizer.optimizer.MegatronOptimizer static method) _find_submodule() (in module core.transformer.moe.upcycling_utils) _finish_capturing() (core.transformer.cuda_graphs.TECudaGraphHelper method) _fix_tensor_parallel_attributes() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) _FLOAT_TYPES (in module core.transformer.module) _fork_rng() (in module core.tensor_parallel.random) _forward() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.models.retro.decoder_attention.RetroDecoderBiasDropoutAdd class method) (core.models.retro.encoder_attention.RetroEncoderBiasDropoutAdd class method) (core.pipeline_parallel.utils.ScheduleNode method) _forward_attention() (core.transformer.transformer_layer.TransformerLayer method) _forward_impl() (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) _forward_mlp() (core.transformer.transformer_layer.TransformerLayer method) _free_storage() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _fsdp_modules (core.models.huggingface.clip_model.SiglipHuggingFaceModel attribute) (core.models.huggingface.qwen_model.QwenHuggingFaceModel attribute) _gather_along_first_dim() (in module core.tensor_parallel.mappings) _gather_along_last_dim() (in module core.tensor_parallel.mappings) _GatherFromModelParallelRegion (class in core.tensor_parallel.mappings) _GatherFromSequenceParallelRegion (class in core.tensor_parallel.mappings) _gen_rank_enum() (core.hyper_comm_grid.HyperCommGrid method) _get() (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) _get_all_ranks_time_string() (core.timers.Timers method) _get_all_rng_states() (in module core.tensor_parallel.random) _get_async_caller() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) _get_available_models_list() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _get_batch_size_and_seq_len() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) _get_block_submodules() (in module core.transformer.transformer_block) _get_config() (in module core.transformer.moe.upcycling_utils) _get_cuda_graph_input_data() (core.transformer.cuda_graphs.TECudaGraphHelper method) _get_cuda_rng_state() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) _get_custom_recipe() (in module core.fp8_utils) _get_distribution() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict static method) _get_dp_buffer_shard_bucket_index() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _get_dp_tp_mesh() (in module core.distributed.fsdp.mcore_fsdp_adapter) _get_elapsed_time_all_ranks() (core.timers.Timers method) _get_embeddings() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _get_empty_tensor_for_exchange() (in module core.dist_checkpointing.exchange_utils) _get_energy() (core.energy_monitor.EnergyMonitor method) _get_extra_kitchen_kwargs() (in module core.extensions.kitchen) _get_extra_state_offsets() (in module core.transformer.utils) _get_extra_te_kwargs() (in module core.extensions.transformer_engine) _get_filesystem_reader() (in module core.dist_checkpointing.strategies.torch) _get_fp8_autocast_for_quant_params() (in module core.extensions.transformer_engine) _get_fp8_autocast_for_quant_recipe() (in module core.extensions.transformer_engine) _get_fp8_params_and_shard_fp32_from_fp8() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _get_fsdp_tensor_spec() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _get_gbuf_name() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.RotaryBucketAllocator method) _get_global_min_max_time() (core.timers.Timers method) _get_global_min_max_time_string() (core.timers.Timers method) _get_hsdp_tp_mesh() (in module core.distributed.fsdp.mcore_fsdp_adapter) _get_item_local_index() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) _get_item_local_shard_index() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) _get_item_slice_in_shard() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) _get_keys_endswith() (in module core.transformer.moe.upcycling_utils) _get_layer() (core.transformer.transformer_block.TransformerBlock method) _get_layer_offset() (core.transformer.transformer_layer.TransformerLayer static method) _get_ltor_masks_and_position_ids() (in module core.datasets.gpt_dataset) _get_main_grad_attr() (in module core.distributed.finalize_model_grads) _get_main_param_and_optimizer_states() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _get_megatron_optimizer_based_on_param_groups() (in module core.optimizer) _get_merges_file() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _get_metadata_path() (in module core.tokenizers.megatron_tokenizer) _get_mlp_module_spec() (in module core.models.gpt.gpt_layer_specs) (in module core.models.vision.vit_layer_specs) _get_model_and_main_params_data_float16() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _get_model_param_range_map() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _get_mtp_block_submodules() (in module core.transformer.multi_token_prediction) _get_num_epochs() (core.datasets.gpt_dataset.GPTDataset method) _get_num_tokens_per_epoch() (core.datasets.gpt_dataset.GPTDataset method) _get_param_groups() (core.optimizer.optimizer.MegatronOptimizer method) (in module core.optimizer) _get_param_groups_and_buffers() (in module core.optimizer) _get_parameter_groups() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _get_pos_embeddings() (core.models.vision.radio.RADIOViTModel method) _get_position_embedding_weight() (in module core.distributed.finalize_model_grads) _get_pp_layer_offset_for_inference() (core.transformer.attention.Attention method) _get_remove_vocab_padding() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) _get_rng_state_dict() (in module core.distributed.fsdp.mcore_fsdp_adapter) _get_sample_arguments() (core.transformer.cuda_graphs.TECudaGraphHelper method) _get_save_and_finalize_callbacks() (core.dist_checkpointing.strategies.torch.TorchDistSaveShardedStrategy method) _get_shared_word_embedding_weight() (in module core.distributed.finalize_model_grads) _get_should_context_be_quantized_params() (in module core.extensions.transformer_engine) _get_should_context_be_quantized_recipe() (in module core.extensions.transformer_engine) _get_size_per_split_per_dataset() (in module core.datasets.blended_megatron_dataset_builder) _get_state() (core.optimizer.optimizer.MegatronOptimizer method) _get_states_from_cache() (core.ssm.mamba_mixer.MambaMixer method) _get_sub_optimizer_param_groups() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _get_submodules_under_cudagraphs() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _get_te_cuda_graph_replay_args() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _get_thd_freqs_on_this_cp_rank() (in module core.models.common.embeddings.rope_utils) _get_thd_token_idx() (in module core.fusions.fused_mla_yarn_rope_apply) _get_token_mask() (core.datasets.bert_dataset.BERTMaskedWordPieceDataset method) (core.datasets.masked_dataset.MaskedWordPieceDataset method) (core.datasets.t5_dataset.T5MaskedWordPieceDataset method) _get_trtllm_config() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _get_trtllm_pretrained_config_and_model_weights_list_on_single_device() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _get_validation_call_info() (core.rerun_state_machine.RerunStateMachine method) _get_varlen_generation_state() (core.ssm.mamba_mixer.MambaMixer method) _get_vocab_file() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _get_write_results_queue() (in module core.dist_checkpointing.strategies.filesystem_async) _GLOBAL_MEMORY_BUFFER (in module core.parallel_state) _GLOBAL_NUM_MICROBATCHES_CALCULATOR (in module core.num_microbatches_calculator) _global_process_group_list (in module core.parallel_state) _GLOBAL_RERUN_STATE_MACHINE (in module core.rerun_state_machine) _GLOBAL_SYMMETRIC_MEMORY_BUFFER (in module core.parallel_state) _GlobalMetadata (in module core.dist_checkpointing.validation) _grad_accum_fusion_available (in module core.tensor_parallel.layers) _GraphStatus (class in core.transformer.cuda_graphs) _HALF_TYPES (in module core.transformer.module) _handler() (core.utils.StragglerDetector method) _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS (in module core.parallel_state) _hybrid_ep_buffer (in module core.transformer.moe.fused_a2a) _HybridEPManager (class in core.transformer.moe.token_dispatcher) _import_class_from_path() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _import_module_if_available() (in module core.transformer.custom_layers.batch_invariant_kernels) _import_trigger (in module core.dist_checkpointing.strategies.two_stage) _INDEX_HEADER (in module core.datasets.indexed_dataset) _IndexReader (class in core.datasets.indexed_dataset) _IndexWriter (class in core.datasets.indexed_dataset) _indices_to_multihot() (core.transformer.moe.token_dispatcher._DeepepManager method) _indices_to_multihot_kernel() (in module core.fusions.fused_indices_converter) _init_dist_index() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) _init_distributed_params() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _init_dynamic_sampling_tensors() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _init_each_parameter_group_buffers() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _init_fsdp_param_and_grad_buffer() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _init_moe_expert_cache() (in module core.inference.utils) _init_optimizer_named_parameters() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _init_optimizer_states_with_dummy_values() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _init_sequence_parallel_cache() (in module core.transformer.utils) _init_sub_optimizers() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _initialize_affine_weight_cpu() (in module core.tensor_parallel.layers) _initialize_affine_weight_gpu() (in module core.tensor_parallel.layers) _initialize_language_model() (core.models.mimo.model.base.MimoModel method) _initialize_metadata() (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) _initialize_submodules() (core.models.mimo.model.base.MimoModel method) _insert_sharded_data() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP (in module core.parallel_state) _intersection() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) _INTRA_DISTRIBUTED_OPTIMIZER_INSTANCE_GROUP (in module core.parallel_state) _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP (in module core.parallel_state) _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP_GLOO (in module core.parallel_state) _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP (in module core.parallel_state) _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP_GLOO (in module core.parallel_state) _is_cuda() (in module core.inference.communication_utils) _is_cuda_contiguous() (in module core.inference.communication_utils) _IS_GRAPH_CAPTURING (in module core.transformer.cuda_graphs) _is_hollow (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict attribute) _is_in_embd_group() (core.models.common.language_module.language_module.LanguageModule method) _is_msc_path() (in module core.datasets.object_storage_utils) _is_raisable() (core.inference.async_stream.AsyncStream static method) _is_s3_path() (in module core.datasets.object_storage_utils) _is_supported_dtype_for_bik() (in module core.transformer.custom_layers.batch_invariant_kernels) _is_two_bucket_group_equal() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) _item_size() (in module core.dist_checkpointing.strategies.filesystem_async) _kernel_make_viewless_tensor() (in module core.utils) _key_config_attributes() (core.datasets.bert_dataset.BERTMaskedWordPieceDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) (core.datasets.retro.query.multi_split_gpt_dataset.MultiSplitGPTDataset static method) (core.datasets.t5_dataset.T5MaskedWordPieceDataset static method) _KITCHEN_CONFIG_TYPE_KEY (in module core.extensions.kitchen) _layer_counts_match() (in module core.ssm.mamba_hybrid_layer_allocation) _layer_is_graphable() (in module core.transformer.cuda_graphs) _load_from_array() (in module core.dist_checkpointing.strategies.tensorstore) (in module core.dist_checkpointing.strategies.zarr) _load_from_state_dict() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) (core.transformer.moe.router.TopKRouter method) _load_regular_chunk() (in module core.dist_checkpointing.strategies.tensorstore) _load_rng_state_dict() (in module core.distributed.fsdp.mcore_fsdp_adapter) _load_scaling_factors() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _load_state_dict_hook_ignore_extra_state() (in module core.models.multimodal.llava_model) _load_state_dict_hook_ignore_param_names() (in module core.models.multimodal.llava_model) _LocalMetadata (in module core.dist_checkpointing.validation) _log_parameter_groups() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _log_softmax_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) _log_softmax_kernel() (in module core.transformer.custom_layers.batch_invariant_kernels) _log_validation_error_to_file() (core.rerun_state_machine.RerunStateMachine method) _logged_deprecations (in module core.dist_checkpointing.mapping) _LOGGER (in module core.transformer.custom_layers.batch_invariant_kernels) _maintain_float32_expert_bias() (core.transformer.moe.router.TopKRouter method) _make_backward_post_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) _make_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) _mamba_ssm_version (in module core.utils) _matches() (in module core.optimizer) _matmul_launch_metadata() (in module core.transformer.custom_layers.batch_invariant_kernels) _matmul_reduce_scatter() (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) _MAX_DATA_DIM (in module core.tensor_parallel.data) _maybe_allocate_symmetric_buffer() (core.tensor_parallel.inference_layers.InferenceLayerNormColumnParallelLinear method) _maybe_dtoh_and_synchronize() (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) _maybe_report_stats() (core.rerun_state_machine.RerunStateMachine method) _maybe_update_cuda_sync_point() (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) _MEG_TE_GENERAL_GEMM_ORIG (in module core.transformer.custom_layers.batch_invariant_kernels) _metadata_fn (in module core.dist_checkpointing.strategies.torch) _min_max() (core.utils.StragglerDetector method) _MMapBinReader (class in core.datasets.indexed_dataset) _mod (in module core.inference.unified_memory) _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS (in module core.tensor_parallel.layers) _MODEL_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _MODEL_PARALLEL_GROUP (in module core.parallel_state) _MODEL_PARALLEL_RNG_TRACKER_NAME (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) _MOE_LAYER_WISE_LOGGING_TRACKER (in module core.transformer.moe.moe_utils) _move_book_keeping_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) _move_new_state_to_right_device() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _MPU_DATA_PARALLEL_RANK (in module core.parallel_state) _MPU_DATA_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_EXPERT_MODEL_PARALLEL_RANK (in module core.parallel_state) _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_EXPERT_TENSOR_PARALLEL_RANK (in module core.parallel_state) _MPU_EXPERT_TENSOR_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_PIPELINE_MODEL_PARALLEL_RANK (in module core.parallel_state) _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_TENSOR_MODEL_PARALLEL_RANK (in module core.parallel_state) _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _multi_tensor_copy_this_to_that() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.optimizer.optimizer) _multihot_to_indices_kernel() (in module core.fusions.fused_indices_converter) _MultiStorageClientBinReader (class in core.datasets.indexed_dataset) _new_bucket() (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) _norm() (core.transformer.torch_norm.L2Norm method) _notify_cond_for_new_request() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _nvtx_decorator_get_func_path() (in module core.utils) _nvtx_enabled (in module core.utils) _nvtx_range_get_func_path() (in module core.utils) _nvtx_range_messages (in module core.utils) _offset_slice() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) _open_zarr_array_verbose() (in module core.dist_checkpointing.strategies.zarr) _order_dims() (core.hyper_comm_grid.HyperCommGrid method) _p2p_ops() (in module core.pipeline_parallel.p2p_communication) _p_assert() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _pad() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _pad_routing_map() (core.transformer.moe.token_dispatcher._DeepepManager method) _pad_routing_map_kernel() (in module core.fusions.fused_pad_routing_map) _pad_tensor_for_quantization() (core.transformer.moe.experts.SequentialMLP method) _PAD_TOKEN_ID (in module core.datasets.megatron_dataset) _param2group_meta_to_param_groups() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _param_generator() (in module core.optimizer.cpu_offloading.hybrid_optimizer) _param_groups_to_param2group_meta() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _param_name() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _ParamAndGradBucket (class in core.distributed.param_and_grad_buffer) _ParamAndGradBucketGroup (class in core.distributed.param_and_grad_buffer) _ParamAndGradBuffer (class in core.distributed.param_and_grad_buffer) _PIPELINE_GLOBAL_RANKS (in module core.parallel_state) _PIPELINE_MODEL_PARALLEL_GROUP (in module core.parallel_state) _POSITION_EMBEDDING_GLOBAL_RANKS (in module core.parallel_state) _POSITION_EMBEDDING_GROUP (in module core.parallel_state) _postprocess() (core.models.gpt.gpt_model.GPTModel method) (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _preprocess() (core.models.gpt.gpt_model.GPTModel method) _preprocess_data() (core.models.multimodal.llava_model.LLaVAModel method) _process_embedding_token_parallel() (core.models.multimodal.llava_model.LLaVAModel method) _process_memory() (in module core.dist_checkpointing.strategies.filesystem_async) _proj_and_transformer_layer() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _query_document_sample_shuffle_indices() (core.datasets.gpt_dataset.GPTDataset method) _recompute() (core.tensor_parallel.random.CheckpointWithoutOutput method) _recv_task() (core.inference.inference_client.InferenceClient method) _redo_attention_load_balancing() (in module core.ssm.mamba_context_parallel) _reduce() (in module core.tensor_parallel.mappings) _reduce_any() (core.rerun_state_machine.RerunStateMachine method) _reduce_scatter_along_first_dim() (in module core.tensor_parallel.mappings) _reduce_scatter_along_last_dim() (in module core.tensor_parallel.mappings) _ReduceFromModelParallelRegion (class in core.tensor_parallel.mappings) _ReduceScatterToSequenceParallelRegion (class in core.tensor_parallel.mappings) _ReduceScatterToTensorParallelRegion (class in core.tensor_parallel.mappings) _ReduceScatterWithFP32AccumulationWorkHandle (class in core.distributed.reduce_scatter_with_fp32_accumulation) _reestablish_shared_weights() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _register_fsdp_hooks() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _register_load_state_dict_hooks() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _register_param_copy_back_gpu_hook() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _relative_position_bucket() (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding method) _release_state() (core.models.gpt.fine_grained_callables.TransformerLayerNode method) (core.pipeline_parallel.utils.ScheduleNode method) _remove_msc_prefix() (in module core.datasets.object_storage_utils) _remove_redundant_data() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict static method) _remove_s3_prefix() (in module core.datasets.object_storage_utils) _replace_module_parameter() (in module core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) _replace_param_with_distributed_if_needed() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _replace_param_with_raw_if_needed() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _replace_sharded_keys_with_state_dict_keys() (in module core.dist_checkpointing.strategies.torch) _replace_state_dict_keys_with_sharded_keys() (in module core.dist_checkpointing.strategies.torch) _report_quantize_tensor_info() (core.post_training.modelopt.layers.RealQuantTransformerLayer method) _reset_parameters() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _reshard_if_dtensor() (in module core.distributed.finalize_model_grads) _resolve_callable_from_python_import_path() (in module core.fp8_utils) _restore_common_per_param_step() (core.optimizer.optimizer.MegatronOptimizer static method) _restore_dict_types() (in module core.dist_checkpointing.strategies.torch) _restore_model() (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) _restore_state() (core.rerun_state_machine.RerunStateMachine method) _results_queue (in module core.dist_checkpointing.strategies.filesystem_async) _roll_tensor_packed_seq() (in module core.transformer.multi_token_prediction) _rotate_half() (in module core.models.common.embeddings.rope_utils) _round() (in module core.num_microbatches_calculator) _s3_download_file() (in module core.datasets.object_storage_utils) _s3_object_exists() (in module core.datasets.object_storage_utils) _S3BinReader (class in core.datasets.indexed_dataset) _safe_get_rank() (in module core.rerun_state_machine) _sanitize_data_iterators() (core.rerun_state_machine.RerunStateMachine method) _sanity_check_attention_and_get_attn_mask_dimension() (core.models.bert.bert_model.BertModel method) _save_state() (core.rerun_state_machine.RerunStateMachine method) _save_to_existing_array() (in module core.dist_checkpointing.strategies.zarr) _save_to_state_dict() (core.transformer.moe.router.TopKRouter method) _ScatterToModelParallelRegion (class in core.tensor_parallel.mappings) _ScatterToSequenceParallelRegion (class in core.tensor_parallel.mappings) _select_layers_for_pipeline_parallel() (core.ssm.mamba_block.MambaStack method) _send_signal_to_engines() (core.inference.inference_client.InferenceClient method) _sequence_parallel_attr_cache (in module core.transformer.utils) _sequence_pointers() (core.datasets.indexed_dataset._IndexWriter method) _set_all_rng_states() (in module core.tensor_parallel.random) _set_attention_backend() (core.models.common.language_module.language_module.LanguageModule method) _set_capture_end() (in module core.transformer.cuda_graphs) _set_capture_start() (in module core.transformer.cuda_graphs) _set_cos_sin_cache() (core.models.common.embeddings.yarn_rotary_pos_embedding.YarnRotaryEmbedding method) _set_cuda_rng_state() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) _set_fc2_next_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) _set_fc2_residual() (core.transformer.transformer_layer.TransformerLayer method) _set_global_memory_buffer() (in module core.parallel_state) _set_global_symmetric_memory_buffer() (in module core.parallel_state) _set_main_param_and_optimizer_states() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _set_next_layer_norm_weights() (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) _set_param_groups() (core.optimizer.optimizer.MegatronOptimizer method) _set_proj_next_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) _set_proj_residual() (core.transformer.transformer_layer.TransformerLayer method) _set_rerun_state_machine() (in module core.rerun_state_machine) _set_residual() (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) _set_state() (core.optimizer.optimizer.MegatronOptimizer method) _set_sub_optimizer_grads() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _setup_fused_tp_communication() (core.transformer.transformer_block.TransformerBlock method) _shard_size() (in module core.dist_checkpointing.exchange_utils) _sharded_object_id() (in module core.dist_checkpointing.utils) _sharded_state_dict_grouped() (core.extensions.kitchen.KitchenGroupedLinear method) _sharded_tensor_shard_id() (in module core.dist_checkpointing.utils) _sharded_tensors (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) _ShardedTensorMetadata (class in core.dist_checkpointing.strategies.two_stage) _ShardId (in module core.dist_checkpointing.utils) _should_call_local_cudagraph() (core.ssm.mamba_layer.MambaLayer method) (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) _should_call_te_cudagraph() (core.transformer.module.GraphableMegatronModule method) _should_create_array() (in module core.dist_checkpointing.strategies.zarr) _slice_conv_param() (core.ssm.mamba_context_parallel.MambaContextParallel method) _slice_vector_param() (core.ssm.mamba_context_parallel.MambaContextParallel method) _split_along_first_dim() (in module core.tensor_parallel.mappings) _split_along_last_dim() (in module core.tensor_parallel.mappings) _split_by_separation_hint() (in module core.dist_checkpointing.strategies.filesystem_async) _split_by_size_and_type() (in module core.dist_checkpointing.strategies.filesystem_async) _split_extra_state() (core.extensions.kitchen.KitchenGroupedLinear method) _split_state_dict() (core.optimizer.optimizer.ChainedOptimizer method) _split_tensor_at_batch_dim() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) _split_tensor_factory() (in module core.ssm.mamba_mixer) _squared_relu_back() (in module core.fusions.fused_weighted_squared_relu) _start_capturing() (core.transformer.cuda_graphs.TECudaGraphHelper method) _StragglerData (class in core.utils) _swap_book_keeping_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) _sync_hdo_param_groups_to_sub_optimizers() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _sync_hdo_state_to_sub_optimizers() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _sync_prompt_logprobs_fields() (core.inference.sampling_params.SamplingParams method) _sync_sub_optimizers_state_to_hdo() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _synchronize_steps() (core.optimizer.optimizer.ChainedOptimizer method) _TE_CONFIG_TYPE_KEY (in module core.extensions.transformer_engine) _te_cuda_graph_capture() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _te_cuda_graph_replay() (core.ssm.mamba_layer.MambaLayer method) (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _TE_GEMM_FUNC_ORIGS (in module core.transformer.custom_layers.batch_invariant_kernels) _TE_GENERAL_GEMM_ORIG (in module core.transformer.custom_layers.batch_invariant_kernels) _te_general_gemm_patched() (in module core.transformer.custom_layers.batch_invariant_kernels) _te_patch_for_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) _te_rms_norm_kernel() (in module core.tensor_parallel.inference_layers) _te_rmsnorm_forward_patched() (in module core.transformer.custom_layers.batch_invariant_kernels) _TE_RMSNORM_FUNC_ORIGS (in module core.transformer.custom_layers.batch_invariant_kernels) _TE_RMSNORM_ORIG_FWD (in module core.transformer.custom_layers.batch_invariant_kernels) _te_unpatch_for_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) _te_version (in module core.utils) _temporarily_bypass_shape_validation() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) _TENSOR_AND_CONTEXT_PARALLEL_GROUP (in module core.parallel_state) _TENSOR_AND_DATA_PARALLEL_GROUP (in module core.parallel_state) _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP (in module core.parallel_state) _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _TENSOR_MODEL_PARALLEL_GROUP (in module core.parallel_state) _text_to_ids() (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) _text_to_ids_extra_space() (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) _tokenize_prompts_and_batch() (in module core.inference.text_generation_server.tokenization) _torch_sampling_func() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _train() (core.datasets.retro.index.indexes.faiss_base.FaissBaseIndex method) _train_index() (in module core.datasets.retro.index.build) _undo_attention_load_balancing() (in module core.ssm.mamba_context_parallel) _unscale_main_grads_and_check_for_nan() (core.optimizer.optimizer.MixedPrecisionOptimizer method) _unshard_if_dtensor() (in module core.distributed.finalize_model_grads) _unwrap_pyt_sharded_tensor() (in module core.dist_checkpointing.strategies.torch) _update_fp32_params_by_new_state() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _update_legacy_world_tensors() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _update_router_expert_bias() (in module core.distributed.finalize_model_grads) _update_top_n_logprobs_dict() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _validate_common_state_dict() (in module core.dist_checkpointing.validation) _validate_global_shapes() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) _validate_objects_for_key() (in module core.dist_checkpointing.validation) _validate_params() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict static method) _validate_sharding_for_key() (in module core.dist_checkpointing.validation) _ValueWithRank (class in core.utils) _VERBOSE (in module core.datasets.blended_dataset) _version_no_greater_than() (in module core.ssm.triton_cache_manager) _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK (in module core.parallel_state) _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _VocabParallelCrossEntropy (class in core.fusions.fused_cross_entropy) (class in core.tensor_parallel.cross_entropy) _wrapped_run_engine() (core.inference.engines.static_engine.StaticInferenceEngine method) _yarn_find_correction_dim() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_find_correction_range() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_get_concentration_factor() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_get_concentration_factor_from_config() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_get_mscale() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_linear_ramp_mask() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _zero_grad_group_helper() (in module core.optimizer.optimizer) A abort_request() (core.inference.scheduler.Scheduler method) AbstractEngine (class in core.inference.engines.abstract_engine) AbstractModelInferenceWrapper (class in core.inference.model_inference_wrappers.abstract_model_inference_wrapper) AbstractSchedulePlan (class in core.pipeline_parallel.utils) account_for_embedding_in_pipeline_split (core.transformer.transformer_config.TransformerConfig attribute) account_for_loss_in_pipeline_split (core.transformer.transformer_config.TransformerConfig attribute) activation_func (core.transformer.mlp.MLPSubmodules attribute) (core.transformer.transformer_config.TransformerConfig attribute) activation_func() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) activation_func_clamp_value (core.transformer.transformer_config.TransformerConfig attribute) activation_func_fp8_input_store (core.transformer.transformer_config.TransformerConfig attribute) ActivationFuncName (in module core.transformer.moe.upcycling_utils) ACTIVE_AND_GENERATING_TOKENS (core.inference.inference_request.Status attribute) ACTIVE_BUT_NOT_GENERATING_TOKENS (core.inference.inference_request.Status attribute) active_time() (core.timers.DummyTimer method) (core.timers.Timer method) ActiveRequestCountOverflowError adam_beta1 (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) adam_beta2 (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) adam_eps (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) AdamOptimizerConfig (class in core.optimizer.optimizer_config) ADD (core.inference.inference_request.DynamicInferenceEventType attribute) add() (core.datasets.retro.index.index.Index method) (core.datasets.retro.index.indexes.faiss_base.FaissBaseIndex method) (core.datasets.retro.index.indexes.faiss_par_add.FaissParallelAddIndex method) (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.tensor_parallel.random.CudaRNGStatesTracker method) add_attributes() (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig method) (core.inference.sampling_params.SamplingParams method) add_bias_linear (core.transformer.transformer_config.TransformerConfig attribute) add_BOS (core.inference.sampling_params.SamplingParams attribute) add_codes() (core.datasets.retro.index.indexes.faiss_par_add.FaissParallelAddIndex method) add_document() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_dummy_requests_for_cudagraph_capture() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) add_dummy_requests_parallel() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) add_earliest_waiting_request_to_active_pool() (core.inference.scheduler.Scheduler method) add_event() (core.inference.inference_request.DynamicInferenceRequest method) add_event_add() (core.inference.inference_request.DynamicInferenceRequest method) add_event_error_nontransient() (core.inference.inference_request.DynamicInferenceRequest method) add_event_error_transient() (core.inference.inference_request.DynamicInferenceRequest method) add_event_fail() (core.inference.inference_request.DynamicInferenceRequest method) add_event_finish() (core.inference.inference_request.DynamicInferenceRequest method) add_event_pause() (core.inference.inference_request.DynamicInferenceRequest method) add_extra_token_to_sequence (core.datasets.gpt_dataset.GPTDatasetConfig attribute) add_finalize_fn() (core.dist_checkpointing.strategies.async_utils.AsyncRequest method) add_index() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_item() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_prefix_for_sharding() (in module core.dist_checkpointing.utils) add_qkv_bias (core.transformer.transformer_config.TransformerConfig attribute) add_request() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.engines.static_engine.StaticInferenceEngine method) (core.inference.inference_client.InferenceClient method) (core.inference.scheduler.Scheduler method) add_special_tokens() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) add_to_index() (in module core.datasets.retro.index.build) additional_special_tokens_ids (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) addmm_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) adjust_batch_dims_for_expert_parallelism() (core.inference.batch_dimensions_utils.InferenceBatchDimensions static method) adjust_non_strict_load() (in module core.dist_checkpointing.validation) advance() (core.rerun_state_machine.RerunDataIterator method) aflops (core.utils._StragglerData attribute) align_embeddings_by_token_positions() (core.models.mimo.model.base.MimoModel method) align_param_gather (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) all_gather_and_wait_parameters_ready() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) all_gather_last_dim_from_tensor_parallel_region() (in module core.tensor_parallel.mappings) all_gather_parameters() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) all_gather_params() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) all_ranks_for_shard (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) all_reduce_gradients() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) all_reduce_max() (core.inference.engines.async_zmq_communicator.AsyncZMQCommunicator method) all_to_all() (in module core.tensor_parallel.mappings) all_to_all_hp2sp() (in module core.tensor_parallel.mappings) all_to_all_sp2hp() (in module core.tensor_parallel.mappings) AllGatherPipeline (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) allocate() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.RotaryBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.StorageResizeBasedBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.TemporaryBucketAllocator method) allocate_all_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) allocate_layers() (in module core.ssm.mamba_hybrid_layer_allocation) allocate_memory_blocks() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) allow_ambiguous_pad_tokens (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) allow_shape_mismatch (core.dist_checkpointing.mapping.ShardedTensor attribute) append_key_value_cache() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply() (core.fusions.fused_bias_gelu.GeLUFunction class method) apply_chat_template() (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) (core.tokenizers.text.libraries.chat_template.MegatronTokenizerChatTemplate method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) apply_factories() (in module core.dist_checkpointing.mapping) apply_factory_merges() (in module core.dist_checkpointing.mapping) apply_fused_qk_rotary_emb() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply_input_jitter() (core.transformer.moe.router.TopKRouter method) apply_loading_parallelization() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) apply_pos_enc() (core.models.vision.radio.RADIOViTModel method) apply_prefix_mapping() (in module core.dist_checkpointing.utils) apply_query_key_layer_scaling (core.transformer.transformer_config.TransformerConfig attribute) apply_random_logits() (in module core.transformer.moe.moe_utils) apply_residual_connection_post_layernorm (core.transformer.transformer_config.TransformerConfig attribute) apply_rope_fusion (core.transformer.transformer_config.TransformerConfig attribute) apply_rotary_emb_key() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply_rotary_emb_query() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply_rotary_pos_emb() (in module core.models.common.embeddings.rope_utils) apply_rotary_pos_emb_with_cos_sin() (in module core.models.common.embeddings.rope_utils) apply_router_token_dropping() (in module core.transformer.moe.moe_utils) apply_saving_parallelization() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper method) apply_swiglu_sharded_factory() (in module core.transformer.mlp) apply_z_loss() (core.transformer.moe.router.TopKRouter method) ApplyMLARotaryEmbKV (class in core.fusions.fused_mla_yarn_rope_apply) ApplyMLARotaryEmbQ (class in core.fusions.fused_mla_yarn_rope_apply) arbitrary (core.transformer.enums.AttnMaskType attribute) ArgMetadata (class in core.transformer.cuda_graphs) arrival_time (core.inference.inference_request.InferenceRequest attribute) assert_grouped_gemm_is_available() (in module core.transformer.moe.grouped_gemm_util) assert_viewless_tensor() (in module core.utils) ASSUME_OK_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) async_bookkeep() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) async_bucket_gather() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) async_caller (core.dist_checkpointing.strategies.async_utils._ActiveAsyncRequest attribute) async_calls (in module core.dist_checkpointing.strategies.base) async_fn (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) async_fn_args (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) async_fn_kwargs (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) async_forward() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) async_generate_output_tokens_dynamic_batch() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) async_loop() (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller static method) async_request (core.dist_checkpointing.strategies.async_utils._ActiveAsyncRequest attribute) async_save() (core.dist_checkpointing.strategies.base.AsyncSaveShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistSaveShardedStrategy method) async_step() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) async_tensor_model_parallel_allreduce (core.model_parallel_config.ModelParallelConfig attribute) AsyncCaller (class in core.dist_checkpointing.strategies.async_utils) AsyncCallsQueue (class in core.dist_checkpointing.strategies.async_utils) AsyncRequest (class in core.dist_checkpointing.strategies.async_utils) AsyncSaveShardedStrategy (class in core.dist_checkpointing.strategies.base) AsyncStream (class in core.inference.async_stream) AsyncZMQCommunicator (class in core.inference.engines.async_zmq_communicator) attach_and_log_load_balancing_loss() (core.transformer.moe.router.TopKRouter method) attach_grad_to_optimizer_state() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) Attention (class in core.transformer.attention) ATTENTION (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) attention_backend (core.transformer.transformer_config.TransformerConfig attribute) attention_dense_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_dense_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_dropout (core.transformer.transformer_config.TransformerConfig attribute) attention_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) attention_linear_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_mask_func() (in module core.transformer.utils) attention_qkv_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_qkv_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_softmax_in_fp32 (core.transformer.transformer_config.TransformerConfig attribute) AttentionBlockSize (in module core.transformer.custom_layers.batch_invariant_kernels) attn (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) AttnBackend (class in core.transformer.enums) AttnMaskType (class in core.transformer.enums) AttnType (class in core.transformer.enums) attr (core.optimizer.optimizer_config.ParamKey attribute) auto (core.transformer.enums.AttnBackend attribute) autocast_dtype (core.model_parallel_config.ModelParallelConfig attribute) AutoHuggingFaceModel (class in core.models.huggingface.module) average_in_collective (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) await_process_event() (in module core.inference.utils) axis_fragmentations (core.dist_checkpointing.mapping.ShardedTensor attribute) B BackendSpecProvider (class in core.models.backends) backward() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.RegisterFSDPBackwardFunction static method) (core.fusions.fused_bias_geglu.BiasGeGLUFunction static method) (core.fusions.fused_bias_geglu.GeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedBiasQuickGeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedQuickGeGLUFunction static method) (core.fusions.fused_bias_gelu.GeLUFunction static method) (core.fusions.fused_bias_swiglu.BiasSwiGLUFunction static method) (core.fusions.fused_bias_swiglu.SwiGLUFunction static method) (core.fusions.fused_bias_swiglu.WeightedSwiGLUFunction static method) (core.fusions.fused_cross_entropy._VocabParallelCrossEntropy static method) (core.fusions.fused_indices_converter.IndicesToMultihot static method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbKV static method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbQ static method) (core.fusions.fused_softmax.ScaledMaskedSoftmax static method) (core.fusions.fused_softmax.ScaledSoftmax static method) (core.fusions.fused_softmax.ScaledUpperTriangMaskedSoftmax static method) (core.fusions.fused_weighted_squared_relu.WeightedSquaredReLUFunction static method) (core.pipeline_parallel.utils.NoopScheduleNode method) (core.pipeline_parallel.utils.ScheduleNode method) (core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy static method) (core.tensor_parallel.layers.LinearWithFrozenWeight static method) (core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication static method) (core.tensor_parallel.mappings._AllGatherFromTensorParallelRegion static method) (core.tensor_parallel.mappings._AllToAll static method) (core.tensor_parallel.mappings._CopyToModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceFromModelParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToTensorParallelRegion static method) (core.tensor_parallel.mappings._ScatterToModelParallelRegion static method) (core.tensor_parallel.mappings._ScatterToSequenceParallelRegion static method) (core.tensor_parallel.random.CheckpointFunction static method) (core.tensor_parallel.random.CheckpointWithoutOutputFunction static method) (core.transformer.cuda_graphs._CudagraphRecordNode static method) (core.transformer.cuda_graphs._CudagraphReplayNode static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantRMSNormFn static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantTEGemmFn static method) (core.transformer.moe.fused_a2a.FusedCombine static method) (core.transformer.moe.fused_a2a.FusedDispatch static method) (core.transformer.moe.fused_a2a.HybridEPCombine static method) (core.transformer.moe.fused_a2a.HybridEPDispatch static method) (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler static method) (core.transformer.moe.moe_utils.RandomSTE static method) (core.transformer.moe.moe_utils.RouterGatingLinearFunction static method) (core.transformer.multi_token_prediction.MTPLossAutoScaler static method) (core.utils.MakeViewlessTensor static method) backward_dw() (core.extensions.transformer_engine.TEColumnParallelLinear method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) (core.extensions.transformer_engine.TERowParallelLinear method) (core.models.gpt.fine_grained_callables.TransformerLayerNode method) (core.transformer.attention.SelfAttention method) (core.transformer.mlp.MLP method) (core.transformer.moe.experts.GroupedMLP method) (core.transformer.moe.experts.SequentialMLP method) (core.transformer.moe.experts.TEGroupedMLP method) (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.multi_latent_attention.MLASelfAttention method) backward_impl() (core.models.gpt.fine_grained_callables.TransformerLayerNode method) BACKWARD_PASS_ORDER (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.PrefetchOrder attribute) backward_step() (in module core.pipeline_parallel.schedules) barrier_with_L1_time (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) BaseInferenceContext (class in core.inference.contexts.base_context) BaseMoELayer (class in core.transformer.moe.moe_layer) BaseRetroCrossAttention (class in core.models.retro.base_attention) BaseTransformerLayer (class in core.transformer.transformer_layer) batch_invariant_mode (core.transformer.transformer_config.TransformerConfig attribute) batch_p2p_comm (core.model_parallel_config.ModelParallelConfig attribute) batch_p2p_sync (core.model_parallel_config.ModelParallelConfig attribute) BatchInvariantRMSNormFn (class in core.transformer.custom_layers.batch_invariant_kernels) BatchInvariantTEGemmFn (class in core.transformer.custom_layers.batch_invariant_kernels) bert (core.datasets.retro.config.tokenizers.RetroTokenizers attribute) bert_extended_attention_mask() (core.models.bert.bert_model.BertModel method) bert_layer_local_spec (in module core.models.bert.bert_layer_specs) bert_position_ids() (core.models.bert.bert_model.BertModel method) BertLMHead (class in core.models.bert.bert_lm_head) BERTMaskedWordPieceDataset (class in core.datasets.bert_dataset) BERTMaskedWordPieceDatasetConfig (class in core.datasets.bert_dataset) BertModel (class in core.models.bert.bert_model) BertTokenizer (class in core.tokenizers.text.models.bert_tokenizer) beta_fast (core.transformer.transformer_config.MLATransformerConfig attribute) beta_slow (core.transformer.transformer_config.MLATransformerConfig attribute) bf16 (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) bias_activation_fusion (core.transformer.transformer_config.TransformerConfig attribute) bias_dropout_add_fused_inference() (in module core.fusions.fused_bias_dropout) bias_dropout_add_fused_train() (in module core.fusions.fused_bias_dropout) bias_dropout_add_unfused() (in module core.fusions.fused_bias_dropout) bias_dropout_fusion (core.transformer.transformer_config.TransformerConfig attribute) bias_geglu() (in module core.fusions.fused_bias_geglu) bias_geglu_back() (in module core.fusions.fused_bias_geglu) bias_geglu_impl() (in module core.fusions.fused_bias_geglu) bias_gelu() (in module core.fusions.fused_bias_gelu) bias_gelu_back() (in module core.fusions.fused_bias_gelu) bias_gelu_impl (in module core.fusions.fused_bias_gelu) bias_swiglu() (in module core.fusions.fused_bias_swiglu) bias_swiglu_back() (in module core.fusions.fused_bias_swiglu) bias_swiglu_impl() (in module core.fusions.fused_bias_swiglu) BiasGeGLUFunction (class in core.fusions.fused_bias_geglu) BiasSwiGLUFunction (class in core.fusions.fused_bias_swiglu) bin_chunk_nbytes (core.datasets.object_storage_utils.ObjectStorageConfig attribute) blend (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) blend_per_split (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) BlendedDataset (class in core.datasets.blended_dataset) BlendedMegatronDatasetBuilder (class in core.datasets.blended_megatron_dataset_builder) BlendedMegatronDatasetConfig (class in core.datasets.blended_megatron_dataset_config) Block (class in core.datasets.retro.utils) BlockAllocator (class in core.inference.contexts.dynamic_block_allocator) BlockOverflowError BlockPathMap (class in core.datasets.retro.utils) blockwise (core.enums.Fp8Recipe attribute) BlockwiseFP8WeightTransformerLayer (class in core.post_training.modelopt.layers) bos (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) bos_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) BridgeCommunicator (class in core.pipeline_parallel.bridge_communicator) broadcast_data() (in module core.tensor_parallel.data) broadcast_float_list() (in module core.inference.communication_utils) broadcast_from_last_pipeline_stage() (in module core.inference.communication_utils) broadcast_int_list() (in module core.inference.communication_utils) broadcast_list() (in module core.inference.communication_utils) broadcast_params() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) broadcast_tensor() (in module core.inference.communication_utils) Bucket (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) bucket_size (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) BucketIndex (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) BucketingPolicy (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) BucketStatus (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) BufferType (class in core.distributed.param_and_grad_buffer) build() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) build_and_save_engine() (core.export.trtllm.engine_builder.trtllm_engine_builder.TRTLLMEngineBuilder static method) (core.export.trtllm.trtllm_helper.TRTLLMHelper method) build_block_db() (in module core.datasets.retro.db.build) build_comm_map() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) build_cpu_optimizer_list() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer static method) build_data_parallel_buffer_index() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) build_db() (in module core.datasets.retro.db.build) build_fn (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) build_generic_dataset() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder static method) build_gpt_chunk_datasets_from_gpt_datasets() (in module core.datasets.retro.query.gpt_chunk_dataset) build_hf_model() (in module core.models.huggingface.module) build_index() (in module core.datasets.retro.index.build) build_individual_db() (in module core.datasets.retro.db.build) build_individual_dbs() (in module core.datasets.retro.db.build) build_layer_callables() (in module core.models.gpt.fine_grained_callables) build_low_level_dataset() (core.datasets.gpt_dataset.GPTDataset static method) (core.datasets.gpt_dataset.MockGPTDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) build_merged_dbs() (in module core.datasets.retro.db.build) build_module() (in module core.transformer.spec_utils) build_mtp_layer_callables() (in module core.models.gpt.fine_grained_callables) build_partial_db() (in module core.datasets.retro.db.build) build_sample_idx() (in module core.datasets.helpers) build_schedule_plan() (core.models.gpt.gpt_model.GPTModel method) build_transformer_layer_callables() (in module core.models.gpt.fine_grained_callables) bwd_mempool (core.transformer.cuda_graphs.CudaGraphManager attribute) BWD_READY (core.transformer.cuda_graphs._GraphStatus attribute) ByteLevelTokenizer (class in core.tokenizers.text.libraries.bytelevel_tokenizer) C cache_index_file() (in module core.datasets.object_storage_utils) cache_mla_latents (core.transformer.transformer_config.MLATransformerConfig attribute) CachedMetadataFileSystemReader (class in core.dist_checkpointing.strategies.cached_metadata_filesystem_reader) calculate_cross_entropy_loss() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_gradients() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_log_probs() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) calculate_logits_max() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_per_token_loss (core.transformer.transformer_config.TransformerConfig attribute) calculate_predicted_logits() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) Call (class in core.rerun_state_machine) call_ddp_preforward_hook() (core.transformer.cuda_graphs.CudaGraphManager method) call_idx (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) Caller (class in core.rerun_state_machine) caller (core.rerun_state_machine.Call attribute) can_handle_sharded_objects (core.dist_checkpointing.strategies.base.LoadStrategyBase property) (core.dist_checkpointing.strategies.base.SaveStrategyBase property) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy property) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper property) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper property) can_handle_sharded_objects() (core.dist_checkpointing.strategies.common.TorchCommonSaveStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistSaveShardedStrategy method) causal (core.transformer.enums.AttnMaskType attribute) causal_bottom_right (core.transformer.enums.AttnMaskType attribute) ChainedOptimizer (class in core.optimizer.optimizer) check_availability() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) check_backend_compatibility() (core.dist_checkpointing.strategies.base.LoadStrategyBase method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) check_first_val_step() (in module core.pipeline_parallel.schedules) check_for_large_grads (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) check_for_nan_in_grad (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) check_gpu_memory() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) check_grads() (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) check_is_distributed_checkpoint() (in module core.dist_checkpointing.core) check_mamba_sequence_packing_support() (in module core.utils) check_param_hashes_across_dp_replicas() (in module core.utils) check_version_compatibility() (core.dist_checkpointing.strategies.base.LoadStrategyBase method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) checkpoint() (core.tensor_parallel.random.CheckpointWithoutOutput method) (in module core.tensor_parallel.random) checkpoint_fully_reshardable_formats (core.optimizer.distrib_optimizer.DistributedOptimizer attribute) checkpoint_id (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync property) CheckpointableShardedTensor (class in core.dist_checkpointing.strategies.checkpointable) CheckpointFunction (class in core.tensor_parallel.random) CheckpointingConfig (class in core.dist_checkpointing.core) CheckpointingException CheckpointWithoutOutput (class in core.tensor_parallel.random) CheckpointWithoutOutputFunction (class in core.tensor_parallel.random) chunk_size_factor (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) CkptShardedMetadata (in module core.dist_checkpointing.serialization) classification_head (core.datasets.bert_dataset.BERTMaskedWordPieceDatasetConfig attribute) clean_loss_in_tracker() (core.transformer.multi_token_prediction.MTPLossLoggingHelper method) clear_aux_losses_tracker() (in module core.transformer.moe.moe_utils) clear_embedding_activation_buffer() (in module core.pipeline_parallel.schedules) clip_grad (core.optimizer.optimizer_config.OptimizerConfig attribute) clip_grad_by_total_norm_fp32() (in module core.optimizer.clip_grads) clip_grad_norm() (core.optimizer.optimizer.MegatronOptimizer method) clip_qk() (core.transformer.attention.Attention method) (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) (in module core.optimizer.qk_clip) CLIPViTModel (class in core.models.vision.clip_vit_model) clone_scatter_output_in_embedding (core.transformer.transformer_config.TransformerConfig attribute) clone_tensors_in_struct() (in module core.full_cuda_graph) close() (core.datasets.object_storage_utils.S3Client method) (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) (core.inference.engines.async_zmq_communicator.AsyncZMQCommunicator method) cls (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) cls_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) code_from_dtype() (core.datasets.indexed_dataset.DType class method) column_parallel_layer_norm_linear() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) column_parallel_linear() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) ColumnParallelLinear (class in core.tensor_parallel.layers) combine() (core.rerun_state_machine.QuickStats method) (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) combine_postprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) combine_preprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) combined_1f1b_schedule_for_interleaved_pipelining() (in module core.pipeline_parallel.combined_1f1b) combined_1f1b_schedule_for_no_pipelining() (in module core.pipeline_parallel.combined_1f1b) combined_forward_backward_step() (in module core.pipeline_parallel.combined_1f1b) commit_tensor() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) common (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict attribute) common_backend (core.dist_checkpointing.core.CheckpointingConfig attribute) common_backend_version (core.dist_checkpointing.core.CheckpointingConfig attribute) common_state_dict (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) COMMON_STATE_FNAME (in module core.dist_checkpointing.strategies.common) CommonStateDict (in module core.dist_checkpointing.mapping) CommRole (class in core.pipeline_parallel.bridge_communicator) COMMUNICATING (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketStatus attribute) COMPARISON_MATCH (in module core.rerun_state_machine) COMPARISON_MISMATCH (in module core.rerun_state_machine) CompilationState (class in core.inference.unified_memory) compile_allocator() (in module core.inference.unified_memory) compile_helpers() (in module core.datasets.utils) COMPLETED (core.inference.inference_request.Status attribute) COMPOUND_PARAMS (core.extensions.kitchen.KitchenConfigType attribute) CompoundParamsConfigSchema (class in core.extensions.kitchen) compute_language_model_loss() (core.models.common.language_module.language_module.LanguageModule method) compute_routing_scores_for_aux_loss() (in module core.transformer.moe.moe_utils) condition_init_method() (in module core.extensions.transformer_engine) config_attention_mask() (core.datasets.t5_dataset.T5MaskedWordPieceDataset static method) CONFIG_FNAME (in module core.dist_checkpointing.core) config_logger_dir (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) configs (core.extensions.kitchen.CompoundParamsConfigSchema attribute) configure() (core.utils.StragglerDetector method) configure_fused_tp_inference() (core.transformer.transformer_layer.TransformerLayer method) configure_nvtx_profiling() (in module core.utils) configured (core.utils.StragglerDetector property) CONNECT (core.inference.headers.Headers attribute) CONNECT_ACK (core.inference.headers.Headers attribute) ConstantGradScaler (class in core.optimizer.grad_scaler) ConstantNumMicroBatchesCalculator (class in core.num_microbatches_calculator) contains_submesh() (in module core.distributed.fsdp.src.megatron_fsdp.utils) context_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) ContextErrorFactory (class in core.inference.contexts.dynamic_context) ContextOverflowError conv1d() (core.ssm.mamba_context_parallel.MambaContextParallel method) conv1d_channels() (core.ssm.mamba_context_parallel.MambaContextParallel method) conversion_helper() (in module core.transformer.module) convert() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) convert_schedule_table_to_order() (in module core.pipeline_parallel.schedules) convert_split_vector_to_split_matrix() (in module core.datasets.blended_megatron_dataset_config) copy_main_weights_to_model_weights() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) copy_model_weights_to_main_weights() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) copy_tensor_model_parallel_attributes() (in module core.tensor_parallel.layers) copy_tensors_in_struct() (in module core.full_cuda_graph) copy_tensors_to_cpu() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) copy_to_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) core module core.activations module core.config module core.config_logger module core.datasets module core.datasets.bert_dataset module core.datasets.blended_dataset module core.datasets.blended_megatron_dataset_builder module core.datasets.blended_megatron_dataset_config module core.datasets.gpt_dataset module core.datasets.helpers module core.datasets.indexed_dataset module core.datasets.masked_dataset module core.datasets.megatron_dataset module core.datasets.megatron_tokenizer module core.datasets.multimodal_dataset module core.datasets.object_storage_utils module core.datasets.retro module core.datasets.retro.config module core.datasets.retro.config.bert_embedders module core.datasets.retro.config.config module core.datasets.retro.config.gpt_chunk_datasets module core.datasets.retro.config.tokenizers module core.datasets.retro.db module core.datasets.retro.db.build module core.datasets.retro.db.dataset module core.datasets.retro.db.utils module core.datasets.retro.external_libs module core.datasets.retro.index module core.datasets.retro.index.build module core.datasets.retro.index.factory module core.datasets.retro.index.index module core.datasets.retro.index.indexes module core.datasets.retro.index.indexes.faiss_base module core.datasets.retro.index.indexes.faiss_par_add module core.datasets.retro.index.utils module core.datasets.retro.index.validate module core.datasets.retro.query module core.datasets.retro.query.gpt_chunk_dataset module core.datasets.retro.query.multi_split_gpt_dataset module core.datasets.retro.query.query module core.datasets.retro.query.retro_dataset module core.datasets.retro.query.utils module core.datasets.retro.utils module core.datasets.t5_dataset module core.datasets.utils module core.datasets.utils_s3 module core.dist_checkpointing module core.dist_checkpointing.core module core.dist_checkpointing.dict_utils module core.dist_checkpointing.exchange_utils module core.dist_checkpointing.mapping module core.dist_checkpointing.optimizer module core.dist_checkpointing.serialization module core.dist_checkpointing.state_dict_utils module core.dist_checkpointing.strategies module core.dist_checkpointing.strategies.async_utils module core.dist_checkpointing.strategies.base module core.dist_checkpointing.strategies.cached_metadata_filesystem_reader module core.dist_checkpointing.strategies.checkpointable module core.dist_checkpointing.strategies.common module core.dist_checkpointing.strategies.filesystem_async module core.dist_checkpointing.strategies.fully_parallel module core.dist_checkpointing.strategies.state_dict_saver module core.dist_checkpointing.strategies.tensorstore module core.dist_checkpointing.strategies.torch module core.dist_checkpointing.strategies.two_stage module core.dist_checkpointing.strategies.zarr module core.dist_checkpointing.tensor_aware_state_dict module core.dist_checkpointing.utils module core.dist_checkpointing.validation module core.distributed module core.distributed.data_parallel_base module core.distributed.distributed_data_parallel module core.distributed.distributed_data_parallel_config module core.distributed.finalize_model_grads module core.distributed.fsdp module core.distributed.fsdp.mcore_fsdp_adapter module core.distributed.fsdp.src module core.distributed.fsdp.src.megatron_fsdp module core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config module core.distributed.fsdp.src.megatron_fsdp.fully_shard module core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp module core.distributed.fsdp.src.megatron_fsdp.package_info module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor module core.distributed.fsdp.src.megatron_fsdp.utils module core.distributed.param_and_grad_buffer module core.distributed.reduce_scatter_with_fp32_accumulation module core.distributed.torch_fully_sharded_data_parallel module core.distributed.torch_fully_sharded_data_parallel_config module core.energy_monitor module core.enums module core.export module core.export.data_type module core.export.export_config module core.export.model_type module core.export.trtllm module core.export.trtllm.engine_builder module core.export.trtllm.engine_builder.trtllm_engine_builder module core.export.trtllm.model_to_trllm_mapping module core.export.trtllm.model_to_trllm_mapping.default_conversion_dict module core.export.trtllm.trt_model_config module core.export.trtllm.trt_model_type module core.export.trtllm.trtllm_helper module core.export.trtllm.trtllm_layers module core.export.trtllm.trtllm_weights_converter module core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter module core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter module core.export.trtllm.trtllm_weights_converter.utils module core.extensions module core.extensions.kitchen module core.extensions.transformer_engine module core.extensions.transformer_engine_spec_provider module core.fp4_utils module core.fp8_utils module core.full_cuda_graph module core.fusions module core.fusions.fused_bias_dropout module core.fusions.fused_bias_geglu module core.fusions.fused_bias_gelu module core.fusions.fused_bias_swiglu module core.fusions.fused_cross_entropy module core.fusions.fused_indices_converter module core.fusions.fused_layer_norm module core.fusions.fused_mla_yarn_rope_apply module core.fusions.fused_pad_routing_map module core.fusions.fused_softmax module core.fusions.fused_weighted_squared_relu module core.hyper_comm_grid module core.inference module core.inference.async_stream module core.inference.batch_dimensions_utils module core.inference.common_inference_params module core.inference.communication_utils module core.inference.contexts module core.inference.contexts.base_context module core.inference.contexts.dynamic_block_allocator module core.inference.contexts.dynamic_context module core.inference.contexts.fused_kv_append_kernel module core.inference.contexts.static_context module core.inference.data_parallel_inference_coordinator module core.inference.engines module core.inference.engines.abstract_engine module core.inference.engines.async_zmq_communicator module core.inference.engines.dynamic_engine module core.inference.engines.mcore_engine module core.inference.engines.static_engine module core.inference.headers module core.inference.inference_client module core.inference.inference_request module core.inference.model_inference_wrappers module core.inference.model_inference_wrappers.abstract_model_inference_wrapper module core.inference.model_inference_wrappers.gpt module core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper module core.inference.model_inference_wrappers.inference_wrapper_config module core.inference.model_inference_wrappers.t5 module core.inference.model_inference_wrappers.t5.t5_inference_wrapper module core.inference.sampling_params module core.inference.scheduler module core.inference.text_generation_controllers module core.inference.text_generation_controllers.encoder_decoder_text_generation_controller module core.inference.text_generation_controllers.simple_text_generation_controller module core.inference.text_generation_controllers.text_generation_controller module core.inference.text_generation_controllers.vlm_text_generation_controller module core.inference.text_generation_server module core.inference.text_generation_server.run_mcore_engine module core.inference.text_generation_server.text_generation_server module core.inference.text_generation_server.tokenization module core.inference.unified_memory module core.inference.utils module core.inference_params module core.jit module core.model_parallel_config module core.models module core.models.backends module core.models.bert module core.models.bert.bert_layer_specs module core.models.bert.bert_lm_head module core.models.bert.bert_model module core.models.bert.pooler module core.models.common module core.models.common.embeddings module core.models.common.embeddings.language_model_embedding module core.models.common.embeddings.relative_pos_embedding module core.models.common.embeddings.rope_utils module core.models.common.embeddings.rotary_pos_embedding module core.models.common.embeddings.yarn_rotary_pos_embedding module core.models.common.language_module module core.models.common.language_module.language_module module core.models.common.model_chunk_schedule_plan module core.models.common.vision_module module core.models.common.vision_module.vision_module module core.models.gpt module core.models.gpt.fine_grained_callables module core.models.gpt.gpt_layer_specs module core.models.gpt.gpt_model module core.models.gpt.moe_module_specs module core.models.huggingface module core.models.huggingface.clip_model module core.models.huggingface.module module core.models.huggingface.qwen_model module core.models.mamba module core.models.mamba.mamba_layer_specs module core.models.mamba.mamba_model module core.models.mimo module core.models.mimo.config module core.models.mimo.config.base_configs module core.models.mimo.model module core.models.mimo.model.base module core.models.multimodal module core.models.multimodal.context_parallel module core.models.multimodal.llava_model module core.models.multimodal.llava_spec module core.models.retro module core.models.retro.base_attention module core.models.retro.config module core.models.retro.decoder_attention module core.models.retro.decoder_spec module core.models.retro.encoder_attention module core.models.retro.encoder_spec module core.models.retro.model module core.models.retro.utils module core.models.T5 module core.models.T5.t5_model module core.models.T5.t5_spec module core.models.vision module core.models.vision.clip_vit_model module core.models.vision.multimodal_projector module core.models.vision.radio module core.models.vision.vit_layer_specs module core.msc_utils module core.nccl_allocator module core.num_microbatches_calculator module core.optimizer module core.optimizer.clip_grads module core.optimizer.cpu_offloading module core.optimizer.cpu_offloading.hybrid_optimizer module core.optimizer.distrib_optimizer module core.optimizer.grad_scaler module core.optimizer.optimizer module core.optimizer.optimizer_config module core.optimizer.qk_clip module core.optimizer_param_scheduler module core.package_info module core.packed_seq_params module core.parallel_state module core.pipeline_parallel module core.pipeline_parallel.bridge_communicator module core.pipeline_parallel.combined_1f1b module core.pipeline_parallel.p2p_communication module core.pipeline_parallel.schedules module core.pipeline_parallel.utils module core.post_training module core.post_training.modelopt module core.post_training.modelopt.gpt module core.post_training.modelopt.gpt.model_specs module core.post_training.modelopt.gpt.state_dict_hooks module core.post_training.modelopt.layers module core.post_training.modelopt.mamba module core.post_training.modelopt.mamba.model_specs module core.process_groups_config module core.quantization module core.quantization.quant_config module core.quantization.utils module core.rerun_state_machine module core.safe_globals module core.ssm module core.ssm.mamba_block module core.ssm.mamba_context_parallel module core.ssm.mamba_hybrid_layer_allocation module core.ssm.mamba_layer module core.ssm.mamba_mixer module core.ssm.mlp_layer module core.ssm.triton_cache_manager module core.tensor_parallel module core.tensor_parallel.cross_entropy module core.tensor_parallel.data module core.tensor_parallel.inference_layers module core.tensor_parallel.layers module core.tensor_parallel.mappings module core.tensor_parallel.random module core.tensor_parallel.utils module core.timers module core.tokenizers module core.tokenizers.base_tokenizer module core.tokenizers.megatron_tokenizer module core.tokenizers.text module core.tokenizers.text.libraries module core.tokenizers.text.libraries.abstract_tokenizer module core.tokenizers.text.libraries.bytelevel_tokenizer module core.tokenizers.text.libraries.chat_template module core.tokenizers.text.libraries.huggingface_tokenizer module core.tokenizers.text.libraries.megatron_hf_tokenizer module core.tokenizers.text.libraries.null_tokenizer module core.tokenizers.text.libraries.sentencepiece_tokenizer module core.tokenizers.text.libraries.tiktoken_tokenizer module core.tokenizers.text.models module core.tokenizers.text.models.bert_tokenizer module core.tokenizers.text.models.default_tokenizer module core.tokenizers.text.models.gpt_tokenizer module core.tokenizers.text.models.mamba_tokenizer module core.tokenizers.text.models.retro_tokenizer module core.tokenizers.text.models.t5_tokenizer module core.tokenizers.text.text_tokenizer module core.transformer module core.transformer.attention module core.transformer.cuda_graphs module core.transformer.custom_layers module core.transformer.custom_layers.batch_invariant_kernels module core.transformer.custom_layers.transformer_engine module core.transformer.dot_product_attention module core.transformer.enums module core.transformer.fsdp_dtensor_checkpoint module core.transformer.identity_op module core.transformer.mlp module core.transformer.module module core.transformer.moe module core.transformer.moe.experts module core.transformer.moe.fused_a2a module core.transformer.moe.grouped_gemm_util module core.transformer.moe.moe_layer module core.transformer.moe.moe_utils module core.transformer.moe.router module core.transformer.moe.shared_experts module core.transformer.moe.token_dispatcher module core.transformer.moe.upcycling_utils module core.transformer.multi_latent_attention module core.transformer.multi_token_prediction module core.transformer.pipeline_parallel_layer_layout module core.transformer.spec_utils module core.transformer.torch_layer_norm module core.transformer.torch_norm module core.transformer.transformer_block module core.transformer.transformer_config module core.transformer.transformer_layer module core.transformer.utils module core.utils module core_attention (core.transformer.attention.CrossAttentionSubmodules attribute) (core.transformer.attention.SelfAttentionSubmodules attribute) (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) core_attention() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) correct_amax_history_if_needed() (in module core.fp8_utils) CORRECT_RESULT (core.rerun_state_machine.RerunDiagnostic attribute) count_zeros() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) count_zeros_fp32() (in module core.optimizer.clip_grads) Counter (class in core.inference.utils) cp (core.process_groups_config.ProcessGroupCollection attribute) cp_comm_type (core.transformer.transformer_config.TransformerConfig attribute) cp_stream (core.extensions.transformer_engine.TEDotProductAttention attribute) cpu_offloading (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_activations (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_double_buffering (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_num_layers (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_weights (core.model_parallel_config.ModelParallelConfig attribute) create_attention_mask (core.datasets.gpt_dataset.GPTDatasetConfig attribute) create_bwd_graph() (core.transformer.cuda_graphs._CudaGraphRunner method) create_cuda_graphs() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) create_cudagraphs() (core.transformer.cuda_graphs._CudagraphGlobalRecord class method) (core.transformer.cuda_graphs.TECudaGraphHelper method) (in module core.transformer.cuda_graphs) create_decentralized_global_plan() (core.dist_checkpointing.strategies.torch.MCoreSavePlanner method) create_fwd_graph() (core.transformer.cuda_graphs._CudaGraphRunner method) create_group() (in module core.parallel_state) create_hierarchical_groups() (in module core.parallel_state) create_local_plan() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) (core.dist_checkpointing.strategies.torch.MCoreSavePlanner method) create_nccl_mem_pool() (in module core.nccl_allocator) create_pg() (core.hyper_comm_grid.HyperCommGrid method) create_unified_mempool() (in module core.inference.unified_memory) create_updated_function_signature() (in module core.distributed.fsdp.src.megatron_fsdp.utils) cross_attention (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) cross_attn (core.transformer.enums.AttnType attribute) cross_attn_bda (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) cross_entropy_fusion_impl (core.model_parallel_config.ModelParallelConfig attribute) cross_entropy_loss_fusion (core.model_parallel_config.ModelParallelConfig attribute) CrossAttention (class in core.transformer.attention) CrossAttentionSubmodules (class in core.transformer.attention) cu_kv_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) cu_query_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) cu_seqlens_kv (core.packed_seq_params.PackedSeqParams attribute) cu_seqlens_kv_padded (core.packed_seq_params.PackedSeqParams attribute) cu_seqlens_q (core.packed_seq_params.PackedSeqParams attribute) cu_seqlens_q_padded (core.packed_seq_params.PackedSeqParams attribute) cuda_dtoh_stream (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher attribute) cuda_graph (core.full_cuda_graph.FullCudaGraphWrapper attribute) cuda_graph_attr_cache (in module core.transformer.utils) cuda_graph_impl (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_retain_backward_graph (core.transformer.transformer_config.TransformerConfig attribute) CUDA_GRAPH_ROUNDER (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder attribute) cuda_graph_scope (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_set_manual_hooks() (core.transformer.cuda_graphs.TECudaGraphHelper method) cuda_graph_use_single_mempool (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_warmup_steps (core.transformer.transformer_config.TransformerConfig attribute) cudagraph_created (core.transformer.cuda_graphs._CudagraphGlobalRecord attribute) cudagraph_inference_record (core.transformer.cuda_graphs._CudagraphGlobalRecord attribute) cudagraph_record (core.transformer.cuda_graphs._CudagraphGlobalRecord attribute) CUDAGraphBatchDimensionBuilder (class in core.inference.batch_dimensions_utils) CudaGraphManager (class in core.transformer.cuda_graphs) CudaRNGStatesTracker (class in core.tensor_parallel.random) curr_iter() (core.full_cuda_graph.FullCudaGraphWrapper method) curr_iteration (core.full_cuda_graph.FullCudaGraphWrapper attribute) current_input_and_position_ids() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) custom (core.enums.Fp4Recipe attribute) (core.enums.Fp8Recipe attribute) custom_backward() (in module core.pipeline_parallel.schedules) custom_recipe_factory (core.extensions.transformer_engine.TEQuantizationRecipe attribute) D data (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.Bucket attribute) data_parallel_sharding_strategy (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketingPolicy attribute) data_read() (core.full_cuda_graph.FullCudaGraphWrapper method) data_size (core.dist_checkpointing.strategies.two_stage._ShardedTensorMetadata attribute) DataIteratorArgType (in module core.rerun_state_machine) DataParallelBuffer (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) DataParallelInferenceCoordinator (class in core.inference.data_parallel_inference_coordinator) dataset_exists() (in module core.datasets.object_storage_utils) DataType (in module core.export.data_type) DBDataset (class in core.datasets.retro.db.dataset) deallocate_all_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) deallocate_output_tensor() (in module core.pipeline_parallel.schedules) deallocate_pipeline_outputs (core.model_parallel_config.ModelParallelConfig attribute) debug_msg() (in module core.dist_checkpointing.utils) debug_time() (in module core.dist_checkpointing.utils) decode() (core.ssm.mamba_mixer.MambaMixer method) decode_req_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions attribute) decoder (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.transformer.enums.LayerType attribute) decoder_model_with_local_default_spec() (in module core.models.multimodal.llava_spec) decoder_model_with_local_spec() (in module core.models.T5.t5_spec) decoder_model_with_transformer_engine_default_spec() (in module core.models.multimodal.llava_spec) (in module core.models.T5.t5_spec) decoder_seq_length (core.inference.inference_request.VLMInferenceRequest attribute) decoupled_weight_decay (core.optimizer.optimizer_config.OptimizerConfig attribute) deduplicate_chunks() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) default() (core.config_logger.JSONEncoderWithMcoreTypes method) default_backward_func() (core.pipeline_parallel.utils.ScheduleNode method) default_cache_dir() (in module core.ssm.triton_cache_manager) DEFAULT_CONVERSION_DICT (in module core.export.trtllm.model_to_trllm_mapping.default_conversion_dict) default_embedding_ranks() (in module core.parallel_state) DEFAULT_IMAGE_TOKEN_INDEX (in module core.models.multimodal.llava_model) DEFAULT_MAX_TOKENS (core.inference.contexts.dynamic_context.DynamicInferenceContext attribute) default_position_embedding_ranks() (in module core.parallel_state) default_strategies (in module core.dist_checkpointing.strategies.base) DEFAULT_TIKTOKEN_MAX_VOCAB (in module core.tokenizers.text.libraries.tiktoken_tokenizer) DefaultTokenizerText (class in core.tokenizers.text.models.default_tokenizer) defer_embedding_wgrad_compute (core.model_parallel_config.ModelParallelConfig attribute) delay_wgrad_compute (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.model_parallel_config.ModelParallelConfig attribute) delayed (core.enums.Fp8Recipe attribute) delete_cuda_graphs() (in module core.transformer.cuda_graphs) deprecate_inference_params() (in module core.utils) deprecated() (in module core.utils) dequantize_fp4_tensor() (in module core.fp4_utils) dequantize_fp8_tensor() (in module core.fp8_utils) deserialize() (core.inference.contexts.dynamic_context.ContextErrorFactory class method) (core.inference.inference_request.DynamicInferenceEvent class method) (core.inference.inference_request.DynamicInferenceRequest class method) (core.inference.inference_request.DynamicInferenceRequestRecord class method) (core.inference.inference_request.InferenceRequest class method) (core.inference.sampling_params.SamplingParams class method) deserialize_tensor() (in module core.inference.inference_request) destroy_global_memory_buffer() (in module core.parallel_state) destroy_global_symmetric_memory_buffer() (in module core.parallel_state) destroy_model_parallel() (in module core.parallel_state) destroy_num_microbatches_calculator() (in module core.num_microbatches_calculator) destroy_rerun_state_machine() (in module core.rerun_state_machine) detach() (core.models.gpt.fine_grained_callables.TransformerLayerNode method) determine_global_metadata() (in module core.dist_checkpointing.validation) determine_main_replica_uniform_distribution() (in module core.dist_checkpointing.exchange_utils) deterministic_mode (core.model_parallel_config.ModelParallelConfig attribute) detokenize() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) detokenize_generations() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) dict_list_map_inplace() (in module core.dist_checkpointing.dict_utils) dict_list_map_outplace() (in module core.dist_checkpointing.dict_utils) dict_map() (in module core.dist_checkpointing.dict_utils) dict_map_with_key() (in module core.dist_checkpointing.dict_utils) diff() (in module core.dist_checkpointing.dict_utils) disable() (core.msc_utils._FeatureFlag method) disable_batch_invariant_mode() (in module core.transformer.custom_layers.batch_invariant_kernels) disable_bf16_reduced_precision_matmul (core.transformer.transformer_config.TransformerConfig attribute) disable_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) disable_parameter_transpose_cache (core.transformer.transformer_config.TransformerConfig attribute) disable_symmetric_registration (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) DISABLED (core.rerun_state_machine.RerunMode attribute) discard_output_and_register_recompute() (core.tensor_parallel.random.CheckpointWithoutOutput method) disk (core.datasets.retro.config.bert_embedders.RetroBertEmbedders attribute) dispatch() (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) dispatch_postprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) dispatch_preprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) dist_group_rank (core.dist_checkpointing.strategies.two_stage._ShardedTensorMetadata attribute) dist_group_ranks (core.dist_checkpointing.strategies.two_stage._ShardedTensorMetadata attribute) distribute_main_replicas_with_precomputed_distribution() (in module core.dist_checkpointing.strategies.fully_parallel) distribute_saved_activations (core.transformer.transformer_config.TransformerConfig attribute) distribute_shards_to_ranks() (in module core.dist_checkpointing.exchange_utils) DistributedDataParallel (class in core.distributed.distributed_data_parallel) DistributedDataParallelConfig (class in core.distributed.distributed_data_parallel_config) (class in core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config) DistributedDataset (in module core.datasets.blended_megatron_dataset_builder) DistributedOptimizer (class in core.optimizer.distrib_optimizer) DistributedTRTLLMModelWeightsConverter (class in core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter) divide() (in module core.utils) document_indices (core.datasets.indexed_dataset.IndexedDataset property) DotProductAttention (class in core.transformer.dot_product_attention) download_file() (core.datasets.object_storage_utils.S3Client method) dp (core.process_groups_config.ProcessGroupCollection attribute) dp_cp (core.process_groups_config.ProcessGroupCollection attribute) drain_embedding_wgrad_compute() (in module core.utils) drop_last_partial_validation_sequence (core.datasets.gpt_dataset.GPTDatasetConfig attribute) DType (class in core.datasets.indexed_dataset) dtype (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) dtype_from_code() (core.datasets.indexed_dataset.DType class method) dummy_forward() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) dummy_step() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) DummyTimer (class in core.timers) dynamic_inference() (core.ssm.mamba_mixer.MambaMixer method) DynamicGradScaler (class in core.optimizer.grad_scaler) DynamicInferenceContext (class in core.inference.contexts.dynamic_context) DynamicInferenceEngine (class in core.inference.engines.dynamic_engine) DynamicInferenceEvent (class in core.inference.inference_request) DynamicInferenceEventType (class in core.inference.inference_request) DynamicInferenceRequest (class in core.inference.inference_request) DynamicInferenceRequestRecord (class in core.inference.inference_request) E eh_proj (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) elapsed() (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) (core.utils.StragglerDetector method) embd (core.process_groups_config.ProcessGroupCollection attribute) embed_block() (in module core.datasets.retro.query.query) embed_text() (core.datasets.retro.config.bert_embedders.Embedder method) embed_text_dataset() (core.datasets.retro.config.bert_embedders.Embedder method) embed_text_dataset_block() (core.datasets.retro.index.index.Index method) embed_training_chunks() (in module core.datasets.retro.index.build) Embedder (class in core.datasets.retro.config.bert_embedders) embedding (core.transformer.enums.LayerType attribute) embedding() (core.models.huggingface.qwen_model.QwenHuggingFaceModel method) embedding_init_method (core.transformer.transformer_config.TransformerConfig attribute) embedding_init_method_std (core.transformer.transformer_config.TransformerConfig attribute) EMPTY (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketStatus attribute) empty_from_unique_key() (core.dist_checkpointing.mapping.ShardedObject class method) enable() (core.msc_utils._FeatureFlag method) enable_autocast (core.model_parallel_config.ModelParallelConfig attribute) enable_batch_invariant_mode() (in module core.transformer.custom_layers.batch_invariant_kernels) enable_cuda_graph (core.transformer.transformer_config.TransformerConfig attribute) enable_decode_mode() (core.inference.contexts.static_context.StaticInferenceContext method) ENABLE_EXPERIMENTAL (in module core.config) enable_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) enable_prefill_mode() (core.inference.contexts.static_context.StaticInferenceContext method) enabled (core.utils.StragglerDetector property) encode() (core.datasets.retro.index.indexes.faiss_par_add.FaissParallelAddIndex method) encode_block() (core.datasets.retro.index.indexes.faiss_par_add.FaissParallelAddIndex method) encoder (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.transformer.enums.LayerType attribute) encoder_and_decoder (core.enums.ModelType property) (core.transformer.enums.ModelType property) encoder_model_with_local_spec() (in module core.models.T5.t5_spec) encoder_model_with_transformer_engine_default_spec() (in module core.models.T5.t5_spec) encoder_or_decoder (core.enums.ModelType attribute) (core.transformer.enums.ModelType attribute) encoder_prompt (core.inference.inference_request.InferenceRequest attribute) EncoderDecoderTextGenerationController (class in core.inference.text_generation_controllers.encoder_decoder_text_generation_controller) end_document() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) EnergyMonitor (class in core.energy_monitor) ENGINE_REPLY (core.inference.headers.Headers attribute) EngineSuspendedError enorm (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) ensure_divisibility() (in module core.utils) ensure_metadata_has_dp_cp_group() (in module core.transformer.utils) entrypoint() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator class method) eod (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) eod_mask_loss (core.datasets.gpt_dataset.GPTDatasetConfig attribute) eos (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) eos_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) ep (core.process_groups_config.ProcessGroupCollection attribute) erf_gelu() (in module core.transformer.utils) ERROR_NONTRANSIENT (core.inference.inference_request.DynamicInferenceEventType attribute) ERROR_TRANSIENT (core.inference.inference_request.DynamicInferenceEventType attribute) evaluation_recipe (core.extensions.transformer_engine.TEQuantizationParams attribute) event (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan property) events (core.inference.inference_request.DynamicInferenceRequest attribute) exchange_by_distribution() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_objects_gather_object() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_tensors_broadcast() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_tensors_gather_object() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_tensors_gather_rounds() (in module core.dist_checkpointing.exchange_utils) execute_sync() (core.dist_checkpointing.strategies.async_utils.AsyncRequest method) exists() (core.datasets.indexed_dataset.IndexedDataset static method) EXIT_CODE_FAILED_ON_RESULT_VALIDATION (in module core.rerun_state_machine) EXIT_CODE_RESUME_TO_DISAMBIGUATE (in module core.rerun_state_machine) exp_avg_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) exp_avg_sq_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) experimental_api() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) (in module core.utils) experimental_cls() (in module core.utils) experimental_fn() (in module core.utils) ExperimentalNotEnabledError expert_dist_ckpt_decorator() (in module core.transformer.moe.experts) expert_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) expert_param_local_key() (in module core.transformer.fsdp_dtensor_checkpoint) expert_tensor_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) experts (core.transformer.moe.moe_layer.MoESubmodules attribute) ExpertsType (in module core.transformer.moe.upcycling_utils) ExportConfig (class in core.export.export_config) expt_dp (core.process_groups_config.ProcessGroupCollection attribute) expt_fsdp_group (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex attribute) expt_tp (core.process_groups_config.ProcessGroupCollection attribute) ExtendedRMSNorm (class in core.ssm.mamba_mixer) external_cuda_graph (core.transformer.transformer_config.TransformerConfig attribute) extract_data_config() (in module core.datasets.retro.utils) extract_matching_values() (in module core.dist_checkpointing.dict_utils) extract_nonpersistent() (in module core.dist_checkpointing.utils) extract_sharded_base() (in module core.dist_checkpointing.utils) extract_sharded_tensors() (in module core.dist_checkpointing.utils) extract_sharded_tensors_and_factories() (in module core.dist_checkpointing.utils) extract_sharded_tensors_or_nonpersistent() (in module core.dist_checkpointing.utils) F FactoryBuildFn (in module core.dist_checkpointing.mapping) FactoryMergeFn (in module core.dist_checkpointing.mapping) FAIL (core.inference.inference_request.DynamicInferenceEventType attribute) FAILED (core.inference.inference_request.Status attribute) failed() (core.inference.inference_request.DynamicInferenceRequest method) FAILURE (core.inference.unified_memory.CompilationState attribute) FaissBaseIndex (class in core.datasets.retro.index.indexes.faiss_base) FaissParallelAddIndex (class in core.datasets.retro.index.indexes.faiss_par_add) fallback_logger (in module core.dist_checkpointing.utils) fast_gelu() (in module core.activations) fetch_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) ffn_fc_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) ffn_hidden_size (core.transformer.transformer_config.TransformerConfig attribute) ffn_linear_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) ffn_projection_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) FileSystemWriterAsync (class in core.dist_checkpointing.strategies.filesystem_async) fill_in_deferred_sharded_objects() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) fill_in_deferred_sharded_tensors() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) filter_out_empty_flatten_tensor() (in module core.dist_checkpointing.state_dict_utils) filter_unflattened_state_dict() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) final_layernorm_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) final_layernorm_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) finalize() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) finalize_fns (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) finalize_model_grads() (in module core.distributed.finalize_model_grads) finalize_model_grads_func (core.model_parallel_config.ModelParallelConfig attribute) FINISH (core.inference.inference_request.DynamicInferenceEventType attribute) finish() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) (core.inference.async_stream.AsyncStream method) finish_embedding_wgrad_compute() (in module core.pipeline_parallel.schedules) finish_grad_sync() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) finish_init() (core.extensions.kitchen.KitchenDotProductAttention method) (core.extensions.kitchen.KitchenFlashAttention method) (core.extensions.kitchen.KitchenGroupedLinear method) (core.extensions.kitchen.KitchenLayerNormColumnParallelLinear method) (core.extensions.kitchen.KitchenLinear method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) finish_param_sync() (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) finished (core.inference.async_stream.AsyncStream property) finished_chunk_token_count (core.inference.inference_request.DynamicInferenceRequest attribute) first_last_layers_bf16 (core.transformer.transformer_config.TransformerConfig attribute) FIRST_RERUN_NOT_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) FIRST_RERUN_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) FixedPoolAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) flash (core.transformer.enums.AttnBackend attribute) flash_decode (core.transformer.transformer_config.TransformerConfig attribute) flash_decode() (core.transformer.attention.Attention method) flash_decode_and_prefill() (core.transformer.attention.Attention method) flatten_range() (in module core.dist_checkpointing.strategies.zarr) flatten_state_dict() (in module core.dist_checkpointing.strategies.torch) (in module core.transformer.fsdp_dtensor_checkpoint) flattened_range (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) float16_to_fp32() (in module core.transformer.module) Float16Module (class in core.transformer.module) Float16OptimizerWithFloat16Params (class in core.optimizer.optimizer) float32 (core.datasets.indexed_dataset.DType attribute) float64 (core.datasets.indexed_dataset.DType attribute) force_all_tensors_to_non_fp8() (in module core.dist_checkpointing.utils) fork() (core.tensor_parallel.random.CudaRNGStatesTracker method) format_mem_bytes() (in module core.inference.engines.dynamic_engine) FORWARD (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) forward() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.RegisterFSDPBackwardFunction static method) (core.extensions.kitchen.KitchenDotProductAttention method) (core.extensions.kitchen.KitchenFlashAttention method) (core.extensions.kitchen.KitchenGroupedLinear method) (core.extensions.kitchen.KitchenLayerNormColumnParallelLinear method) (core.extensions.kitchen.KitchenLinear method) (core.extensions.transformer_engine.TEDotProductAttention method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) (core.fusions.fused_bias_geglu.BiasGeGLUFunction static method) (core.fusions.fused_bias_geglu.GeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedBiasQuickGeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedQuickGeGLUFunction static method) (core.fusions.fused_bias_gelu.GeLUFunction static method) (core.fusions.fused_bias_swiglu.BiasSwiGLUFunction static method) (core.fusions.fused_bias_swiglu.SwiGLUFunction static method) (core.fusions.fused_bias_swiglu.WeightedSwiGLUFunction static method) (core.fusions.fused_cross_entropy._VocabParallelCrossEntropy static method) (core.fusions.fused_indices_converter.IndicesToMultihot static method) (core.fusions.fused_layer_norm.FusedLayerNorm method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbKV static method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbQ static method) (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) (core.fusions.fused_softmax.ScaledMaskedSoftmax static method) (core.fusions.fused_softmax.ScaledSoftmax static method) (core.fusions.fused_softmax.ScaledUpperTriangMaskedSoftmax static method) (core.fusions.fused_softmax.SoftmaxOne method) (core.fusions.fused_weighted_squared_relu.WeightedSquaredReLUFunction static method) (core.models.bert.bert_lm_head.BertLMHead method) (core.models.bert.bert_model.BertModel method) (core.models.bert.pooler.Pooler method) (core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding method) (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding method) (core.models.common.embeddings.rotary_pos_embedding.MultimodalRotaryEmbedding method) (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) (core.models.common.embeddings.yarn_rotary_pos_embedding.YarnRotaryEmbedding method) (core.models.gpt.gpt_model.GPTModel method) (core.models.huggingface.clip_model.SiglipHuggingFaceModel method) (core.models.huggingface.module.AutoHuggingFaceModel method) (core.models.huggingface.qwen_model.QwenHuggingFaceModel method) (core.models.mamba.mamba_model.MambaModel method) (core.models.mimo.model.base.MimoModel method) (core.models.multimodal.llava_model.LLaVAModel method) (core.models.retro.decoder_attention.RetroDecoderBiasDropoutAdd method) (core.models.retro.decoder_attention.RetroDecoderCrossAttention method) (core.models.retro.encoder_attention.RetroEncoderBiasDropoutAdd method) (core.models.retro.encoder_attention.RetroEncoderCrossAttention method) (core.models.retro.encoder_attention.RetroEncoderLayerNorm method) (core.models.retro.model.RetroModel method) (core.models.T5.t5_model.T5LMHead method) (core.models.T5.t5_model.T5Model method) (core.models.vision.clip_vit_model.CLIPViTModel method) (core.models.vision.multimodal_projector.MultimodalProjector method) (core.models.vision.radio.RADIOViTModel method) (core.pipeline_parallel.utils.NoopScheduleNode method) (core.pipeline_parallel.utils.ScheduleNode method) (core.post_training.modelopt.layers.Linear method) (core.ssm.mamba_block.MambaStack method) (core.ssm.mamba_layer.MambaLayer method) (core.ssm.mamba_mixer.MambaMixer method) (core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy static method) (core.tensor_parallel.inference_layers.InferenceLayerNormColumnParallelLinear method) (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.LinearWithFrozenWeight static method) (core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication static method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.layers.VocabParallelEmbedding method) (core.tensor_parallel.mappings._AllGatherFromTensorParallelRegion static method) (core.tensor_parallel.mappings._AllToAll static method) (core.tensor_parallel.mappings._CopyToModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceFromModelParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToTensorParallelRegion static method) (core.tensor_parallel.mappings._ScatterToModelParallelRegion static method) (core.tensor_parallel.mappings._ScatterToSequenceParallelRegion static method) (core.tensor_parallel.random.CheckpointFunction static method) (core.tensor_parallel.random.CheckpointWithoutOutputFunction static method) (core.transformer.attention.Attention method) (core.transformer.cuda_graphs._CudagraphRecordNode static method) (core.transformer.cuda_graphs._CudagraphReplayNode static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantRMSNormFn static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantTEGemmFn static method) (core.transformer.dot_product_attention.DotProductAttention method) (core.transformer.identity_op.IdentityFuncOp method) (core.transformer.identity_op.IdentityOp method) (core.transformer.mlp.MLP method) (core.transformer.module.Float16Module method) (core.transformer.moe.experts.GroupedMLP method) (core.transformer.moe.experts.SequentialMLP method) (core.transformer.moe.experts.TEGroupedMLP method) (core.transformer.moe.fused_a2a.FusedCombine static method) (core.transformer.moe.fused_a2a.FusedDispatch static method) (core.transformer.moe.fused_a2a.HybridEPCombine static method) (core.transformer.moe.fused_a2a.HybridEPDispatch static method) (core.transformer.moe.moe_layer.BaseMoELayer method) (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler static method) (core.transformer.moe.moe_utils.RandomSTE static method) (core.transformer.moe.moe_utils.RouterGatingLinearFunction static method) (core.transformer.moe.router.Router method) (core.transformer.moe.router.TopKRouter method) (core.transformer.moe.shared_experts.SharedExpertMLP method) (core.transformer.multi_latent_attention.MultiLatentAttention method) (core.transformer.multi_token_prediction.MTPLossAutoScaler static method) (core.transformer.multi_token_prediction.MultiTokenPredictionBlock method) (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) (core.transformer.torch_norm.L2Norm method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) (core.utils.MakeViewlessTensor static method) forward_backward_no_pipelining() (in module core.pipeline_parallel.schedules) forward_backward_pipelining_with_interleaving() (in module core.pipeline_parallel.schedules) forward_backward_pipelining_without_interleaving() (in module core.pipeline_parallel.schedules) forward_fused_softmax() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) forward_impl() (core.models.gpt.fine_grained_callables.PostProcessNode method) (core.models.gpt.fine_grained_callables.PreProcessNode method) (core.models.gpt.fine_grained_callables.TransformerLayerNode method) FORWARD_PASS_ORDER (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.PrefetchOrder attribute) forward_pass_with_pipeline_parallel_large_input_batch() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) forward_pass_with_pipeline_parallel_small_input_batch() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) forward_pass_without_pipeline_parallel() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) forward_step() (in module core.pipeline_parallel.schedules) forward_step_calc_loss() (in module core.pipeline_parallel.schedules) forward_torch_softmax() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) fp16 (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) fp32_residual_connection (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) fp32_to_float16() (in module core.transformer.module) FP32Optimizer (class in core.optimizer.optimizer) fp4 (core.transformer.transformer_config.TransformerConfig attribute) fp4_param (core.transformer.transformer_config.TransformerConfig attribute) fp4_quantization_recipe (core.extensions.transformer_engine.TEQuantizationRecipe attribute) fp4_quantizer_factory (core.transformer.transformer_config.TransformerConfig attribute) fp4_recipe (core.transformer.transformer_config.TransformerConfig attribute) Fp4Recipe (class in core.enums) fp8 (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) FP8_2D_BLOCKWISE_REAL_QUANT_CFG (in module core.post_training.modelopt.layers) fp8_amax_compute_algo (core.transformer.transformer_config.TransformerConfig attribute) fp8_amax_history_len (core.transformer.transformer_config.TransformerConfig attribute) fp8_dot_product_attention (core.transformer.transformer_config.TransformerConfig attribute) fp8_format (core.extensions.transformer_engine.TEQuantizationRecipe attribute) fp8_interval (core.transformer.transformer_config.TransformerConfig attribute) fp8_margin (core.transformer.transformer_config.TransformerConfig attribute) fp8_multi_head_attention (core.transformer.transformer_config.TransformerConfig attribute) fp8_pad_hook() (in module core.models.vision.radio) fp8_param (core.transformer.transformer_config.TransformerConfig attribute) fp8_param_gather (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) FP8_PER_TENSOR_REAL_QUANT_CFG (in module core.post_training.modelopt.layers) fp8_quantization_recipe (core.extensions.transformer_engine.TEQuantizationRecipe attribute) fp8_quantizer_factory (core.transformer.transformer_config.TransformerConfig attribute) fp8_recipe (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) fp8_wgrad (core.transformer.transformer_config.TransformerConfig attribute) Fp8Recipe (class in core.enums) FP8WeightTransformerLayer (class in core.post_training.modelopt.layers) free() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.RotaryBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.StorageResizeBasedBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.TemporaryBucketAllocator method) free_bucket_storage() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) freeze() (core.dist_checkpointing.strategies.async_utils.AsyncRequest method) (core.models.multimodal.llava_model.LLaVAModel method) FREEZE_GC (in module core.transformer.cuda_graphs) from_config() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) (core.inference.contexts.static_context.StaticInferenceContext class method) from_config_dict() (core.quantization.quant_config.RecipeConfig static method) from_dir() (core.datasets.retro.utils.BlockPathMap class method) from_pretrained() (core.tokenizers.megatron_tokenizer.MegatronTokenizer method) from_rank_offsets() (core.dist_checkpointing.mapping.ShardedTensor class method) from_request() (core.inference.inference_request.DynamicInferenceRequestRecord class method) from_sh_ten() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor class method) from_state_dict() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict class method) from_str() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout static method) from_yaml_file() (core.quantization.quant_config.RecipeConfig static method) fsdp_double_buffer (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) fsdp_unit_id (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) fsdp_unit_modules (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketingPolicy attribute) FSDPDistributedIndex (class in core.distributed.fsdp.src.megatron_fsdp.utils) full_validation (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) FullCudaGraphWrapper (class in core.full_cuda_graph) fully_shard() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) fully_shard_model() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) fully_shard_optimizer() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) FullyParallelLoadStrategyWrapper (class in core.dist_checkpointing.strategies.fully_parallel) FullyParallelSaveStrategyWrapper (class in core.dist_checkpointing.strategies.fully_parallel) FullyShardedDataParallel (class in core.distributed.fsdp.mcore_fsdp_adapter) fuse_layernorm_and_linear() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) fused (core.transformer.enums.AttnBackend attribute) fused_apply_mla_rope_for_kv() (in module core.fusions.fused_mla_yarn_rope_apply) fused_apply_mla_rope_for_q() (in module core.fusions.fused_mla_yarn_rope_apply) fused_indices_to_multihot() (in module core.fusions.fused_indices_converter) fused_pad_routing_map() (in module core.fusions.fused_pad_routing_map) fused_single_qkv_rope (core.transformer.transformer_config.TransformerConfig attribute) fused_vocab_parallel_cross_entropy() (in module core.fusions.fused_cross_entropy) FusedCombine (class in core.transformer.moe.fused_a2a) FusedDispatch (class in core.transformer.moe.fused_a2a) FusedLayerNorm (class in core.fusions.fused_layer_norm) FusedScaleMaskSoftmax (class in core.fusions.fused_softmax) future (core.inference.engines.dynamic_engine.RequestEntry attribute) fwd_mempools (core.transformer.cuda_graphs.CudaGraphManager attribute) FWD_READY (core.transformer.cuda_graphs._GraphStatus attribute) G GATED_ACTIVATION (in module core.export.trtllm.trtllm_weights_converter.utils) gated_linear_unit (core.transformer.transformer_config.TransformerConfig attribute) gather_and_compute_chunk_metadata() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) gather_from_sequence_parallel_region() (in module core.tensor_parallel.mappings) gather_from_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) gather_split_1d_tensor() (in module core.tensor_parallel.utils) gather_uneven_dtensor_to_full_tensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) gating() (core.transformer.moe.router.Router method) geglu() (in module core.fusions.fused_bias_geglu) geglu_back() (in module core.fusions.fused_bias_geglu) GeGLUFunction (class in core.fusions.fused_bias_geglu) gelu_impl() (in module core.transformer.utils) GeLUFunction (class in core.fusions.fused_bias_gelu) generate() (core.inference.engines.abstract_engine.AbstractEngine static method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.engines.static_engine.StaticInferenceEngine method) generate_all_output_tokens_static_batch() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) generate_cuda_graph_batch_dimensions_list() (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder static method) generate_masked_orthogonal_rank_groups() (in module core.parallel_state) generate_output_tokens_dynamic_batch() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) generate_using_dynamic_engine() (core.inference.engines.static_engine.StaticInferenceEngine method) generate_using_legacy_static_engine() (core.inference.engines.static_engine.StaticInferenceEngine method) generated_length (core.inference.inference_request.InferenceRequest attribute) generated_log_probs (core.inference.inference_request.InferenceRequest attribute) generated_segments (core.inference.inference_request.InferenceRequest attribute) generated_sequence_lengths (core.inference.inference_request.InferenceRequest attribute) generated_text (core.inference.inference_request.InferenceRequest attribute) generated_tokens (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.InferenceRequest attribute) generated_top_n_logprobs (core.inference.inference_request.InferenceRequest attribute) generator (core.transformer.moe.moe_utils.RandomSTE attribute) generator() (core.inference.async_stream.AsyncStream method) get() (core.datasets.gpt_dataset.MockGPTLowLevelDataset method) (core.datasets.indexed_dataset.IndexedDataset method) (core.num_microbatches_calculator.NumMicroBatchesCalculator method) get_A_log() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_active_avail() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_active_request_count() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_active_sequence_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_active_used() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_added_code_paths() (in module core.datasets.retro.index.utils) get_added_codes_dir() (in module core.datasets.retro.index.utils) get_added_index() (core.datasets.retro.index.index.Index method) get_added_index_path() (core.datasets.retro.index.index.Index method) get_align_size_for_quantization() (in module core.transformer.moe.moe_utils) get_all_ranks() (in module core.parallel_state) get_all_rng_states() (in module core.tensor_parallel.random) get_all_timers_string() (core.timers.Timers method) get_all_true_mask() (in module core.models.retro.utils) get_amax_reduction_group() (in module core.parallel_state) get_asyncio_loop() (in module core.utils) get_attention_mask() (in module core.inference.utils) get_attr_wrapped_model() (in module core.utils) get_aux_loss_coeff() (core.transformer.moe.router.TopKRouter method) get_batch_for_context_window() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper.GPTInferenceWrapper method) (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) get_batch_invariant_attention_block_size() (in module core.transformer.custom_layers.batch_invariant_kernels) get_batch_on_this_cp_rank() (in module core.utils) get_batch_per_block() (core.fusions.fused_softmax.FusedScaleMaskSoftmax static method) get_bert_layer_with_transformer_engine_spec() (in module core.models.bert.bert_layer_specs) get_bias_dropout_add() (in module core.fusions.fused_bias_dropout) get_bin_path() (in module core.datasets.indexed_dataset) get_blend_from_list() (in module core.datasets.utils) get_block_nload() (in module core.datasets.retro.index.build) get_blocks() (in module core.datasets.retro.utils) get_blocks_by_rank() (in module core.datasets.retro.utils) get_boundary_pp_stage_ranks() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) get_buffer() (in module core.transformer.moe.fused_a2a) get_cached_cos_sin() (core.models.common.embeddings.yarn_rotary_pos_embedding.YarnRotaryEmbedding method) get_capacity() (in module core.transformer.moe.moe_utils) get_causal_conv1d_version() (in module core.utils) get_comm_stream() (in module core.pipeline_parallel.utils) get_comp_stream() (in module core.pipeline_parallel.utils) get_compute_units() (in module core.transformer.custom_layers.batch_invariant_kernels) get_config_keys() (core.extensions.transformer_engine.TEQuantizationRecipe class method) get_config_logger_path() (in module core.config_logger) get_config_path() (in module core.models.retro.utils) get_context_parallel_global_ranks() (in module core.parallel_state) get_context_parallel_group() (in module core.parallel_state) get_context_parallel_rank() (in module core.parallel_state) get_context_parallel_world_size() (in module core.parallel_state) get_conv1d_bias() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_conv1d_weight() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_cos_sin() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) get_cpu_offload_context (in module core.transformer.transformer_block) get_cuda_rng_tracker() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) get_cudagraph_runner() (core.transformer.cuda_graphs.CudaGraphManager method) get_current_global_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_current_running_global_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_D() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_data_modulo_expert_parallel_group() (in module core.parallel_state) get_data_parallel_group() (in module core.parallel_state) get_data_parallel_group_gloo() (in module core.parallel_state) get_data_parallel_group_if_dtensor() (in module core.utils) get_data_parallel_rank() (in module core.parallel_state) get_data_parallel_rng_tracker_name() (in module core.tensor_parallel.random) get_data_parallel_src_rank() (in module core.parallel_state) get_data_parallel_world_size() (in module core.parallel_state) get_db_dir() (in module core.datasets.retro.db.utils) get_default_causal_mask() (in module core.transformer.utils) get_default_load_sharded_strategy() (in module core.dist_checkpointing.serialization) get_default_pg_collection() (in module core.transformer.moe.moe_utils) get_default_save_common_strategy() (in module core.dist_checkpointing.serialization) get_default_save_sharded_strategy() (in module core.dist_checkpointing.serialization) get_default_strategy() (in module core.dist_checkpointing.strategies.base) get_distributed_index() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) get_document_indices() (core.datasets.indexed_dataset.IndexedDataset method) get_dp_group() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_dt_bias() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_embedding_group() (in module core.parallel_state) get_empty_index() (core.datasets.retro.index.index.Index method) get_empty_index_path() (core.datasets.retro.index.index.Index method) (in module core.datasets.retro.index.build) get_ep_layer_offset() (in module core.transformer.fsdp_dtensor_checkpoint) get_expected_keys() (core.extensions.kitchen.CompoundParamsConfigSchema class method) (core.extensions.kitchen.QAttentionParamsConfigSchema class method) (core.extensions.kitchen.QFlashAttentionParamsConfigSchema class method) (core.extensions.kitchen.QLinearParamsConfigSchema class method) get_expert_data_parallel_group() (in module core.parallel_state) get_expert_data_parallel_group_gloo() (in module core.parallel_state) get_expert_data_parallel_rank() (in module core.parallel_state) get_expert_data_parallel_world_size() (in module core.parallel_state) get_expert_index_from_key() (in module core.transformer.fsdp_dtensor_checkpoint) get_expert_model_parallel_group() (in module core.parallel_state) get_expert_model_parallel_rank() (in module core.parallel_state) get_expert_model_parallel_src_rank() (in module core.parallel_state) get_expert_model_parallel_world_size() (in module core.parallel_state) get_expert_parallel_rng_tracker_name() (in module core.tensor_parallel.random) get_expert_tensor_and_model_parallel_group() (in module core.parallel_state) get_expert_tensor_and_model_parallel_rank() (in module core.parallel_state) get_expert_tensor_and_model_parallel_world_size() (in module core.parallel_state) get_expert_tensor_model_pipeline_parallel_group() (in module core.parallel_state) get_expert_tensor_parallel_group() (in module core.parallel_state) get_expert_tensor_parallel_rank() (in module core.parallel_state) get_expert_tensor_parallel_world_size() (in module core.parallel_state) get_extra_state() (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) get_fa_version() (in module core.utils) get_forward_backward_func() (in module core.pipeline_parallel.schedules) get_fp4_align_size() (in module core.fp4_utils) get_fp4_context() (core.transformer.cuda_graphs._CudaGraphRunner method) get_fp8_align_size() (in module core.fp8_utils) get_fp8_context() (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan method) (core.transformer.cuda_graphs._CudaGraphRunner method) get_freqs_non_repeated() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) get_fsdp_buffer() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) get_fsdp_group() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_full_tensor_if_necessary() (in module core.utils) get_func_args() (in module core.nccl_allocator) get_global_memory_buffer() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.parallel_state) get_global_symmetric_memory_buffer() (in module core.parallel_state) get_global_unique_param_name() (in module core.transformer.fsdp_dtensor_checkpoint) get_gpt_data_dir() (in module core.models.retro.utils) get_gpt_decoder_block_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_layer_local_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_layer_with_inference_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_layer_with_transformer_engine_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_modelopt_spec() (in module core.post_training.modelopt.gpt.model_specs) get_gpt_mtp_block_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_mtp_block_spec_for_backend() (in module core.models.gpt.gpt_layer_specs) get_grad() (core.pipeline_parallel.utils.ScheduleNode method) get_grad_norm() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) get_grad_norm_fp32() (in module core.optimizer.clip_grads) get_grad_stats_parallel_group() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) get_hf_model_type() (in module core.models.huggingface.module) get_hidden_bytes() (in module core.transformer.moe.fused_a2a) get_hierarchical_context_parallel_groups() (in module core.parallel_state) get_idx_path() (in module core.datasets.indexed_dataset) get_index() (core.datasets.retro.index.factory.IndexFactory class method) (in module core.datasets.retro.query.query) get_index_cache_path() (in module core.datasets.object_storage_utils) get_index_class() (core.datasets.retro.index.factory.IndexFactory class method) get_index_dir() (in module core.datasets.retro.index.utils) get_index_of_chunked_prefill_request() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_indexed_dataset_infos() (in module core.datasets.retro.db.utils) get_indexed_dataset_infos_path() (in module core.datasets.retro.db.utils) get_individual_chunk_db() (in module core.datasets.retro.db.utils) get_individual_db_dir() (in module core.datasets.retro.db.utils) get_individual_db_paths() (in module core.datasets.retro.db.utils) get_individual_doc_offsets() (in module core.datasets.retro.db.utils) get_input_grads_with_dummy_flags() (core.transformer.cuda_graphs._CudaGraphRunner method) get_inter_distributed_optimizer_instance_group() (in module core.parallel_state) get_intra_distributed_optimizer_instance_group() (in module core.parallel_state) get_item() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_item_from_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_kvcache_utilization_stats() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_layer() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) get_layer_id_list() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) get_layer_maps_from_layer_type_list() (in module core.ssm.mamba_hybrid_layer_allocation) get_layer_name_without_prefix() (in module core.export.trtllm.trtllm_layers) get_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) get_layer_offset() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) get_layer_static_inputs() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) get_leader_rank() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) get_linear_layer() (in module core.transformer.utils) get_local_model_weights_per_gpu() (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) get_logical_hybrid_fsdp_rank() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_loss_scale() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) get_lr() (core.optimizer_param_scheduler.OptimizerParamScheduler method) get_main_grads_for_grad_norm() (core.optimizer.optimizer.MegatronOptimizer method) get_mamba_inference_state_config_from_model() (in module core.utils) get_mamba_stack_modelopt_spec() (in module core.post_training.modelopt.mamba.model_specs) get_mamba_version() (in module core.utils) get_mask() (core.parallel_state.RankGenerator method) get_max_sequence_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_mcore_tensor_parallel_partition_dim() (in module core.distributed.fsdp.src.megatron_fsdp.utils) get_megatron_optimizer() (in module core.optimizer) get_mem_alloc_context() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) get_mem_size_str() (in module core.inference.contexts.dynamic_context) get_merged_dataset() (in module core.datasets.retro.db.utils) get_merged_datasets() (in module core.datasets.retro.db.utils) get_merged_db_path_map() (in module core.datasets.retro.db.utils) get_merged_sampled_dataset() (in module core.datasets.retro.db.utils) get_merged_train_dataset() (in module core.datasets.retro.db.utils) get_merged_valid_dataset() (in module core.datasets.retro.db.utils) get_mesh_names() (in module core.distributed.fsdp.src.megatron_fsdp.utils) get_metadata_types() (core.inference.inference_request.DynamicInferenceRequest static method) get_micro_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_mismatch_errors() (core.transformer.cuda_graphs._CudaGraphRunner method) get_mlp_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) get_mlp_module_spec() (in module core.models.gpt.gpt_layer_specs) get_mlp_module_spec_for_backend() (in module core.models.gpt.gpt_layer_specs) get_mode() (core.rerun_state_machine.RerunStateMachine method) get_model_config() (in module core.utils) get_model_parallel_group() (in module core.parallel_state) get_model_parallel_src_rank() (in module core.parallel_state) get_model_type() (in module core.utils) get_model_xattn() (in module core.utils) get_module() (in module core.transformer.spec_utils) get_moe_layer_wise_logging_tracker() (in module core.transformer.moe.moe_utils) get_moe_module_spec() (in module core.models.gpt.moe_module_specs) get_moe_module_spec_for_backend() (in module core.models.gpt.moe_module_specs) get_mtp_layer_offset() (in module core.transformer.multi_token_prediction) get_mtp_layer_spec() (in module core.transformer.multi_token_prediction) get_mtp_layer_spec_for_backend() (in module core.transformer.multi_token_prediction) get_mtp_num_layers_to_build() (in module core.transformer.multi_token_prediction) get_nccl_options() (in module core.parallel_state) get_neighbor_dir() (in module core.datasets.retro.query.utils) get_new_request_id() (core.inference.engines.static_engine.StaticInferenceEngine method) (core.inference.scheduler.Scheduler method) get_next_data_parallel_rank() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator method) get_num_chunks_per_sample() (in module core.datasets.retro.utils) get_num_image_embeddings() (in module core.models.vision.clip_vit_model) get_num_layers_to_build() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) (in module core.transformer.transformer_block) get_num_microbatches() (in module core.num_microbatches_calculator) get_num_stages_from_str() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout static method) get_num_unfinalized_calls() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) get_number_of_tokens_per_expert() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) get_object() (core.datasets.object_storage_utils.S3Client method) get_object_storage_access() (in module core.datasets.object_storage_utils) get_optim_param_to_id_map() (in module core.dist_checkpointing.optimizer) get_outer_fsdp_group() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_output() (core.pipeline_parallel.utils.ScheduleNode method) (core.transformer.moe.shared_experts.SharedExpertMLP method) get_packed_seq_params() (in module core.models.multimodal.context_parallel) get_padded_vocab_size() (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) get_padding() (in module core.models.multimodal.context_parallel) get_param_id_to_sharded_param_map() (in module core.dist_checkpointing.optimizer) get_parameter_state_dp_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) get_parameter_state_dp_zero() (core.optimizer.distrib_optimizer.DistributedOptimizer method) get_parameters() (core.optimizer.optimizer.MegatronOptimizer method) get_path_count() (in module core.config_logger) get_path_with_count() (in module core.config_logger) get_paused_avail() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_paused_used() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_permuted_hidden_states_by_experts() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) get_pg() (core.hyper_comm_grid.HyperCommGrid method) get_pg_rank() (in module core.utils) get_pg_size() (in module core.utils) get_pg_src_rank() (in module core.utils) get_pipeline_model_parallel_first_rank() (in module core.parallel_state) get_pipeline_model_parallel_group() (in module core.parallel_state) get_pipeline_model_parallel_last_rank() (in module core.parallel_state) get_pipeline_model_parallel_next_rank() (in module core.parallel_state) get_pipeline_model_parallel_prev_rank() (in module core.parallel_state) get_pipeline_model_parallel_rank() (in module core.parallel_state) get_pipeline_model_parallel_world_size() (in module core.parallel_state) get_pos_emb_on_this_cp_rank() (in module core.models.common.embeddings.rope_utils) get_pos_enc() (core.models.vision.radio.RADIOViTModel method) get_position_embedding_group() (in module core.parallel_state) get_pp_first_rank() (in module core.pipeline_parallel.utils) get_pp_last_rank() (in module core.pipeline_parallel.utils) get_pp_next_rank() (in module core.pipeline_parallel.utils) get_pp_prev_rank() (in module core.pipeline_parallel.utils) get_pp_rank_microbatches() (in module core.pipeline_parallel.schedules) get_qattention_params() (core.extensions.kitchen.CompoundParamsConfigSchema method) get_qfa_params() (core.extensions.kitchen.CompoundParamsConfigSchema method) get_qkv_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) get_qlinear_params() (core.extensions.kitchen.CompoundParamsConfigSchema method) get_quant_config_or_none() (in module core.quantization.utils) get_quantization_context() (core.transformer.cuda_graphs._CudaGraphRunner method) get_query_dir() (in module core.datasets.retro.query.utils) get_query_key_value_tensors() (core.transformer.attention.Attention method) (core.transformer.attention.CrossAttention method) (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) get_ranks() (core.parallel_state.RankGenerator method) get_ready_bucket_group_for_reduction() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) get_relative_seq_len() (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding static method) get_request() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) get_rerun_state_machine() (in module core.rerun_state_machine) get_restored_hidden_states_by_experts() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) get_retro_datasets() (in module core.datasets.retro.query.retro_dataset) get_retro_decoder_block_spec() (in module core.models.retro.decoder_spec) get_retro_decoder_layer_local_spec() (in module core.models.retro.decoder_spec) get_retro_decoder_layer_te_spec() (in module core.models.retro.decoder_spec) get_retro_encoder_block_spec() (in module core.models.retro.encoder_spec) get_retro_encoder_layer_local_spec() (in module core.models.retro.encoder_spec) get_retro_encoder_layer_te_spec() (in module core.models.retro.encoder_spec) get_root_mesh() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_rotary_seq_len() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) get_save_function_and_args() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) get_schedule_table() (in module core.pipeline_parallel.schedules) get_shard_from_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_shard_from_local_buffer() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_skipped_iterations_from_tracker_file() (core.rerun_state_machine.RerunStateMachine class method) get_sliding_window_causal_mask() (in module core.transformer.utils) get_states() (core.tensor_parallel.random.CudaRNGStatesTracker method) get_stream_generator() (core.inference.engines.static_engine.StaticInferenceEngine method) get_submesh() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_t5_decoder_with_local_block_spec() (in module core.models.T5.t5_spec) get_t5_decoder_with_transformer_engine_block_spec() (in module core.models.T5.t5_spec) get_t5_encoder_with_local_block_spec() (in module core.models.T5.t5_spec) get_t5_encoder_with_transformer_engine_block_spec() (in module core.models.T5.t5_spec) get_te_version() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.models.bert.bert_model) (in module core.utils) get_tensor() (core.distributed.fsdp.src.megatron_fsdp.utils.GlobalMemoryBuffer method) (core.utils.GlobalMemoryBuffer method) get_tensor_and_context_parallel_group() (in module core.parallel_state) get_tensor_and_context_parallel_rank() (in module core.parallel_state) get_tensor_and_context_parallel_world_size() (in module core.parallel_state) get_tensor_and_data_parallel_group() (in module core.parallel_state) get_tensor_model_parallel_group() (in module core.parallel_state) get_tensor_model_parallel_group_if_none() (in module core.utils) get_tensor_model_parallel_rank() (in module core.parallel_state) get_tensor_model_parallel_src_rank() (in module core.parallel_state) get_tensor_model_parallel_world_size() (in module core.parallel_state) get_tensor_shapes() (in module core.pipeline_parallel.schedules) get_tensors() (core.transformer.cuda_graphs._CudaGraphRunner class method) get_text_dataset_for_adding() (in module core.datasets.retro.index.build) get_text_dataset_for_training() (in module core.datasets.retro.index.build) get_text_embeddings() (core.models.mimo.model.base.MimoModel method) get_torch_version() (in module core.utils) get_total() (core.energy_monitor.EnergyMonitor method) get_training_data_block_dir() (in module core.datasets.retro.index.utils) get_training_data_block_paths() (in module core.datasets.retro.index.utils) get_training_data_merged_path() (in module core.datasets.retro.index.utils) get_training_data_root_dir() (in module core.datasets.retro.index.utils) get_transformer_layer_offset() (in module core.transformer.transformer_layer) get_trtllm_pretrained_config_and_model_weights() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) get_unflattened_state_dict() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) get_updated_expert_bias() (in module core.transformer.moe.moe_utils) get_virtual_pipeline_model_parallel_rank() (in module core.parallel_state) get_virtual_pipeline_model_parallel_world_size() (in module core.parallel_state) get_vit_layer_with_local_spec() (in module core.models.vision.vit_layer_specs) get_vit_layer_with_transformer_engine_spec() (in module core.models.vision.vit_layer_specs) get_wd() (core.optimizer_param_scheduler.OptimizerParamScheduler method) global_mempool (core.transformer.cuda_graphs.CudaGraphManager attribute) global_offset (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) global_rank (core.dist_checkpointing.strategies.two_stage._ShardedTensorMetadata attribute) global_shape (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) global_slice() (core.dist_checkpointing.mapping.ShardedTensor method) GlobalMemoryBuffer (class in core.distributed.fsdp.src.megatron_fsdp.utils) (class in core.utils) GlobalSymmetricMemoryBuffer (class in core.utils) GlobMatcher (class in core.quantization.quant_config) glu_linear_offset (core.transformer.transformer_config.TransformerConfig attribute) gpt (core.datasets.retro.config.tokenizers.RetroTokenizers attribute) GPTChunkDataset (class in core.datasets.retro.query.gpt_chunk_dataset) GPTDataset (class in core.datasets.gpt_dataset) GPTDatasetConfig (class in core.datasets.gpt_dataset) GPTInferenceWrapper (class in core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper) GPTModel (class in core.models.gpt.gpt_model) GPTTokenizer (class in core.tokenizers.text.models.gpt_tokenizer) GPTToTextDataset (class in core.datasets.retro.utils) GRAD (core.distributed.param_and_grad_buffer.BufferType attribute) grad_reduce_in_fp32 (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) grad_scale_func (core.model_parallel_config.ModelParallelConfig attribute) grad_sync_func (core.model_parallel_config.ModelParallelConfig attribute) gradient_accumulation_fusion (core.model_parallel_config.ModelParallelConfig attribute) gradient_reduce_div_fusion (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) gradient_reduce_preprocessing() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) GradReducePipeline (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) grads_states_parallel_group_is_shared() (core.optimizer.optimizer.ChainedOptimizer method) GraphableMegatronModule (class in core.transformer.module) group_limited_topk() (in module core.transformer.moe.moe_utils) grouped_gemm_is_available() (in module core.transformer.moe.grouped_gemm_util) grouped_mlp_modules() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) GroupedMLP (class in core.transformer.moe.experts) H handle_experts_in_state_dict() (in module core.transformer.fsdp_dtensor_checkpoint) handle_fp8_extra_state_case() (in module core.transformer.fsdp_dtensor_checkpoint) handle_swiglu_in_state_dict() (in module core.transformer.fsdp_dtensor_checkpoint) has_config_logger_enabled() (in module core.config_logger) has_regular_grid (core.dist_checkpointing.mapping.ShardedTensor property) has_unfinished_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) HAVE_APEX_OR_TE (in module core.optimizer.distrib_optimizer) have_requests_pending() (core.inference.scheduler.Scheduler method) HAVE_TE (in module core.fp4_utils) (in module core.fp8_utils) HAVE_TE_FP4_TENSOR_CLASS (in module core.fp4_utils) HAVE_TE_FP8_TENSOR_CLASS (in module core.fp8_utils) hcp (core.process_groups_config.ProcessGroupCollection attribute) head_object() (core.datasets.object_storage_utils.S3Client method) Headers (class in core.inference.headers) hetereogenous_dist_checkpoint (core.transformer.transformer_config.TransformerConfig attribute) heterogeneous_block_specs (core.transformer.transformer_config.TransformerConfig attribute) hidden_dropout (core.transformer.transformer_config.TransformerConfig attribute) hidden_size (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) hierarchical_context_parallel_sizes (core.model_parallel_config.ModelParallelConfig attribute) hnorm (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) hsdp_gbuf (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) hsdp_wbuf (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) HuggingFaceModule (class in core.models.huggingface.module) HuggingFaceTokenizer (class in core.tokenizers.text.libraries.huggingface_tokenizer) HybridDeviceOptimizer (class in core.optimizer.cpu_offloading.hybrid_optimizer) HybridEPCombine (class in core.transformer.moe.fused_a2a) HybridEPDispatch (class in core.transformer.moe.fused_a2a) HyperCommGrid (class in core.hyper_comm_grid) hysteresis (core.optimizer.optimizer_config.OptimizerConfig attribute) I id_to_token() (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) IdentityFuncOp (class in core.transformer.identity_op) IdentityOp (class in core.transformer.identity_op) IDLE (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) ids_to_text() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) ids_to_tokens() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) idx (core.dist_checkpointing.strategies.async_utils._ActiveAsyncRequest attribute) IGNORE_ALL (core.dist_checkpointing.validation.StrictHandling attribute) IGNORE_INDEX (in module core.models.multimodal.llava_model) image_h (core.datasets.multimodal_dataset.MultimodalDatasetConfig attribute) IMAGE_TOKEN (in module core.models.multimodal.llava_model) image_w (core.datasets.multimodal_dataset.MultimodalDatasetConfig attribute) imgs (core.inference.inference_request.VLMInferenceRequest attribute) import_module() (in module core.transformer.spec_utils) import_package() (core.msc_utils._FeatureFlag method) in_proj (core.ssm.mamba_mixer.MambaMixerSubmodules attribute) increment_batch_size_offset() (core.inference.contexts.base_context.BaseInferenceContext method) increment_sequence_len_offset() (core.inference.contexts.base_context.BaseInferenceContext method) Index (class in core.datasets.retro.index.index) IndexedDataset (class in core.datasets.indexed_dataset) IndexedDatasetBuilder (class in core.datasets.indexed_dataset) IndexFactory (class in core.datasets.retro.index.factory) IndicesToMultihot (class in core.fusions.fused_indices_converter) inference_batch_times_seqlen_threshold (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) inference_fuse_tp_communication (core.transformer.transformer_config.TransformerConfig attribute) inference_max_requests (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) inference_max_seq_length (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) inference_parameters (core.inference.inference_request.InferenceRequest attribute) inference_params (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper property) inference_pp_size (core.export.export_config.ExportConfig attribute) inference_rng_tracker (core.transformer.transformer_config.TransformerConfig attribute) inference_sampling_seed (core.transformer.transformer_config.TransformerConfig attribute) inference_tp_size (core.export.export_config.ExportConfig attribute) InferenceBatchDimensions (class in core.inference.batch_dimensions_utils) InferenceClient (class in core.inference.inference_client) InferenceLayerNormColumnParallelLinear (class in core.tensor_parallel.inference_layers) InferenceRequest (class in core.inference.inference_request) InferenceRowParallelLinear (class in core.tensor_parallel.inference_layers) InferenceSpecProvider (class in core.models.backends) InferenceWrapperConfig (class in core.inference.model_inference_wrappers.inference_wrapper_config) init() (in module core.nccl_allocator) init_cuda_graph_cache() (in module core.transformer.utils) init_data() (core.dist_checkpointing.mapping.ShardedTensor method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) init_hybrid_ep_buffer() (in module core.transformer.moe.fused_a2a) init_indexed_dataset_infos() (in module core.datasets.retro.db.utils) init_method (core.transformer.transformer_config.TransformerConfig attribute) init_method_normal() (in module core.utils) init_method_std (core.transformer.transformer_config.TransformerConfig attribute) init_model_with_meta_device (core.transformer.transformer_config.TransformerConfig attribute) init_num_microbatches_calculator() (in module core.num_microbatches_calculator) init_tensors() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) initial_loss_scale (core.optimizer.optimizer_config.OptimizerConfig attribute) INITIAL_RUN (core.rerun_state_machine.RerunState attribute) (core.rerun_state_machine.RerunValidationStatus attribute) initialize() (core.datasets.indexed_dataset.IndexedDataset method) initialize_attention_state() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) initialize_model_parallel() (in module core.parallel_state) initialize_rerun_state_machine() (in module core.rerun_state_machine) initialize_rng_tracker() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) input_layernorm (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) input_layernorm_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) input_layernorm_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) insert_tensors() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) inspect_types() (in module core.dist_checkpointing.dict_utils) install_optimized_model_weights() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) int16 (core.datasets.indexed_dataset.DType attribute) int32 (core.datasets.indexed_dataset.DType attribute) int64 (core.datasets.indexed_dataset.DType attribute) int8 (core.datasets.indexed_dataset.DType attribute) inter_dist_opt (core.process_groups_config.ProcessGroupCollection attribute) internal_api() (in module core.utils) intra_dist_opt (core.process_groups_config.ProcessGroupCollection attribute) intra_dp_cp (core.process_groups_config.ProcessGroupCollection attribute) intra_expt_dp (core.process_groups_config.ProcessGroupCollection attribute) inv_scale (core.optimizer.grad_scaler.MegatronGradScaler property) inv_vocab (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) is_applicable_for_batch_dim() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) is_aux_loss_enabled() (core.transformer.moe.router.TopKRouter method) is_batch_invariant_mode_enabled() (in module core.transformer.custom_layers.batch_invariant_kernels) is_causal_conv1d_min_version() (in module core.utils) is_column_parallel_linear() (in module core.fp8_utils) is_current_async_call_done() (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) is_current_rank_in_grid() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) is_decode_only() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.contexts.static_context.StaticInferenceContext method) is_dynamic_batching() (core.inference.contexts.base_context.BaseInferenceContext method) is_enabled() (core.msc_utils._FeatureFlag method) is_experimental_enabled() (in module core.config) is_expert_param (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) is_fa_min_version() (in module core.utils) is_first_last_bf16_layer() (in module core.fp8_utils) is_float8tensor() (in module core.dist_checkpointing.exchange_utils) (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.fp8_utils) is_frozen (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) is_gated_activation() (in module core.export.trtllm.trtllm_weights_converter.utils) is_graph_capturing() (in module core.transformer.cuda_graphs) is_hollow (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) is_hybrid_model (core.transformer.transformer_config.TransformerConfig attribute) is_initialized() (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.tensor_parallel.random.CudaRNGStatesTracker method) (in module core.parallel_state) is_kernel_available() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) is_layer_window_attention() (in module core.transformer.utils) is_main_replica() (in module core.dist_checkpointing.mapping) is_mamba_min_version() (in module core.utils) is_mcore_tensor_model_parallel() (in module core.distributed.fsdp.src.megatron_fsdp.utils) is_mcore_tensor_parallel_duplicated() (in module core.distributed.fsdp.src.megatron_fsdp.utils) is_memory_available() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) is_mxfp8tensor() (in module core.fp8_utils) is_nvfp4tensor() (in module core.fp4_utils) is_object_storage_path() (in module core.datasets.object_storage_utils) is_pipeline_first_stage() (in module core.inference.communication_utils) (in module core.parallel_state) is_pipeline_last_stage() (in module core.inference.communication_utils) (in module core.parallel_state) is_pp_first_stage() (in module core.pipeline_parallel.utils) is_pp_last_stage() (in module core.pipeline_parallel.utils) is_rank_in_embedding_group() (in module core.parallel_state) is_rank_in_position_embedding_group() (in module core.parallel_state) is_row_parallel_linear() (in module core.fp8_utils) is_single_shape() (in module core.pipeline_parallel.p2p_communication) is_static_batching() (core.inference.contexts.base_context.BaseInferenceContext method) (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.contexts.static_context.StaticInferenceContext method) is_submodule() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.utils) is_te_min_version() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.utils) is_torch_min_version() (in module core.utils) is_unexpectedly_large() (core.rerun_state_machine.RerunStateMachine method) is_unitialized() (in module core.parallel_state) is_valid() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) is_vp_first_stage() (in module core.pipeline_parallel.utils) is_vp_last_stage() (in module core.pipeline_parallel.utils) items() (core.optimizer.optimizer.ProxyDict method) J jit_fuser (in module core.jit) JSONEncoderWithMcoreTypes (class in core.config_logger) K k_layernorm (core.transformer.attention.SelfAttentionSubmodules attribute) keep_fp8_transpose_cache (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) KEEP_VARS_HINT (in module core.dist_checkpointing.optimizer) key (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) key_value_cache() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) kitchen_attention_backend (core.transformer.transformer_config.TransformerConfig attribute) kitchen_config_type (core.extensions.kitchen.CompoundParamsConfigSchema attribute) (core.extensions.kitchen.QAttentionParamsConfigSchema attribute) (core.extensions.kitchen.QFlashAttentionParamsConfigSchema attribute) (core.extensions.kitchen.QLinearParamsConfigSchema attribute) kitchen_quantization_recipe_config() (in module core.quantization.utils) KitchenColumnParallelGroupedLinear (class in core.extensions.kitchen) KitchenColumnParallelLinear (class in core.extensions.kitchen) KitchenConfigType (class in core.extensions.kitchen) KitchenDotProductAttention (class in core.extensions.kitchen) KitchenFlashAttention (class in core.extensions.kitchen) KitchenGroupedLinear (class in core.extensions.kitchen) KitchenLayerNormColumnParallelLinear (class in core.extensions.kitchen) KitchenLinear (class in core.extensions.kitchen) KitchenQuantizationParams (class in core.extensions.kitchen) KitchenRowParallelGroupedLinear (class in core.extensions.kitchen) KitchenRowParallelLinear (class in core.extensions.kitchen) KitchenSpecProvider (class in core.extensions.kitchen) kv_channels (core.transformer.transformer_config.TransformerConfig attribute) kv_layernorm (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) kv_lora_rank (core.transformer.transformer_config.MLATransformerConfig attribute) L L2Norm (class in core.transformer.torch_norm) language_model_spec (core.models.mimo.config.base_configs.MimoModelConfig attribute) LanguageModelEmbedding (class in core.models.common.embeddings.language_model_embedding) LanguageModule (class in core.models.common.language_module.language_module) lap() (core.energy_monitor.EnergyMonitor method) last_token_logits() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) latency (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.DynamicInferenceRequestRecord attribute) layer_norm (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) (core.transformer.transformer_block.TransformerBlockSubmodules attribute) layer_norm() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) layer_number (core.quantization.quant_config.MatchContext attribute) layer_specs (core.transformer.multi_token_prediction.MultiTokenPredictionBlockSubmodules attribute) (core.transformer.transformer_block.TransformerBlockSubmodules attribute) layernorm_epsilon (core.transformer.transformer_config.TransformerConfig attribute) layernorm_zero_centered_gamma (core.transformer.transformer_config.TransformerConfig attribute) LayerType (class in core.transformer.enums) Linear (class in core.post_training.modelopt.layers) linear() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.InferenceSpecProvider method) linear_fc1 (core.transformer.mlp.MLPSubmodules attribute) linear_fc1_forward_and_act() (core.transformer.moe.shared_experts.SharedExpertMLP method) linear_fc2 (core.transformer.mlp.MLPSubmodules attribute) linear_fc2_forward() (core.transformer.moe.shared_experts.SharedExpertMLP method) linear_kv (core.transformer.attention.CrossAttentionSubmodules attribute) linear_kv_down_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_kv_up_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_proj (core.transformer.attention.CrossAttentionSubmodules attribute) (core.transformer.attention.SelfAttentionSubmodules attribute) (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_q (core.transformer.attention.CrossAttentionSubmodules attribute) linear_q_down_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_q_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_q_up_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_qkv (core.transformer.attention.SelfAttentionSubmodules attribute) linear_with_frozen_weight() (in module core.tensor_parallel.layers) linear_with_grad_accumulation_and_async_allreduce() (in module core.tensor_parallel.layers) LinearWithFrozenWeight (class in core.tensor_parallel.layers) LinearWithGradAccumulationAndAsyncCommunication (class in core.tensor_parallel.layers) LLaVAModel (class in core.models.multimodal.llava_model) lm_head (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) load() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) load_and_upcycle_model() (in module core.transformer.moe.upcycling_utils) LOAD_COMMON (core.dist_checkpointing.strategies.base.StrategyAction attribute) load_common() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) load_common_state_dict() (in module core.dist_checkpointing.serialization) load_content_metadata() (in module core.dist_checkpointing.serialization) load_doc_tuples() (core.datasets.retro.db.dataset.DBDataset method) load_indexed_datasets() (in module core.datasets.retro.db.utils) load_parameter_state() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) load_parameter_state_from_dp_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_dp_zero() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_dp_zero_legacy() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_fs_model_space() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_fully_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_plain_tensors() (in module core.dist_checkpointing.serialization) load_preprocess() (in module core.dist_checkpointing.state_dict_utils) load_quantization_recipe() (in module core.quantization.utils) LOAD_SHARDED (core.dist_checkpointing.strategies.base.StrategyAction attribute) load_sharded_metadata() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) load_sharded_objects() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) load_state_dict() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) (core.distributed.torch_fully_sharded_data_parallel.TorchFullyShardedDataParallel method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.grad_scaler.ConstantGradScaler method) (core.optimizer.grad_scaler.DynamicGradScaler method) (core.optimizer.grad_scaler.MegatronGradScaler method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer_param_scheduler.OptimizerParamScheduler method) (core.rerun_state_machine.RerunDataIterator method) (core.rerun_state_machine.RerunErrorInjector method) (core.rerun_state_machine.RerunStateMachine method) (core.transformer.module.Float16Module method) load_tensor_from_storage() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) load_tensors_metadata() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.tensorstore.TensorStoreLoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) (core.dist_checkpointing.strategies.zarr.ZarrLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) load_zarr_based_sharded_metadata() (in module core.dist_checkpointing.strategies.zarr) LoadCommonStrategy (class in core.dist_checkpointing.strategies.base) LoadShardedStrategy (class in core.dist_checkpointing.strategies.base) LoadStrategyBase (class in core.dist_checkpointing.strategies.base) local (core.transformer.enums.AttnBackend attribute) local_chunk_offset_in_global() (core.dist_checkpointing.mapping.ShardedTensor method) local_multi_tensor_applier() (in module core.utils) local_multi_tensor_l2_norm() (in module core.utils) local_multi_tensor_scale() (in module core.utils) local_shape (core.dist_checkpointing.mapping.ShardedTensor attribute) LocalNonpersistentObject (class in core.dist_checkpointing.mapping) LocalShardsContainer (class in core.dist_checkpointing.strategies.checkpointable) LocalSpecProvider (class in core.models.backends) locate_item_in_global_item() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) log() (core.timers.Timers method) LOG_ALL (core.dist_checkpointing.validation.StrictHandling attribute) log_config_to_disk() (in module core.config_logger) log_max_attention_logit (core.transformer.transformer_config.TransformerConfig attribute) log_num_zeros_in_grad (core.optimizer.optimizer_config.OptimizerConfig attribute) log_on_each_pipeline_stage() (in module core.utils) log_retro_rank_0() (in module core.datasets.retro.utils) log_single_rank() (in module core.utils) log_softmax() (in module core.transformer.custom_layers.batch_invariant_kernels) LOG_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) logger (in module core.datasets.blended_dataset) (in module core.datasets.blended_megatron_dataset_builder) (in module core.datasets.blended_megatron_dataset_config) (in module core.datasets.gpt_dataset) (in module core.datasets.indexed_dataset) (in module core.datasets.masked_dataset) (in module core.datasets.megatron_tokenizer) (in module core.datasets.retro.query.multi_split_gpt_dataset) (in module core.datasets.retro.utils) (in module core.datasets.utils) (in module core.dist_checkpointing.exchange_utils) (in module core.dist_checkpointing.mapping) (in module core.dist_checkpointing.optimizer) (in module core.dist_checkpointing.serialization) (in module core.dist_checkpointing.strategies.async_utils) (in module core.dist_checkpointing.strategies.common) (in module core.dist_checkpointing.strategies.filesystem_async) (in module core.dist_checkpointing.strategies.fully_parallel) (in module core.dist_checkpointing.strategies.state_dict_saver) (in module core.dist_checkpointing.strategies.tensorstore) (in module core.dist_checkpointing.strategies.torch) (in module core.dist_checkpointing.strategies.two_stage) (in module core.dist_checkpointing.strategies.zarr) (in module core.dist_checkpointing.tensor_aware_state_dict) (in module core.dist_checkpointing.validation) (in module core.distributed.distributed_data_parallel) (in module core.distributed.fsdp.mcore_fsdp_adapter) (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) (in module core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.distributed.param_and_grad_buffer) (in module core.extensions.kitchen) (in module core.full_cuda_graph) (in module core.models.common.embeddings.relative_pos_embedding) (in module core.models.common.embeddings.rope_utils) (in module core.models.common.embeddings.rotary_pos_embedding) (in module core.models.common.embeddings.yarn_rotary_pos_embedding) (in module core.models.mimo.model.base) (in module core.msc_utils) (in module core.num_microbatches_calculator) (in module core.optimizer) (in module core.optimizer.distrib_optimizer) (in module core.optimizer.optimizer) (in module core.optimizer_param_scheduler) (in module core.parallel_state) (in module core.post_training.modelopt.gpt.state_dict_hooks) (in module core.post_training.modelopt.layers) (in module core.quantization.quant_config) (in module core.rerun_state_machine) (in module core.ssm.mamba_hybrid_layer_allocation) (in module core.ssm.mamba_mixer) (in module core.timers) (in module core.tokenizers.megatron_tokenizer) (in module core.tokenizers.text.libraries.huggingface_tokenizer) (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) (in module core.transformer.cuda_graphs) (in module core.transformer.fsdp_dtensor_checkpoint) (in module core.transformer.mlp) (in module core.transformer.moe.experts) (in module core.transformer.moe.token_dispatcher) (in module core.transformer.pipeline_parallel_layer_layout) (in module core.transformer.transformer_block) (in module core.transformer.transformer_layer) (in module core.utils) logger_stack() (in module core.dist_checkpointing.utils) loss (core.transformer.enums.LayerType attribute) loss_scale (core.optimizer.optimizer_config.OptimizerConfig attribute) loss_scale_window (core.optimizer.optimizer_config.OptimizerConfig attribute) LowLevelDataset (in module core.datasets.megatron_dataset) lr (core.optimizer.optimizer_config.OptimizerConfig attribute) M main_grad_buffer (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) main_grads_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) main_loss_backward_scale (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler attribute) (core.transformer.multi_token_prediction.MTPLossAutoScaler attribute) main_params_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) main_rank_for_shard (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) main_weight_buffer (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) MAJOR (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) make_fsdp_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) make_object_verbose() (core.datasets.retro.index.index.Index class method) make_sharded_object_for_checkpoint() (in module core.transformer.utils) make_sharded_optimizer_tensor() (in module core.dist_checkpointing.optimizer) make_sharded_tensor_for_checkpoint() (in module core.utils) make_sharded_tensors_for_checkpoint() (in module core.transformer.utils) make_tp_sharded_tensor_for_checkpoint() (in module core.utils) make_viewless() (in module core.pipeline_parallel.utils) make_viewless_tensor() (in module core.utils) MakeViewlessTensor (class in core.utils) MAMBA (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) mamba_bda (core.ssm.mamba_layer.MambaLayerSubmodules attribute) mamba_head_dim (core.transformer.transformer_config.TransformerConfig attribute) mamba_inference_stack_spec (in module core.models.mamba.mamba_layer_specs) mamba_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) mamba_num_groups (core.transformer.transformer_config.TransformerConfig attribute) mamba_num_heads (core.transformer.transformer_config.TransformerConfig attribute) mamba_stack_spec (in module core.models.mamba.mamba_layer_specs) mamba_state_dim (core.transformer.transformer_config.TransformerConfig attribute) mamba_state_shapes_per_request() (core.ssm.mamba_block.MambaStack method) (core.ssm.mamba_layer.MambaLayer method) (core.ssm.mamba_mixer.MambaMixer method) mamba_states_cache() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) MambaContextParallel (class in core.ssm.mamba_context_parallel) MambaLayer (class in core.ssm.mamba_layer) MambaLayerSubmodules (class in core.ssm.mamba_layer) MambaMixer (class in core.ssm.mamba_mixer) MambaMixerSubmodules (class in core.ssm.mamba_mixer) MambaModel (class in core.models.mamba.mamba_model) MambaStack (class in core.ssm.mamba_block) MambaStackSubmodules (class in core.ssm.mamba_block) MambaTokenizer (class in core.tokenizers.text.models.mamba_tokenizer) map_reduce() (in module core.dist_checkpointing.dict_utils) mask (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) mask_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) masked_softmax_fusion (core.transformer.transformer_config.TransformerConfig attribute) MaskedWordPieceDataset (class in core.datasets.masked_dataset) MaskedWordPieceDatasetConfig (class in core.datasets.masked_dataset) masking_do_full_word (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_do_permutation (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_max_ngram (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_probability (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_use_geometric_distribution (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_use_longer_ngrams (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) match() (core.quantization.quant_config.GlobMatcher method) (core.quantization.quant_config.Matcher method) (core.quantization.quant_config.RecipeConfig method) match_graph_config() (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder static method) match_input (core.extensions.kitchen.KitchenQuantizationParams attribute) match_to_config_key() (core.quantization.quant_config.RecipeConfig method) MatchContext (class in core.quantization.quant_config) Matcher (class in core.quantization.quant_config) matmul_kernel_persistent() (in module core.transformer.custom_layers.batch_invariant_kernels) matmul_persistent() (in module core.transformer.custom_layers.batch_invariant_kernels) max_allowed_chunks() (core.dist_checkpointing.mapping.ShardedTensor method) max_btime (core.utils._StragglerData attribute) max_clock (core.utils._StragglerData attribute) max_elapsed (core.utils._StragglerData attribute) max_power (core.utils._StragglerData attribute) max_seqlen_kv (core.packed_seq_params.PackedSeqParams attribute) max_seqlen_q (core.packed_seq_params.PackedSeqParams attribute) max_sequence_length (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) max_temp (core.utils._StragglerData attribute) max_util (core.utils._StragglerData attribute) MaxSequenceLengthOverflowError maybe_cat() (in module core.utils) maybe_finalize_async_calls() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) maybe_get_tensor() (core.utils.GlobalSymmetricMemoryBuffer method) maybe_init_gloo_group() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) maybe_initialize_symmetric_memory() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) maybe_inject() (core.rerun_state_machine.RerunErrorInjector method) maybe_load_config() (in module core.dist_checkpointing.core) maybe_miscompare() (core.rerun_state_machine.RerunErrorInjector method) maybe_move_tensor_to_cpu() (in module core.transformer.moe.moe_utils) maybe_report_missing_and_unexpected_keys() (in module core.dist_checkpointing.validation) mcore_gpt_load_te_state_dict_pre_hook() (in module core.post_training.modelopt.gpt.state_dict_hooks) mcore_to_pyt_state_dict() (in module core.dist_checkpointing.strategies.torch) MCoreLoadPlanner (class in core.dist_checkpointing.strategies.torch) MCoreMetadata (class in core.dist_checkpointing.strategies.torch) MCoreSavePlan (class in core.dist_checkpointing.strategies.torch) MCoreSavePlanner (class in core.dist_checkpointing.strategies.torch) MCoreTensorAwareStateDict (class in core.dist_checkpointing.tensor_aware_state_dict) mean_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) mean_dim() (in module core.transformer.custom_layers.batch_invariant_kernels) mean_kernel() (in module core.transformer.custom_layers.batch_invariant_kernels) MEGATRON_CACHE (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) MEGATRON_CONFIG_MAP (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) MegatronDataset (class in core.datasets.megatron_dataset) MegatronFSDP (class in core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) MegatronGenerate (class in core.inference.text_generation_server.text_generation_server) MegatronGradScaler (class in core.optimizer.grad_scaler) MegatronHFTokenizer (class in core.tokenizers.text.libraries.megatron_hf_tokenizer) MegatronLegacyTokenizer (class in core.datasets.megatron_tokenizer) MegatronModule (class in core.transformer.module) MegatronOptimizer (class in core.optimizer.optimizer) MegatronServer (class in core.inference.text_generation_server.text_generation_server) MegatronTokenizer (class in core.tokenizers.megatron_tokenizer) MegatronTokenizerBase (class in core.tokenizers.base_tokenizer) MegatronTokenizerChatTemplate (class in core.tokenizers.text.libraries.chat_template) MegatronTokenizerText (class in core.tokenizers.text.text_tokenizer) MegatronTokenizerTextAbstract (class in core.tokenizers.text.libraries.abstract_tokenizer) mem (core.datasets.retro.config.bert_embedders.RetroBertEmbedders attribute) MEMBER (core.pipeline_parallel.bridge_communicator.CommRole attribute) memory_efficient_layer_norm (core.transformer.transformer_config.TransformerConfig attribute) merge() (core.inference.inference_request.DynamicInferenceRequestRecord method) (in module core.dist_checkpointing.dict_utils) merge_dbs() (in module core.datasets.retro.db.build) merge_embedding_blocks() (in module core.datasets.retro.index.build) merge_fn (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) merge_global_slice_with_shape() (in module core.dist_checkpointing.strategies.tensorstore) merges_file (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) message (core.rerun_state_machine.Caller attribute) microbatch_group_size_per_vp_stage (core.model_parallel_config.ModelParallelConfig attribute) mid_level_dataset_surplus (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MidLevelDataset (in module core.datasets.blended_megatron_dataset_builder) MimoModel (class in core.models.mimo.model.base) MimoModelConfig (class in core.models.mimo.config.base_configs) min_btime (core.utils._StragglerData attribute) min_clock (core.utils._StragglerData attribute) min_elapsed (core.utils._StragglerData attribute) min_loss_scale (core.optimizer.optimizer_config.OptimizerConfig attribute) min_lr (core.optimizer.optimizer_config.OptimizerConfig attribute) min_power (core.utils._StragglerData attribute) min_temp (core.utils._StragglerData attribute) min_util (core.utils._StragglerData attribute) MINOR (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) MixedPrecisionOptimizer (class in core.optimizer.optimizer) mixer (core.ssm.mamba_layer.MambaLayerSubmodules attribute) MLASelfAttention (class in core.transformer.multi_latent_attention) MLASelfAttentionSubmodules (class in core.transformer.multi_latent_attention) MLATransformerConfig (class in core.transformer.transformer_config) MLP (class in core.transformer.mlp) mlp (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) MLP (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) mlp (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) mlp_bda (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) mlp_chunks_for_prefill (core.transformer.transformer_config.TransformerConfig attribute) mlp_fc_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_fc_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_fc_weight_mixture_of_experts (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) mlp_projection_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_projection_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_projection_weight_mixture_of_experts (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_router_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) MLPLayer (class in core.ssm.mlp_layer) MLPSubmodules (class in core.transformer.mlp) mm_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) mmap_bin_files (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) mock (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MockGPTDataset (class in core.datasets.gpt_dataset) MockGPTLowLevelDataset (class in core.datasets.gpt_dataset) MockMultimodalDataset (class in core.datasets.multimodal_dataset) modality_submodules_spec (core.models.mimo.config.base_configs.MimoModelConfig attribute) model_parallel_cuda_manual_seed() (in module core.tensor_parallel.random) model_parallel_is_initialized() (in module core.parallel_state) model_weight_buffer (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) ModelChunkState (class in core.models.common.model_chunk_schedule_plan) ModelParallelConfig (class in core.model_parallel_config) ModelType (class in core.enums) (class in core.transformer.enums) (in module core.export.model_type) modify_underlying_storage() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.fp8_utils) module core core.activations core.config core.config_logger core.datasets core.datasets.bert_dataset core.datasets.blended_dataset core.datasets.blended_megatron_dataset_builder core.datasets.blended_megatron_dataset_config core.datasets.gpt_dataset core.datasets.helpers core.datasets.indexed_dataset core.datasets.masked_dataset core.datasets.megatron_dataset core.datasets.megatron_tokenizer core.datasets.multimodal_dataset core.datasets.object_storage_utils core.datasets.retro core.datasets.retro.config core.datasets.retro.config.bert_embedders core.datasets.retro.config.config core.datasets.retro.config.gpt_chunk_datasets core.datasets.retro.config.tokenizers core.datasets.retro.db core.datasets.retro.db.build core.datasets.retro.db.dataset core.datasets.retro.db.utils core.datasets.retro.external_libs core.datasets.retro.index core.datasets.retro.index.build core.datasets.retro.index.factory core.datasets.retro.index.index core.datasets.retro.index.indexes core.datasets.retro.index.indexes.faiss_base core.datasets.retro.index.indexes.faiss_par_add core.datasets.retro.index.utils core.datasets.retro.index.validate core.datasets.retro.query core.datasets.retro.query.gpt_chunk_dataset core.datasets.retro.query.multi_split_gpt_dataset core.datasets.retro.query.query core.datasets.retro.query.retro_dataset core.datasets.retro.query.utils core.datasets.retro.utils core.datasets.t5_dataset core.datasets.utils core.datasets.utils_s3 core.dist_checkpointing core.dist_checkpointing.core core.dist_checkpointing.dict_utils core.dist_checkpointing.exchange_utils core.dist_checkpointing.mapping core.dist_checkpointing.optimizer core.dist_checkpointing.serialization core.dist_checkpointing.state_dict_utils core.dist_checkpointing.strategies core.dist_checkpointing.strategies.async_utils core.dist_checkpointing.strategies.base core.dist_checkpointing.strategies.cached_metadata_filesystem_reader core.dist_checkpointing.strategies.checkpointable core.dist_checkpointing.strategies.common core.dist_checkpointing.strategies.filesystem_async core.dist_checkpointing.strategies.fully_parallel core.dist_checkpointing.strategies.state_dict_saver core.dist_checkpointing.strategies.tensorstore core.dist_checkpointing.strategies.torch core.dist_checkpointing.strategies.two_stage core.dist_checkpointing.strategies.zarr core.dist_checkpointing.tensor_aware_state_dict core.dist_checkpointing.utils core.dist_checkpointing.validation core.distributed core.distributed.data_parallel_base core.distributed.distributed_data_parallel core.distributed.distributed_data_parallel_config core.distributed.finalize_model_grads core.distributed.fsdp core.distributed.fsdp.mcore_fsdp_adapter core.distributed.fsdp.src core.distributed.fsdp.src.megatron_fsdp core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config core.distributed.fsdp.src.megatron_fsdp.fully_shard core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp core.distributed.fsdp.src.megatron_fsdp.package_info core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor core.distributed.fsdp.src.megatron_fsdp.utils core.distributed.param_and_grad_buffer core.distributed.reduce_scatter_with_fp32_accumulation core.distributed.torch_fully_sharded_data_parallel core.distributed.torch_fully_sharded_data_parallel_config core.energy_monitor core.enums core.export core.export.data_type core.export.export_config core.export.model_type core.export.trtllm core.export.trtllm.engine_builder core.export.trtllm.engine_builder.trtllm_engine_builder core.export.trtllm.model_to_trllm_mapping core.export.trtllm.model_to_trllm_mapping.default_conversion_dict core.export.trtllm.trt_model_config core.export.trtllm.trt_model_type core.export.trtllm.trtllm_helper core.export.trtllm.trtllm_layers core.export.trtllm.trtllm_weights_converter core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter core.export.trtllm.trtllm_weights_converter.utils core.extensions core.extensions.kitchen core.extensions.transformer_engine core.extensions.transformer_engine_spec_provider core.fp4_utils core.fp8_utils core.full_cuda_graph core.fusions core.fusions.fused_bias_dropout core.fusions.fused_bias_geglu core.fusions.fused_bias_gelu core.fusions.fused_bias_swiglu core.fusions.fused_cross_entropy core.fusions.fused_indices_converter core.fusions.fused_layer_norm core.fusions.fused_mla_yarn_rope_apply core.fusions.fused_pad_routing_map core.fusions.fused_softmax core.fusions.fused_weighted_squared_relu core.hyper_comm_grid core.inference core.inference.async_stream core.inference.batch_dimensions_utils core.inference.common_inference_params core.inference.communication_utils core.inference.contexts core.inference.contexts.base_context core.inference.contexts.dynamic_block_allocator core.inference.contexts.dynamic_context core.inference.contexts.fused_kv_append_kernel core.inference.contexts.static_context core.inference.data_parallel_inference_coordinator core.inference.engines core.inference.engines.abstract_engine core.inference.engines.async_zmq_communicator core.inference.engines.dynamic_engine core.inference.engines.mcore_engine core.inference.engines.static_engine core.inference.headers core.inference.inference_client core.inference.inference_request core.inference.model_inference_wrappers core.inference.model_inference_wrappers.abstract_model_inference_wrapper core.inference.model_inference_wrappers.gpt core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper core.inference.model_inference_wrappers.inference_wrapper_config core.inference.model_inference_wrappers.t5 core.inference.model_inference_wrappers.t5.t5_inference_wrapper core.inference.sampling_params core.inference.scheduler core.inference.text_generation_controllers core.inference.text_generation_controllers.encoder_decoder_text_generation_controller core.inference.text_generation_controllers.simple_text_generation_controller core.inference.text_generation_controllers.text_generation_controller core.inference.text_generation_controllers.vlm_text_generation_controller core.inference.text_generation_server core.inference.text_generation_server.run_mcore_engine core.inference.text_generation_server.text_generation_server core.inference.text_generation_server.tokenization core.inference.unified_memory core.inference.utils core.inference_params core.jit core.model_parallel_config core.models core.models.backends core.models.bert core.models.bert.bert_layer_specs core.models.bert.bert_lm_head core.models.bert.bert_model core.models.bert.pooler core.models.common core.models.common.embeddings core.models.common.embeddings.language_model_embedding core.models.common.embeddings.relative_pos_embedding core.models.common.embeddings.rope_utils core.models.common.embeddings.rotary_pos_embedding core.models.common.embeddings.yarn_rotary_pos_embedding core.models.common.language_module core.models.common.language_module.language_module core.models.common.model_chunk_schedule_plan core.models.common.vision_module core.models.common.vision_module.vision_module core.models.gpt core.models.gpt.fine_grained_callables core.models.gpt.gpt_layer_specs core.models.gpt.gpt_model core.models.gpt.moe_module_specs core.models.huggingface core.models.huggingface.clip_model core.models.huggingface.module core.models.huggingface.qwen_model core.models.mamba core.models.mamba.mamba_layer_specs core.models.mamba.mamba_model core.models.mimo core.models.mimo.config core.models.mimo.config.base_configs core.models.mimo.model core.models.mimo.model.base core.models.multimodal core.models.multimodal.context_parallel core.models.multimodal.llava_model core.models.multimodal.llava_spec core.models.retro core.models.retro.base_attention core.models.retro.config core.models.retro.decoder_attention core.models.retro.decoder_spec core.models.retro.encoder_attention core.models.retro.encoder_spec core.models.retro.model core.models.retro.utils core.models.T5 core.models.T5.t5_model core.models.T5.t5_spec core.models.vision core.models.vision.clip_vit_model core.models.vision.multimodal_projector core.models.vision.radio core.models.vision.vit_layer_specs core.msc_utils core.nccl_allocator core.num_microbatches_calculator core.optimizer core.optimizer.clip_grads core.optimizer.cpu_offloading core.optimizer.cpu_offloading.hybrid_optimizer core.optimizer.distrib_optimizer core.optimizer.grad_scaler core.optimizer.optimizer core.optimizer.optimizer_config core.optimizer.qk_clip core.optimizer_param_scheduler core.package_info core.packed_seq_params core.parallel_state core.pipeline_parallel core.pipeline_parallel.bridge_communicator core.pipeline_parallel.combined_1f1b core.pipeline_parallel.p2p_communication core.pipeline_parallel.schedules core.pipeline_parallel.utils core.post_training core.post_training.modelopt core.post_training.modelopt.gpt core.post_training.modelopt.gpt.model_specs core.post_training.modelopt.gpt.state_dict_hooks core.post_training.modelopt.layers core.post_training.modelopt.mamba core.post_training.modelopt.mamba.model_specs core.process_groups_config core.quantization core.quantization.quant_config core.quantization.utils core.rerun_state_machine core.safe_globals core.ssm core.ssm.mamba_block core.ssm.mamba_context_parallel core.ssm.mamba_hybrid_layer_allocation core.ssm.mamba_layer core.ssm.mamba_mixer core.ssm.mlp_layer core.ssm.triton_cache_manager core.tensor_parallel core.tensor_parallel.cross_entropy core.tensor_parallel.data core.tensor_parallel.inference_layers core.tensor_parallel.layers core.tensor_parallel.mappings core.tensor_parallel.random core.tensor_parallel.utils core.timers core.tokenizers core.tokenizers.base_tokenizer core.tokenizers.megatron_tokenizer core.tokenizers.text core.tokenizers.text.libraries core.tokenizers.text.libraries.abstract_tokenizer core.tokenizers.text.libraries.bytelevel_tokenizer core.tokenizers.text.libraries.chat_template core.tokenizers.text.libraries.huggingface_tokenizer core.tokenizers.text.libraries.megatron_hf_tokenizer core.tokenizers.text.libraries.null_tokenizer core.tokenizers.text.libraries.sentencepiece_tokenizer core.tokenizers.text.libraries.tiktoken_tokenizer core.tokenizers.text.models core.tokenizers.text.models.bert_tokenizer core.tokenizers.text.models.default_tokenizer core.tokenizers.text.models.gpt_tokenizer core.tokenizers.text.models.mamba_tokenizer core.tokenizers.text.models.retro_tokenizer core.tokenizers.text.models.t5_tokenizer core.tokenizers.text.text_tokenizer core.transformer core.transformer.attention core.transformer.cuda_graphs core.transformer.custom_layers core.transformer.custom_layers.batch_invariant_kernels core.transformer.custom_layers.transformer_engine core.transformer.dot_product_attention core.transformer.enums core.transformer.fsdp_dtensor_checkpoint core.transformer.identity_op core.transformer.mlp core.transformer.module core.transformer.moe core.transformer.moe.experts core.transformer.moe.fused_a2a core.transformer.moe.grouped_gemm_util core.transformer.moe.moe_layer core.transformer.moe.moe_utils core.transformer.moe.router core.transformer.moe.shared_experts core.transformer.moe.token_dispatcher core.transformer.moe.upcycling_utils core.transformer.multi_latent_attention core.transformer.multi_token_prediction core.transformer.pipeline_parallel_layer_layout core.transformer.spec_utils core.transformer.torch_layer_norm core.transformer.torch_norm core.transformer.transformer_block core.transformer.transformer_config core.transformer.transformer_layer core.transformer.utils core.utils module (core.transformer.spec_utils.ModuleSpec attribute) module_path (core.quantization.quant_config.MatchContext attribute) ModuleSpec (class in core.transformer.spec_utils) MOE (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) moe (in module core.models.mamba.mamba_layer_specs) moe_apply_probs_on_input (core.transformer.transformer_config.TransformerConfig attribute) moe_aux_loss_coeff (core.transformer.transformer_config.TransformerConfig attribute) moe_combine (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) moe_deepep_num_sms (core.transformer.transformer_config.TransformerConfig attribute) moe_dispatch (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) moe_enable_deepep (core.transformer.transformer_config.TransformerConfig attribute) moe_expert_capacity_factor (core.transformer.transformer_config.TransformerConfig attribute) moe_extended_tp (core.model_parallel_config.ModelParallelConfig attribute) moe_ffn_hidden_size (core.transformer.transformer_config.TransformerConfig attribute) moe_flex_dispatcher_backend (core.transformer.transformer_config.TransformerConfig attribute) moe_grouped_gemm (core.transformer.transformer_config.TransformerConfig attribute) moe_hybridep_num_sms (core.transformer.transformer_config.TransformerConfig attribute) moe_input_jitter_eps (core.transformer.transformer_config.TransformerConfig attribute) moe_latent_size (core.transformer.transformer_config.TransformerConfig attribute) moe_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) moe_layer_cache (in module core.inference.utils) moe_layer_freq (core.transformer.transformer_config.TransformerConfig attribute) moe_layer_recompute (core.transformer.transformer_config.TransformerConfig attribute) moe_pad_expert_input_to_capacity (core.transformer.transformer_config.TransformerConfig attribute) moe_pad_experts_for_cuda_graph_inference (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) moe_per_layer_logging (core.transformer.transformer_config.TransformerConfig attribute) moe_permute_fusion (core.transformer.transformer_config.TransformerConfig attribute) moe_router_bias_update_rate (core.transformer.transformer_config.TransformerConfig attribute) moe_router_dtype (core.transformer.transformer_config.TransformerConfig attribute) moe_router_enable_expert_bias (core.transformer.transformer_config.TransformerConfig attribute) moe_router_force_load_balancing (core.transformer.transformer_config.TransformerConfig attribute) moe_router_fusion (core.transformer.transformer_config.TransformerConfig attribute) moe_router_group_topk (core.transformer.transformer_config.TransformerConfig attribute) moe_router_load_balancing_type (core.transformer.transformer_config.TransformerConfig attribute) moe_router_num_groups (core.transformer.transformer_config.TransformerConfig attribute) moe_router_padding_for_fp8 (core.transformer.transformer_config.TransformerConfig attribute) moe_router_padding_for_quantization (core.transformer.transformer_config.TransformerConfig attribute) moe_router_pre_softmax (core.transformer.transformer_config.TransformerConfig attribute) moe_router_score_function (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk_limited_devices (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk_scaling_factor (core.transformer.transformer_config.TransformerConfig attribute) moe_shared_expert_intermediate_size (core.transformer.transformer_config.TransformerConfig attribute) moe_shared_expert_overlap (core.transformer.transformer_config.TransformerConfig attribute) moe_token_dispatcher_type (core.transformer.transformer_config.TransformerConfig attribute) moe_token_drop_policy (core.transformer.transformer_config.TransformerConfig attribute) moe_token_dropping (core.transformer.transformer_config.TransformerConfig attribute) moe_use_legacy_grouped_gemm (core.transformer.transformer_config.TransformerConfig attribute) moe_z_loss_coeff (core.transformer.transformer_config.TransformerConfig attribute) MoEAllGatherTokenDispatcher (class in core.transformer.moe.token_dispatcher) MoEAlltoAllTokenDispatcher (class in core.transformer.moe.token_dispatcher) MoEAuxLossAutoScaler (class in core.transformer.moe.moe_utils) MoEFlexTokenDispatcher (class in core.transformer.moe.token_dispatcher) MoELayer (class in core.transformer.moe.moe_layer) MoESubmodules (class in core.transformer.moe.moe_layer) MoETokenDispatcher (class in core.transformer.moe.token_dispatcher) mp (core.process_groups_config.ProcessGroupCollection attribute) mpu (in module core) mrope_section (core.transformer.transformer_config.TransformerConfig attribute) MSC_PREFIX (in module core.datasets.object_storage_utils) (in module core.dist_checkpointing.strategies.torch) mscale (core.transformer.transformer_config.MLATransformerConfig attribute) mscale_all_dim (core.transformer.transformer_config.MLATransformerConfig attribute) mtp (core.transformer.enums.LayerType attribute) mtp_loss_scaling_factor (core.transformer.transformer_config.TransformerConfig attribute) mtp_num_layers (core.transformer.transformer_config.TransformerConfig attribute) mtp_post_process (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) MTPLossAutoScaler (class in core.transformer.multi_token_prediction) MTPLossLoggingHelper (class in core.transformer.multi_token_prediction) multi_latent_attention (core.transformer.transformer_config.MLATransformerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) MultiGroupMemPoolAllocator (class in core.nccl_allocator) MultiGroupUBRAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) MultiLatentAttention (class in core.transformer.multi_latent_attention) MultimodalDatasetConfig (class in core.datasets.multimodal_dataset) MultimodalProjector (class in core.models.vision.multimodal_projector) MultimodalRotaryEmbedding (class in core.models.common.embeddings.rotary_pos_embedding) multiple_validation_sets (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MultiSplitGPTDataset (class in core.datasets.retro.query.multi_split_gpt_dataset) MultiSplitGPTDatasetConfig (class in core.datasets.retro.query.multi_split_gpt_dataset) MultiStorageClientFeature (in module core.msc_utils) MultiTokenPredictionBlock (class in core.transformer.multi_token_prediction) MultiTokenPredictionBlockSubmodules (class in core.transformer.multi_token_prediction) MultiTokenPredictionLayer (class in core.transformer.multi_token_prediction) MultiTokenPredictionLayerSubmodules (class in core.transformer.multi_token_prediction) mxfp8 (core.enums.Fp8Recipe attribute) my_rank (core.utils.StragglerDetector property) N name (core.optimizer.optimizer_config.ParamKey attribute) narrow() (core.dist_checkpointing.mapping.ShardedTensor method) nccl_all_reduce_for_prefill (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) NCCL_ALLOCATOR (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) nccl_mem (class in core.nccl_allocator) NCCL_MEMORY_POOL (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) nccl_ub (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) NEMOTRON_NAS_CONVERSION_DICT (in module core.export.trtllm.model_to_trllm_mapping.default_conversion_dict) nested_items_iter() (in module core.dist_checkpointing.dict_utils) nested_values() (in module core.dist_checkpointing.dict_utils) next_iter() (core.full_cuda_graph.FullCudaGraphWrapper method) no_mask (core.transformer.enums.AttnMaskType attribute) no_rope_freq (core.transformer.transformer_config.TransformerConfig attribute) NO_SHARD (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) no_sync() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) no_sync_func (core.model_parallel_config.ModelParallelConfig attribute) NON_TRANSFORMER_LAYERS_NAMES (in module core.export.trtllm.trtllm_layers) NoopScheduleNode (class in core.pipeline_parallel.utils) Norm (class in core.post_training.modelopt.layers) norm (core.ssm.mamba_layer.MambaLayerSubmodules attribute) normalization (core.transformer.transformer_config.MLATransformerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) normalize() (core.optimizer.distrib_optimizer.Range method) (in module core.datasets.utils) NOT_RUNNING_YET (core.rerun_state_machine.RerunState attribute) null_decorator() (in module core.utils) null_method() (core.utils.StragglerDetector method) NullTokenizer (class in core.tokenizers.text.libraries.null_tokenizer) num_attention_heads (core.transformer.transformer_config.TransformerConfig attribute) num_buckets (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline property) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline property) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer property) num_dataset_builder_threads (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) num_decode_requests (core.inference.contexts.dynamic_context.DynamicInferenceContext property) num_distributed_optimizer_instances (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) num_img_embeddings_per_tile (core.inference.inference_request.VLMInferenceRequest attribute) num_layers (core.transformer.transformer_config.TransformerConfig attribute) num_layers() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) num_layers_at_end_in_bf16 (core.transformer.transformer_config.TransformerConfig attribute) num_layers_at_start_in_bf16 (core.transformer.transformer_config.TransformerConfig attribute) num_layers_in_first_pipeline_stage (core.transformer.transformer_config.TransformerConfig attribute) num_layers_in_last_pipeline_stage (core.transformer.transformer_config.TransformerConfig attribute) num_microbatches_with_partial_activation_checkpoints (core.model_parallel_config.ModelParallelConfig attribute) num_moe_experts (core.transformer.transformer_config.TransformerConfig attribute) num_query_groups (core.transformer.transformer_config.TransformerConfig attribute) num_requests_pending() (core.inference.scheduler.Scheduler method) num_samples_to_block_ranges() (in module core.datasets.retro.index.utils) num_tiles (core.inference.inference_request.VLMInferenceRequest attribute) num_tokens_to_generate (core.inference.sampling_params.SamplingParams attribute) num_tokens_total (core.inference.sampling_params.SamplingParams attribute) numel_low_level_dataset() (core.datasets.gpt_dataset.GPTDataset static method) (core.datasets.gpt_dataset.MockGPTDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) NumMicroBatchesCalculator (class in core.num_microbatches_calculator) numpy_to_torch_dtype_dict (in module core.dist_checkpointing.strategies.zarr) nvfp4 (core.enums.Fp4Recipe attribute) nvtx_decorator() (in module core.utils) nvtx_range_pop() (in module core.utils) nvtx_range_push() (in module core.utils) O OBJECT_STORAGE_BIN_READERS (in module core.datasets.indexed_dataset) object_storage_cache_path (core.datasets.gpt_dataset.GPTDatasetConfig attribute) ObjectStorageConfig (class in core.datasets.object_storage_utils) offload_to_cpu() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) offsets() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) open_file() (in module core.msc_utils) open_ts_array() (in module core.dist_checkpointing.strategies.tensorstore) openai_gelu() (in module core.transformer.utils) ops (in module core.transformer.moe.grouped_gemm_util) OPTIM (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) OPTIM_GRADS (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) OPTIM_GRADS_PARAMS (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) optim_state_to_sharding_state() (in module core.dist_checkpointing.optimizer) optimal_dtype() (core.datasets.indexed_dataset.DType static method) optimizer (core.optimizer.optimizer.ChainedOptimizer property) (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.optimizer.optimizer_config.SGDOptimizerConfig attribute) optimizer_cpu_offload (core.optimizer.optimizer_config.OptimizerConfig attribute) optimizer_offload_fraction (core.optimizer.optimizer_config.OptimizerConfig attribute) OptimizerConfig (class in core.optimizer.optimizer_config) OptimizerParamScheduler (class in core.optimizer_param_scheduler) original_max_position_embeddings (core.transformer.transformer_config.MLATransformerConfig attribute) out_proj (core.ssm.mamba_mixer.MambaMixerSubmodules attribute) outer_dp_sharding_strategy (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) output_layer_init_method (core.transformer.transformer_config.TransformerConfig attribute) overlap_cpu_optimizer_d2h_h2d (core.optimizer.optimizer_config.OptimizerConfig attribute) overlap_grad_reduce (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) overlap_moe_expert_parallel_comm (core.model_parallel_config.ModelParallelConfig attribute) overlap_p2p_comm (core.model_parallel_config.ModelParallelConfig attribute) overlap_p2p_comm_warmup_flush (core.model_parallel_config.ModelParallelConfig attribute) overlap_param_gather (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) overlap_param_gather_with_optimizer_step (core.optimizer.optimizer_config.OptimizerConfig attribute) override_nonquantized_autocast (core.extensions.transformer_engine.TEQuantizationRecipe attribute) override_quantized_autocast (core.extensions.transformer_engine.TEQuantizationRecipe attribute) override_sharded_param_methods_with_safety_checks() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) overwrite_nccl_comm_cfgs() (in module core.parallel_state) P P2PCommunicator (class in core.pipeline_parallel.p2p_communication) PackedSeqParams (class in core.packed_seq_params) pad (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) pad_buckets_for_high_nccl_busbw (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) pad_encoder_prompts_tokens() (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) pad_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) pad_input_prompt_tokens() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) pad_routing_map() (in module core.transformer.moe.moe_utils) pad_to_expected_shape() (in module core.dist_checkpointing.strategies.zarr) pad_vocab_size() (in module core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter) padded_vocab_size (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) padding (core.transformer.enums.AttnMaskType attribute) padding_causal (core.transformer.enums.AttnMaskType attribute) ParallelFileCacheManager (class in core.ssm.triton_cache_manager) PARAM (core.distributed.param_and_grad_buffer.BufferType attribute) param_group_identifier_keys (in module core.optimizer.optimizer) param_groups (core.optimizer.optimizer.ChainedOptimizer property) (core.optimizer.optimizer.MegatronOptimizer attribute) param_is_not_shared() (in module core.transformer.module) param_is_not_tensor_parallel_duplicate() (in module core.tensor_parallel.layers) param_sync_func (core.model_parallel_config.ModelParallelConfig attribute) ParamAndGradBuffer (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) ParameterGroup (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) ParamKey (class in core.optimizer.optimizer_config) params (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) (core.transformer.spec_utils.ModuleSpec attribute) params_config_key (core.extensions.kitchen.KitchenQuantizationParams attribute) params_dtype (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) parse_and_normalize_split() (in module core.datasets.blended_megatron_dataset_config) parse_config_dict() (core.extensions.kitchen.CompoundParamsConfigSchema class method) (core.extensions.kitchen.QAttentionParamsConfigSchema class method) (core.extensions.kitchen.QFlashAttentionParamsConfigSchema class method) (core.extensions.kitchen.QLinearParamsConfigSchema class method) parse_from_config() (core.extensions.kitchen.KitchenQuantizationParams static method) (core.extensions.transformer_engine.TEQuantizationParams static method) (core.extensions.transformer_engine.TEQuantizationRecipe class method) parse_s3_path() (in module core.datasets.object_storage_utils) parse_str_to_list() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout static method) parse_strict_flag() (in module core.dist_checkpointing.validation) partition_buckets() (in module core.distributed.param_and_grad_buffer) PATCH (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) path (core.datasets.retro.utils.Block attribute) path_to_cache (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) path_to_idx_cache (core.datasets.object_storage_utils.ObjectStorageConfig attribute) PATTERN_TIKTOKEN_V1 (in module core.tokenizers.text.libraries.tiktoken_tokenizer) PATTERN_TIKTOKEN_V2 (in module core.tokenizers.text.libraries.tiktoken_tokenizer) PAUSE (core.inference.headers.Headers attribute) (core.inference.inference_request.DynamicInferenceEventType attribute) pause() (core.energy_monitor.EnergyMonitor method) PAUSE_ACK (core.inference.headers.Headers attribute) pause_engines() (core.inference.inference_client.InferenceClient method) payload (core.inference.inference_request.DynamicInferenceEvent attribute) perform_initialization (core.model_parallel_config.ModelParallelConfig attribute) permute() (in module core.transformer.moe.moe_utils) persist_layer_norm (core.transformer.transformer_config.TransformerConfig attribute) PERSISTENT_ERROR (core.rerun_state_machine.RerunDiagnostic attribute) PersistentAsyncCaller (class in core.dist_checkpointing.strategies.async_utils) pin_cpu_grads (core.optimizer.optimizer_config.OptimizerConfig attribute) pin_cpu_params (core.optimizer.optimizer_config.OptimizerConfig attribute) pipeline_dtype (core.model_parallel_config.ModelParallelConfig attribute) pipeline_model_parallel_comm_backend (core.model_parallel_config.ModelParallelConfig attribute) pipeline_model_parallel_layout (core.transformer.transformer_config.TransformerConfig attribute) pipeline_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) PipelineParallelLayerLayout (class in core.transformer.pipeline_parallel_layer_layout) pixel_shuffle() (in module core.models.multimodal.llava_model) Pooler (class in core.models.bert.pooler) pop_tensors() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) pos_embd (core.process_groups_config.ProcessGroupCollection attribute) position_embedding (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) post_all_gather_processing() (in module core.fp8_utils) post_attn (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) POST_BACKWARD (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) post_conv_ssm() (core.ssm.mamba_context_parallel.MambaContextParallel method) post_forward_comm() (core.transformer.moe.shared_experts.SharedExpertMLP method) post_layernorm_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) post_layernorm_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) post_process_requests() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) postprocess_numpy_array() (in module core.dist_checkpointing.strategies.zarr) PostProcessNode (class in core.models.gpt.fine_grained_callables) pp (core.process_groups_config.ProcessGroupCollection attribute) PRE_BACKWARD (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) pre_conv_ssm() (core.ssm.mamba_context_parallel.MambaContextParallel method) pre_cross_attn_layernorm (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) pre_forward_comm() (core.transformer.moe.shared_experts.SharedExpertMLP method) pre_mlp_layernorm (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) PRE_RELEASE (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) PrefetchOrder (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) prefill_req_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions attribute) preload_fn (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) preload_tensors() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync static method) prep_inference_input() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper.GPTInferenceWrapper method) (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) (core.inference.text_generation_controllers.encoder_decoder_text_generation_controller.EncoderDecoderTextGenerationController method) (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) (core.inference.text_generation_controllers.vlm_text_generation_controller.VLMTextGenerationController method) prep_model_for_inference() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) prepare_decentralized_global_plan() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) prepare_for_absorption() (core.transformer.multi_latent_attention.MLASelfAttention method) prepare_gradient_calculation_operands() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) prepare_grads() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) prepare_input_tensors_for_wgrad_compute() (in module core.utils) prepare_local_plan() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) prepare_write_data() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) prepend_axis_num (core.dist_checkpointing.mapping.ShardedTensor attribute) preprocess() (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) preprocess_func (core.datasets.multimodal_dataset.MultimodalDatasetConfig attribute) preprocess_state_dict_for_uneven_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) PreProcessNode (class in core.models.gpt.fine_grained_callables) preserve_fp32_weights (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) pretty_repr() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) print_diff_in_state_dicts() (in module core.transformer.fsdp_dtensor_checkpoint) print_stats() (core.rerun_state_machine.QuickStats method) ProcessGroupCollection (class in core.process_groups_config) ProcessGroupHelperMeta (class in core.process_groups_config) prompt (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.InferenceRequest attribute) prompt_log_probs (core.inference.inference_request.InferenceRequest attribute) prompt_tokens (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.InferenceRequest attribute) prompt_top_n_logprobs (core.inference.inference_request.InferenceRequest attribute) ProxyDict (class in core.optimizer.optimizer) put() (core.inference.async_stream.AsyncStream method) (core.inference.text_generation_server.text_generation_server.MegatronGenerate method) (core.ssm.triton_cache_manager.ParallelFileCacheManager method) Q q_attention_params (core.extensions.kitchen.CompoundParamsConfigSchema attribute) q_fa_params (core.extensions.kitchen.CompoundParamsConfigSchema attribute) q_layernorm (core.transformer.attention.SelfAttentionSubmodules attribute) (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) q_linear_params (core.extensions.kitchen.CompoundParamsConfigSchema attribute) q_lora_rank (core.transformer.transformer_config.MLATransformerConfig attribute) QATTENTION_PARAMS (core.extensions.kitchen.KitchenConfigType attribute) qattention_params (core.extensions.kitchen.KitchenQuantizationParams attribute) QAttentionParamsConfigSchema (class in core.extensions.kitchen) qfa_params (core.extensions.kitchen.KitchenQuantizationParams attribute) QFLASHATTENTION_PARAMS (core.extensions.kitchen.KitchenConfigType attribute) QFlashAttentionParamsConfigSchema (class in core.extensions.kitchen) qk_clip (core.transformer.transformer_config.TransformerConfig attribute) qk_clip_alpha (core.transformer.transformer_config.TransformerConfig attribute) qk_clip_threshold (core.transformer.transformer_config.TransformerConfig attribute) qk_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) qk_layernorm (core.transformer.transformer_config.TransformerConfig attribute) qk_pos_emb_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) qkv_format (core.packed_seq_params.PackedSeqParams attribute) QLINEAR_PARAMS (core.extensions.kitchen.KitchenConfigType attribute) qlinear_params (core.extensions.kitchen.KitchenQuantizationParams attribute) QLinearParamsConfigSchema (class in core.extensions.kitchen) quant_recipe (core.transformer.transformer_config.TransformerConfig attribute) QuantizationConfig (class in core.quantization.quant_config) quantize_param_shard() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.fp8_utils) query_block_neighbors() (in module core.datasets.retro.query.query) query_dataset_neighbors() (in module core.datasets.retro.query.query) query_embedding_block() (in module core.datasets.retro.query.query) query_embeddings() (in module core.datasets.retro.query.query) query_neighbors() (in module core.datasets.retro.query.query) quick_geglu() (in module core.fusions.fused_bias_geglu) quick_geglu_back() (in module core.fusions.fused_bias_geglu) quick_gelu() (in module core.activations) (in module core.fusions.fused_bias_geglu) QuickStats (class in core.rerun_state_machine) QwenHuggingFaceModel (class in core.models.huggingface.qwen_model) R RADIOViTModel (class in core.models.vision.radio) RAISE_ALL (core.dist_checkpointing.validation.StrictHandling attribute) RAISE_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) RampupBatchsizeNumMicroBatchesCalculator (class in core.num_microbatches_calculator) random_logits (core.transformer.moe.moe_utils.RandomSTE attribute) random_seed (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) RandomSTE (class in core.transformer.moe.moe_utils) Range (class in core.optimizer.distrib_optimizer) range (core.datasets.retro.utils.Block attribute) rank (core.rerun_state_machine.Caller attribute) RankCommInfo (class in core.pipeline_parallel.bridge_communicator) RankGenerator (class in core.parallel_state) read() (core.datasets.indexed_dataset._BinReader method) (core.datasets.indexed_dataset._FileBinReader method) (core.datasets.indexed_dataset._MMapBinReader method) (core.datasets.indexed_dataset._MultiStorageClientBinReader method) (core.datasets.indexed_dataset._S3BinReader method) read_metadata() (core.dist_checkpointing.strategies.cached_metadata_filesystem_reader.CachedMetadataFileSystemReader method) READY_TO_USE (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketStatus attribute) real_quant_cfg (core.post_training.modelopt.layers.BlockwiseFP8WeightTransformerLayer attribute) (core.post_training.modelopt.layers.FP8WeightTransformerLayer attribute) (core.post_training.modelopt.layers.RealQuantTransformerLayer attribute) RealQuantTransformerLayer (class in core.post_training.modelopt.layers) RECEIVER (core.pipeline_parallel.bridge_communicator.CommRole attribute) recipe_idx (core.extensions.kitchen.QAttentionParamsConfigSchema attribute) (core.extensions.kitchen.QLinearParamsConfigSchema attribute) recipe_name (core.extensions.kitchen.QFlashAttentionParamsConfigSchema attribute) RecipeConfig (class in core.quantization.quant_config) recompute_granularity (core.transformer.transformer_config.TransformerConfig attribute) recompute_method (core.transformer.transformer_config.TransformerConfig attribute) recompute_modules (core.transformer.transformer_config.TransformerConfig attribute) recompute_num_layers (core.transformer.transformer_config.TransformerConfig attribute) reconfigure_num_microbatches_calculator() (in module core.num_microbatches_calculator) record (core.inference.engines.dynamic_engine.RequestEntry attribute) record() (core.rerun_state_machine.QuickStats method) record_bwd_graph() (core.transformer.cuda_graphs._CudagraphGlobalRecord class method) record_current_stream() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) record_fwd_graph() (core.transformer.cuda_graphs._CudagraphGlobalRecord class method) record_graph_capture() (core.transformer.cuda_graphs._CudaGraphRunner method) recv_backward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) recv_forward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) recv_from_prev_pipeline_rank_() (in module core.inference.communication_utils) recv_from_ranks (core.pipeline_parallel.bridge_communicator.RankCommInfo attribute) recycle_unused_buckets() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) reduce_aux_losses_tracker_across_ranks() (in module core.transformer.moe.moe_utils) reduce_from_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) reduce_gradients() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) reduce_loss_in_tracker() (core.transformer.multi_token_prediction.MTPLossLoggingHelper method) reduce_scatter_gradients() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) reduce_scatter_last_dim_to_tensor_parallel_region() (in module core.tensor_parallel.mappings) reduce_scatter_to_sequence_parallel_region() (in module core.tensor_parallel.mappings) reduce_scatter_with_fp32_accumulation (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) reduce_scatter_with_fp32_accumulation() (in module core.distributed.reduce_scatter_with_fp32_accumulation) register_default_common_strategies() (in module core.dist_checkpointing.strategies.common) register_default_strategy() (in module core.dist_checkpointing.strategies.base) register_default_tensorstore_strategies() (in module core.dist_checkpointing.strategies.tensorstore) register_default_torch_strategies() (in module core.dist_checkpointing.strategies.torch) register_default_zarr_strategies() (in module core.dist_checkpointing.strategies.zarr) register_grad_ready() (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) register_safe_globals() (in module core.safe_globals) RegisterFSDPBackwardFunction (class in core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) RelativePositionEmbedding (class in core.models.common.embeddings.relative_pos_embedding) release_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) release_memory_blocks() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) release_state() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) reload_mergeable_ranks() (in module core.tokenizers.text.libraries.tiktoken_tokenizer) reload_model_params() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) remaining_prompt_length (core.inference.inference_request.DynamicInferenceRequest property) remaining_prompt_tokens (core.inference.inference_request.DynamicInferenceRequest attribute) remove_codes() (core.datasets.retro.index.indexes.faiss_par_add.FaissParallelAddIndex method) remove_embeddings() (in module core.datasets.retro.index.build) remove_sharded_tensors() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) rename_input_layer_names_to_trtllm_layer_names() (core.export.trtllm.trtllm_layers.TRTLLMLayers static method) replace_prefix_for_sharding() (in module core.dist_checkpointing.utils) replay_graph_capture() (core.transformer.cuda_graphs._CudaGraphRunner method) replica_id (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) ReplicaId (in module core.dist_checkpointing.mapping) report() (core.utils.StragglerDetector method) REPORT_DETERMINISM_STATS (core.rerun_state_machine.RerunMode attribute) REPORTING_INTERVAL_ITERATIONS (core.rerun_state_machine.RerunStateMachine attribute) req_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions property) request_id (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.DynamicInferenceRequestRecord property) (core.inference.inference_request.InferenceRequest attribute) REQUEST_ROUNDER (core.inference.contexts.dynamic_context.DynamicInferenceContext attribute) RequestEntry (class in core.inference.engines.dynamic_engine) RequestOverflowError requests (core.inference.inference_request.DynamicInferenceRequestRecord attribute) required_libs (in module core.datasets.retro.external_libs) requires_explicit_ckpt_mismatch_check() (core.dist_checkpointing.validation.StrictHandling static method) requires_global_app_metadata() (core.dist_checkpointing.validation.StrictHandling static method) requires_grad (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) requires_returning_mismatch_keys() (core.dist_checkpointing.validation.StrictHandling static method) RERUN_DISABLED (core.rerun_state_machine.RerunValidationStatus attribute) RerunDataIterator (class in core.rerun_state_machine) RerunDiagnostic (class in core.rerun_state_machine) RerunErrorInjector (class in core.rerun_state_machine) RerunMode (class in core.rerun_state_machine) RERUNNING_AGAIN_FROM_CHECKPOINT (core.rerun_state_machine.RerunState attribute) RERUNNING_FROM_CHECKPOINT (core.rerun_state_machine.RerunState attribute) RERUNNING_IN_PLACE (core.rerun_state_machine.RerunState attribute) RerunState (class in core.rerun_state_machine) RerunStateMachine (class in core.rerun_state_machine) RerunValidationStatus (class in core.rerun_state_machine) reset() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.contexts.static_context.StaticInferenceContext method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.utils.Counter method) (core.rerun_state_machine.QuickStats method) (core.tensor_parallel.random.CudaRNGStatesTracker method) (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) (core.utils.StragglerDetector method) reset_attention_mask (core.datasets.gpt_dataset.GPTDatasetConfig attribute) reset_attention_state() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) reset_batch_size_offset() (core.inference.contexts.base_context.BaseInferenceContext method) reset_global_aux_loss_tracker() (core.transformer.moe.router.TopKRouter method) reset_mamba_state() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) reset_model_temporary_tensors() (in module core.distributed.finalize_model_grads) reset_parameters() (core.fusions.fused_layer_norm.FusedLayerNorm method) (core.transformer.moe.router.Router method) reset_position_ids (core.datasets.gpt_dataset.GPTDatasetConfig attribute) ResetParametersContext (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) reshard_after_forward (core.distributed.torch_fully_sharded_data_parallel_config.TorchFullyShardedDataParallelConfig attribute) resolve_tensor() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) restore_from_cpu() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) restore_tensor_device() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) result (core.full_cuda_graph.FullCudaGraphWrapper attribute) RESUME (core.inference.headers.Headers attribute) resume() (core.energy_monitor.EnergyMonitor method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) resume_engines() (core.inference.inference_client.InferenceClient method) retrieve_write_results() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) retro_bert_batch_size (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_bert_embedders (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_bert_max_chunk_length (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_bert_tokenizer_type (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_bert_vocab_file (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_block_size (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) (core.models.retro.config.RetroConfig attribute) retro_chunk_length (core.models.retro.config.RetroConfig attribute) retro_decoder (core.enums.ModelType attribute) retro_doc_block_size (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_encoder (core.enums.ModelType attribute) retro_encoder_attention_dropout (core.models.retro.config.RetroConfig attribute) retro_encoder_hidden_dropout (core.models.retro.config.RetroConfig attribute) retro_encoder_num_layers (core.models.retro.config.RetroConfig attribute) retro_gpt_chunk_datasets (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_chunk_length (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_data_cache_path (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_data_path (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_eval_interval (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_eval_iters (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_global_batch_size (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_merge_file (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_seed (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_seq_length (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_split (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_tokenizer_model (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_tokenizer_type (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_train_samples (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_gpt_vocab_file (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_index_add_load_fraction (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_index_delete_added_codes (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_index_delete_training_embeddings (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_index_ntrain (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_index_str (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_index_train_load_fraction (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_index_type (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_makedir() (in module core.datasets.retro.utils) retro_neighbor_dirs (core.models.retro.config.RetroConfig attribute) retro_num_neighbors (core.models.retro.config.RetroConfig attribute) retro_num_retrieved_chunks (core.models.retro.config.RetroConfig attribute) retro_project_dir (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) (core.models.retro.config.RetroConfig attribute) retro_query_ef_search (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_query_nprobe (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_query_num_neighbors_query (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_query_num_neighbors_save (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_retrieved_length (core.models.retro.config.RetroConfig attribute) retro_split_preprocessing (core.models.retro.config.RetroConfig attribute) retro_task_validate (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_tasks (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_tokenizers (core.datasets.retro.config.config.RetroPreprocessingConfig attribute) retro_verify_neighbor_count (core.models.retro.config.RetroConfig attribute) RetroBertEmbedders (class in core.datasets.retro.config.bert_embedders) RetroConfig (class in core.models.retro.config) RetroDataset (class in core.datasets.retro.query.retro_dataset) RetroDecoderBiasDropoutAdd (class in core.models.retro.decoder_attention) RetroDecoderCrossAttention (class in core.models.retro.decoder_attention) RetroEncoderBiasDropoutAdd (class in core.models.retro.encoder_attention) RetroEncoderCrossAttention (class in core.models.retro.encoder_attention) RetroEncoderLayerNorm (class in core.models.retro.encoder_attention) RetroGPTChunkDatasets (class in core.datasets.retro.config.gpt_chunk_datasets) RetroModel (class in core.models.retro.model) RetroPreprocessingConfig (class in core.datasets.retro.config.config) RetroTokenizer (class in core.tokenizers.text.models.retro_tokenizer) RetroTokenizers (class in core.datasets.retro.config.tokenizers) RETURN_ALL (core.dist_checkpointing.validation.StrictHandling attribute) return_document_ids (core.datasets.retro.query.multi_split_gpt_dataset.MultiSplitGPTDatasetConfig attribute) return_layer_name_and_number() (core.export.trtllm.trtllm_layers.TRTLLMLayers static method) return_log_probs (core.inference.sampling_params.SamplingParams attribute) return_prompt_top_n_logprobs (core.inference.sampling_params.SamplingParams attribute) return_segments (core.inference.sampling_params.SamplingParams attribute) RETURN_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) reuse_grad_buf_for_mxfp8_param_ag (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) rewind() (core.rerun_state_machine.RerunDataIterator method) rmsnorm_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) role (core.pipeline_parallel.bridge_communicator.RankCommInfo attribute) roll_tensor() (in module core.transformer.multi_token_prediction) rope_type (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_base (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_bwd_kv_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_bwd_q_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_fwd_kv_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_fwd_q_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_interleaved (core.transformer.transformer_config.TransformerConfig attribute) rotary_percent (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_scaling_factor (core.transformer.transformer_config.MLATransformerConfig attribute) RotaryBucketAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) RotaryEmbedding (class in core.models.common.embeddings.rotary_pos_embedding) round_up() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) round_up_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) round_up_tokens() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) routed_experts_compute() (core.transformer.moe.moe_layer.MoELayer method) Router (class in core.transformer.moe.router) router_and_preprocess() (core.transformer.moe.moe_layer.MoELayer method) router_gating_linear() (in module core.transformer.moe.moe_utils) RouterGatingLinearFunction (class in core.transformer.moe.moe_utils) routing() (core.transformer.moe.router.Router method) (core.transformer.moe.router.TopKRouter method) row_parallel_linear() (core.extensions.kitchen.KitchenSpecProvider method) (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) RowParallelLinear (class in core.tensor_parallel.layers) run() (core.inference.text_generation_server.text_generation_server.MegatronServer method) (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan static method) (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan static method) (core.pipeline_parallel.utils.AbstractSchedulePlan static method) run_engine() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.engines.static_engine.StaticInferenceEngine method) run_engine_async() (core.inference.engines.static_engine.StaticInferenceEngine method) run_engine_with_coordinator() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) run_mcore_engine() (in module core.inference.text_generation_server.run_mcore_engine) run_one_forward_step() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) run_realtime_tests() (core.transformer.attention.SelfAttention method) S S3_PREFIX (in module core.datasets.object_storage_utils) S3Client (class in core.datasets.object_storage_utils) S3Config (in module core.datasets.object_storage_utils) SAFE_GLOBALS (in module core.safe_globals) safely_set_viewless_tensor_data() (in module core.utils) sample_from_logits() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) sampling_params (core.inference.inference_request.InferenceRequest attribute) SamplingParams (class in core.inference.sampling_params) save() (core.dist_checkpointing.strategies.base.AsyncSaveShardedStrategy method) (core.dist_checkpointing.strategies.base.SaveShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper method) (core.dist_checkpointing.strategies.zarr.ZarrSaveShardedStrategy method) (in module core.dist_checkpointing.serialization) save_block() (core.datasets.retro.index.indexes.faiss_par_add.FaissParallelAddIndex method) save_block_db() (in module core.datasets.retro.db.build) SAVE_COMMON (core.dist_checkpointing.strategies.base.StrategyAction attribute) save_common() (core.dist_checkpointing.strategies.base.SaveCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonSaveStrategy method) save_config() (in module core.dist_checkpointing.core) save_indexed_dataset_infos() (in module core.datasets.retro.db.utils) save_loss_to_tracker() (core.transformer.multi_token_prediction.MTPLossLoggingHelper static method) save_parameter_state() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) save_preprocess() (in module core.dist_checkpointing.state_dict_utils) save_pretrained() (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) SAVE_SHARDED (core.dist_checkpointing.strategies.base.StrategyAction attribute) save_sharded_objects() (core.dist_checkpointing.strategies.base.SaveCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonSaveStrategy method) save_state_dict_async_finalize() (in module core.dist_checkpointing.strategies.state_dict_saver) save_state_dict_async_plan() (in module core.dist_checkpointing.strategies.state_dict_saver) save_to_aux_losses_tracker() (in module core.transformer.moe.moe_utils) save_vocabulary() (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) SaveCommonStrategy (class in core.dist_checkpointing.strategies.base) SaveShardedStrategy (class in core.dist_checkpointing.strategies.base) SaveStrategyBase (class in core.dist_checkpointing.strategies.base) scale (core.optimizer.grad_scaler.MegatronGradScaler property) scale_gradients() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) scale_loss() (core.optimizer.optimizer.MegatronOptimizer method) scaled_init_method_normal() (in module core.utils) ScaledMaskedSoftmax (class in core.fusions.fused_softmax) ScaledSoftmax (class in core.fusions.fused_softmax) ScaledUpperTriangMaskedSoftmax (class in core.fusions.fused_softmax) scatter_to_sequence_parallel_region() (in module core.tensor_parallel.mappings) scatter_to_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) schedule_async_call() (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) schedule_async_request() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) schedule_chunked_prefill() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) schedule_non_chunked_prefill() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) schedule_requests() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) schedule_waiting_requests() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) ScheduleNode (class in core.pipeline_parallel.utils) Scheduler (class in core.inference.scheduler) SECOND_RERUN_NOT_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) SECOND_RERUN_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) seed (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) segments (core.inference.inference_request.InferenceRequest attribute) self_attention (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) self_attn (core.transformer.enums.AttnType attribute) self_attn_bda (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) SelfAttention (class in core.transformer.attention) SelfAttentionSubmodules (class in core.transformer.attention) send_backward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_backward_recv_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_backward_recv_forward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_backward_recv_forward_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_recv_backward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_recv_forward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_to_next_pipeline_rank() (in module core.inference.communication_utils) send_to_ranks (core.pipeline_parallel.bridge_communicator.RankCommInfo attribute) SENDER (core.pipeline_parallel.bridge_communicator.CommRole attribute) SentencePieceTokenizer (class in core.tokenizers.text.libraries.sentencepiece_tokenizer) sep (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) sep_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) sequence (core.rerun_state_machine.Call attribute) sequence_length (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) sequence_length_decoder (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig attribute) sequence_length_encoder (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig attribute) sequence_lengths (core.datasets.indexed_dataset.IndexedDataset property) sequence_modes (core.datasets.indexed_dataset.IndexedDataset property) sequence_parallel (core.model_parallel_config.ModelParallelConfig attribute) SequentialMLP (class in core.transformer.moe.experts) SerializableStateType (in module core.rerun_state_machine) serialize() (core.inference.contexts.dynamic_context.ContextErrorFactory class method) (core.inference.inference_request.DynamicInferenceEvent method) (core.inference.inference_request.DynamicInferenceRequest method) (core.inference.inference_request.DynamicInferenceRequestRecord method) (core.inference.inference_request.InferenceRequest method) (core.inference.sampling_params.SamplingParams method) serialize_tensor() (in module core.inference.inference_request) set_barrier_group() (core.timers.Timer method) set_batch_invariant_mode() (in module core.transformer.custom_layers.batch_invariant_kernels) set_current_microbatch() (in module core.pipeline_parallel.schedules) set_data_parallel_rank() (in module core.parallel_state) set_decode_expert_padding() (in module core.inference.utils) set_defaults_if_not_set_tensor_model_parallel_attributes() (in module core.tensor_parallel.layers) set_document_indices() (core.datasets.indexed_dataset.IndexedDataset method) set_experimental_flag() (in module core.config) set_expert_model_parallel_rank() (in module core.parallel_state) set_expert_model_parallel_world_size() (in module core.parallel_state) set_expert_tensor_parallel_rank() (in module core.parallel_state) set_expert_tensor_parallel_world_size() (in module core.parallel_state) set_extra_state() (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) set_for_recompute_input_layernorm() (core.transformer.attention.Attention method) (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) set_for_recompute_pre_mlp_layernorm() (core.transformer.moe.moe_layer.MoELayer method) set_input_tensor() (core.models.bert.bert_model.BertModel method) (core.models.gpt.gpt_model.GPTModel method) (core.models.huggingface.module.HuggingFaceModule method) (core.models.mamba.mamba_model.MambaModel method) (core.models.mimo.model.base.MimoModel method) (core.models.multimodal.llava_model.LLaVAModel method) (core.models.T5.t5_model.T5Model method) (core.models.vision.clip_vit_model.CLIPViTModel method) (core.models.vision.radio.RADIOViTModel method) (core.ssm.mamba_block.MambaStack method) (core.transformer.module.Float16Module method) (core.transformer.transformer_block.TransformerBlock method) set_is_first_microbatch() (core.transformer.cuda_graphs.CudaGraphManager method) (core.transformer.module.MegatronModule method) set_item() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) set_layer_number() (core.transformer.moe.moe_layer.BaseMoELayer method) (core.transformer.moe.router.Router method) set_loss_scale() (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler static method) (core.transformer.multi_token_prediction.MTPLossAutoScaler static method) set_mode() (core.rerun_state_machine.RerunStateMachine method) set_model_auto_sync() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) set_model_to_sequence_parallel() (in module core.transformer.utils) set_pipeline_model_parallel_rank() (in module core.parallel_state) set_pipeline_model_parallel_world_size() (in module core.parallel_state) set_save_original_input() (in module core.extensions.transformer_engine) set_shared_experts() (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) set_states() (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.tensor_parallel.random.CudaRNGStatesTracker method) set_streams() (in module core.pipeline_parallel.utils) set_symmetric_ar() (core.transformer.module.MegatronModule method) set_tensor_grad_fn_sequence_sr() (in module core.transformer.moe.shared_experts) set_tensor_model_parallel_attributes() (in module core.tensor_parallel.layers) set_tensor_model_parallel_rank() (in module core.parallel_state) set_tensor_model_parallel_world_size() (in module core.parallel_state) set_virtual_pipeline_model_parallel_rank() (in module core.parallel_state) set_virtual_pipeline_model_parallel_world_size() (in module core.parallel_state) setup() (core.energy_monitor.EnergyMonitor method) setup_embeddings_and_output_layer() (core.models.common.language_module.language_module.LanguageModule method) setup_manual_hooks() (core.transformer.module.GraphableMegatronModule method) setup_metadata() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) setup_process_groups_for_ddp() (core.process_groups_config.ProcessGroupCollection static method) setup_process_groups_for_optimizer() (core.process_groups_config.ProcessGroupCollection static method) sgd_momentum (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.optimizer.optimizer_config.SGDOptimizerConfig attribute) SGDOptimizerConfig (class in core.optimizer.optimizer_config) Shape (in module core.pipeline_parallel.combined_1f1b) (in module core.pipeline_parallel.p2p_communication) (in module core.pipeline_parallel.schedules) shard_buffer() (in module core.distributed.param_and_grad_buffer) shard_to_metadata (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) ShardBucketIndex (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) ShardDistribution (class in core.dist_checkpointing.exchange_utils) sharded_backend (core.dist_checkpointing.core.CheckpointingConfig attribute) sharded_backend_version (core.dist_checkpointing.core.CheckpointingConfig attribute) sharded_param_state_dp_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_dp_zero() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_fs_model_space() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_fsdp_dtensor() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_fully_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_state_dict (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict attribute) sharded_state_dict() (core.extensions.kitchen.KitchenColumnParallelGroupedLinear method) (core.extensions.kitchen.KitchenColumnParallelLinear method) (core.extensions.kitchen.KitchenLayerNormColumnParallelLinear method) (core.extensions.kitchen.KitchenLinear method) (core.extensions.kitchen.KitchenRowParallelGroupedLinear method) (core.extensions.kitchen.KitchenRowParallelLinear method) (core.extensions.transformer_engine.TEColumnParallelLinear method) (core.extensions.transformer_engine.TEDotProductAttention method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) (core.extensions.transformer_engine.TERowParallelLinear method) (core.models.common.language_module.language_module.LanguageModule method) (core.models.gpt.gpt_model.GPTModel method) (core.models.retro.model.RetroModel method) (core.models.T5.t5_model.T5Model method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.post_training.modelopt.layers.Linear method) (core.ssm.mamba_block.MambaStack method) (core.ssm.mamba_layer.MambaLayer method) (core.ssm.mamba_mixer.ExtendedRMSNorm method) (core.ssm.mamba_mixer.MambaMixer method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.layers.VocabParallelEmbedding method) (core.transformer.dot_product_attention.DotProductAttention method) (core.transformer.mlp.MLP method) (core.transformer.module.Float16Module method) (core.transformer.module.MegatronModule method) (core.transformer.moe.experts.GroupedMLP method) (core.transformer.moe.experts.SequentialMLP method) (core.transformer.moe.experts.TEGroupedMLP method) (core.transformer.moe.shared_experts.SharedExpertMLP method) (core.transformer.multi_token_prediction.MultiTokenPredictionBlock method) (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) sharded_state_dict_default() (in module core.transformer.utils) sharded_state_dict_keys_map (core.ssm.mamba_layer.MambaLayerSubmodules attribute) (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) sharded_tensor_chunk_id() (in module core.dist_checkpointing.strategies.two_stage) sharded_tensor_no_data (core.dist_checkpointing.strategies.two_stage._ShardedTensorMetadata attribute) sharded_tensor_to_torch_sharded_tensor() (in module core.dist_checkpointing.strategies.torch) ShardedBase (class in core.dist_checkpointing.mapping) ShardedObject (class in core.dist_checkpointing.mapping) ShardedStateDict (in module core.dist_checkpointing.mapping) ShardedTensor (class in core.dist_checkpointing.mapping) ShardedTensorFactory (class in core.dist_checkpointing.mapping) ShardingStrategy (class in core.distributed.fsdp.src.megatron_fsdp.fully_shard) shards_in_this_group (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) shared_embedding_or_output_weight() (core.models.common.language_module.language_module.LanguageModule method) (core.models.gpt.gpt_model.GPTModel method) (core.models.multimodal.llava_model.LLaVAModel method) (core.models.T5.t5_model.T5Model method) shared_experts (core.transformer.moe.moe_layer.MoESubmodules attribute) shared_experts_compute() (core.transformer.moe.moe_layer.MoELayer method) SharedExpertMLP (class in core.transformer.moe.shared_experts) short_sequence_probability (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) should_checkpoint_and_exit() (core.rerun_state_machine.RerunStateMachine method) should_free_input() (in module core.models.gpt.fine_grained_callables) should_run_forward_backward() (core.rerun_state_machine.RerunStateMachine method) shutdown() (core.energy_monitor.EnergyMonitor method) SiglipHuggingFaceModel (class in core.models.huggingface.clip_model) SingleDeviceTRTLLMModelWeightsConverter (class in core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter) sinkhorn() (in module core.transformer.moe.moe_utils) sinkhorn_load_balancing() (core.transformer.moe.router.TopKRouter method) size (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) size() (core.datasets.indexed_dataset.DType static method) skip_prompt_log_probs (core.inference.sampling_params.SamplingParams attribute) softmax_scale (core.transformer.transformer_config.TransformerConfig attribute) softmax_type (core.transformer.transformer_config.TransformerConfig attribute) SoftmaxOne (class in core.fusions.fused_softmax) sort_chunks_by_idxs() (in module core.transformer.moe.moe_utils) special_token_ids (core.models.mimo.config.base_configs.MimoModelConfig attribute) SPECIAL_TOKEN_TEMPLATE (in module core.tokenizers.text.libraries.tiktoken_tokenizer) SPECIAL_TOKENS (in module core.tokenizers.text.libraries.tiktoken_tokenizer) Split (class in core.datasets.utils) split (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) split_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) split_matrix (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) split_preprocessing (core.datasets.retro.query.multi_split_gpt_dataset.MultiSplitGPTDatasetConfig attribute) split_state_dict_if_needed() (core.optimizer.distrib_optimizer.DistributedOptimizer method) split_te_layernorm_column_parallel_linear() (in module core.extensions.transformer_engine) split_tensor_along_last_dim() (in module core.tensor_parallel.utils) split_tensor_into_1d_equal_chunks() (in module core.tensor_parallel.utils) squared_relu() (in module core.activations) ssm_decode() (core.ssm.mamba_mixer.MambaMixer method) ssm_prefill() (core.ssm.mamba_mixer.MambaMixer method) ssm_training() (core.ssm.mamba_mixer.MambaMixer method) start() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator method) (core.inference.inference_client.InferenceClient method) (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) start_grad_sync() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) start_listening_to_data_parallel_coordinator() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) start_method() (core.utils.StragglerDetector method) start_param_sync() (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) state (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan property) (core.optimizer.optimizer.ChainedOptimizer property) (core.optimizer.optimizer.MegatronOptimizer attribute) state_dict() (core.distributed.data_parallel_base._BaseDataParallel method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.grad_scaler.ConstantGradScaler method) (core.optimizer.grad_scaler.DynamicGradScaler method) (core.optimizer.grad_scaler.MegatronGradScaler method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer_param_scheduler.OptimizerParamScheduler method) (core.rerun_state_machine.RerunDataIterator method) (core.rerun_state_machine.RerunErrorInjector method) (core.rerun_state_machine.RerunStateMachine method) (core.transformer.module.Float16Module method) state_dict_for_save_checkpoint() (core.distributed.data_parallel_base._BaseDataParallel method) (core.transformer.module.Float16Module method) (core.transformer.module.MegatronModule method) StateDict (in module core.dist_checkpointing.mapping) static_buffers (core.full_cuda_graph.StaticBufferLoader attribute) StaticBufferLoader (class in core.full_cuda_graph) StaticInferenceContext (class in core.inference.contexts.static_context) StaticInferenceEngine (class in core.inference.engines.static_engine) Status (class in core.inference.inference_request) status (core.inference.inference_request.InferenceRequest attribute) step (core.inference.engines.dynamic_engine.DynamicInferenceEngine attribute) step() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) (core.optimizer_param_scheduler.OptimizerParamScheduler method) step_legacy() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) step_modern() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) step_with_ready_grads() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) STOP (core.inference.headers.Headers attribute) stop() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.inference_client.InferenceClient method) (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) STOP_ACK (core.inference.headers.Headers attribute) stop_communication() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) stop_engines() (core.inference.inference_client.InferenceClient method) STOP_ITERATION (in module core.inference.async_stream) stop_method() (core.utils.StragglerDetector method) StorageResizeBasedBucketAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) store_param_remainders (core.optimizer.optimizer_config.OptimizerConfig attribute) str_dtype_to_torch() (in module core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter) (in module core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter) StragglerDetector (class in core.utils) StrategyAction (class in core.dist_checkpointing.strategies.base) stream (core.transformer.moe.shared_experts.SharedExpertMLP attribute) stream_acquire_context() (in module core.pipeline_parallel.utils) stream_tokens() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) StrictHandling (class in core.dist_checkpointing.validation) sub_optimizers (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer property) SUBMIT_REQUEST (core.inference.headers.Headers attribute) submodules (core.transformer.spec_utils.ModuleSpec attribute) succeeded() (core.inference.inference_request.DynamicInferenceRequest method) SUCCESS (core.inference.unified_memory.CompilationState attribute) suggested_bucket_size (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketingPolicy attribute) suggested_communication_unit_size (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) summarize_load_times() (core.dist_checkpointing.strategies.two_stage.TwoStageDataParallelLoadShardedStrategy method) SUPPORTED_ATTN_MASK (in module core.transformer.multi_token_prediction) SUSPEND (core.inference.headers.Headers attribute) suspend() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.inference_request.DynamicInferenceRequestRecord method) suspend_engines() (core.inference.inference_client.InferenceClient method) suspend_resume_ctx() (core.inference.engines.dynamic_engine.DynamicInferenceEngine static method) swap_key_value_dict() (core.inference.contexts.static_context.StaticInferenceContext method) swiglu() (in module core.fusions.fused_bias_swiglu) swiglu_back() (in module core.fusions.fused_bias_swiglu) SwiGLUFunction (class in core.fusions.fused_bias_swiglu) switch_load_balancing_loss_func() (in module core.transformer.moe.moe_utils) symbolic() (core.tensor_parallel.mappings._AllGatherFromTensorParallelRegion static method) (core.tensor_parallel.mappings._CopyToModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceFromModelParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToTensorParallelRegion static method) (core.tensor_parallel.mappings._ScatterToModelParallelRegion static method) (core.tensor_parallel.mappings._ScatterToSequenceParallelRegion static method) Symbols (class in core.ssm.mamba_hybrid_layer_allocation) symmetric_ar_type (core.transformer.transformer_config.TransformerConfig attribute) sync() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) sync_all_async_calls() (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) sync_rng_states_across_tp_group() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) synchronize_gradient_reduce() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) synchronize_param_gather() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) T T (in module core.dist_checkpointing.exchange_utils) (in module core.dist_checkpointing.strategies.fully_parallel) t5_extended_attention_mask() (in module core.models.T5.t5_model) t5_position_ids() (in module core.models.T5.t5_model) T5InferenceWrapper (class in core.inference.model_inference_wrappers.t5.t5_inference_wrapper) T5LMHead (class in core.models.T5.t5_model) T5MaskedWordPieceDataset (class in core.datasets.t5_dataset) T5MaskedWordPieceDatasetConfig (class in core.datasets.t5_dataset) T5Model (class in core.models.T5.t5_model) T5Tokenizer (class in core.tokenizers.text.models.t5_tokenizer) te_checkpoint (in module core.transformer.transformer_block) te_checkpoint() (in module core.extensions.transformer_engine) TEColumnParallelLinear (class in core.extensions.transformer_engine) TECudaGraphHelper (class in core.transformer.cuda_graphs) TECudaRNGStatesTracker (class in core.extensions.transformer_engine) TEDelayedScaling (class in core.extensions.transformer_engine) TEDotProductAttention (class in core.extensions.transformer_engine) TEGroupedMLP (class in core.transformer.moe.experts) TELayerNormColumnParallelLinear (class in core.extensions.transformer_engine) TELinear (class in core.extensions.transformer_engine) temperature (core.inference.sampling_params.SamplingParams attribute) TemporalAsyncCaller (class in core.dist_checkpointing.strategies.async_utils) TemporaryBucketAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) TENorm (class in core.extensions.transformer_engine) tensor_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) tensor_swap() (in module core.inference.utils) TensorItemIndex (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) tensors (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) TensorStateDeallocatedError TensorStoreLoadShardedStrategy (class in core.dist_checkpointing.strategies.tensorstore) tensorwise (core.enums.Fp8Recipe attribute) TEQuantizationParams (class in core.extensions.transformer_engine) (core.extensions.transformer_engine.TransformerEngineConfigType attribute) TEQuantizationRecipe (class in core.extensions.transformer_engine) termination_id (core.inference.sampling_params.SamplingParams attribute) TERowParallelLinear (class in core.extensions.transformer_engine) TESpecProvider (class in core.extensions.transformer_engine_spec_provider) test (core.datasets.retro.config.gpt_chunk_datasets.RetroGPTChunkDatasets attribute) (core.datasets.utils.Split attribute) test_mode (core.transformer.transformer_config.TransformerConfig attribute) text_to_ids() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) text_to_tokens() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) TextGenerationController (class in core.inference.text_generation_controllers.text_generation_controller) tie_embeddings_and_output_weights_state_dict() (core.models.common.language_module.language_module.LanguageModule method) tie_output_layer_state_dict() (in module core.transformer.multi_token_prediction) tie_word_embeddings_state_dict() (in module core.transformer.multi_token_prediction) TikTokenTokenizer (class in core.tokenizers.text.libraries.tiktoken_tokenizer) timed() (in module core.dist_checkpointing.strategies.two_stage) Timer (class in core.timers) TimerBase (class in core.timers) Timers (class in core.timers) timers (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) (in module core.dist_checkpointing.strategies.two_stage) timestamp (core.inference.inference_request.DynamicInferenceEvent attribute) to_kitchen_qattention() (core.extensions.kitchen.QAttentionParamsConfigSchema method) to_kitchen_qfa() (core.extensions.kitchen.QFlashAttentionParamsConfigSchema method) to_kitchen_qlinear() (core.extensions.kitchen.QLinearParamsConfigSchema method) to_local_if_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) (in module core.utils) to_state_dict() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) toggle_cuda_graphs() (in module core.transformer.utils) token_combine() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) token_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions attribute) token_dispatch() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) TOKEN_ROUNDER (core.inference.contexts.dynamic_context.DynamicInferenceContext attribute) token_to_id() (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) tokenize() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) tokenize_encoder_prompt() (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) tokenize_prompt() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) tokenize_prompts() (in module core.inference.text_generation_server.tokenization) tokenizer (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) TOKENIZER_LIBRARIES (in module core.tokenizers.megatron_tokenizer) TOKENIZER_MAPPING_LIBRARIES (in module core.tokenizers.text.text_tokenizer) TOKENIZER_MAPPING_NAMES (in module core.tokenizers.megatron_tokenizer) TokenOverflowError tokens_to_ids() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) tokens_to_text() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) top_k (core.inference.sampling_params.SamplingParams attribute) top_n_logprobs (core.inference.sampling_params.SamplingParams attribute) top_p (core.inference.sampling_params.SamplingParams attribute) topk_routing_with_score_function() (in module core.transformer.moe.moe_utils) TopKRouter (class in core.transformer.moe.router) TopLevelDataset (in module core.datasets.blended_megatron_dataset_builder) torch_home (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) torch_to_numpy_dtype_dict (in module core.dist_checkpointing.strategies.zarr) TorchCommonLoadStrategy (class in core.dist_checkpointing.strategies.common) TorchCommonSaveStrategy (class in core.dist_checkpointing.strategies.common) TorchDistLoadShardedStrategy (class in core.dist_checkpointing.strategies.torch) TorchDistSaveShardedStrategy (class in core.dist_checkpointing.strategies.torch) TorchFullyShardedDataParallel (class in core.distributed.torch_fully_sharded_data_parallel) TorchFullyShardedDataParallelConfig (class in core.distributed.torch_fully_sharded_data_parallel_config) tp (core.process_groups_config.ProcessGroupCollection attribute) tp_comm_atomic_ag (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_atomic_rs (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_bootstrap_backend (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_bulk_dgrad (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_bulk_wgrad (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_ag (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_disable_fc1 (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_disable_qkv (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_rs (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_rs_dgrad (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_split_ag (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_split_rs (core.model_parallel_config.ModelParallelConfig attribute) tp_cp (core.process_groups_config.ProcessGroupCollection attribute) tp_dp_cp (core.process_groups_config.ProcessGroupCollection attribute) tp_ep (core.process_groups_config.ProcessGroupCollection attribute) tp_ep_pp (core.process_groups_config.ProcessGroupCollection attribute) tp_only_amax_red (core.extensions.transformer_engine.TEQuantizationRecipe attribute) (core.transformer.transformer_config.TransformerConfig attribute) tpot (core.inference.inference_request.InferenceRequest attribute) trace_async_exceptions() (in module core.utils) track_moe_metrics() (in module core.transformer.moe.moe_utils) track_mtp_metrics() (core.transformer.multi_token_prediction.MTPLossLoggingHelper method) tracked_metadata (core.inference.inference_request.DynamicInferenceRequest property) tracker (core.transformer.multi_token_prediction.MTPLossLoggingHelper attribute) train (core.datasets.retro.config.gpt_chunk_datasets.RetroGPTChunkDatasets attribute) (core.datasets.utils.Split attribute) train() (core.datasets.retro.index.index.Index method) (core.datasets.retro.index.indexes.faiss_base.FaissBaseIndex method) train_index() (in module core.datasets.retro.index.build) train_on_embeddings() (in module core.datasets.retro.index.build) training_recipe (core.extensions.transformer_engine.TEQuantizationParams attribute) TrainingState (class in core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) transform_object() (core.dist_checkpointing.strategies.torch.MCoreSavePlanner method) transformer_impl (core.transformer.transformer_config.TransformerConfig attribute) transformer_layer (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) TransformerBlock (class in core.transformer.transformer_block) TransformerBlockSubmodules (class in core.transformer.transformer_block) TransformerConfig (class in core.transformer.transformer_config) TransformerEngineConfigType (class in core.extensions.transformer_engine) TransformerLayer (class in core.transformer.transformer_layer) TransformerLayerNode (class in core.models.gpt.fine_grained_callables) TransformerLayerSchedulePlan (class in core.models.common.model_chunk_schedule_plan) TransformerLayerState (class in core.models.gpt.fine_grained_callables) TransformerLayerSubmodules (class in core.transformer.transformer_layer) TransformerModelChunkSchedulePlan (class in core.models.common.model_chunk_schedule_plan) TRANSIENT_ERROR (core.rerun_state_machine.RerunDiagnostic attribute) triton_append_key_value_cache() (in module core.inference.contexts.fused_kv_append_kernel) TRT_MODEL_CONFIG (in module core.export.trtllm.trt_model_config) TRT_MODEL_TYPE_STRING (in module core.export.trtllm.trt_model_type) TRTLLMEngineBuilder (class in core.export.trtllm.engine_builder.trtllm_engine_builder) TRTLLMHelper (class in core.export.trtllm.trtllm_helper) TRTLLMLayers (class in core.export.trtllm.trtllm_layers) TwoStageDataParallelLoadShardedStrategy (class in core.dist_checkpointing.strategies.two_stage) type (core.inference.inference_request.DynamicInferenceEvent attribute) U uint16 (core.datasets.indexed_dataset.DType attribute) uint8 (core.datasets.indexed_dataset.DType attribute) UNATTEMPTED (core.inference.unified_memory.CompilationState attribute) uncompress_kv_from_cache() (core.transformer.multi_latent_attention.MLASelfAttention method) unfused (core.transformer.enums.AttnBackend attribute) UnifiedMemoryCompileTimeoutError UnifiedMemoryUnsupportedError unique_identifiers (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) unique_key (core.dist_checkpointing.mapping.ShardedObject property) unk (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) unk_id (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) UnknownHeaderError unpad_input_prompt_tokens() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) UNPAUSE (core.inference.headers.Headers attribute) unpause_engines() (core.inference.inference_client.InferenceClient method) unpermute() (in module core.transformer.moe.moe_utils) unset_num_microbatches_calculator() (in module core.num_microbatches_calculator) unwrap() (core.dist_checkpointing.mapping.LocalNonpersistentObject method) (core.utils.WrappedTensor method) unwrap_model() (in module core.utils) upcycle_state_dict() (in module core.transformer.moe.upcycling_utils) update() (core.num_microbatches_calculator.ConstantNumMicroBatchesCalculator method) (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (core.num_microbatches_calculator.RampupBatchsizeNumMicroBatchesCalculator method) (core.optimizer.grad_scaler.ConstantGradScaler method) (core.optimizer.grad_scaler.DynamicGradScaler method) (core.optimizer.grad_scaler.MegatronGradScaler method) update_chunk_counts() (in module core.datasets.retro.db.build) update_fp32_param_by_new_param() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) update_generation_status() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) update_main_grads() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) update_num_microbatches() (in module core.num_microbatches_calculator) update_pg_timeout() (in module core.parallel_state) update_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) update_requests_pools() (core.inference.scheduler.Scheduler method) update_uneven_dtensor_chunk_metadata() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) upload_file() (core.datasets.object_storage_utils.S3Client method) use_cpu_initialization (core.model_parallel_config.ModelParallelConfig attribute) use_custom_fsdp (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) use_distributed_optimizer (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) use_embedding_sharing (core.export.export_config.ExportConfig attribute) use_fused_weighted_squared_relu (core.transformer.transformer_config.TransformerConfig attribute) use_inference_optimized_layers (core.transformer.transformer_config.TransformerConfig attribute) use_kitchen (core.transformer.transformer_config.TransformerConfig attribute) use_kitchen_attention (core.transformer.transformer_config.TransformerConfig attribute) use_mamba_mem_eff_path (core.transformer.transformer_config.TransformerConfig attribute) use_megatron_fsdp (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) use_mpu_process_groups() (core.process_groups_config.ProcessGroupCollection class method) use_parallel_embedding (core.export.export_config.ExportConfig attribute) use_precision_aware_optimizer (core.optimizer.optimizer_config.OptimizerConfig attribute) use_ring_exchange_p2p (core.model_parallel_config.ModelParallelConfig attribute) use_te_activation_func (core.transformer.transformer_config.TransformerConfig attribute) use_te_rng_tracker (core.model_parallel_config.ModelParallelConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) use_torch_optimizer_for_cpu_offload (core.optimizer.optimizer_config.OptimizerConfig attribute) USING_APEX_OPTIMIZER (in module core.optimizer.distrib_optimizer) using_cuda_graph_this_step() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) USING_TE_OPTIMIZER (in module core.optimizer.distrib_optimizer) V v_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) valid (core.datasets.retro.config.gpt_chunk_datasets.RetroGPTChunkDatasets attribute) (core.datasets.utils.Split attribute) VALID (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) validate_added_encodings() (in module core.datasets.retro.index.validate) validate_checkpoint_id() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync class method) validate_index() (in module core.datasets.retro.index.validate) validate_integrity_and_strict_load() (in module core.dist_checkpointing.validation) validate_layer_layout() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) validate_loaded_state_dict() (in module core.transformer.fsdp_dtensor_checkpoint) validate_metadata_integrity() (core.dist_checkpointing.mapping.ShardedBase method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) validate_result() (core.rerun_state_machine.RerunStateMachine method) VALIDATE_RESULTS (core.rerun_state_machine.RerunMode attribute) validate_sharded_objects_handling() (in module core.dist_checkpointing.validation) validate_sharding_integrity() (in module core.dist_checkpointing.validation) validate_state_dict() (core.rerun_state_machine.RerunStateMachine method) validate_training_embeddings() (in module core.datasets.retro.index.validate) validate_uneven_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) variable_seq_lengths (core.model_parallel_config.ModelParallelConfig attribute) verbose (core.post_training.modelopt.layers.RealQuantTransformerLayer attribute) verify_checkpoint_and_load_strategy() (in module core.dist_checkpointing.validation) verify_global_md_reuse() (in module core.dist_checkpointing.strategies.state_dict_saver) VERSION (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) VIDEO_TOKEN (in module core.models.multimodal.llava_model) virtual_pipeline_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) VisionModule (class in core.models.common.vision_module.vision_module) VLMInferenceRequest (class in core.inference.inference_request) VLMTextGenerationController (class in core.inference.text_generation_controllers.vlm_text_generation_controller) vocab (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) vocab() (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) vocab_embedding (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) vocab_file (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) vocab_parallel_cross_entropy() (in module core.tensor_parallel.cross_entropy) vocab_range_from_global_vocab_size() (core.tensor_parallel.utils.VocabUtility static method) vocab_range_from_per_partition_vocab_size() (core.tensor_parallel.utils.VocabUtility static method) vocab_size (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) vocab_size() (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) VocabParallelCrossEntropy (class in core.tensor_parallel.cross_entropy) VocabParallelEmbedding (class in core.tensor_parallel.layers) VocabUtility (class in core.tensor_parallel.utils) W wait() (core.distributed.reduce_scatter_with_fp32_accumulation._ReduceScatterWithFP32AccumulationWorkHandle method) wait_bucket_ready() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) wait_current_stream() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) wait_for_previous_grad_reduce() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) WAITING_IN_QUEUE (core.inference.inference_request.Status attribute) weak_method() (in module core.models.gpt.fine_grained_callables) weight_decay (core.optimizer.optimizer_config.OptimizerConfig attribute) weighted_bias_quick_geglu() (in module core.fusions.fused_bias_geglu) weighted_bias_quick_geglu_back() (in module core.fusions.fused_bias_geglu) weighted_bias_quick_geglu_impl() (in module core.fusions.fused_bias_geglu) weighted_bias_swiglu_impl() (in module core.fusions.fused_bias_swiglu) weighted_quick_geglu() (in module core.fusions.fused_bias_geglu) weighted_quick_geglu_back() (in module core.fusions.fused_bias_geglu) weighted_squared_relu() (in module core.fusions.fused_weighted_squared_relu) weighted_squared_relu_back() (in module core.fusions.fused_weighted_squared_relu) weighted_squared_relu_impl() (in module core.fusions.fused_weighted_squared_relu) weighted_swiglu() (in module core.fusions.fused_bias_swiglu) weighted_swiglu_back() (in module core.fusions.fused_bias_swiglu) WeightedBiasQuickGeGLUFunction (class in core.fusions.fused_bias_geglu) WeightedQuickGeGLUFunction (class in core.fusions.fused_bias_geglu) WeightedSquaredReLUFunction (class in core.fusions.fused_weighted_squared_relu) WeightedSwiGLUFunction (class in core.fusions.fused_bias_swiglu) wgrad_deferral_limit (core.model_parallel_config.ModelParallelConfig attribute) will_execute_quantized() (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) WILL_RERUN_FROM_CHECKPOINT (core.rerun_state_machine.RerunState attribute) window_attn_skip_freq (core.transformer.transformer_config.TransformerConfig attribute) window_size (core.transformer.transformer_config.TransformerConfig attribute) without_data() (core.dist_checkpointing.mapping.ShardedBase method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) world_size (core.utils.StragglerDetector property) WrappedTensor (class in core.utils) WrappedTorchLayerNorm (in module core.transformer.torch_layer_norm) WrappedTorchNorm (class in core.transformer.torch_norm) write() (core.datasets.indexed_dataset._IndexWriter method) (core.timers.Timers method) write_data() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) write_metadata() (core.tokenizers.megatron_tokenizer.MegatronTokenizer method) write_preloaded_data() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync static method) write_preloaded_data_multiproc() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync static method) WriteBucket (in module core.dist_checkpointing.strategies.filesystem_async) Y YarnRotaryEmbedding (class in core.models.common.embeddings.yarn_rotary_pos_embedding) Z z_loss_func() (in module core.transformer.moe.moe_utils) ZarrLoadShardedStrategy (class in core.dist_checkpointing.strategies.zarr) ZarrSaveShardedStrategy (class in core.dist_checkpointing.strategies.zarr) zero_grad() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) zero_grad_buffer() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) zero_out_tensors() (core.transformer.cuda_graphs._CudaGraphRunner method) zero_parameters() (core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding method) zip_strict() (in module core.dist_checkpointing.utils)