Index _ | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | Y | Z _ __all__ (in module core) (in module core.config_logger) (in module core.distributed.fsdp.src.megatron_fsdp) (in module core.models.common.embeddings.relative_pos_embedding) (in module core.models.common.embeddings.rope_utils) (in module core.models.common.embeddings.rotary_pos_embedding) (in module core.models.mimo) (in module core.models.mimo.config) (in module core.models.mimo.model) (in module core.msc_utils) (in module core.resharding) (in module core.resharding.copy_services) (in module core.resharding.nvshmem_copy_service) (in module core.resharding.nvshmem_copy_service.core) (in module core.resharding.nvshmem_copy_service.memory) (in module core.resharding.nvshmem_copy_service.planning) (in module core.tensor_parallel) (in module core.tokenizers.text.libraries.tiktoken_tokenizer) (in module core.transformer.custom_layers.batch_invariant_kernels) __call__() (core.full_cuda_graph.FullCudaGraphWrapper method) (core.full_cuda_graph.StaticBufferLoader method) (core.optimizer.optimizer_config.ParamPredicate method) (core.optimizer.optimizer_config.ParamWithNamePredicate method) (core.timers.Timers method) (core.transformer.attention.CoreAttentionBuilder method) (core.transformer.attention.LinearLayerBuilder method) (core.transformer.attention.LinearQkvBuilder method) (core.transformer.cuda_graphs.CudaGraphManager method) (core.transformer.module.GraphableMegatronModule method) (core.transformer.moe.moe_layer.RouterBuilder method) (core.transformer.spec_utils.ModuleSpec method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) (core.utils._ValueWithRank method) (core.utils.StragglerDetector method) __config_logger_path_counts (in module core.config_logger) __contact_emails__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __contact_names__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __create_chunk_list__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) __create_write_items__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) __del__() (core.datasets.indexed_dataset._IndexReader method) (core.datasets.indexed_dataset._MMapBinReader method) (core.datasets.indexed_dataset._S3BinReader method) (core.datasets.indexed_dataset.IndexedDataset method) (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) (core.models.gpt.fine_grained_callables.TransformerLayerNode method) (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool method) __description__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __download_url__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __enter__() (core.datasets.indexed_dataset._IndexWriter method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.MultiGroupUBRAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ResetParametersContext method) (core.nccl_allocator.MemPoolAllocatorWithoutRegistration method) (core.nccl_allocator.MultiGroupMemPoolAllocator method) (core.nccl_allocator.nccl_mem method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface method) (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) (core.utils.StragglerDetector method) __eq__() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) (core.inference.contexts.static_context.StaticInferenceContext method) __exit__() (core.datasets.indexed_dataset._IndexWriter method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.MultiGroupUBRAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ResetParametersContext method) (core.nccl_allocator.MemPoolAllocatorWithoutRegistration method) (core.nccl_allocator.MultiGroupMemPoolAllocator method) (core.nccl_allocator.nccl_mem method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface method) (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) (core.utils.StragglerDetector method) __get_tensor_shard__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) __getattr__() (in module core.models.bert.bert_layer_specs) __getitem__() (core.datasets.bert_dataset.BERTMaskedWordPieceDataset method) (core.datasets.blended_dataset.BlendedDataset method) (core.datasets.gpt_dataset.GPTDataset method) (core.datasets.gpt_dataset.MockGPTLowLevelDataset method) (core.datasets.indexed_dataset._IndexReader method) (core.datasets.indexed_dataset.IndexedDataset method) (core.datasets.megatron_dataset.MegatronDataset method) (core.datasets.multimodal_dataset.MockMultimodalDataset method) (core.datasets.t5_dataset.T5MaskedWordPieceDataset method) (core.inference.inference_request.DynamicInferenceRequestRecord method) (core.optimizer.optimizer.ProxyDict method) __getstate_() (core.rerun_state_machine.QuickStats method) __getstate__() (core.datasets.indexed_dataset.IndexedDataset method) (core.msc_utils._FeatureFlag method) __gt__() (core.utils._ValueWithRank method) __hash__() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) __homepage__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __init_() (core.inference.headers.UnknownHeaderError method) __iter__() (core.datasets.data_schedule.HybridCPDataLoaderWrapper method) (core.optimizer.optimizer.ProxyDict method) __keywords__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __len__() (core.datasets.blended_dataset.BlendedDataset method) (core.datasets.gpt_dataset.GPTDataset method) (core.datasets.gpt_dataset.MockGPTLowLevelDataset method) (core.datasets.indexed_dataset._IndexReader method) (core.datasets.indexed_dataset.IndexedDataset method) (core.datasets.masked_dataset.MaskedWordPieceDataset method) (core.datasets.megatron_dataset.MegatronDataset method) (core.optimizer.distrib_optimizer.Range method) (core.optimizer.optimizer.ProxyDict method) __license__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __LOGGER_NAME_STACK (in module core.dist_checkpointing.utils) __LOGGER_STACK (in module core.dist_checkpointing.utils) __lt__() (core.utils._ValueWithRank method) __new__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) (core.extensions.transformer_engine.TENorm method) (core.post_training.modelopt.layers.Norm method) (core.transformer.torch_norm.WrappedTorchNorm method) (core.utils.StragglerDetector method) __next__() (core.datasets.data_schedule.HybridCPDataLoaderWrapper method) (core.inference.utils.Counter method) (core.rerun_state_machine.RerunDataIterator method) __package_name__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __post_init__() (core.datasets.bert_dataset.BERTMaskedWordPieceDatasetConfig method) (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig method) (core.datasets.gpt_dataset.GPTDatasetConfig method) (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig method) (core.datasets.multimodal_dataset.MultimodalDatasetConfig method) (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig method) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig method) (core.export.export_config.ExportConfig method) (core.inference.inference_request.DynamicInferenceEvent method) (core.inference.inference_request.DynamicInferenceRequest method) (core.inference.inference_request.InferenceRequest method) (core.inference.sampling_params.SamplingParams method) (core.model_parallel_config.ModelParallelConfig method) (core.optimizer.optimizer_config.OptimizerConfig method) (core.transformer.transformer_config.MLATransformerConfig method) (core.transformer.transformer_config.TransformerConfig method) __repository_url__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __repr__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer method) (core.extensions.transformer_engine.TEColumnParallelLinear method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TERowParallelLinear method) (core.optimizer.distrib_optimizer.Range method) (core.process_groups_config.ProcessGroupCollection method) (core.quantization.quant_config.GlobMatcher method) (core.quantization.quant_config.QuantizationConfig method) (core.quantization.quant_config.RecipeConfig method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) __setattr__() (core.models.huggingface.module.HuggingFaceModule method) (core.process_groups_config.ProcessGroupHelperMeta method) __setitem__() (core.optimizer.optimizer.ProxyDict method) __setstate() (core.rerun_state_machine.QuickStats method) __setstate__() (core.datasets.indexed_dataset.IndexedDataset method) (core.msc_utils._FeatureFlag method) __shortversion__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) __str__() (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.strategies.base.SaveStrategyBase method) (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) (core.inference.contexts.static_context.StaticInferenceContext method) (core.inference.inference_request.DynamicInferenceEvent method) (core.inference.inference_request.DynamicInferenceRequest method) (core.optimizer.distrib_optimizer.Range method) (core.resharding.utils.ReshardPlan method) (core.transformer.cuda_graphs._CudaGraphRunner method) (core.utils._ValueWithRank method) __straggler__ (in module core.utils) __torch_dispatch__() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor class method) (core.dist_checkpointing.strategies.checkpointable.LocalShardsContainer class method) __version__ (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) _ActiveAsyncRequest (class in core.dist_checkpointing.strategies.async_utils) _add_request() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _add_scales_to_converter() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _add_to_trtllm_model_weights() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) _adjust_key_value_for_inference() (core.transformer.attention.Attention method) _all_gather() (core.tensor_parallel.inference_layers.InferenceLayerNormColumnParallelLinear method) _all_to_all_cp2hp() (in module core.ssm.mamba_context_parallel) _all_to_all_hp2cp() (in module core.ssm.mamba_context_parallel) _AllGatherFromTensorParallelRegion (class in core.tensor_parallel.mappings) _alloc (in module core.inference.unified_memory) _alloc_storage() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _allocate() (core.utils.GlobalSymmetricMemoryBuffer method) _allocate_auto() (in module core.ssm.mamba_hybrid_layer_allocation) _allocate_memory() (core.transformer.attention.Attention method) _allocate_override() (in module core.ssm.mamba_hybrid_layer_allocation) _allocate_recv_buffer() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) _allocator (in module core.nccl_allocator) _allreduce_conditional_embedding_grads() (in module core.distributed.finalize_model_grads) _allreduce_embedding_grad() (in module core.distributed.finalize_model_grads) _allreduce_layernorm_grads (in module core.distributed.finalize_model_grads) _allreduce_non_tensor_model_parallel_grads() (in module core.distributed.finalize_model_grads) _allreduce_position_embedding_grads() (in module core.distributed.finalize_model_grads) _allreduce_word_embedding_grads() (in module core.distributed.finalize_model_grads) _AllToAll (class in core.tensor_parallel.mappings) _append_kv_cache_kernel() (in module core.inference.contexts.fused_kv_append_kernel) _apply_aux_loss() (core.transformer.moe.router.TopKRouter method) _apply_bias() (core.transformer.moe.experts.TEGroupedMLP static method) _apply_expert_bias() (core.transformer.moe.router.TopKRouter method) _apply_gated_norm() (core.ssm.gated_delta_net.GatedDeltaNet method) _apply_global_aux_loss() (core.transformer.moe.router.TopKRouter method) _apply_output_gate() (core.transformer.attention.Attention method) _apply_rotary_pos_emb_bshd() (in module core.models.common.embeddings.rope_utils) _apply_rotary_pos_emb_thd() (in module core.models.common.embeddings.rope_utils) _apply_scaling() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) _apply_seq_aux_loss() (core.transformer.moe.router.TopKRouter method) _apply_tile_tagging() (core.models.multimodal.llava_model.LLaVAModel method) _assemble_full_tensor_from_uneven_chunks() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) _assign_iterations() (core.resharding.nvshmem_copy_service.planning.communication_scheduler.CommunicationScheduler method) _ASYNC_IO_LOOP (in module core.utils) _ASYNC_TASK_STATS (in module core.utils) _backward() (core.pipeline_parallel.utils.ScheduleNode method) _backward_kv_proj() (core.transformer.multi_latent_attention.MLASelfAttention method) _backward_output_proj() (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) _backward_q_proj() (core.transformer.multi_latent_attention.MLASelfAttention method) _backward_qkv_proj() (core.transformer.attention.SelfAttention method) _BackwardDWWrapper (class in core.models.gpt.fine_grained_callables) _BaseDataParallel (class in core.distributed.data_parallel_base) _batch_invariant_LIB (in module core.transformer.custom_layers.batch_invariant_kernels) _batch_invariant_MODE (in module core.transformer.custom_layers.batch_invariant_kernels) _batched_p2p_ops() (in module core.pipeline_parallel.p2p_communication) _BF16_TYPES (in module core.transformer.module) _bias_dropout_add_func() (in module core.fusions.fused_bias_dropout) _BinReader (class in core.datasets.indexed_dataset) _bucket_group_gradient_reduce() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) _buffer (in module core.transformer.moe.fused_a2a) _build_attention_mask_and_position_ids() (core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper.GPTInferenceWrapper method) _build_b1ss_attention_mask() (core.datasets.t5_dataset.T5MaskedWordPieceDataset static method) _build_blended_dataset_splits() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) _build_bridge_comms() (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) _build_callable_nodes() (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan method) _build_descriptors_for_param() (in module core.resharding.planner) _build_document_index() (in module core.datasets.gpt_dataset) _build_document_sample_shuffle_indices() (core.datasets.gpt_dataset.GPTDataset method) _build_gbuf_range_map() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_indices() (core.datasets.blended_dataset.BlendedDataset method) _build_key_size_numel_dictionaries() (in module core.tensor_parallel.data) _build_layer_module_prefix_map() (in module core.resharding.utils) _build_layer_schedule_plan() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) _build_layers() (core.transformer.multi_token_prediction.MultiTokenPredictionBlock method) (core.transformer.transformer_block.TransformerBlock method) _build_matchers() (core.quantization.quant_config.RecipeConfig static method) _build_megatron_dataset_splits() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) _build_megatron_datasets_parallel() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) _build_model_and_main_param_groups() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_gbuf_param_range_map() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_gbuf_range() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_param_gbuf_map() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_model_param_to_state_dict_param_map() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _build_nccl_allocator() (in module core.nccl_allocator) _build_num_microbatches_calculator() (in module core.num_microbatches_calculator) _build_optimizer_group_ranges() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _build_rank_module_info_map() (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) _build_sample_index() (core.datasets.masked_dataset.MaskedWordPieceDataset method) _build_shuffle_index() (in module core.datasets.gpt_dataset) _calculate_cuda_graph_token_counts() (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder static method) _calculate_memory_size() (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool static method) _calculate_num_segments() (core.resharding.nvshmem_copy_service.planning.task_segmenter.TaskSegmenter method) _can_allocate() (core.utils.GlobalSymmetricMemoryBuffer method) _cast_value() (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) _causal_conv1d_version (in module core.utils) _check_and_set() (core.optimizer_param_scheduler.OptimizerParamScheduler method) _check_data_types() (in module core.tensor_parallel.data) _check_mamba_sequence_packing_support() (in module core.ssm.mamba_mixer) _check_mesh_ranks_and_group_ranks_are_consistent() (in module core.distributed.fsdp.mcore_fsdp_adapter) _check_module_parameter_types() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _check_stop_words_for_request_post_append() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _check_supported_type() (in module core.transformer.cuda_graphs) _check_toggle() (core.utils.StragglerDetector method) _checkpointed_attention_forward() (core.transformer.attention.Attention method) _checkpointed_forward() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) (core.transformer.transformer_block.TransformerBlock method) _clean_metadata_for_serialization() (in module core.dist_checkpointing.utils) _clip_kv_proj_weight() (core.transformer.multi_latent_attention.MLASelfAttention method) _clip_linear_qkv() (core.transformer.attention.SelfAttention method) _clip_q_proj_weight() (core.transformer.multi_latent_attention.MLASelfAttention method) _clone_nested_tensors() (in module core.transformer.cuda_graphs) _collect_all_batches() (core.resharding.nvshmem_copy_service.planning.communication_scheduler.CommunicationScheduler method) _collect_main_grad_data_for_unscaling() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _collect_original_tensor_info() (core.post_training.modelopt.layers.RealQuantTransformerLayer method) _COMM_STREAM (in module core.pipeline_parallel.utils) _communicate() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) _communicate_shapes() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) _COMP_STREAM (in module core.pipeline_parallel.utils) _compare_dataclasses() (in module core.dist_checkpointing.strategies.state_dict_saver) _compare_floats() (in module core.rerun_state_machine) _compilation_error (in module core.inference.unified_memory) _compilation_state (in module core.inference.unified_memory) _compile_timeout() (in module core.inference.unified_memory) _compute_bias() (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding method) _compute_pid() (in module core.transformer.custom_layers.batch_invariant_kernels) _compute_shards_access() (in module core.dist_checkpointing.validation) _concat_embeddings() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _configure_global_num_microbatches_calculator() (in module core.num_microbatches_calculator) _configured (core.utils.StragglerDetector attribute) _connect_with_inference_coordinator() (core.inference.inference_client.InferenceClient method) _CONTENT_METADATA_KEY (in module core.dist_checkpointing.serialization) _CONTEXT_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _CONTEXT_PARALLEL_GROUP (in module core.parallel_state) _controller() (core.utils.StragglerDetector method) _convert_non_transformer_layer() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) _convert_to_moe_state_dict() (in module core.transformer.moe.upcycling_utils) _convert_transformer_layer() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) _copy_main_params_to_model_params() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _copy_main_params_to_param_buffer() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _copy_model_grads_to_main_grads() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _copy_model_params_to_main_params() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _CopyToModelParallelRegion (class in core.tensor_parallel.mappings) _cpu_offloading_context (core.model_parallel_config.ModelParallelConfig attribute) _create_masked_lm_predictions() (core.datasets.masked_dataset.MaskedWordPieceDataset method) _create_packed_seq_idx() (core.ssm.mamba_mixer.MambaMixer method) _ctypes_lib (in module core.inference.unified_memory) _ctypes_lock (in module core.inference.unified_memory) _CUDA_RNG_STATE_TRACKER (in module core.tensor_parallel.random) _CUDA_RNG_STATE_TRACKER_INITIALIZED (in module core.tensor_parallel.random) _CudagraphGlobalRecord (class in core.transformer.cuda_graphs) _CudagraphRecordNode (class in core.transformer.cuda_graphs) _CudagraphReplayNode (class in core.transformer.cuda_graphs) _CudaGraphRunner (class in core.transformer.cuda_graphs) _DATA_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP (in module core.parallel_state) _DATA_PARALLEL_GROUP (in module core.parallel_state) _DATA_PARALLEL_GROUP_GLOO (in module core.parallel_state) _DATA_PARALLEL_GROUP_WITH_CP (in module core.parallel_state) _DATA_PARALLEL_GROUP_WITH_CP_AG (in module core.parallel_state) _DATA_PARALLEL_GROUP_WITH_CP_GLOO (in module core.parallel_state) _DATA_PARALLEL_RNG_TRACKER_NAME (in module core.tensor_parallel.random) _decode() (core.ssm.mamba_mixer.MambaMixer method) _DeepepManager (class in core.transformer.moe.token_dispatcher) _defer_loading_sharded_items() (in module core.dist_checkpointing.strategies.fully_parallel) _defer_loading_sharded_objects() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) _defer_loading_sharded_tensors() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) _detect_expert_index_from_param_name() (in module core.resharding.utils) _determine_if_first_last_layer_of_this_vp_chunk() (in module core.transformer.cuda_graphs) _determine_missing_and_unexpected_keys() (in module core.dist_checkpointing.validation) _determine_source_ranks_for_dst_param() (in module core.resharding.planner) _detokenize() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _disable_gc() (in module core.dist_checkpointing.strategies.async_utils) _DispatchManager (class in core.transformer.moe.token_dispatcher) _download() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _dtype_size() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _dynamic_inference() (core.ssm.mamba_mixer.MambaMixer method) _dynamic_step_calculate_log_probs() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_calculate_top_n_logprobs() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_context_bookkeeping() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_context_init() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_forward_logits() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_log_probs_bookkeeping() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_sample_bookkeeping() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _dynamic_step_sample_logits() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _EMBEDDING_GLOBAL_RANKS (in module core.parallel_state) _EMBEDDING_GROUP (in module core.parallel_state) _encode_segment_id() (core.resharding.nvshmem_copy_service.planning.task_segmenter.TaskSegmenter method) _enforce_double_buffer_limit() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) _ensure_generator_state_is_cudagraph_safe() (in module core.transformer.cuda_graphs) _ensure_initialized() (core.resharding.copy_services.nvshmem_copy_service.NVSHMEMCopyService method) _ep_group_has_work() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _ERROR_NAMES (core.rerun_state_machine.RerunErrorInjector attribute) _exchange_workload_summaries() (core.resharding.nvshmem_copy_service.planning.communication_scheduler.CommunicationScheduler method) _EXPERT_DATA_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_DATA_PARALLEL_GROUP_GLOO (in module core.parallel_state) _EXPERT_MODEL_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_MODEL_PARALLEL_RANKS (in module core.parallel_state) _EXPERT_PARALLEL_RNG_TRACKER_NAME (in module core.tensor_parallel.random) _EXPERT_TENSOR_AND_MODEL_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_TENSOR_MODEL_PIPELINE_PARALLEL_GROUP (in module core.parallel_state) _EXPERT_TENSOR_PARALLEL_GROUP (in module core.parallel_state) _extract_common_per_param_step() (core.optimizer.optimizer.MegatronOptimizer static method) _extract_from_cache() (core.datasets.indexed_dataset._S3BinReader method) _extract_te_gemm_args() (in module core.transformer.custom_layers.batch_invariant_kernels) _fa_version (in module core.utils) _FeatureFlag (class in core.msc_utils) _FileBinReader (class in core.datasets.indexed_dataset) _fill_in_deferred_sharded_items() (in module core.dist_checkpointing.strategies.fully_parallel) _filter_and_reorder_param_groups() (core.optimizer.optimizer.MegatronOptimizer static method) _finalize_dp_transfers() (in module core.resharding.planner) _find_submodule() (in module core.transformer.moe.upcycling_utils) _finish_capturing() (core.transformer.cuda_graphs.TECudaGraphHelper method) _fix_tensor_parallel_attributes() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) _flash_attention_3_forward_wrapper() (core.transformer.attention.Attention method) _FLOAT_TYPES (in module core.transformer.module) _fork_rng() (in module core.tensor_parallel.random) _forward() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.pipeline_parallel.utils.ScheduleNode method) _forward_attention() (core.transformer.transformer_layer.TransformerLayer method) _forward_impl() (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) _forward_mlp() (core.transformer.transformer_layer.MoETransformerLayer method) (core.transformer.transformer_layer.TransformerLayer method) _forward_mlp_expert_compute() (core.transformer.transformer_layer.MoETransformerLayer method) _forward_mlp_postprocess() (core.transformer.transformer_layer.MoETransformerLayer method) _forward_mlp_router() (core.transformer.transformer_layer.MoETransformerLayer method) _forward_post_mlp() (core.transformer.transformer_layer.TransformerLayer method) _forward_pre_mlp_layernorm() (core.transformer.transformer_layer.TransformerLayer method) _fp8_create_transpose_cache_fallback() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) _fp8_quantize_fallback() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) _free_storage() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _fsdp_modules (core.models.huggingface.clip_model.SiglipHuggingFaceModel attribute) (core.models.huggingface.qwen_model.QwenHuggingFaceModel attribute) _gather_along_first_dim() (in module core.tensor_parallel.mappings) _gather_along_last_dim() (in module core.tensor_parallel.mappings) _GatherFromModelParallelRegion (class in core.tensor_parallel.mappings) _GatherFromSequenceParallelRegion (class in core.tensor_parallel.mappings) _gen_rank_enum() (core.hyper_comm_grid.HyperCommGrid method) _get() (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) _get_all_ranks_time_string() (core.timers.Timers method) _get_all_rng_states() (in module core.tensor_parallel.random) _get_and_clear_stop_word_finished_ids() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _get_async_caller() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) _get_available_models_list() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _get_backend_spec_provider() (in module core.models.gpt.experimental_attention_variant_module_specs) _get_batch_size_and_seq_len() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) _get_block_submodules() (in module core.transformer.transformer_block) _get_config() (in module core.transformer.moe.upcycling_utils) _get_ctypes_lib() (in module core.inference.unified_memory) _get_cuda_graph_input_data() (core.transformer.cuda_graphs.TECudaGraphHelper method) _get_cuda_rng_state() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) _get_custom_recipe() (in module core.fp8_utils) _get_dense_mlp_module_spec() (in module core.models.gpt.experimental_attention_variant_module_specs) _get_distribution() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict static method) _get_dp_buffer_shard_bucket_index() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _get_dp_tp_mesh() (in module core.distributed.fsdp.mcore_fsdp_adapter) _get_elapsed_time_all_ranks() (core.timers.Timers method) _get_embeddings() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _get_empty_tensor_for_exchange() (in module core.dist_checkpointing.exchange_utils) _get_energy() (core.energy_monitor.EnergyMonitor method) _get_extra_state_offsets() (in module core.transformer.utils) _get_extra_te_kwargs() (in module core.extensions.transformer_engine) _get_filesystem_reader() (in module core.dist_checkpointing.strategies.torch) _get_fp8_autocast_for_quant_params() (in module core.extensions.transformer_engine) _get_fp8_autocast_for_quant_recipe() (in module core.extensions.transformer_engine) _get_fp8_params_and_shard_fp32_from_fp8() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _get_fsdp_tensor_spec() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _get_gbuf_name() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.RotaryBucketAllocator method) _get_global_min_max_time() (core.timers.Timers method) _get_global_min_max_time_string() (core.timers.Timers method) _get_hsdp_tp_mesh() (in module core.distributed.fsdp.mcore_fsdp_adapter) _get_item_local_index() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) _get_item_local_shard_index() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) _get_item_slice_in_shard() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) _get_keys_endswith() (in module core.transformer.moe.upcycling_utils) _get_layer() (core.transformer.transformer_block.TransformerBlock method) _get_layer_offset() (core.transformer.transformer_layer.TransformerLayer static method) _get_ltor_masks_and_position_ids() (in module core.datasets.gpt_dataset) _get_main_grad_attr() (in module core.distributed.finalize_model_grads) _get_main_param_and_optimizer_states() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _get_megatron_optimizer_based_on_param_groups() (in module core.optimizer) _get_merges_file() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _get_metadata_path() (in module core.tokenizers.megatron_tokenizer) _get_mlp_module_spec() (in module core.models.gpt.gpt_layer_specs) (in module core.models.vision.vit_layer_specs) _get_model_and_main_params_data_float16() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) _get_model_param_range_map() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _get_moe_module_spec() (in module core.models.gpt.experimental_attention_variant_module_specs) _get_mtp_block_submodules() (in module core.transformer.multi_token_prediction) _get_num_epochs() (core.datasets.gpt_dataset.GPTDataset method) _get_num_tokens_per_epoch() (core.datasets.gpt_dataset.GPTDataset method) _get_param_groups() (core.optimizer.optimizer.MegatronOptimizer method) (in module core.optimizer) _get_param_groups_and_buffers() (in module core.optimizer) _get_parameter_groups() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _get_pool_key() (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool method) _get_pos_embeddings() (core.models.vision.radio.RADIOViTModel method) _get_position_embedding_weight() (in module core.distributed.finalize_model_grads) _get_pp_layer_offset_for_inference() (core.transformer.attention.Attention method) _get_rank_in_group() (in module core.resharding.utils) _get_remove_vocab_padding() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) _get_rng_state_dict() (in module core.distributed.fsdp.mcore_fsdp_adapter) _get_sample_arguments() (core.transformer.cuda_graphs.TECudaGraphHelper method) _get_save_and_finalize_callbacks() (core.dist_checkpointing.strategies.torch.TorchDistSaveShardedStrategy method) _get_self_attention_module_spec() (in module core.models.gpt.experimental_attention_variant_module_specs) _get_shared_word_embedding_weight() (in module core.distributed.finalize_model_grads) _get_should_context_be_quantized_params() (in module core.extensions.transformer_engine) _get_should_context_be_quantized_recipe() (in module core.extensions.transformer_engine) _get_size_per_split_per_dataset() (in module core.datasets.blended_megatron_dataset_builder) _get_state() (core.optimizer.optimizer.MegatronOptimizer method) _get_states_from_cache() (core.ssm.mamba_mixer.MambaMixer method) _get_sub_optimizer_param_groups() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _get_submodules_under_cudagraphs() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _get_te_cuda_graph_replay_args() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _get_thd_freqs_on_this_cp_rank() (in module core.models.common.embeddings.rope_utils) _get_thd_token_idx() (in module core.fusions.fused_mla_yarn_rope_apply) _get_token_mask() (core.datasets.bert_dataset.BERTMaskedWordPieceDataset method) (core.datasets.masked_dataset.MaskedWordPieceDataset method) (core.datasets.t5_dataset.T5MaskedWordPieceDataset method) _get_trtllm_config() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _get_trtllm_pretrained_config_and_model_weights_in_distributed_setting() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _get_trtllm_pretrained_config_and_model_weights_list_on_single_device() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _get_validation_call_info() (core.rerun_state_machine.RerunStateMachine method) _get_vocab_file() (core.tokenizers.text.libraries.megatron_hf_tokenizer.MegatronHFTokenizer method) _get_write_results_queue() (in module core.dist_checkpointing.strategies.filesystem_async) _gid_to_src_rank() (core.datasets.data_schedule.HybridCPDataLoaderWrapper method) _GLOBAL_MEMORY_BUFFER (in module core.parallel_state) _GLOBAL_NUM_MICROBATCHES_CALCULATOR (in module core.num_microbatches_calculator) _global_process_group_list (in module core.parallel_state) _GLOBAL_RERUN_STATE_MACHINE (in module core.rerun_state_machine) _GLOBAL_SYMMETRIC_MEMORY_BUFFER (in module core.parallel_state) _GlobalMetadata (in module core.dist_checkpointing.validation) _grad_accum_fusion_available (in module core.tensor_parallel.layers) _GraphStatus (class in core.transformer.cuda_graphs) _HALF_TYPES (in module core.transformer.module) _handler() (core.utils.StragglerDetector method) _has_conflict() (core.resharding.nvshmem_copy_service.planning.communication_scheduler.CommunicationScheduler method) _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS (in module core.parallel_state) _HYBRID_DP_CP_GROUPS (in module core.parallel_state) _hybrid_ep_buffer (in module core.transformer.moe.fused_a2a) _HybridEPManager (class in core.transformer.moe.token_dispatcher) _import_class_from_path() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _import_module_if_available() (in module core.transformer.custom_layers.batch_invariant_kernels) _INDEX_HEADER (in module core.datasets.indexed_dataset) _IndexReader (class in core.datasets.indexed_dataset) _IndexWriter (class in core.datasets.indexed_dataset) _indices_to_multihot() (core.transformer.moe.token_dispatcher._DeepepManager method) _indices_to_multihot_kernel() (in module core.fusions.fused_indices_converter) _init_dist_index() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) _init_distributed_params() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _init_dynamic_sampling_tensors() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _init_each_parameter_group_buffers() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _init_fsdp_param_and_grad_buffer() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _init_moe_expert_cache() (in module core.inference.utils) _init_optimizer_named_parameters() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _init_optimizer_states_with_dummy_values() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _init_sequence_parallel_cache() (in module core.transformer.utils) _init_sub_optimizers() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _initialize_affine_weight_cpu() (in module core.tensor_parallel.layers) _initialize_affine_weight_gpu() (in module core.tensor_parallel.layers) _initialize_language_model() (core.models.mimo.model.base.MimoModel method) _initialize_metadata() (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) _initialize_submodules() (core.models.mimo.model.base.MimoModel method) _insert_sharded_data() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) _INTER_PARTIAL_EXPERT_DATA_PARALLEL_GROUP (in module core.parallel_state) _intersection() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) _INTRA_DISTRIBUTED_OPTIMIZER_INSTANCE_GROUP (in module core.parallel_state) _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP (in module core.parallel_state) _INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP_GLOO (in module core.parallel_state) _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP (in module core.parallel_state) _INTRA_PARTIAL_EXPERT_DATA_PARALLEL_GROUP_GLOO (in module core.parallel_state) _is_cuda() (in module core.inference.communication_utils) _is_cuda_contiguous() (in module core.inference.communication_utils) _IS_GRAPH_CAPTURING (in module core.transformer.cuda_graphs) _IS_GRAPH_WARMUP (in module core.transformer.cuda_graphs) _is_hollow (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict attribute) _is_in_embd_group() (core.models.common.language_module.language_module.LanguageModule method) _is_msc_path() (in module core.datasets.object_storage_utils) _is_raisable() (core.inference.async_stream.AsyncStream static method) _is_s3_path() (in module core.datasets.object_storage_utils) _is_sink_module() (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) _is_source_module() (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) _is_supported_dtype_for_bik() (in module core.transformer.custom_layers.batch_invariant_kernels) _is_two_bucket_group_equal() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) _item_size() (in module core.dist_checkpointing.strategies.filesystem_async) _kernel_make_viewless_tensor() (in module core.utils) _key_config_attributes() (core.datasets.bert_dataset.BERTMaskedWordPieceDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) (core.datasets.t5_dataset.T5MaskedWordPieceDataset static method) _launch_pack() (core.resharding.nvshmem_copy_service.core.pipeline_executor.PipelineExecutor method) _launch_unpack() (core.resharding.nvshmem_copy_service.core.pipeline_executor.PipelineExecutor method) _layer_counts_match() (in module core.ssm.mamba_hybrid_layer_allocation) _layer_is_graphable() (in module core.transformer.cuda_graphs) _level (core.resharding.nvshmem_copy_service.logger.PELogger attribute) _load_from_state_dict() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) (core.transformer.moe.router.TopKRouter method) _load_rng_state_dict() (in module core.distributed.fsdp.mcore_fsdp_adapter) _load_scaling_factors() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) _load_state_dict_hook_ignore_extra_state() (in module core.models.multimodal.llava_model) _load_state_dict_hook_ignore_param_names() (in module core.models.multimodal.llava_model) _LocalMetadata (in module core.dist_checkpointing.validation) _log_parameter_groups() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _log_softmax_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) _log_softmax_kernel() (in module core.transformer.custom_layers.batch_invariant_kernels) _log_validation_error_to_file() (core.rerun_state_machine.RerunStateMachine method) _logged_deprecations (in module core.dist_checkpointing.mapping) _logger (core.resharding.nvshmem_copy_service.logger.PELogger attribute) _LOGGER (in module core.transformer.custom_layers.batch_invariant_kernels) _maintain_float32_expert_bias() (core.transformer.moe.router.TopKRouter method) _make_backward_post_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) _make_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) _mamba_ssm_version (in module core.utils) _matmul_launch_metadata() (in module core.transformer.custom_layers.batch_invariant_kernels) _matmul_reduce_scatter() (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) _MAX_DATA_DIM (in module core.tensor_parallel.data) _maybe_allocate_symmetric_buffer() (core.tensor_parallel.inference_layers.InferenceLayerNormColumnParallelLinear method) _maybe_dtoh_and_synchronize() (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) _maybe_report_stats() (core.rerun_state_machine.RerunStateMachine method) _maybe_update_cuda_sync_point() (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) _MEG_TE_GENERAL_GEMM_ORIG (in module core.transformer.custom_layers.batch_invariant_kernels) _metadata_fn (in module core.dist_checkpointing.strategies.torch) _min_max() (core.utils.StragglerDetector method) _MMapBinReader (class in core.datasets.indexed_dataset) _mod (in module core.inference.unified_memory) _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS (in module core.tensor_parallel.layers) _MODEL_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _MODEL_PARALLEL_GROUP (in module core.parallel_state) _MODEL_PARALLEL_RNG_TRACKER_NAME (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) _Module (class in core.typed_torch) _MOE_LAYER_WISE_LOGGING_TRACKER (in module core.transformer.moe.moe_utils) _move_book_keeping_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) _move_new_state_to_right_device() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _MPU_DATA_PARALLEL_RANK (in module core.parallel_state) _MPU_DATA_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_EXPERT_MODEL_PARALLEL_RANK (in module core.parallel_state) _MPU_EXPERT_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_EXPERT_TENSOR_PARALLEL_RANK (in module core.parallel_state) _MPU_EXPERT_TENSOR_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_PIPELINE_MODEL_PARALLEL_RANK (in module core.parallel_state) _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _MPU_TENSOR_MODEL_PARALLEL_RANK (in module core.parallel_state) _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _multi_tensor_copy_this_to_that() (in module core.optimizer.optimizer) _multihot_to_indices_kernel() (in module core.fusions.fused_indices_converter) _MultiStorageClientBinReader (class in core.datasets.indexed_dataset) _new_bucket() (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) _norm() (core.transformer.torch_norm.L2Norm method) _notify_cond_for_new_request() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _nvtx_decorator_get_func_path() (in module core.utils) _nvtx_enabled (in module core.utils) _nvtx_range_get_func_path() (in module core.utils) _nvtx_range_messages (in module core.utils) _offset_slice() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) _order_dims() (core.hyper_comm_grid.HyperCommGrid method) _p2p_ops() (in module core.pipeline_parallel.p2p_communication) _p_assert() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _pack_single_destination() (core.resharding.nvshmem_copy_service.planning.workload_packer.WorkloadPacker method) _pad() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) _pad_routing_map() (core.transformer.moe.token_dispatcher._DeepepManager method) _pad_routing_map_kernel() (in module core.fusions.fused_pad_routing_map) _pad_tensor_for_quantization() (core.transformer.moe.experts.SequentialMLP method) _PAD_TOKEN_ID (in module core.datasets.megatron_dataset) _param2group_meta_to_param_groups() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _param_generator() (in module core.optimizer.cpu_offloading.hybrid_optimizer) _param_groups_to_param2group_meta() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _param_name() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _ParamAndGradBucket (class in core.distributed.param_and_grad_buffer) _ParamAndGradBucketGroup (class in core.distributed.param_and_grad_buffer) _ParamAndGradBuffer (class in core.distributed.param_and_grad_buffer) _pe_id (core.resharding.nvshmem_copy_service.logger.PELogger attribute) _PIPELINE_GLOBAL_RANKS (in module core.parallel_state) _PIPELINE_MODEL_PARALLEL_GROUP (in module core.parallel_state) _plan_kernel_args() (core.resharding.nvshmem_copy_service.planning.gpu_execution_planner.GPUExecutionPlanner method) _plan_multi_dim_lcm() (in module core.resharding.planner) _POSITION_EMBEDDING_GLOBAL_RANKS (in module core.parallel_state) _POSITION_EMBEDDING_GROUP (in module core.parallel_state) _post_deserialize() (core.inference.inference_request.DynamicInferenceRequest method) (core.inference.inference_request.InferenceRequest method) _postprocess() (core.models.gpt.gpt_model.GPTModel method) (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _prepare_iter_schedules() (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) _preprocess() (core.models.gpt.gpt_model.GPTModel method) _preprocess_data() (core.models.multimodal.llava_model.LLaVAModel method) _process_embedding_token_parallel() (core.models.multimodal.llava_model.LLaVAModel method) _process_memory() (in module core.dist_checkpointing.strategies.filesystem_async) _proj_and_transformer_layer() (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) _query_document_sample_shuffle_indices() (core.datasets.gpt_dataset.GPTDataset method) _recompute() (core.tensor_parallel.random.CheckpointWithoutOutput method) _recv_task() (core.inference.inference_client.InferenceClient method) _redo_attention_load_balancing() (in module core.ssm.mamba_context_parallel) _reduce() (in module core.tensor_parallel.mappings) _reduce_any() (core.rerun_state_machine.RerunStateMachine method) _reduce_scatter_along_first_dim() (in module core.tensor_parallel.mappings) _reduce_scatter_along_last_dim() (in module core.tensor_parallel.mappings) _ReduceFromModelParallelRegion (class in core.tensor_parallel.mappings) _ReduceScatterToSequenceParallelRegion (class in core.tensor_parallel.mappings) _ReduceScatterToTensorParallelRegion (class in core.tensor_parallel.mappings) _ReduceScatterWithFP32AccumulationWorkHandle (class in core.distributed.reduce_scatter_with_fp32_accumulation) _reestablish_shared_weights() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _register_fsdp_hooks() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _register_load_state_dict_hooks() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _register_param_copy_back_gpu_hook() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _relative_position_bucket() (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding method) _release_state() (core.pipeline_parallel.utils.ScheduleNode method) _remove_msc_prefix() (in module core.datasets.object_storage_utils) _remove_redundant_data() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict static method) _remove_s3_prefix() (in module core.datasets.object_storage_utils) _replace_module_parameter() (in module core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) _replace_param_with_distributed_if_needed() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _replace_param_with_raw_if_needed() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) _replace_sharded_keys_with_state_dict_keys() (in module core.dist_checkpointing.strategies.torch) _replace_state_dict_keys_with_sharded_keys() (in module core.dist_checkpointing.strategies.torch) _report_quantize_tensor_info() (core.post_training.modelopt.layers.RealQuantTransformerLayer method) _reset_parameters() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) _reshard_if_dtensor() (in module core.distributed.finalize_model_grads) _resolve_callable_from_python_import_path() (in module core.fp8_utils) _resolve_global_layer_number_in_name() (in module core.resharding.utils) _restore_common_per_param_step() (core.optimizer.optimizer.MegatronOptimizer static method) _restore_dict_types() (in module core.dist_checkpointing.strategies.torch) _restore_model() (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) _restore_state() (core.rerun_state_machine.RerunStateMachine method) _results_queue (in module core.dist_checkpointing.strategies.filesystem_async) _roll_tensor_packed_seq() (in module core.transformer.multi_token_prediction) _rotate_half() (in module core.models.common.embeddings.rope_utils) _round() (in module core.num_microbatches_calculator) _run_coroutine_sync() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) _s3_download_file() (in module core.datasets.object_storage_utils) _s3_object_exists() (in module core.datasets.object_storage_utils) _S3BinReader (class in core.datasets.indexed_dataset) _sanitize_data_iterators() (core.rerun_state_machine.RerunStateMachine method) _sanity_check_attention_and_get_attn_mask_dimension() (core.models.bert.bert_model.BertModel method) _save_state() (core.rerun_state_machine.RerunStateMachine method) _save_to_state_dict() (core.transformer.moe.router.TopKRouter method) _ScatterToModelParallelRegion (class in core.tensor_parallel.mappings) _ScatterToSequenceParallelRegion (class in core.tensor_parallel.mappings) _segment_tasks() (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) _select_layers_for_pipeline_parallel() (core.ssm.mamba_block.MambaStack method) _send_signal_to_engines() (core.inference.inference_client.InferenceClient method) _sequence_parallel_attr_cache (in module core.transformer.utils) _sequence_pointers() (core.datasets.indexed_dataset._IndexWriter method) _service_cache (in module core.resharding.refit) _set_all_rng_states() (in module core.tensor_parallel.random) _set_attention_backend() (core.models.common.language_module.language_module.LanguageModule method) _set_capture_end() (in module core.transformer.cuda_graphs) _set_capture_start() (in module core.transformer.cuda_graphs) _set_checkpointing() (in module core.tensor_parallel.random) _set_cos_sin_cache() (core.models.common.embeddings.yarn_rotary_pos_embedding.YarnRotaryEmbedding method) _set_cuda_rng_state() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) _set_fc2_next_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) _set_fc2_residual() (core.transformer.transformer_layer.TransformerLayer method) _set_global_memory_buffer() (in module core.parallel_state) _set_global_symmetric_memory_buffer() (in module core.parallel_state) _set_main_param_and_optimizer_states() (core.optimizer.distrib_optimizer.DistributedOptimizer method) _set_next_layer_norm_weights() (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) _set_param_groups() (core.optimizer.optimizer.MegatronOptimizer method) _set_proj_next_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) _set_proj_residual() (core.transformer.transformer_layer.TransformerLayer method) _set_rerun_state_machine() (in module core.rerun_state_machine) _set_residual() (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) _set_state() (core.optimizer.optimizer.MegatronOptimizer method) _set_sub_optimizer_grads() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _set_warmup_end() (in module core.transformer.cuda_graphs) _set_warmup_start() (in module core.transformer.cuda_graphs) _setup_fused_tp_communication() (core.transformer.transformer_block.TransformerBlock method) _shard_size() (in module core.dist_checkpointing.exchange_utils) _sharded_object_id() (in module core.dist_checkpointing.utils) _sharded_tensor_shard_id() (in module core.dist_checkpointing.utils) _sharded_tensors (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) _ShardId (in module core.dist_checkpointing.utils) _should_call_local_cudagraph() (core.ssm.mamba_layer.MambaLayer method) (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) _should_call_te_cudagraph() (core.transformer.module.GraphableMegatronModule method) _slice_conv_param() (core.ssm.mamba_context_parallel.MambaContextParallel method) _slice_vector_param() (core.ssm.mamba_context_parallel.MambaContextParallel method) _so_path (in module core.inference.unified_memory) _split_along_first_dim() (in module core.tensor_parallel.mappings) _split_along_last_dim() (in module core.tensor_parallel.mappings) _split_by_separation_hint() (in module core.dist_checkpointing.strategies.filesystem_async) _split_by_size_and_type() (in module core.dist_checkpointing.strategies.filesystem_async) _split_state_dict() (core.optimizer.optimizer.ChainedOptimizer method) _split_tensor_at_batch_dim() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) _split_tensor_factory() (in module core.ssm.gated_delta_net) (in module core.ssm.mamba_mixer) _squared_relu_back() (in module core.fusions.fused_weighted_squared_relu) _ssm_decode() (core.ssm.mamba_mixer.MambaMixer method) _ssm_prefill() (core.ssm.mamba_mixer.MambaMixer method) _ssm_training() (core.ssm.mamba_mixer.MambaMixer method) _start_capturing() (core.transformer.cuda_graphs.TECudaGraphHelper method) _StragglerData (class in core.utils) _swap_book_keeping_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) _sync_hdo_param_groups_to_sub_optimizers() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _sync_hdo_state_to_sub_optimizers() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _sync_prompt_logprobs_fields() (core.inference.sampling_params.SamplingParams method) _sync_sub_optimizers_state_to_hdo() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _synchronize_steps() (core.optimizer.optimizer.ChainedOptimizer method) _TE_CONFIG_TYPE_KEY (in module core.extensions.transformer_engine) _te_cuda_graph_backward_dw_graph() (core.transformer.module.GraphableMegatronModule method) _te_cuda_graph_capture() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _te_cuda_graph_replay() (core.ssm.mamba_layer.MambaLayer method) (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) _TE_GEMM_FUNC_ORIGS (in module core.transformer.custom_layers.batch_invariant_kernels) _TE_GENERAL_GEMM_ORIG (in module core.transformer.custom_layers.batch_invariant_kernels) _te_general_gemm_patched() (in module core.transformer.custom_layers.batch_invariant_kernels) _te_patch_for_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) _te_rms_norm_kernel() (in module core.tensor_parallel.inference_layers) _te_rmsnorm_forward_patched() (in module core.transformer.custom_layers.batch_invariant_kernels) _TE_RMSNORM_FUNC_ORIGS (in module core.transformer.custom_layers.batch_invariant_kernels) _TE_RMSNORM_ORIG_FWD (in module core.transformer.custom_layers.batch_invariant_kernels) _te_unpatch_for_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) _te_version (in module core.utils) _temporarily_bypass_shape_validation() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) _TENSOR_AND_CONTEXT_PARALLEL_GROUP (in module core.parallel_state) _TENSOR_AND_DATA_PARALLEL_GROUP (in module core.parallel_state) _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP (in module core.parallel_state) _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS (in module core.parallel_state) _TENSOR_MODEL_PARALLEL_GROUP (in module core.parallel_state) _text_to_ids() (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) _text_to_ids_extra_space() (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) _tokenize_prompts_and_batch() (in module core.inference.text_generation_server.dynamic_text_gen_server.tokenization) (in module core.inference.text_generation_server.tokenization) _torch_sampling_func() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _undo_attention_load_balancing() (in module core.ssm.mamba_context_parallel) _unscale_main_grads_and_check_for_nan() (core.optimizer.optimizer.MixedPrecisionOptimizer method) _unset_checkpointing() (in module core.tensor_parallel.random) _unshard_if_dtensor() (in module core.distributed.finalize_model_grads) _unwrap_pyt_sharded_tensor() (in module core.dist_checkpointing.strategies.torch) _update_fp32_params_by_new_state() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) _update_legacy_world_tensors() (core.optimizer.distrib_optimizer.DistributedOptimizer class method) _update_router_expert_bias() (in module core.distributed.finalize_model_grads) _update_top_n_logprobs_dict() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) _validate_common_state_dict() (in module core.dist_checkpointing.validation) _validate_global_shapes() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) _validate_objects_for_key() (in module core.dist_checkpointing.validation) _validate_params() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict static method) _validate_segmentation() (core.resharding.nvshmem_copy_service.planning.task_segmenter.TaskSegmenter method) _validate_sharding_for_key() (in module core.dist_checkpointing.validation) _ValueWithRank (class in core.utils) _VERBOSE (in module core.datasets.blended_dataset) _version_no_greater_than() (in module core.ssm.triton_cache_manager) _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK (in module core.parallel_state) _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE (in module core.parallel_state) _VocabParallelCrossEntropy (class in core.fusions.fused_cross_entropy) (class in core.tensor_parallel.cross_entropy) _wrapped_run_engine() (core.inference.engines.static_engine.StaticInferenceEngine method) _yarn_find_correction_dim() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_find_correction_range() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_get_concentration_factor() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_get_concentration_factor_from_config() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_get_mscale() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _yarn_linear_ramp_mask() (in module core.models.common.embeddings.yarn_rotary_pos_embedding) _zero_grad_group_helper() (in module core.optimizer.optimizer) A abort_request() (core.inference.scheduler.Scheduler method) AbstractEngine (class in core.inference.engines.abstract_engine) AbstractModelInferenceWrapper (class in core.inference.model_inference_wrappers.abstract_model_inference_wrapper) AbstractSchedulePlan (class in core.pipeline_parallel.utils) account_for_embedding_in_pipeline_split (core.transformer.transformer_config.TransformerConfig attribute) account_for_loss_in_pipeline_split (core.transformer.transformer_config.TransformerConfig attribute) activation_func (core.transformer.mlp.MLPSubmodules attribute) (core.transformer.transformer_config.TransformerConfig attribute) activation_func() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) activation_func_clamp_value (core.transformer.transformer_config.TransformerConfig attribute) activation_func_fp8_input_store (core.transformer.transformer_config.TransformerConfig attribute) ActivationFuncName (in module core.transformer.moe.upcycling_utils) ACTIVE_AND_GENERATING_TOKENS (core.inference.inference_request.Status attribute) ACTIVE_BUT_NOT_GENERATING_TOKENS (core.inference.inference_request.Status attribute) active_time() (core.timers.DummyTimer method) (core.timers.Timer method) ActiveRequestCountOverflowError adam_beta1 (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) adam_beta2 (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) adam_eps (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) AdamOptimizerConfig (class in core.optimizer.optimizer_config) ADD (core.inference.inference_request.DynamicInferenceEventType attribute) add() (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.tensor_parallel.random.CudaRNGStatesTracker method) add_attributes() (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig method) (core.inference.sampling_params.SamplingParams method) add_bias_linear (core.transformer.transformer_config.TransformerConfig attribute) add_BOS (core.inference.sampling_params.SamplingParams attribute) add_document() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_dummy_requests_for_cudagraph_capture() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) add_dummy_requests_parallel() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) add_earliest_waiting_request_to_active_pool() (core.inference.scheduler.Scheduler method) add_event() (core.inference.inference_request.DynamicInferenceRequest method) add_event_add() (core.inference.inference_request.DynamicInferenceRequest method) add_event_error_nontransient() (core.inference.inference_request.DynamicInferenceRequest method) add_event_error_transient() (core.inference.inference_request.DynamicInferenceRequest method) add_event_evict() (core.inference.inference_request.DynamicInferenceRequest method) add_event_fail() (core.inference.inference_request.DynamicInferenceRequest method) add_event_finish() (core.inference.inference_request.DynamicInferenceRequest method) add_event_pause() (core.inference.inference_request.DynamicInferenceRequest method) add_extra_token_to_sequence (core.datasets.gpt_dataset.GPTDatasetConfig attribute) add_finalize_fn() (core.dist_checkpointing.strategies.async_utils.AsyncRequest method) add_index() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_item() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) add_prefix_for_sharding() (in module core.dist_checkpointing.utils) add_qkv_bias (core.transformer.transformer_config.TransformerConfig attribute) add_request() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.engines.static_engine.StaticInferenceEngine method) (core.inference.inference_client.InferenceClient method) (core.inference.scheduler.Scheduler method) add_special_tokens() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) additional_special_tokens_ids (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) addmm_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) adjust_batch_dims_for_expert_parallelism() (core.inference.batch_dimensions_utils.InferenceBatchDimensions static method) adjust_non_strict_load() (in module core.dist_checkpointing.validation) advance() (core.rerun_state_machine.RerunDataIterator method) advise_managed_module_parameters_preferred_location() (in module core.inference.unified_memory) advise_managed_tensor_accessed_by() (in module core.inference.unified_memory) advise_managed_tensor_preferred_location() (in module core.inference.unified_memory) aflops (core.utils._StragglerData attribute) align_embeddings_by_token_positions() (core.models.mimo.model.base.MimoModel method) align_param_gather (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) all_gather_and_wait_parameters_ready() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) all_gather_last_dim_from_tensor_parallel_region() (in module core.tensor_parallel.mappings) all_gather_parameters() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) all_gather_params() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) all_passed (core.resharding.nvshmem_copy_service.validation.ValidationSummary property) all_ranks_for_shard (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) all_reduce_gradients() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) all_reduce_max() (core.inference.engines.async_zmq_communicator.AsyncZMQCommunicator method) all_to_all() (in module core.tensor_parallel.mappings) all_to_all_hp2sp() (in module core.tensor_parallel.mappings) all_to_all_sp2hp() (in module core.tensor_parallel.mappings) allgather_params() (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) AllGatherPipeline (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) allocate() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.RotaryBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.StorageResizeBasedBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.TemporaryBucketAllocator method) (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool method) (core.resharding.nvshmem_copy_service.memory.double_buffer_manager.DoubleBufferManager method) allocate_all_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) allocate_layers() (in module core.ssm.mamba_hybrid_layer_allocation) allocate_memory_blocks() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) allow_ambiguous_pad_tokens (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) allow_shape_mismatch (core.dist_checkpointing.mapping.ShardedTensor attribute) append_key_value_cache() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply() (core.fusions.fused_bias_gelu.GeLUFunction class method) apply_chat_template() (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) (core.tokenizers.text.libraries.chat_template.MegatronTokenizerChatTemplate method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) apply_cudagraph_record_metadata() (core.transformer.cuda_graphs._CudaGraphRunner method) apply_factories() (in module core.dist_checkpointing.mapping) apply_factory_merges() (in module core.dist_checkpointing.mapping) apply_fused_qk_rotary_emb() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply_input_jitter() (core.transformer.moe.router.TopKRouter method) apply_loading_parallelization() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) apply_module() (in module core.typed_torch) apply_pos_enc() (core.models.vision.radio.RADIOViTModel method) apply_prefix_mapping() (in module core.dist_checkpointing.utils) apply_query_key_layer_scaling (core.transformer.transformer_config.TransformerConfig attribute) apply_random_logits() (in module core.transformer.moe.moe_utils) apply_residual_connection_post_layernorm (core.transformer.transformer_config.TransformerConfig attribute) apply_rope_fusion (core.transformer.transformer_config.TransformerConfig attribute) apply_rotary_emb_key() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply_rotary_emb_query() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) apply_rotary_pos_emb() (in module core.models.common.embeddings.rope_utils) apply_rotary_pos_emb_with_cos_sin() (in module core.models.common.embeddings.rope_utils) apply_router_token_dropping() (in module core.transformer.moe.moe_utils) apply_saving_parallelization() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper method) apply_swiglu_sharded_factory() (in module core.transformer.mlp) apply_wd_to_qk_layernorm (core.optimizer.optimizer_config.OptimizerConfig attribute) apply_z_loss() (core.transformer.moe.router.TopKRouter method) ApplyMLARotaryEmbKV (class in core.fusions.fused_mla_yarn_rope_apply) ApplyMLARotaryEmbQ (class in core.fusions.fused_mla_yarn_rope_apply) arbitrary (core.transformer.enums.AttnMaskType attribute) ArgMetadata (class in core.transformer.cuda_graphs) arrival_time (core.inference.inference_request.InferenceRequest attribute) assert_grouped_gemm_is_available() (in module core.transformer.moe.grouped_gemm_util) assert_viewless_tensor() (in module core.utils) assign_ep_resolved_name_inplace() (in module core.resharding.utils) assign_resolved_name_inplace() (in module core.resharding.utils) ASSUME_OK_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) async_bookkeep() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) async_bucket_gather() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) async_caller (core.dist_checkpointing.strategies.async_utils._ActiveAsyncRequest attribute) async_calls (in module core.dist_checkpointing.strategies.base) async_fn (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) async_fn_args (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) async_fn_kwargs (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) async_forward() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) async_generate_output_tokens_dynamic_batch() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) async_loop() (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller static method) async_request (core.dist_checkpointing.strategies.async_utils._ActiveAsyncRequest attribute) async_save() (core.dist_checkpointing.strategies.base.AsyncSaveShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistSaveShardedStrategy method) async_step() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) async_tensor_model_parallel_allreduce (core.model_parallel_config.ModelParallelConfig attribute) AsyncCaller (class in core.dist_checkpointing.strategies.async_utils) AsyncCallsQueue (class in core.dist_checkpointing.strategies.async_utils) AsyncRequest (class in core.dist_checkpointing.strategies.async_utils) AsyncSaveShardedStrategy (class in core.dist_checkpointing.strategies.base) AsyncStream (class in core.inference.async_stream) AsyncZMQCommunicator (class in core.inference.engines.async_zmq_communicator) attach_and_log_load_balancing_loss() (core.transformer.moe.router.TopKRouter method) attach_grad_to_optimizer_state() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) Attention (class in core.transformer.attention) ATTENTION (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) attention_backend (core.transformer.transformer_config.TransformerConfig attribute) attention_dense_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_dense_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_dropout (core.transformer.transformer_config.TransformerConfig attribute) attention_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) attention_linear_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_mask_func() (in module core.transformer.utils) attention_output_gate (core.transformer.transformer_config.TransformerConfig attribute) attention_qkv_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_qkv_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) attention_softmax_in_fp32 (core.transformer.transformer_config.TransformerConfig attribute) AttentionBlockSize (in module core.transformer.custom_layers.batch_invariant_kernels) attn (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) (core.transformer.enums.CudaGraphScope attribute) AttnBackend (class in core.transformer.enums) AttnMaskType (class in core.transformer.enums) AttnType (class in core.transformer.enums) attr (core.optimizer.optimizer_config.ParamKey attribute) auto (core.transformer.enums.AttnBackend attribute) autocast_dtype (core.model_parallel_config.ModelParallelConfig attribute) AutogradFunctionImplementation (in module core.extensions.kitchen) AutoHuggingFaceModel (class in core.models.huggingface.module) average_in_collective (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) await_process_call() (in module core.inference.utils) axis_fragmentations (core.dist_checkpointing.mapping.ShardedTensor attribute) B BackendSpecProvider (class in core.models.backends) backward() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.RegisterFSDPBackwardFunction static method) (core.fusions.fused_bias_geglu.BiasGeGLUFunction static method) (core.fusions.fused_bias_geglu.GeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedBiasQuickGeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedQuickGeGLUFunction static method) (core.fusions.fused_bias_gelu.GeLUFunction static method) (core.fusions.fused_bias_swiglu.BiasSwiGLUFunction static method) (core.fusions.fused_bias_swiglu.SwiGLUFunction static method) (core.fusions.fused_bias_swiglu.WeightedSwiGLUFunction static method) (core.fusions.fused_cross_entropy._VocabParallelCrossEntropy static method) (core.fusions.fused_indices_converter.IndicesToMultihot static method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbKV static method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbQ static method) (core.fusions.fused_softmax.ScaledMaskedSoftmax static method) (core.fusions.fused_softmax.ScaledSoftmax static method) (core.fusions.fused_softmax.ScaledUpperTriangMaskedSoftmax static method) (core.fusions.fused_weighted_squared_relu.WeightedSquaredReLUFunction static method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedOffloadingBackwardRecordFunction static method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedOffloadingGroupCommitFunction static method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedOffloadingGroupStartFunction static method) (core.pipeline_parallel.utils.NoopScheduleNode method) (core.pipeline_parallel.utils.ScheduleNode method) (core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy static method) (core.tensor_parallel.layers.LinearWithFrozenWeight static method) (core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication static method) (core.tensor_parallel.mappings._AllGatherFromTensorParallelRegion static method) (core.tensor_parallel.mappings._AllToAll static method) (core.tensor_parallel.mappings._CopyToModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceFromModelParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToTensorParallelRegion static method) (core.tensor_parallel.mappings._ScatterToModelParallelRegion static method) (core.tensor_parallel.mappings._ScatterToSequenceParallelRegion static method) (core.tensor_parallel.random.CheckpointFunction static method) (core.tensor_parallel.random.CheckpointWithoutOutputFunction static method) (core.transformer.cuda_graphs._CudagraphRecordNode static method) (core.transformer.cuda_graphs._CudagraphReplayNode static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantRMSNormFn static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantTEGemmFn static method) (core.transformer.moe.fused_a2a.FusedCombine static method) (core.transformer.moe.fused_a2a.FusedDispatch static method) (core.transformer.moe.fused_a2a.HybridEPCombine static method) (core.transformer.moe.fused_a2a.HybridEPDispatch static method) (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler static method) (core.transformer.moe.moe_utils.RandomSTE static method) (core.transformer.moe.moe_utils.RouterGatingLinearFunction static method) (core.transformer.multi_token_prediction.MTPLossAutoScaler static method) (core.utils.MakeViewlessTensor static method) backward_dw() (core.extensions.transformer_engine.TEColumnParallelLinear method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) (core.extensions.transformer_engine.TERowParallelLinear method) (core.models.gpt.fine_grained_callables._BackwardDWWrapper method) (core.models.gpt.fine_grained_callables.TransformerLayerNode method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.transformer.attention.LinearQkv method) (core.transformer.attention.SelfAttention method) (core.transformer.mlp.MLP method) (core.transformer.moe.experts.GroupedMLP method) (core.transformer.moe.experts.SequentialMLP method) (core.transformer.moe.experts.TEGroupedMLP method) (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.multi_latent_attention.MLASelfAttention method) backward_impl() (core.models.gpt.fine_grained_callables.TransformerLayerNode method) BACKWARD_PASS_ORDER (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.PrefetchOrder attribute) backward_step() (in module core.pipeline_parallel.schedules) BalancedCPScheduler (class in core.pipeline_parallel.hybrid_cp_schedule) barrier_with_L1_time (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) BaseInferenceContext (class in core.inference.contexts.base_context) BaseMoELayer (class in core.transformer.moe.moe_layer) BaseTransformerLayer (class in core.transformer.transformer_layer) batch_index (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) batch_invariant_mode (core.transformer.transformer_config.TransformerConfig attribute) batch_p2p_comm (core.model_parallel_config.ModelParallelConfig attribute) batch_p2p_sync (core.model_parallel_config.ModelParallelConfig attribute) BatchInvariantRMSNormFn (class in core.transformer.custom_layers.batch_invariant_kernels) BatchInvariantTEGemmFn (class in core.transformer.custom_layers.batch_invariant_kernels) bert_extended_attention_mask() (core.models.bert.bert_model.BertModel method) bert_layer_local_spec (in module core.models.bert.bert_layer_specs) bert_position_ids() (core.models.bert.bert_model.BertModel method) BertLMHead (class in core.models.bert.bert_lm_head) BERTMaskedWordPieceDataset (class in core.datasets.bert_dataset) BERTMaskedWordPieceDatasetConfig (class in core.datasets.bert_dataset) BertModel (class in core.models.bert.bert_model) BertTokenizer (class in core.tokenizers.text.models.bert_tokenizer) beta_fast (core.transformer.transformer_config.MLATransformerConfig attribute) beta_slow (core.transformer.transformer_config.MLATransformerConfig attribute) bf16 (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) bias_activation_fusion (core.transformer.transformer_config.TransformerConfig attribute) bias_dropout_add_fused_inference() (in module core.fusions.fused_bias_dropout) bias_dropout_add_fused_train() (in module core.fusions.fused_bias_dropout) bias_dropout_add_unfused() (in module core.fusions.fused_bias_dropout) bias_dropout_fusion (core.transformer.transformer_config.TransformerConfig attribute) bias_geglu() (in module core.fusions.fused_bias_geglu) bias_geglu_back() (in module core.fusions.fused_bias_geglu) bias_geglu_impl() (in module core.fusions.fused_bias_geglu) bias_gelu() (in module core.fusions.fused_bias_gelu) bias_gelu_back() (in module core.fusions.fused_bias_gelu) bias_gelu_impl (in module core.fusions.fused_bias_gelu) bias_swiglu() (in module core.fusions.fused_bias_swiglu) bias_swiglu_back() (in module core.fusions.fused_bias_swiglu) bias_swiglu_impl() (in module core.fusions.fused_bias_swiglu) BiasGeGLUFunction (class in core.fusions.fused_bias_geglu) BiasSwiGLUFunction (class in core.fusions.fused_bias_swiglu) bin_chunk_nbytes (core.datasets.object_storage_utils.ObjectStorageConfig attribute) blend (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) blend_per_split (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) BlendedDataset (class in core.datasets.blended_dataset) BlendedMegatronDatasetBuilder (class in core.datasets.blended_megatron_dataset_builder) BlendedMegatronDatasetConfig (class in core.datasets.blended_megatron_dataset_config) BlockAllocator (class in core.inference.contexts.dynamic_block_allocator) BlockOverflowError blockwise (core.enums.Fp8Recipe attribute) BlockwiseFP8WeightTransformerLayer (class in core.post_training.modelopt.layers) bos (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) bos_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) bridge_comms_as_dest_module (core.pipeline_parallel.multimodule_communicator.RankModuleInfo attribute) bridge_comms_as_src_module (core.pipeline_parallel.multimodule_communicator.RankModuleInfo attribute) BridgeCommunicator (class in core.pipeline_parallel.bridge_communicator) broadcast_data() (in module core.tensor_parallel.data) broadcast_float_list() (in module core.inference.communication_utils) broadcast_from_last_pipeline_stage() (in module core.inference.communication_utils) broadcast_int_list() (in module core.inference.communication_utils) broadcast_list() (in module core.inference.communication_utils) broadcast_params() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) broadcast_tensor() (in module core.inference.communication_utils) Bucket (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) bucket_size (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) BucketIndex (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) BucketingPolicy (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) BucketStatus (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) BufferType (class in core.distributed.param_and_grad_buffer) build() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) build_and_save_engine() (core.export.trtllm.engine_builder.trtllm_engine_builder.TRTLLMEngineBuilder static method) (core.export.trtllm.trtllm_helper.TRTLLMHelper method) build_centralized_reshard_plan() (in module core.resharding.planner) build_comm_map() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) build_cpu_optimizer_list() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer static method) build_data_parallel_buffer_index() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) build_fn (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) build_generic_dataset() (core.datasets.blended_megatron_dataset_builder.BlendedMegatronDatasetBuilder static method) build_hf_model() (in module core.models.huggingface.module) build_layer_callables() (in module core.models.gpt.fine_grained_callables) build_low_level_dataset() (core.datasets.gpt_dataset.GPTDataset static method) (core.datasets.gpt_dataset.MockGPTDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) build_module() (in module core.transformer.spec_utils) build_mtp_layer_callables() (in module core.models.gpt.fine_grained_callables) build_sample_idx() (in module core.datasets.helpers) build_schedule() (core.resharding.nvshmem_copy_service.planning.communication_scheduler.CommunicationScheduler method) build_schedule_plan() (core.models.gpt.gpt_model.GPTModel method) build_transformer_layer_callables() (in module core.models.gpt.fine_grained_callables) bulk_offload() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) bulk_offload_group() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) bulk_reload() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) bulk_reload_group() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) bwd_buffer_reuse_ref_count (in module core.transformer.cuda_graphs) bwd_cudagraph_buffer (core.transformer.cuda_graphs.CudagraphBufferMetadata attribute) BWD_READY (core.transformer.cuda_graphs._GraphStatus attribute) ByteLevelTokenizer (class in core.tokenizers.text.libraries.bytelevel_tokenizer) C cache_index_file() (in module core.datasets.object_storage_utils) cache_mla_latents (core.transformer.transformer_config.MLATransformerConfig attribute) CachedMetadataFileSystemReader (class in core.dist_checkpointing.strategies.cached_metadata_filesystem_reader) calculate_cross_entropy_loss() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_gradients() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_log_probs() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) calculate_logits_max() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) calculate_per_token_loss (core.transformer.transformer_config.TransformerConfig attribute) calculate_predicted_logits() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) (in module core.fusions.fused_cross_entropy) Call (class in core.rerun_state_machine) call_ddp_preforward_hook() (core.transformer.cuda_graphs.CudaGraphManager method) call_idx (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) Caller (class in core.rerun_state_machine) caller (core.rerun_state_machine.Call attribute) can_handle_sharded_objects (core.dist_checkpointing.strategies.base.LoadStrategyBase property) (core.dist_checkpointing.strategies.base.SaveStrategyBase property) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy property) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper property) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper property) can_handle_sharded_objects() (core.dist_checkpointing.strategies.common.TorchCommonSaveStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistSaveShardedStrategy method) capture_reuse_count (core.transformer.cuda_graphs.CudagraphBufferMetadata attribute) causal (core.transformer.enums.AttnMaskType attribute) causal_bottom_right (core.transformer.enums.AttnMaskType attribute) ChainedOptimizer (class in core.optimizer.optimizer) check_availability() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) check_backend_compatibility() (core.dist_checkpointing.strategies.base.LoadStrategyBase method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) check_config_overrides_consistency() (in module core.optimizer) check_first_val_step() (in module core.pipeline_parallel.schedules) check_for_large_grads (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) check_for_nan_in_grad (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) check_gpu_memory() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) check_grads() (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) check_is_distributed_checkpoint() (in module core.dist_checkpointing.core) check_param_hashes_across_dp_replicas() (in module core.utils) check_version_compatibility() (core.dist_checkpointing.strategies.base.LoadStrategyBase method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) checkpoint() (core.inference.inference_request.DynamicInferenceRequestRecord method) (core.tensor_parallel.random.CheckpointWithoutOutput method) (in module core.tensor_parallel.random) checkpoint_fully_reshardable_formats (core.optimizer.distrib_optimizer.DistributedOptimizer attribute) checkpoint_id (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync property) CheckpointableShardedTensor (class in core.dist_checkpointing.strategies.checkpointable) CheckpointFunction (class in core.tensor_parallel.random) CheckpointingConfig (class in core.dist_checkpointing.core) CheckpointingException CheckpointWithoutOutput (class in core.tensor_parallel.random) CheckpointWithoutOutputFunction (class in core.tensor_parallel.random) chunk_size_factor (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) ChunkOffloadHandler (class in core.pipeline_parallel.fine_grained_activation_offload) CkptShardedMetadata (in module core.dist_checkpointing.serialization) classification_head (core.datasets.bert_dataset.BERTMaskedWordPieceDatasetConfig attribute) clean_loss_in_tracker() (core.transformer.multi_token_prediction.MTPLossLoggingHelper method) clear() (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool method) (core.transformer.moe.moe_utils.MoECudaGraphTensorStore method) clear_aux_losses_tracker() (in module core.transformer.moe.moe_utils) clear_embedding_activation_buffer() (in module core.pipeline_parallel.schedules) clear_global_indices() (core.transformer.moe.router_replay.RouterReplay static method) clear_global_router_replay_action() (core.transformer.moe.router_replay.RouterReplay static method) clear_global_router_replay_instances() (core.transformer.moe.router_replay.RouterReplay static method) clear_indices() (core.transformer.moe.router_replay.RouterReplay method) clear_requests() (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) clear_router_replay_action() (core.transformer.moe.router_replay.RouterReplay method) clear_service_cache() (in module core.resharding.refit) clip_grad (core.optimizer.optimizer_config.OptimizerConfig attribute) clip_grad_by_total_norm_fp32() (in module core.optimizer.clip_grads) clip_grad_norm() (core.optimizer.optimizer.MegatronOptimizer method) clip_qk() (core.transformer.attention.Attention method) (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) (in module core.optimizer.qk_clip) CLIPViTModel (class in core.models.vision.clip_vit_model) clone_scatter_output_in_embedding (core.transformer.transformer_config.TransformerConfig attribute) clone_tensors_in_struct() (in module core.full_cuda_graph) close() (core.datasets.object_storage_utils.S3Client method) (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) (core.inference.engines.async_zmq_communicator.AsyncZMQCommunicator method) cls (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) cls_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) code_from_dtype() (core.datasets.indexed_dataset.DType class method) ColoredFormatter (class in core.resharding.nvshmem_copy_service.logger) column_parallel_layer_norm_linear() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) column_parallel_linear() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) ColumnParallelLinear (class in core.tensor_parallel.layers) combine() (core.rerun_state_machine.QuickStats method) (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) combine_param_group_overrides() (in module core.optimizer_param_scheduler) combine_postprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) combine_preprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) combined_1f1b_schedule_for_interleaved_pipelining() (in module core.pipeline_parallel.combined_1f1b) combined_1f1b_schedule_for_no_pipelining() (in module core.pipeline_parallel.combined_1f1b) combined_forward_backward_step() (in module core.pipeline_parallel.combined_1f1b) commit_tensor() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) common (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict attribute) common_backend (core.dist_checkpointing.core.CheckpointingConfig attribute) common_backend_version (core.dist_checkpointing.core.CheckpointingConfig attribute) common_state_dict (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) COMMON_STATE_FNAME (in module core.dist_checkpointing.strategies.common) CommonStateDict (in module core.dist_checkpointing.mapping) CommRole (class in core.pipeline_parallel.bridge_communicator) COMMUNICATING (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketStatus attribute) CommunicationScheduler (class in core.resharding.nvshmem_copy_service.planning.communication_scheduler) COMPARISON_MATCH (in module core.rerun_state_machine) COMPARISON_MISMATCH (in module core.rerun_state_machine) CompilationState (class in core.inference.unified_memory) compile_allocator() (in module core.inference.unified_memory) compile_helpers() (in module core.datasets.utils) COMPLETED (core.inference.inference_request.Status attribute) compute_language_model_loss() (core.models.common.language_module.language_module.LanguageModule method) compute_routing_scores_for_aux_loss() (in module core.transformer.moe.moe_utils) compute_total_pipeline_stages() (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator static method) condition_init_method() (in module core.extensions.transformer_engine) config_attention_mask() (core.datasets.t5_dataset.T5MaskedWordPieceDataset static method) CONFIG_FNAME (in module core.dist_checkpointing.core) config_logger_dir (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) configure() (core.utils.StragglerDetector method) configure_fused_tp_inference() (core.transformer.transformer_layer.TransformerLayer method) configure_nvtx_profiling() (in module core.utils) configured (core.utils.StragglerDetector property) CONNECT (core.inference.headers.Headers attribute) CONNECT_ACK (core.inference.headers.Headers attribute) ConstantGradScaler (class in core.optimizer.grad_scaler) ConstantNumMicroBatchesCalculator (class in core.num_microbatches_calculator) contains_submesh() (in module core.distributed.fsdp.src.megatron_fsdp.utils) context_parallel_size (core.datasets.gpt_dataset.GPTDatasetConfig attribute) (core.model_parallel_config.ModelParallelConfig attribute) ContextErrorFactory (class in core.inference.contexts.dynamic_context) ContextOverflowError conv1d() (core.ssm.mamba_context_parallel.MambaContextParallel method) conv1d_channels() (core.ssm.mamba_context_parallel.MambaContextParallel method) conversion_helper() (in module core.transformer.module) convert() (core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter.DistributedTRTLLMModelWeightsConverter method) (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) convert_cuda_rng_state() (in module core.tensor_parallel.random) convert_schedule_table_to_order() (in module core.transformer.cuda_graphs) convert_split_vector_to_split_matrix() (in module core.datasets.blended_megatron_dataset_config) copy_main_weights_to_model_weights() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) copy_model_weights_to_main_weights() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) copy_tensor_model_parallel_attributes() (in module core.tensor_parallel.layers) copy_tensors_in_struct() (in module core.full_cuda_graph) copy_tensors_to_cpu() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) copy_to_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) CopyService (class in core.resharding.copy_services.base) core module core._rank_utils module core.activations module core.config module core.config_logger module core.datasets module core.datasets.bert_dataset module core.datasets.blended_dataset module core.datasets.blended_megatron_dataset_builder module core.datasets.blended_megatron_dataset_config module core.datasets.data_schedule module core.datasets.gpt_dataset module core.datasets.helpers module core.datasets.indexed_dataset module core.datasets.masked_dataset module core.datasets.megatron_dataset module core.datasets.megatron_tokenizer module core.datasets.multimodal_dataset module core.datasets.object_storage_utils module core.datasets.t5_dataset module core.datasets.utils module core.datasets.utils_s3 module core.dist_checkpointing module core.dist_checkpointing.core module core.dist_checkpointing.dict_utils module core.dist_checkpointing.exchange_utils module core.dist_checkpointing.mapping module core.dist_checkpointing.optimizer module core.dist_checkpointing.serialization module core.dist_checkpointing.state_dict_utils module core.dist_checkpointing.strategies module core.dist_checkpointing.strategies.async_utils module core.dist_checkpointing.strategies.base module core.dist_checkpointing.strategies.cached_metadata_filesystem_reader module core.dist_checkpointing.strategies.checkpointable module core.dist_checkpointing.strategies.common module core.dist_checkpointing.strategies.filesystem_async module core.dist_checkpointing.strategies.fully_parallel module core.dist_checkpointing.strategies.state_dict_saver module core.dist_checkpointing.strategies.torch module core.dist_checkpointing.tensor_aware_state_dict module core.dist_checkpointing.utils module core.dist_checkpointing.validation module core.distributed module core.distributed.data_parallel_base module core.distributed.distributed_data_parallel module core.distributed.distributed_data_parallel_config module core.distributed.finalize_model_grads module core.distributed.fsdp module core.distributed.fsdp.mcore_fsdp_adapter module core.distributed.fsdp.src module core.distributed.fsdp.src.megatron_fsdp module core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config module core.distributed.fsdp.src.megatron_fsdp.fully_shard module core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp module core.distributed.fsdp.src.megatron_fsdp.mixed_precision module core.distributed.fsdp.src.megatron_fsdp.package_info module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor module core.distributed.fsdp.src.megatron_fsdp.utils module core.distributed.param_and_grad_buffer module core.distributed.reduce_scatter_with_fp32_accumulation module core.distributed.torch_fully_sharded_data_parallel module core.distributed.torch_fully_sharded_data_parallel_config module core.energy_monitor module core.enums module core.export module core.export.data_type module core.export.export_config module core.export.model_type module core.export.trtllm module core.export.trtllm.engine_builder module core.export.trtllm.engine_builder.trtllm_engine_builder module core.export.trtllm.model_to_trllm_mapping module core.export.trtllm.model_to_trllm_mapping.default_conversion_dict module core.export.trtllm.trt_model_config module core.export.trtllm.trt_model_type module core.export.trtllm.trtllm_helper module core.export.trtllm.trtllm_layers module core.export.trtllm.trtllm_weights_converter module core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter module core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter module core.export.trtllm.trtllm_weights_converter.utils module core.extensions module core.extensions.kitchen module core.extensions.transformer_engine module core.extensions.transformer_engine_spec_provider module core.fp4_utils module core.fp8_utils module core.full_cuda_graph module core.fusions module core.fusions.fused_bias_dropout module core.fusions.fused_bias_geglu module core.fusions.fused_bias_gelu module core.fusions.fused_bias_swiglu module core.fusions.fused_cross_entropy module core.fusions.fused_indices_converter module core.fusions.fused_layer_norm module core.fusions.fused_mla_yarn_rope_apply module core.fusions.fused_pad_routing_map module core.fusions.fused_softmax module core.fusions.fused_weighted_squared_relu module core.hyper_comm_grid module core.inference module core.inference.async_stream module core.inference.batch_dimensions_utils module core.inference.common_inference_params module core.inference.communication_utils module core.inference.contexts module core.inference.contexts.base_context module core.inference.contexts.dynamic_block_allocator module core.inference.contexts.dynamic_context module core.inference.contexts.fused_kv_append_kernel module core.inference.contexts.static_context module core.inference.data_parallel_inference_coordinator module core.inference.engines module core.inference.engines.abstract_engine module core.inference.engines.async_zmq_communicator module core.inference.engines.dynamic_engine module core.inference.engines.mcore_engine module core.inference.engines.static_engine module core.inference.headers module core.inference.inference_client module core.inference.inference_request module core.inference.model_inference_wrappers module core.inference.model_inference_wrappers.abstract_model_inference_wrapper module core.inference.model_inference_wrappers.gpt module core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper module core.inference.model_inference_wrappers.inference_wrapper_config module core.inference.model_inference_wrappers.t5 module core.inference.model_inference_wrappers.t5.t5_inference_wrapper module core.inference.sampling_params module core.inference.scheduler module core.inference.text_generation_controllers module core.inference.text_generation_controllers.encoder_decoder_text_generation_controller module core.inference.text_generation_controllers.simple_text_generation_controller module core.inference.text_generation_controllers.text_generation_controller module core.inference.text_generation_controllers.vlm_text_generation_controller module core.inference.text_generation_server module core.inference.text_generation_server.dynamic_text_gen_server module core.inference.text_generation_server.dynamic_text_gen_server.endpoints module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.chat_completions module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.common module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.completions module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.health module core.inference.text_generation_server.dynamic_text_gen_server.flask_server module core.inference.text_generation_server.dynamic_text_gen_server.tokenization module core.inference.text_generation_server.run_mcore_engine module core.inference.text_generation_server.text_generation_server module core.inference.text_generation_server.tokenization module core.inference.unified_memory module core.inference.utils module core.inference_params module core.jit module core.model_parallel_config module core.models module core.models.backends module core.models.bert module core.models.bert.bert_layer_specs module core.models.bert.bert_lm_head module core.models.bert.bert_model module core.models.bert.pooler module core.models.common module core.models.common.embeddings module core.models.common.embeddings.language_model_embedding module core.models.common.embeddings.relative_pos_embedding module core.models.common.embeddings.rope_utils module core.models.common.embeddings.rotary_pos_embedding module core.models.common.embeddings.yarn_rotary_pos_embedding module core.models.common.language_module module core.models.common.language_module.language_module module core.models.common.model_chunk_schedule_plan module core.models.common.vision_module module core.models.common.vision_module.vision_module module core.models.gpt module core.models.gpt.experimental_attention_variant_module_specs module core.models.gpt.fine_grained_callables module core.models.gpt.gpt_layer_specs module core.models.gpt.gpt_model module core.models.gpt.moe_module_specs module core.models.huggingface module core.models.huggingface.clip_model module core.models.huggingface.module module core.models.huggingface.qwen_model module core.models.mamba module core.models.mamba.mamba_layer_specs module core.models.mamba.mamba_model module core.models.mimo module core.models.mimo.config module core.models.mimo.config.base_configs module core.models.mimo.model module core.models.mimo.model.base module core.models.multimodal module core.models.multimodal.context_parallel module core.models.multimodal.llava_model module core.models.multimodal.llava_spec module core.models.T5 module core.models.T5.t5_model module core.models.T5.t5_spec module core.models.vision module core.models.vision.clip_vit_model module core.models.vision.multimodal_projector module core.models.vision.radio module core.models.vision.vit_layer_specs module core.msc_utils module core.nccl_allocator module core.num_microbatches_calculator module core.optimizer module core.optimizer.clip_grads module core.optimizer.cpu_offloading module core.optimizer.cpu_offloading.hybrid_optimizer module core.optimizer.distrib_optimizer module core.optimizer.grad_scaler module core.optimizer.layer_wise_optimizer module core.optimizer.muon module core.optimizer.optimizer module core.optimizer.optimizer_config module core.optimizer.qk_clip module core.optimizer_param_scheduler module core.package_info module core.packed_seq_params module core.parallel_state module core.pipeline_parallel module core.pipeline_parallel.bridge_communicator module core.pipeline_parallel.combined_1f1b module core.pipeline_parallel.fine_grained_activation_offload module core.pipeline_parallel.hybrid_cp_schedule module core.pipeline_parallel.multimodule_communicator module core.pipeline_parallel.p2p_communication module core.pipeline_parallel.schedules module core.pipeline_parallel.utils module core.post_training module core.post_training.modelopt module core.post_training.modelopt.gpt module core.post_training.modelopt.gpt.model_specs module core.post_training.modelopt.gpt.state_dict_hooks module core.post_training.modelopt.layers module core.post_training.modelopt.mamba module core.post_training.modelopt.mamba.model_specs module core.process_groups_config module core.quantization module core.quantization.quant_config module core.quantization.utils module core.rerun_state_machine module core.resharding module core.resharding.copy_services module core.resharding.copy_services.base module core.resharding.copy_services.gloo_copy_service module core.resharding.copy_services.nccl_copy_service module core.resharding.copy_services.nvshmem_copy_service module core.resharding.execution module core.resharding.nvshmem_copy_service module core.resharding.nvshmem_copy_service.core module core.resharding.nvshmem_copy_service.core.gpu_resource_manager module core.resharding.nvshmem_copy_service.core.kernel_launcher module core.resharding.nvshmem_copy_service.core.pipeline_executor module core.resharding.nvshmem_copy_service.logger module core.resharding.nvshmem_copy_service.memory module core.resharding.nvshmem_copy_service.memory.double_buffer_manager module core.resharding.nvshmem_copy_service.memory.tensor_pointer_utils module core.resharding.nvshmem_copy_service.nvshmem_types module core.resharding.nvshmem_copy_service.planning module core.resharding.nvshmem_copy_service.planning.communication_scheduler module core.resharding.nvshmem_copy_service.planning.gpu_execution_planner module core.resharding.nvshmem_copy_service.planning.task_segmenter module core.resharding.nvshmem_copy_service.planning.workload_packer module core.resharding.nvshmem_copy_service.service module core.resharding.nvshmem_copy_service.validation module core.resharding.planner module core.resharding.refit module core.resharding.utils module core.safe_globals module core.ssm module core.ssm.gated_delta_net module core.ssm.mamba_block module core.ssm.mamba_context_parallel module core.ssm.mamba_hybrid_layer_allocation module core.ssm.mamba_layer module core.ssm.mamba_mixer module core.ssm.mlp_layer module core.ssm.triton_cache_manager module core.tensor_parallel module core.tensor_parallel.cross_entropy module core.tensor_parallel.data module core.tensor_parallel.inference_layers module core.tensor_parallel.layers module core.tensor_parallel.mappings module core.tensor_parallel.random module core.tensor_parallel.utils module core.timers module core.tokenizers module core.tokenizers.base_tokenizer module core.tokenizers.megatron_tokenizer module core.tokenizers.text module core.tokenizers.text.libraries module core.tokenizers.text.libraries.abstract_tokenizer module core.tokenizers.text.libraries.bytelevel_tokenizer module core.tokenizers.text.libraries.chat_template module core.tokenizers.text.libraries.huggingface_tokenizer module core.tokenizers.text.libraries.megatron_hf_tokenizer module core.tokenizers.text.libraries.null_tokenizer module core.tokenizers.text.libraries.sentencepiece_tokenizer module core.tokenizers.text.libraries.tiktoken_tokenizer module core.tokenizers.text.models module core.tokenizers.text.models.bert_tokenizer module core.tokenizers.text.models.default_tokenizer module core.tokenizers.text.models.gpt_tokenizer module core.tokenizers.text.models.mamba_tokenizer module core.tokenizers.text.models.t5_tokenizer module core.tokenizers.text.text_tokenizer module core.transformer module core.transformer.attention module core.transformer.cuda_graphs module core.transformer.custom_layers module core.transformer.custom_layers.batch_invariant_kernels module core.transformer.custom_layers.transformer_engine module core.transformer.dot_product_attention module core.transformer.enums module core.transformer.fsdp_dtensor_checkpoint module core.transformer.identity_op module core.transformer.mlp module core.transformer.module module core.transformer.moe module core.transformer.moe.experts module core.transformer.moe.fused_a2a module core.transformer.moe.grouped_gemm_util module core.transformer.moe.moe_layer module core.transformer.moe.moe_utils module core.transformer.moe.router module core.transformer.moe.router_replay module core.transformer.moe.shared_experts module core.transformer.moe.token_dispatcher module core.transformer.moe.upcycling_utils module core.transformer.multi_latent_attention module core.transformer.multi_token_prediction module core.transformer.pipeline_parallel_layer_layout module core.transformer.spec_utils module core.transformer.torch_layer_norm module core.transformer.torch_norm module core.transformer.transformer_block module core.transformer.transformer_config module core.transformer.transformer_layer module core.transformer.utils module core.typed_torch module core.utils module core_attention (core.transformer.attention.CrossAttentionSubmodules attribute) (core.transformer.attention.SelfAttentionSubmodules attribute) (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) core_attention() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) CoreAttention (class in core.transformer.attention) CoreAttentionBuilder (class in core.transformer.attention) correct_amax_history_if_needed() (in module core.fp8_utils) CORRECT_RESULT (core.rerun_state_machine.RerunDiagnostic attribute) count_zeros() (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) count_zeros_fp32() (in module core.optimizer.clip_grads) Counter (class in core.inference.utils) cp (core.process_groups_config.ProcessGroupCollection attribute) cp_comm_type (core.transformer.transformer_config.TransformerConfig attribute) cp_group (core.packed_seq_params.PackedSeqParams attribute) cp_stream (core.extensions.transformer_engine.TEDotProductAttention attribute) cpu_offloading (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_activations (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_double_buffering (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_num_layers (core.model_parallel_config.ModelParallelConfig attribute) cpu_offloading_weights (core.model_parallel_config.ModelParallelConfig attribute) cpu_tensor_pool (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager property) create_attention_mask (core.datasets.gpt_dataset.GPTDatasetConfig attribute) create_bwd_graph() (core.transformer.cuda_graphs._CudaGraphRunner method) create_cuda_graphs() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) create_cudagraphs() (core.transformer.cuda_graphs._CudagraphGlobalRecord class method) (core.transformer.cuda_graphs.TECudaGraphHelper method) (in module core.transformer.cuda_graphs) create_decentralized_global_plan() (core.dist_checkpointing.strategies.torch.MCoreSavePlanner method) create_events() (core.resharding.nvshmem_copy_service.core.gpu_resource_manager.GPUResourceManager method) create_fwd_graph() (core.transformer.cuda_graphs._CudaGraphRunner method) create_gpu_plans() (core.resharding.nvshmem_copy_service.planning.gpu_execution_planner.GPUExecutionPlanner method) create_group() (in module core.parallel_state) create_hierarchical_groups() (in module core.parallel_state) create_hybrid_dp_cp_groups() (in module core.parallel_state) create_local_plan() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) (core.dist_checkpointing.strategies.torch.MCoreSavePlanner method) create_mcore_cudagraph_manager() (core.ssm.mamba_layer.MambaLayer method) (core.transformer.transformer_layer.MoETransformerLayer method) (core.transformer.transformer_layer.TransformerLayer method) create_nccl_mem_pool() (in module core.nccl_allocator) create_pg() (core.hyper_comm_grid.HyperCommGrid method) create_unified_mempool() (in module core.inference.unified_memory) create_updated_function_signature() (in module core.distributed.fsdp.src.megatron_fsdp.utils) critical() (core.resharding.nvshmem_copy_service.logger.PELogger class method) cross_attention (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) cross_attn (core.transformer.enums.AttnType attribute) cross_attn_bda (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) cross_entropy_fusion_impl (core.model_parallel_config.ModelParallelConfig attribute) cross_entropy_loss_fusion (core.model_parallel_config.ModelParallelConfig attribute) CrossAttention (class in core.transformer.attention) CrossAttentionSubmodules (class in core.transformer.attention) cu_kv_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) cu_query_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) cu_seqlens_kv (core.packed_seq_params.PackedSeqParams attribute) cu_seqlens_kv_padded (core.packed_seq_params.PackedSeqParams attribute) cu_seqlens_q (core.packed_seq_params.PackedSeqParams attribute) cu_seqlens_q_padded (core.packed_seq_params.PackedSeqParams attribute) cuda_dtoh_stream (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher attribute) cuda_graph (core.full_cuda_graph.FullCudaGraphWrapper attribute) cuda_graph_attr_cache (in module core.transformer.utils) cuda_graph_impl (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_retain_backward_graph (core.transformer.transformer_config.TransformerConfig attribute) CUDA_GRAPH_ROUNDER (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder attribute) cuda_graph_scope (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_set_manual_hooks() (core.transformer.cuda_graphs.TECudaGraphHelper method) cuda_graph_use_single_mempool (core.transformer.transformer_config.TransformerConfig attribute) cuda_graph_warmup_steps (core.transformer.transformer_config.TransformerConfig attribute) cudagraph_created (core.transformer.cuda_graphs._CudagraphGlobalRecord attribute) cudagraph_inference_record (core.transformer.cuda_graphs._CudagraphGlobalRecord attribute) cudagraph_record (core.transformer.cuda_graphs._CudagraphGlobalRecord attribute) cudagraph_reuse_ref_count (core.transformer.cuda_graphs.CudagraphBufferMetadata attribute) CUDAGraphBatchDimensionBuilder (class in core.inference.batch_dimensions_utils) CudagraphBufferMetadata (class in core.transformer.cuda_graphs) CudaGraphManager (class in core.transformer.cuda_graphs) CudaGraphScope (class in core.transformer.enums) CudaRNGStatesTracker (class in core.tensor_parallel.random) cur_backward_chunk() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) cur_forward_chunk() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) curr_iter() (core.full_cuda_graph.FullCudaGraphWrapper method) curr_iteration (core.full_cuda_graph.FullCudaGraphWrapper attribute) current_input_and_position_ids() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) custom (core.enums.Fp4Recipe attribute) (core.enums.Fp8Recipe attribute) custom_backward() (in module core.pipeline_parallel.schedules) custom_recipe_factory (core.extensions.transformer_engine.TEQuantizationRecipe attribute) D d2h_stream (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager property) data (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.Bucket attribute) data_parallel_group_ranks (core.resharding.utils.ParameterMetadata attribute) data_parallel_sharding_strategy (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketingPolicy attribute) data_parallel_size (core.datasets.gpt_dataset.GPTDatasetConfig attribute) data_read() (core.full_cuda_graph.FullCudaGraphWrapper method) DataIteratorArgType (in module core.rerun_state_machine) DataParallelBuffer (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) DataParallelInferenceCoordinator (class in core.inference.data_parallel_inference_coordinator) dataset_exists() (in module core.datasets.object_storage_utils) DataType (in module core.export.data_type) deallocate_all_tensors() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) deallocate_output_tensor() (in module core.pipeline_parallel.schedules) deallocate_pipeline_outputs (core.model_parallel_config.ModelParallelConfig attribute) DEBUG (in module core.pipeline_parallel.fine_grained_activation_offload) debug() (core.resharding.nvshmem_copy_service.logger.PELogger class method) debug_msg() (in module core.dist_checkpointing.utils) DEBUG_RANK (in module core.pipeline_parallel.fine_grained_activation_offload) debug_rank() (in module core.pipeline_parallel.fine_grained_activation_offload) debug_time() (in module core.dist_checkpointing.utils) decode_req_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions attribute) decoder (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.transformer.enums.LayerType attribute) decoder_model_with_local_default_spec() (in module core.models.multimodal.llava_spec) decoder_model_with_local_spec() (in module core.models.T5.t5_spec) decoder_model_with_transformer_engine_default_spec() (in module core.models.multimodal.llava_spec) (in module core.models.T5.t5_spec) decoder_seq_length (core.inference.inference_request.VLMInferenceRequest attribute) decoupled_lr (core.optimizer.optimizer_config.OptimizerConfig attribute) decoupled_min_lr (core.optimizer.optimizer_config.OptimizerConfig attribute) decoupled_weight_decay (core.optimizer.optimizer_config.OptimizerConfig attribute) default() (core.config_logger.JSONEncoderWithMcoreTypes method) default_backward_func() (core.pipeline_parallel.utils.ScheduleNode method) default_cache_dir() (in module core.ssm.triton_cache_manager) DEFAULT_CONVERSION_DICT (in module core.export.trtllm.model_to_trllm_mapping.default_conversion_dict) default_embedding_ranks() (in module core.parallel_state) DEFAULT_IMAGE_TOKEN_INDEX (in module core.models.multimodal.llava_model) DEFAULT_MAX_TOKENS (core.inference.contexts.dynamic_context.DynamicInferenceContext attribute) default_position_embedding_ranks() (in module core.parallel_state) default_strategies (in module core.dist_checkpointing.strategies.base) DEFAULT_TIKTOKEN_MAX_VOCAB (in module core.tokenizers.text.libraries.tiktoken_tokenizer) DefaultTokenizerText (class in core.tokenizers.text.models.default_tokenizer) defer_embedding_wgrad_compute (core.model_parallel_config.ModelParallelConfig attribute) defer_npy_index_mmap (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) delay_wgrad_compute (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.model_parallel_config.ModelParallelConfig attribute) delayed (core.enums.Fp8Recipe attribute) delete_cuda_graphs() (core.transformer.cuda_graphs.TECudaGraphHelper method) (in module core.transformer.cuda_graphs) deprecate_inference_params() (in module core.utils) deprecated() (in module core.utils) dequantize_fp4_tensor() (in module core.fp4_utils) dequantize_fp8_tensor() (in module core.fp8_utils) deregister_mem_pool() (in module core.nccl_allocator) deserialize() (core.inference.contexts.dynamic_context.ContextErrorFactory class method) (core.inference.inference_request.DynamicInferenceEvent class method) (core.inference.inference_request.DynamicInferenceRequestRecord class method) (core.inference.inference_request.InferenceRequest class method) (core.inference.sampling_params.SamplingParams class method) deserialize_tensor() (in module core.inference.inference_request) dest_pe (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.SendRequest attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.WorkloadGroup attribute) dest_pos (core.resharding.nvshmem_copy_service.nvshmem_types.ReceiveRequest attribute) dest_rank (core.resharding.copy_services.gloo_copy_service.SendOp attribute) (core.resharding.copy_services.nccl_copy_service.SendOp attribute) dest_tensor (core.resharding.nvshmem_copy_service.nvshmem_types.ReceiveRequest attribute) destroy_global_memory_buffer() (in module core.parallel_state) destroy_global_symmetric_memory_buffer() (in module core.parallel_state) destroy_model_parallel() (in module core.parallel_state) destroy_num_microbatches_calculator() (in module core.num_microbatches_calculator) destroy_rerun_state_machine() (in module core.rerun_state_machine) detach() (core.models.gpt.fine_grained_callables.TransformerLayerNode method) determine_global_metadata() (in module core.dist_checkpointing.validation) determine_main_replica_uniform_distribution() (in module core.dist_checkpointing.exchange_utils) deterministic_mode (core.model_parallel_config.ModelParallelConfig attribute) detokenize() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator method) (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) detokenize_generations() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) device (core.resharding.nvshmem_copy_service.service.RemoteCopyService property) dict_list_map_inplace() (in module core.dist_checkpointing.dict_utils) dict_list_map_outplace() (in module core.dist_checkpointing.dict_utils) dict_map() (in module core.dist_checkpointing.dict_utils) dict_map_with_key() (in module core.dist_checkpointing.dict_utils) diff() (in module core.dist_checkpointing.dict_utils) dim (core.resharding.utils.ShardingDescriptor attribute) disable() (core.msc_utils._FeatureFlag method) disable_batch_invariant_mode() (in module core.transformer.custom_layers.batch_invariant_kernels) disable_bf16_reduced_precision_matmul (core.transformer.transformer_config.TransformerConfig attribute) disable_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) disable_jit_fuser() (in module core.jit) disable_offload() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) disable_parameter_transpose_cache (core.transformer.transformer_config.TransformerConfig attribute) disable_symmetric_registration (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) DISABLED (core.rerun_state_machine.RerunMode attribute) discard_output_and_register_recompute() (core.tensor_parallel.random.CheckpointWithoutOutput method) dispatch() (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) dispatch_postprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) dispatch_preprocess() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) distribute_main_replicas_with_precomputed_distribution() (in module core.dist_checkpointing.strategies.fully_parallel) distribute_saved_activations (core.transformer.transformer_config.TransformerConfig attribute) distribute_shards_to_ranks() (in module core.dist_checkpointing.exchange_utils) DistributedDataParallel (class in core.distributed.distributed_data_parallel) DistributedDataParallelConfig (class in core.distributed.distributed_data_parallel_config) (class in core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config) DistributedDataset (in module core.datasets.blended_megatron_dataset_builder) DistributedOptimizer (class in core.optimizer.distrib_optimizer) DistributedTRTLLMModelWeightsConverter (class in core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter) divide() (in module core.utils) document_indices (core.datasets.indexed_dataset.IndexedDataset property) DotProductAttention (class in core.transformer.dot_product_attention) DoubleBufferManager (class in core.resharding.nvshmem_copy_service.memory.double_buffer_manager) download_file() (core.datasets.object_storage_utils.S3Client method) dp (core.process_groups_config.ProcessGroupCollection attribute) dp_cp (core.process_groups_config.ProcessGroupCollection attribute) drain_embedding_wgrad_compute() (in module core.utils) drop_last_partial_validation_sequence (core.datasets.gpt_dataset.GPTDatasetConfig attribute) dsa_indexer_head_dim (core.transformer.transformer_config.TransformerConfig attribute) dsa_indexer_loss_coeff (core.transformer.transformer_config.TransformerConfig attribute) dsa_indexer_n_heads (core.transformer.transformer_config.TransformerConfig attribute) dsa_indexer_topk (core.transformer.transformer_config.TransformerConfig attribute) dsa_indexer_use_sparse_loss (core.transformer.transformer_config.TransformerConfig attribute) dst_dim_ranks (core.resharding.utils.ShardingDescriptor attribute) dst_stride (core.resharding.utils.ShardingDescriptor attribute) DType (class in core.datasets.indexed_dataset) dtype (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) (core.resharding.utils.ParameterMetadata attribute) dtype_from_code() (core.datasets.indexed_dataset.DType class method) dummy_forward() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) dummy_step() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) DummyTimer (class in core.timers) DynamicGradScaler (class in core.optimizer.grad_scaler) DynamicInferenceContext (class in core.inference.contexts.dynamic_context) DynamicInferenceEngine (class in core.inference.engines.dynamic_engine) DynamicInferenceEvent (class in core.inference.inference_request) DynamicInferenceEventType (class in core.inference.inference_request) DynamicInferenceRequest (class in core.inference.inference_request) DynamicInferenceRequestRecord (class in core.inference.inference_request) E eh_proj (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) elapsed() (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) (core.utils.StragglerDetector method) element_size (core.resharding.utils.ParameterMetadata attribute) embd (core.process_groups_config.ProcessGroupCollection attribute) embedding (core.transformer.enums.LayerType attribute) embedding() (core.models.huggingface.qwen_model.QwenHuggingFaceModel method) embedding_init_method (core.transformer.transformer_config.TransformerConfig attribute) embedding_init_method_std (core.transformer.transformer_config.TransformerConfig attribute) EMPTY (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketStatus attribute) empty_from_unique_key() (core.dist_checkpointing.mapping.ShardedObject class method) enable() (core.msc_utils._FeatureFlag method) enable_autocast (core.model_parallel_config.ModelParallelConfig attribute) enable_batch_invariant_mode() (in module core.transformer.custom_layers.batch_invariant_kernels) enable_cuda_graph (core.transformer.transformer_config.TransformerConfig attribute) enable_decode_mode() (core.inference.contexts.static_context.StaticInferenceContext method) ENABLE_EXPERIMENTAL (in module core.config) enable_forward_pre_hook() (core.distributed.distributed_data_parallel.DistributedDataParallel method) enable_jit_fuser() (in module core.jit) enable_offload() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) enable_prefill_mode() (core.inference.contexts.static_context.StaticInferenceContext method) enabled (core.utils.StragglerDetector property) encoder (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.transformer.enums.LayerType attribute) encoder_and_decoder (core.enums.ModelType property) (core.transformer.enums.ModelType property) encoder_model_with_local_spec() (in module core.models.T5.t5_spec) encoder_model_with_transformer_engine_default_spec() (in module core.models.T5.t5_spec) encoder_or_decoder (core.enums.ModelType attribute) (core.transformer.enums.ModelType attribute) encoder_prompt (core.inference.inference_request.InferenceRequest attribute) EncoderDecoderTextGenerationController (class in core.inference.text_generation_controllers.encoder_decoder_text_generation_controller) end_document() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) end_wd (core.optimizer_param_scheduler.ParamGroupOverride attribute) EnergyMonitor (class in core.energy_monitor) ENGINE_REPLY (core.inference.headers.Headers attribute) EngineSuspendedError enorm (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) ensure_divisibility() (in module core.utils) ensure_metadata_has_dp_cp_group() (in module core.transformer.utils) entrypoint() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator class method) eod (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) eod_mask_loss (core.datasets.gpt_dataset.GPTDatasetConfig attribute) eos (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) eos_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) ep (core.process_groups_config.ProcessGroupCollection attribute) ep_overlap_early_attn_memory_release (core.model_parallel_config.ModelParallelConfig attribute) erf_gelu() (in module core.transformer.utils) error() (core.resharding.nvshmem_copy_service.logger.PELogger class method) ERROR_NONTRANSIENT (core.inference.inference_request.DynamicInferenceEventType attribute) ERROR_TRANSIENT (core.inference.inference_request.DynamicInferenceEventType attribute) evaluation_recipe (core.extensions.transformer_engine.TEQuantizationParams attribute) event (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan property) events (core.inference.inference_request.DynamicInferenceRequest attribute) EVICT (core.inference.inference_request.DynamicInferenceEventType attribute) evict_overflow_paused_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) exchange_by_distribution() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_objects_gather_object() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_tensors_broadcast() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_tensors_gather_object() (in module core.dist_checkpointing.exchange_utils) exchange_loaded_tensors_gather_rounds() (in module core.dist_checkpointing.exchange_utils) execute_pipeline() (core.resharding.nvshmem_copy_service.core.pipeline_executor.PipelineExecutor method) execute_reshard_plan() (in module core.resharding.execution) execute_sync() (core.dist_checkpointing.strategies.async_utils.AsyncRequest method) exists() (core.datasets.indexed_dataset.IndexedDataset static method) EXIT_CODE_FAILED_ON_RESULT_VALIDATION (in module core.rerun_state_machine) EXIT_CODE_RESUME_TO_DISAMBIGUATE (in module core.rerun_state_machine) exp_avg_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) exp_avg_sq_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) experimental_api() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) (in module core.utils) experimental_attention_variant (core.transformer.transformer_config.TransformerConfig attribute) experimental_cls() (in module core.utils) experimental_fn() (in module core.utils) ExperimentalNotEnabledError expert_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) expert_parallel_group_ranks (core.resharding.utils.ParameterMetadata attribute) expert_param_local_key() (in module core.transformer.fsdp_dtensor_checkpoint) expert_tensor_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) experts (core.transformer.moe.moe_layer.MoESubmodules attribute) ExpertsType (in module core.transformer.moe.upcycling_utils) ExportConfig (class in core.export.export_config) expt_dp (core.process_groups_config.ProcessGroupCollection attribute) expt_fsdp_group (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex attribute) expt_tp (core.process_groups_config.ProcessGroupCollection attribute) ExtendedRMSNorm (class in core.ssm.mamba_mixer) external_cuda_graph (core.transformer.transformer_config.TransformerConfig attribute) extract_matching_values() (in module core.dist_checkpointing.dict_utils) extract_nonpersistent() (in module core.dist_checkpointing.utils) extract_param_metadata() (in module core.resharding.utils) extract_sharded_base() (in module core.dist_checkpointing.utils) extract_sharded_tensors() (in module core.dist_checkpointing.utils) extract_sharded_tensors_and_factories() (in module core.dist_checkpointing.utils) extract_sharded_tensors_or_nonpersistent() (in module core.dist_checkpointing.utils) F FactoryBuildFn (in module core.dist_checkpointing.mapping) FactoryMergeFn (in module core.dist_checkpointing.mapping) FAIL (core.inference.inference_request.DynamicInferenceEventType attribute) FAILED (core.inference.inference_request.Status attribute) failed() (core.inference.inference_request.DynamicInferenceRequest method) failed_tasks (core.resharding.nvshmem_copy_service.validation.ValidationSummary attribute) FAILURE (core.inference.unified_memory.CompilationState attribute) fallback_logger (in module core.dist_checkpointing.utils) fast_cache_load (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) fast_gelu() (in module core.activations) fetch_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) ffn_fc_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) ffn_hidden_size (core.transformer.transformer_config.TransformerConfig attribute) ffn_linear_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) ffn_projection_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) FileSystemWriterAsync (class in core.dist_checkpointing.strategies.filesystem_async) fill_in_deferred_sharded_objects() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) fill_in_deferred_sharded_tensors() (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper static method) filter_out_empty_flatten_tensor() (in module core.dist_checkpointing.state_dict_utils) filter_unflattened_state_dict() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) final_layernorm_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) final_layernorm_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) finalize() (core.datasets.indexed_dataset.IndexedDatasetBuilder method) (core.resharding.nvshmem_copy_service.core.gpu_resource_manager.GPUResourceManager method) (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) finalize_fns (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) finalize_model_grads() (in module core.distributed.finalize_model_grads) finalize_model_grads_func (core.model_parallel_config.ModelParallelConfig attribute) find_group_with_name() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) find_next_group() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) fine_grained_activation_offloading (core.transformer.transformer_config.TransformerConfig attribute) fine_grained_offloading_backward_record() (in module core.pipeline_parallel.fine_grained_activation_offload) fine_grained_offloading_disable_offload() (in module core.pipeline_parallel.fine_grained_activation_offload) fine_grained_offloading_enable_offload() (in module core.pipeline_parallel.fine_grained_activation_offload) fine_grained_offloading_forward_record() (in module core.pipeline_parallel.fine_grained_activation_offload) fine_grained_offloading_group_commit() (in module core.pipeline_parallel.fine_grained_activation_offload) fine_grained_offloading_group_flush_delayed_groups() (in module core.pipeline_parallel.fine_grained_activation_offload) fine_grained_offloading_group_start() (in module core.pipeline_parallel.fine_grained_activation_offload) FineGrainedActivationOffloadingInterface (class in core.pipeline_parallel.fine_grained_activation_offload) FineGrainedOffloadingBackwardRecordFunction (class in core.pipeline_parallel.fine_grained_activation_offload) FineGrainedOffloadingGroupCommitFunction (class in core.pipeline_parallel.fine_grained_activation_offload) FineGrainedOffloadingGroupStartFunction (class in core.pipeline_parallel.fine_grained_activation_offload) FINISH (core.inference.inference_request.DynamicInferenceEventType attribute) finish() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) (core.inference.async_stream.AsyncStream method) finish_all_groups() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) finish_embedding_wgrad_compute() (in module core.pipeline_parallel.schedules) finish_grad_sync() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) finish_init() (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) finish_param_sync() (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) finished (core.inference.async_stream.AsyncStream property) finished_chunk_token_count (core.inference.inference_request.DynamicInferenceRequest attribute) first_last_layers_bf16 (core.transformer.transformer_config.TransformerConfig attribute) first_mismatch_actual (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) first_mismatch_expected (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) first_mismatch_idx (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) FIRST_RERUN_NOT_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) FIRST_RERUN_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) FixedPoolAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) flash (core.transformer.enums.AttnBackend attribute) flash_decode (core.transformer.transformer_config.TransformerConfig attribute) flash_decode() (core.transformer.attention.Attention method) flash_decode_and_prefill() (core.transformer.attention.Attention method) flatten_state_dict() (in module core.dist_checkpointing.strategies.torch) (in module core.transformer.fsdp_dtensor_checkpoint) flattened_range (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) float16_to_fp32() (in module core.transformer.module) Float16Module (class in core.transformer.module) Float16OptimizerWithFloat16Params (class in core.optimizer.optimizer) float32 (core.datasets.indexed_dataset.DType attribute) float64 (core.datasets.indexed_dataset.DType attribute) flush() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) flush_delayed_groups() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) fn (core.optimizer.optimizer_config.ParamPredicate attribute) (core.optimizer.optimizer_config.ParamWithNamePredicate attribute) force_all_tensors_to_non_fp8() (in module core.dist_checkpointing.utils) fork() (core.tensor_parallel.random.CudaRNGStatesTracker method) format() (core.resharding.nvshmem_copy_service.logger.ColoredFormatter method) format_mem_bytes() (in module core.inference.engines.dynamic_engine) formatTime() (core.resharding.nvshmem_copy_service.logger.ColoredFormatter method) FORWARD (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) forward() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.RegisterFSDPBackwardFunction static method) (core.extensions.transformer_engine.TEDotProductAttention method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) (core.fusions.fused_bias_geglu.BiasGeGLUFunction static method) (core.fusions.fused_bias_geglu.GeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedBiasQuickGeGLUFunction static method) (core.fusions.fused_bias_geglu.WeightedQuickGeGLUFunction static method) (core.fusions.fused_bias_gelu.GeLUFunction static method) (core.fusions.fused_bias_swiglu.BiasSwiGLUFunction static method) (core.fusions.fused_bias_swiglu.SwiGLUFunction static method) (core.fusions.fused_bias_swiglu.WeightedSwiGLUFunction static method) (core.fusions.fused_cross_entropy._VocabParallelCrossEntropy static method) (core.fusions.fused_indices_converter.IndicesToMultihot static method) (core.fusions.fused_layer_norm.FusedLayerNorm method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbKV static method) (core.fusions.fused_mla_yarn_rope_apply.ApplyMLARotaryEmbQ static method) (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) (core.fusions.fused_softmax.ScaledMaskedSoftmax static method) (core.fusions.fused_softmax.ScaledSoftmax static method) (core.fusions.fused_softmax.ScaledUpperTriangMaskedSoftmax static method) (core.fusions.fused_softmax.SoftmaxOne method) (core.fusions.fused_weighted_squared_relu.WeightedSquaredReLUFunction static method) (core.models.bert.bert_lm_head.BertLMHead method) (core.models.bert.bert_model.BertModel method) (core.models.bert.pooler.Pooler method) (core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding method) (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding method) (core.models.common.embeddings.rotary_pos_embedding.MultimodalRotaryEmbedding method) (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) (core.models.common.embeddings.yarn_rotary_pos_embedding.YarnRotaryEmbedding method) (core.models.gpt.gpt_model.GPTModel method) (core.models.huggingface.clip_model.SiglipHuggingFaceModel method) (core.models.huggingface.module.AutoHuggingFaceModel method) (core.models.huggingface.qwen_model.QwenHuggingFaceModel method) (core.models.mamba.mamba_model.MambaModel method) (core.models.mimo.model.base.MimoModel method) (core.models.multimodal.llava_model.LLaVAModel method) (core.models.T5.t5_model.T5LMHead method) (core.models.T5.t5_model.T5Model method) (core.models.vision.clip_vit_model.CLIPViTModel method) (core.models.vision.multimodal_projector.MultimodalProjector method) (core.models.vision.radio.RADIOViTModel method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedOffloadingBackwardRecordFunction static method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedOffloadingGroupCommitFunction static method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedOffloadingGroupStartFunction static method) (core.pipeline_parallel.utils.NoopScheduleNode method) (core.pipeline_parallel.utils.ScheduleNode method) (core.post_training.modelopt.layers.Linear method) (core.ssm.gated_delta_net.GatedDeltaNet method) (core.ssm.mamba_block.MambaStack method) (core.ssm.mamba_layer.MambaLayer method) (core.ssm.mamba_mixer.MambaMixer method) (core.tensor_parallel.cross_entropy._VocabParallelCrossEntropy static method) (core.tensor_parallel.inference_layers.InferenceLayerNormColumnParallelLinear method) (core.tensor_parallel.inference_layers.InferenceRowParallelLinear method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.LinearWithFrozenWeight static method) (core.tensor_parallel.layers.LinearWithGradAccumulationAndAsyncCommunication static method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.layers.VocabParallelEmbedding method) (core.tensor_parallel.mappings._AllGatherFromTensorParallelRegion static method) (core.tensor_parallel.mappings._AllToAll static method) (core.tensor_parallel.mappings._CopyToModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceFromModelParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToTensorParallelRegion static method) (core.tensor_parallel.mappings._ScatterToModelParallelRegion static method) (core.tensor_parallel.mappings._ScatterToSequenceParallelRegion static method) (core.tensor_parallel.random.CheckpointFunction static method) (core.tensor_parallel.random.CheckpointWithoutOutputFunction static method) (core.transformer.attention.Attention method) (core.transformer.attention.CoreAttention method) (core.transformer.attention.LinearLayer method) (core.transformer.attention.LinearQkv method) (core.transformer.cuda_graphs._CudagraphRecordNode static method) (core.transformer.cuda_graphs._CudagraphReplayNode static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantRMSNormFn static method) (core.transformer.custom_layers.batch_invariant_kernels.BatchInvariantTEGemmFn static method) (core.transformer.dot_product_attention.DotProductAttention method) (core.transformer.identity_op.IdentityFuncOp method) (core.transformer.identity_op.IdentityOp method) (core.transformer.mlp.MLP method) (core.transformer.module.Float16Module method) (core.transformer.moe.experts.GroupedMLP method) (core.transformer.moe.experts.SequentialMLP method) (core.transformer.moe.experts.TEGroupedMLP method) (core.transformer.moe.fused_a2a.FusedCombine static method) (core.transformer.moe.fused_a2a.FusedDispatch static method) (core.transformer.moe.fused_a2a.HybridEPCombine static method) (core.transformer.moe.fused_a2a.HybridEPDispatch static method) (core.transformer.moe.moe_layer.BaseMoELayer method) (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.moe.moe_layer.RouterInterface method) (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler static method) (core.transformer.moe.moe_utils.RandomSTE static method) (core.transformer.moe.moe_utils.RouterGatingLinearFunction static method) (core.transformer.moe.router.Router method) (core.transformer.moe.router.TopKRouter method) (core.transformer.moe.shared_experts.SharedExpertMLP method) (core.transformer.multi_latent_attention.MultiLatentAttention method) (core.transformer.multi_token_prediction.MTPLossAutoScaler static method) (core.transformer.multi_token_prediction.MultiTokenPredictionBlock method) (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) (core.transformer.torch_norm.L2Norm method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) (core.typed_torch._Module method) (core.utils.MakeViewlessTensor static method) forward_backward_no_pipelining() (in module core.pipeline_parallel.schedules) forward_backward_pipelining_with_interleaving() (in module core.pipeline_parallel.schedules) forward_backward_pipelining_without_interleaving() (in module core.pipeline_parallel.schedules) forward_fused_softmax() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) forward_impl() (core.models.gpt.fine_grained_callables.PostProcessNode method) (core.models.gpt.fine_grained_callables.PreProcessNode method) (core.models.gpt.fine_grained_callables.TransformerLayerNode method) FORWARD_PASS_ORDER (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.PrefetchOrder attribute) forward_pass_with_pipeline_parallel_large_input_batch() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) forward_pass_with_pipeline_parallel_small_input_batch() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) forward_pass_without_pipeline_parallel() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) forward_record() (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface static method) forward_step() (in module core.pipeline_parallel.schedules) forward_step_calc_loss() (in module core.pipeline_parallel.schedules) forward_torch_softmax() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) fp16 (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) fp32_residual_connection (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) fp32_to_float16() (in module core.transformer.module) FP32Optimizer (class in core.optimizer.optimizer) fp4 (core.transformer.transformer_config.TransformerConfig attribute) fp4_param (core.transformer.transformer_config.TransformerConfig attribute) fp4_quantization_recipe (core.extensions.transformer_engine.TEQuantizationRecipe attribute) fp4_quantizer_factory (core.transformer.transformer_config.TransformerConfig attribute) fp4_recipe (core.transformer.transformer_config.TransformerConfig attribute) Fp4Recipe (class in core.enums) fp8 (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) FP8_2D_BLOCKWISE_REAL_QUANT_CFG (in module core.post_training.modelopt.layers) fp8_amax_compute_algo (core.transformer.transformer_config.TransformerConfig attribute) fp8_amax_history_len (core.transformer.transformer_config.TransformerConfig attribute) fp8_create_transpose_cache() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_dequantize() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_discard_transpose_cache() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_dot_product_attention (core.transformer.transformer_config.TransformerConfig attribute) fp8_format (core.extensions.transformer_engine.TEQuantizationRecipe attribute) fp8_get_raw_data() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_interval (core.transformer.transformer_config.TransformerConfig attribute) fp8_margin (core.transformer.transformer_config.TransformerConfig attribute) fp8_multi_head_attention (core.transformer.transformer_config.TransformerConfig attribute) fp8_need_transpose_data() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_need_transpose_data_for_meta_device_init() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_pad_hook() (in module core.models.vision.radio) fp8_param (core.transformer.transformer_config.TransformerConfig attribute) fp8_param_gather (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) FP8_PER_TENSOR_REAL_QUANT_CFG (in module core.post_training.modelopt.layers) fp8_quantization_recipe (core.extensions.transformer_engine.TEQuantizationRecipe attribute) fp8_quantize() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_quantizer_factory (core.transformer.transformer_config.TransformerConfig attribute) fp8_recipe (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) fp8_set_raw_data() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) fp8_wgrad (core.transformer.transformer_config.TransformerConfig attribute) Fp8Recipe (class in core.enums) FP8WeightTransformerLayer (class in core.post_training.modelopt.layers) free() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.FixedPoolAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.RotaryBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.StorageResizeBasedBucketAllocator method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.TemporaryBucketAllocator method) (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool method) (core.resharding.nvshmem_copy_service.memory.double_buffer_manager.DoubleBufferManager method) free_bucket_storage() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) freeze() (core.dist_checkpointing.strategies.async_utils.AsyncRequest method) (core.models.multimodal.llava_model.LLaVAModel method) FREEZE_GC (in module core.transformer.cuda_graphs) from_config() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) (core.inference.contexts.static_context.StaticInferenceContext class method) from_config_dict() (core.quantization.quant_config.RecipeConfig static method) from_pretrained() (core.tokenizers.megatron_tokenizer.MegatronTokenizer method) from_rank_offsets() (core.dist_checkpointing.mapping.ShardedTensor class method) from_request() (core.inference.inference_request.DynamicInferenceRequestRecord class method) from_sh_ten() (core.dist_checkpointing.strategies.checkpointable.CheckpointableShardedTensor class method) from_state_dict() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict class method) from_str() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout static method) from_yaml_file() (core.quantization.quant_config.RecipeConfig static method) front_backward_chunk() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) fsdp_double_buffer (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) fsdp_manual_registration (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) fsdp_unit_id (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) fsdp_unit_modules (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketingPolicy attribute) FSDPDistributedIndex (class in core.distributed.fsdp.src.megatron_fsdp.utils) full_iteration (core.transformer.enums.CudaGraphScope attribute) full_validation (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) FullCudaGraphWrapper (class in core.full_cuda_graph) fully_shard() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) fully_shard_model() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) fully_shard_optimizer() (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) FullyParallelLoadStrategyWrapper (class in core.dist_checkpointing.strategies.fully_parallel) FullyParallelSaveStrategyWrapper (class in core.dist_checkpointing.strategies.fully_parallel) FullyShardedDataParallel (class in core.distributed.fsdp.mcore_fsdp_adapter) fuse_layernorm_and_linear() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) fused (core.transformer.enums.AttnBackend attribute) fused_apply_mla_rope_for_kv() (in module core.fusions.fused_mla_yarn_rope_apply) fused_apply_mla_rope_for_q() (in module core.fusions.fused_mla_yarn_rope_apply) fused_indices_to_multihot() (in module core.fusions.fused_indices_converter) fused_pad_routing_map() (in module core.fusions.fused_pad_routing_map) fused_single_qkv_rope (core.transformer.transformer_config.TransformerConfig attribute) fused_vocab_parallel_cross_entropy() (in module core.fusions.fused_cross_entropy) FusedCombine (class in core.transformer.moe.fused_a2a) FusedDispatch (class in core.transformer.moe.fused_a2a) FusedLayerNorm (class in core.fusions.fused_layer_norm) FusedScaleMaskSoftmax (class in core.fusions.fused_softmax) future (core.inference.engines.dynamic_engine.RequestEntry attribute) fwd_buffer_reuse_ref_count (in module core.transformer.cuda_graphs) fwd_cudagraph_buffer (core.transformer.cuda_graphs.CudagraphBufferMetadata attribute) FWD_READY (core.transformer.cuda_graphs._GraphStatus attribute) G GATED_ACTIVATION (in module core.export.trtllm.trtllm_weights_converter.utils) gated_linear_unit (core.transformer.transformer_config.TransformerConfig attribute) GatedDeltaNet (class in core.ssm.gated_delta_net) GatedDeltaNetSubmodules (class in core.ssm.gated_delta_net) gather_and_compute_chunk_metadata() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) gather_from_sequence_parallel_region() (in module core.tensor_parallel.mappings) gather_from_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) gather_split_1d_tensor() (in module core.tensor_parallel.utils) gather_uneven_dtensor_to_full_tensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) gating() (core.transformer.moe.router.Router method) geglu() (in module core.fusions.fused_bias_geglu) geglu_back() (in module core.fusions.fused_bias_geglu) GeGLUFunction (class in core.fusions.fused_bias_geglu) gelu_impl() (in module core.transformer.utils) GeLUFunction (class in core.fusions.fused_bias_gelu) generate() (core.inference.engines.abstract_engine.AbstractEngine static method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.engines.static_engine.StaticInferenceEngine method) generate_all_output_tokens_static_batch() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) generate_cuda_graph_batch_dimensions_list() (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder static method) generate_deterministic_data() (in module core.resharding.nvshmem_copy_service.validation) generate_masked_orthogonal_rank_groups() (in module core.parallel_state) GENERATE_NUM (in module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.common) generate_output_tokens_dynamic_batch() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) generate_using_dynamic_engine() (core.inference.engines.static_engine.StaticInferenceEngine method) generate_using_legacy_static_engine() (core.inference.engines.static_engine.StaticInferenceEngine method) generated_length (core.inference.inference_request.InferenceRequest attribute) generated_log_probs (core.inference.inference_request.InferenceRequest attribute) generated_segments (core.inference.inference_request.InferenceRequest attribute) generated_sequence_lengths (core.inference.inference_request.InferenceRequest attribute) generated_text (core.inference.inference_request.InferenceRequest attribute) generated_tokens (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.InferenceRequest attribute) generated_top_n_logprobs (core.inference.inference_request.InferenceRequest attribute) generator() (core.inference.async_stream.AsyncStream method) get() (core.datasets.gpt_dataset.MockGPTLowLevelDataset method) (core.datasets.indexed_dataset.IndexedDataset method) (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (core.transformer.cuda_graphs.TensorReusePool method) get_A_log() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_active_avail() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_active_request_count() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_active_sequence_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_active_used() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_align_size_for_quantization() (in module core.transformer.moe.moe_utils) get_all_ranks() (in module core.parallel_state) get_all_rng_states() (in module core.tensor_parallel.random) get_all_timers_string() (core.timers.Timers method) get_amax_reduction_group() (in module core.parallel_state) get_arg_metas() (core.transformer.cuda_graphs._CudaGraphRunner method) get_asyncio_loop() (in module core.utils) get_attention_mask() (in module core.inference.utils) get_attr_wrapped_model() (in module core.utils) get_aux_loss_coeff() (core.transformer.moe.router.TopKRouter method) get_batch_for_context_window() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper.GPTInferenceWrapper method) (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) get_batch_invariant_attention_block_size() (in module core.transformer.custom_layers.batch_invariant_kernels) get_batch_on_this_cp_rank() (in module core.utils) get_batch_on_this_hybrid_cp_rank() (in module core.utils) get_batch_per_block() (core.fusions.fused_softmax.FusedScaleMaskSoftmax static method) get_bert_layer_with_transformer_engine_spec() (in module core.models.bert.bert_layer_specs) get_bias_dropout_add() (in module core.fusions.fused_bias_dropout) get_bin_path() (in module core.datasets.indexed_dataset) get_blend_from_list() (in module core.datasets.utils) get_boundary_pp_stage_ranks() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) get_bucket_key() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) get_buffer() (in module core.transformer.moe.fused_a2a) get_cached_cos_sin() (core.models.common.embeddings.yarn_rotary_pos_embedding.YarnRotaryEmbedding method) get_canonical_lr_for_logging() (in module core.optimizer_param_scheduler) get_capacity() (in module core.transformer.moe.moe_utils) get_causal_conv1d_version() (in module core.utils) get_comm_stream() (in module core.pipeline_parallel.utils) get_comp_stream() (in module core.pipeline_parallel.utils) get_compute_units() (in module core.transformer.custom_layers.batch_invariant_kernels) get_config_keys() (core.extensions.transformer_engine.TEQuantizationRecipe class method) get_config_logger_path() (in module core.config_logger) get_connected_params() (core.transformer.cuda_graphs._CudaGraphRunner method) get_context() (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface static method) get_context_parallel_global_ranks() (in module core.parallel_state) get_context_parallel_group() (in module core.parallel_state) get_context_parallel_rank() (in module core.parallel_state) get_context_parallel_world_size() (in module core.parallel_state) get_conv1d_bias() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_conv1d_weight() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_cos_sin() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) get_cpu_offload_context (in module core.transformer.transformer_block) get_cuda_rng_tracker() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) get_cudagraph_runner() (core.transformer.cuda_graphs.CudaGraphManager method) get_current_global_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_current_running_global_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_D() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_data_modulo_expert_parallel_group() (in module core.parallel_state) get_data_parallel_group() (in module core.parallel_state) get_data_parallel_group_gloo() (in module core.parallel_state) get_data_parallel_group_if_dtensor() (in module core.utils) get_data_parallel_rank() (in module core.parallel_state) get_data_parallel_rng_tracker_name() (in module core.tensor_parallel.random) get_data_parallel_src_rank() (in module core.parallel_state) get_data_parallel_world_size() (in module core.parallel_state) get_default_causal_mask() (in module core.transformer.utils) get_default_load_sharded_strategy() (in module core.dist_checkpointing.serialization) get_default_pg_collection() (in module core.transformer.moe.moe_utils) get_default_save_common_strategy() (in module core.dist_checkpointing.serialization) get_default_save_sharded_strategy() (in module core.dist_checkpointing.serialization) get_default_strategy() (in module core.dist_checkpointing.strategies.base) get_distributed_index() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) get_document_indices() (core.datasets.indexed_dataset.IndexedDataset method) get_dp_group() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_dsa_module_spec_for_backend() (in module core.models.gpt.experimental_attention_variant_module_specs) get_dt_bias() (core.ssm.mamba_context_parallel.MambaContextParallel method) get_early_return_outputs() (core.transformer.moe.moe_utils.MoECudaGraphPartialCaptureSignal method) get_emb() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) (core.models.common.embeddings.yarn_rotary_pos_embedding.YarnRotaryEmbedding method) get_embedding_group() (in module core.parallel_state) get_ep_layer_offset() (in module core.transformer.fsdp_dtensor_checkpoint) get_experimental_attention_variant_module_spec() (in module core.models.gpt.experimental_attention_variant_module_specs) get_expert_data_parallel_group() (in module core.parallel_state) get_expert_data_parallel_group_gloo() (in module core.parallel_state) get_expert_data_parallel_rank() (in module core.parallel_state) get_expert_data_parallel_world_size() (in module core.parallel_state) get_expert_index_from_key() (in module core.transformer.fsdp_dtensor_checkpoint) get_expert_model_parallel_group() (in module core.parallel_state) get_expert_model_parallel_rank() (in module core.parallel_state) get_expert_model_parallel_src_rank() (in module core.parallel_state) get_expert_model_parallel_world_size() (in module core.parallel_state) get_expert_parallel_rng_tracker_name() (in module core.tensor_parallel.random) get_expert_tensor_and_model_parallel_group() (in module core.parallel_state) get_expert_tensor_and_model_parallel_rank() (in module core.parallel_state) get_expert_tensor_and_model_parallel_world_size() (in module core.parallel_state) get_expert_tensor_model_pipeline_parallel_group() (in module core.parallel_state) get_expert_tensor_parallel_group() (in module core.parallel_state) get_expert_tensor_parallel_rank() (in module core.parallel_state) get_expert_tensor_parallel_world_size() (in module core.parallel_state) get_extra_state() (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) get_fa_version() (in module core.utils) get_forward_backward_func() (in module core.pipeline_parallel.schedules) get_fp4_align_size() (in module core.fp4_utils) get_fp8_align_size() (in module core.fp8_utils) get_fp8_context() (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan method) get_freqs_non_repeated() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) get_fsdp_buffer() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) get_fsdp_group() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_full_tensor_if_necessary() (in module core.utils) get_func_args() (in module core.nccl_allocator) get_gated_delta_net_module_spec() (in module core.models.gpt.experimental_attention_variant_module_specs) get_global_id_seqlens() (core.datasets.data_schedule.HybridCPDataLoaderWrapper method) get_global_memory_buffer() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.parallel_state) get_global_seqlens() (core.datasets.data_schedule.HybridCPDataLoaderWrapper method) get_global_symmetric_memory_buffer() (in module core.parallel_state) get_global_unique_param_name() (in module core.transformer.fsdp_dtensor_checkpoint) get_gpt_decoder_block_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_decoder_layer_specs() (in module core.models.gpt.gpt_layer_specs) get_gpt_layer_local_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_layer_with_inference_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_layer_with_transformer_engine_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_modelopt_spec() (in module core.post_training.modelopt.gpt.model_specs) get_gpt_mtp_block_spec() (in module core.models.gpt.gpt_layer_specs) get_gpt_mtp_block_spec_for_backend() (in module core.models.gpt.gpt_layer_specs) get_grad() (core.pipeline_parallel.utils.ScheduleNode method) get_grad_norm() (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) get_grad_norm_fp32() (in module core.optimizer.clip_grads) get_grad_stats_parallel_group() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) get_groups_and_subsamples() (core.pipeline_parallel.hybrid_cp_schedule.BalancedCPScheduler method) get_hf_model_type() (in module core.models.huggingface.module) get_hidden_bytes() (in module core.transformer.moe.fused_a2a) get_hierarchical_context_parallel_groups() (in module core.parallel_state) get_hybrid_data_context_parallel_groups() (in module core.parallel_state) get_idx_path() (in module core.datasets.indexed_dataset) get_index_cache_path() (in module core.datasets.object_storage_utils) get_index_of_chunked_prefill_request() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_instance() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager class method) get_inter_distributed_optimizer_instance_group() (in module core.parallel_state) get_intra_distributed_optimizer_instance_group() (in module core.parallel_state) get_item() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_item_from_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_kvcache_utilization_stats() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_layer() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) get_layer_id_list() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) get_layer_maps_from_layer_type_list() (in module core.ssm.mamba_hybrid_layer_allocation) get_layer_name_without_prefix() (in module core.export.trtllm.trtllm_layers) get_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) get_layer_offset() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) get_layer_static_inputs() (core.transformer.module.GraphableMegatronModule method) (core.transformer.transformer_layer.TransformerLayer method) get_leader_rank() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) get_linear_attention_pattern() (in module core.models.gpt.experimental_attention_variant_module_specs) get_linear_layer() (in module core.transformer.utils) get_local_model_weights_per_gpu() (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) get_logical_hybrid_fsdp_rank() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_loss_scale() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) get_lr() (core.optimizer_param_scheduler.OptimizerParamScheduler method) get_main_grads_for_grad_norm() (core.optimizer.optimizer.MegatronOptimizer method) get_mamba_inference_state_config_from_model() (in module core.utils) get_mamba_stack_modelopt_spec() (in module core.post_training.modelopt.mamba.model_specs) get_mamba_version() (in module core.utils) get_mask() (core.parallel_state.RankGenerator method) get_max_deduplicated_groups() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) get_max_sequence_lengths() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) get_mcore_tensor_parallel_partition_dim() (in module core.distributed.fsdp.src.megatron_fsdp.utils) get_megatron_muon_optimizer() (in module core.optimizer.muon) get_megatron_optimizer() (in module core.optimizer) get_mem_alloc_context() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) get_mem_size_str() (in module core.inference.contexts.dynamic_context) get_mesh_names() (in module core.distributed.fsdp.src.megatron_fsdp.utils) get_metadata_types() (core.inference.inference_request.DynamicInferenceRequest static method) get_micro_batch_size() (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (in module core.num_microbatches_calculator) get_mismatch_errors() (core.transformer.cuda_graphs._CudaGraphRunner method) get_mlp_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) get_mlp_module_spec() (in module core.models.gpt.gpt_layer_specs) get_mlp_module_spec_for_backend() (in module core.models.gpt.gpt_layer_specs) get_mode() (core.rerun_state_machine.RerunStateMachine method) get_model_config() (in module core.utils) get_model_parallel_group() (in module core.parallel_state) get_model_parallel_src_rank() (in module core.parallel_state) get_model_type() (in module core.utils) get_model_xattn() (in module core.utils) get_module() (in module core.transformer.spec_utils) get_moe_layer_pattern() (in module core.models.gpt.experimental_attention_variant_module_specs) get_moe_layer_wise_logging_tracker() (in module core.transformer.moe.moe_utils) get_moe_module_spec() (in module core.models.gpt.moe_module_specs) get_moe_module_spec_for_backend() (in module core.models.gpt.moe_module_specs) get_mtp_layer_offset() (in module core.transformer.multi_token_prediction) get_mtp_layer_spec() (in module core.transformer.multi_token_prediction) get_mtp_layer_spec_for_backend() (in module core.transformer.multi_token_prediction) get_mtp_num_layers_to_build() (in module core.transformer.multi_token_prediction) get_mtp_ranks() (in module core.transformer.multi_token_prediction) get_nccl_options() (in module core.parallel_state) get_new_request_id() (core.inference.engines.static_engine.StaticInferenceEngine method) (core.inference.scheduler.Scheduler method) get_next_data_parallel_rank() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator method) get_num_image_embeddings() (in module core.models.vision.clip_vit_model) get_num_layers_to_build() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) (in module core.transformer.transformer_block) get_num_microbatches() (in module core.num_microbatches_calculator) get_num_stages_from_str() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout static method) get_num_unfinalized_calls() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) get_number_of_tokens_per_expert() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) get_object() (core.datasets.object_storage_utils.S3Client method) get_object_storage_access() (in module core.datasets.object_storage_utils) get_optim_param_to_id_map() (in module core.dist_checkpointing.optimizer) get_or_create_service() (in module core.resharding.refit) get_outer_fsdp_group() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_output() (core.pipeline_parallel.utils.ScheduleNode method) (core.transformer.moe.shared_experts.SharedExpertMLP method) get_overlap_moe_expert_parallel_comm_order() (in module core.transformer.cuda_graphs) get_packed_seq_params() (in module core.models.multimodal.context_parallel) get_padded_vocab_size() (core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter.SingleDeviceTRTLLMModelWeightsConverter method) get_padding() (in module core.models.multimodal.context_parallel) get_param_id_to_sharded_param_map() (in module core.dist_checkpointing.optimizer) get_parameter_state_dp_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) get_parameter_state_dp_zero() (core.optimizer.distrib_optimizer.DistributedOptimizer method) get_parameters() (core.optimizer.optimizer.MegatronOptimizer method) get_path_count() (in module core.config_logger) get_path_with_count() (in module core.config_logger) get_paused_avail() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_paused_used() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_permuted_hidden_states_by_experts() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) get_pg() (core.hyper_comm_grid.HyperCommGrid method) get_pg_rank() (in module core.utils) get_pg_size() (in module core.utils) get_pg_src_rank() (in module core.utils) get_pipeline_model_parallel_first_rank() (in module core.parallel_state) get_pipeline_model_parallel_group() (in module core.parallel_state) get_pipeline_model_parallel_last_rank() (in module core.parallel_state) get_pipeline_model_parallel_next_rank() (in module core.parallel_state) get_pipeline_model_parallel_prev_rank() (in module core.parallel_state) get_pipeline_model_parallel_rank() (in module core.parallel_state) get_pipeline_model_parallel_world_size() (in module core.parallel_state) get_pointer() (core.resharding.nvshmem_copy_service.memory.tensor_pointer_utils.TensorPointerExtractor static method) get_pool_status() (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool method) get_pos_emb_on_this_cp_rank() (in module core.models.common.embeddings.rope_utils) get_pos_enc() (core.models.vision.radio.RADIOViTModel method) get_position_embedding_group() (in module core.parallel_state) get_pp_first_rank() (in module core.pipeline_parallel.utils) get_pp_last_rank() (in module core.pipeline_parallel.utils) get_pp_next_rank() (in module core.pipeline_parallel.utils) get_pp_prev_rank() (in module core.pipeline_parallel.utils) get_pp_rank_microbatches() (in module core.pipeline_parallel.schedules) get_qattention_params_from_predefined (in module core.extensions.kitchen) get_qfa_params_from_recipe_name (in module core.extensions.kitchen) get_qkv_layer_norm_weights() (core.transformer.transformer_layer.TransformerLayer method) get_qlinear_params_from_predefined (in module core.extensions.kitchen) get_qlinear_params_from_qat_params (in module core.extensions.kitchen) get_quant_config_or_none() (in module core.quantization.utils) get_quantization_context() (core.transformer.cuda_graphs._CudaGraphRunner method) get_quantized_model_init_context_cls() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) get_query_key_value_tensors() (core.transformer.attention.Attention method) (core.transformer.attention.CrossAttention method) (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) get_rank_enum() (core.hyper_comm_grid.HyperCommGrid method) get_ranks() (core.parallel_state.RankGenerator method) get_ready_bucket_group_for_reduction() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) get_recorded_data() (core.transformer.moe.router_replay.RouterReplay static method) get_recorded_indices() (core.transformer.moe.router_replay.RouterReplay method) get_recv_slot() (core.resharding.nvshmem_copy_service.memory.double_buffer_manager.DoubleBufferManager method) get_relative_seq_len() (core.models.common.embeddings.relative_pos_embedding.RelativePositionEmbedding static method) get_replay_topk() (core.transformer.moe.router_replay.RouterReplay method) get_request() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) get_rerun_state_machine() (in module core.rerun_state_machine) get_restored_hidden_states_by_experts() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) get_root_mesh() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_rotary_seq_len() (core.models.common.embeddings.rotary_pos_embedding.RotaryEmbedding method) get_save_function_and_args() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) get_schedule_table() (in module core.pipeline_parallel.schedules) get_send_slot() (core.resharding.nvshmem_copy_service.memory.double_buffer_manager.DoubleBufferManager method) get_shard_from_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_shard_from_local_buffer() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) get_skipped_iterations_from_tracker_file() (core.rerun_state_machine.RerunStateMachine class method) get_sliding_window_causal_mask() (in module core.transformer.utils) get_standard_config_overrides() (in module core.optimizer) get_states() (core.tensor_parallel.random.CudaRNGStatesTracker method) get_stream() (core.resharding.nvshmem_copy_service.core.gpu_resource_manager.GPUResourceManager method) get_stream_generator() (core.inference.engines.static_engine.StaticInferenceEngine method) get_submesh() (core.distributed.fsdp.src.megatron_fsdp.utils.FSDPDistributedIndex method) get_t5_decoder_with_local_block_spec() (in module core.models.T5.t5_spec) get_t5_decoder_with_transformer_engine_block_spec() (in module core.models.T5.t5_spec) get_t5_encoder_with_local_block_spec() (in module core.models.T5.t5_spec) get_t5_encoder_with_transformer_engine_block_spec() (in module core.models.T5.t5_spec) get_te_version() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.models.bert.bert_model) (in module core.utils) get_tensor() (core.distributed.fsdp.src.megatron_fsdp.utils.GlobalMemoryBuffer method) (core.utils.GlobalMemoryBuffer method) get_tensor_and_context_parallel_group() (in module core.parallel_state) get_tensor_and_context_parallel_rank() (in module core.parallel_state) get_tensor_and_context_parallel_world_size() (in module core.parallel_state) get_tensor_and_data_parallel_group() (in module core.parallel_state) get_tensor_model_parallel_group() (in module core.parallel_state) get_tensor_model_parallel_group_if_none() (in module core.utils) get_tensor_model_parallel_rank() (in module core.parallel_state) get_tensor_model_parallel_src_rank() (in module core.parallel_state) get_tensor_model_parallel_world_size() (in module core.parallel_state) get_tensor_shapes() (in module core.pipeline_parallel.schedules) get_tensors() (core.transformer.cuda_graphs._CudaGraphRunner method) get_text_embeddings() (core.models.mimo.model.base.MimoModel method) get_thd_batch_on_this_cp_rank() (in module core.utils) get_tokens_per_expert_and_token_count() (in module core.transformer.moe.moe_utils) get_torch_stream() (core.resharding.nvshmem_copy_service.core.gpu_resource_manager.GPUResourceManager method) get_torch_version() (in module core.utils) get_total() (core.energy_monitor.EnergyMonitor method) get_total_used() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) get_total_workload() (core.pipeline_parallel.hybrid_cp_schedule.BalancedCPScheduler method) get_transformer_block_with_experimental_attention_variant_spec() (in module core.models.gpt.experimental_attention_variant_module_specs) get_transformer_layer_offset() (in module core.transformer.transformer_layer) get_trtllm_pretrained_config_and_model_weights() (core.export.trtllm.trtllm_helper.TRTLLMHelper method) get_unflattened_state_dict() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) get_updated_expert_bias() (in module core.transformer.moe.moe_utils) get_virtual_pipeline_model_parallel_rank() (in module core.parallel_state) get_virtual_pipeline_model_parallel_world_size() (in module core.parallel_state) get_vit_layer_with_local_spec() (in module core.models.vision.vit_layer_specs) get_vit_layer_with_transformer_engine_spec() (in module core.models.vision.vit_layer_specs) get_wd() (core.optimizer_param_scheduler.OptimizerParamScheduler method) global_expert_index (core.resharding.utils.ParameterMetadata attribute) global_mempool (core.transformer.cuda_graphs.CudaGraphManager attribute) global_offset (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) global_router_replay_instances (core.transformer.moe.router_replay.RouterReplay attribute) global_shape (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) global_slice() (core.dist_checkpointing.mapping.ShardedTensor method) GlobalMemoryBuffer (class in core.distributed.fsdp.src.megatron_fsdp.utils) (class in core.utils) GlobalSymmetricMemoryBuffer (class in core.utils) GlobMatcher (class in core.quantization.quant_config) GlooCopyService (class in core.resharding.copy_services.gloo_copy_service) glu_linear_offset (core.transformer.transformer_config.TransformerConfig attribute) GPTDataset (class in core.datasets.gpt_dataset) GPTDatasetConfig (class in core.datasets.gpt_dataset) GPTInferenceWrapper (class in core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper) GPTModel (class in core.models.gpt.gpt_model) GPTTokenizer (class in core.tokenizers.text.models.gpt_tokenizer) gpu_plan (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) GPUExecutionPlanner (class in core.resharding.nvshmem_copy_service.planning.gpu_execution_planner) GPUResourceManager (class in core.resharding.nvshmem_copy_service.core.gpu_resource_manager) gpus_needed() (core.pipeline_parallel.hybrid_cp_schedule.BalancedCPScheduler method) GPUTensorPool (class in core.pipeline_parallel.fine_grained_activation_offload) GRAD (core.distributed.param_and_grad_buffer.BufferType attribute) grad_reduce_in_fp32 (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) grad_scale_func (core.model_parallel_config.ModelParallelConfig attribute) grad_sync_func (core.model_parallel_config.ModelParallelConfig attribute) gradient_accumulation_fusion (core.model_parallel_config.ModelParallelConfig attribute) gradient_reduce_div_fusion (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) gradient_reduce_preprocessing() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) GradReducePipeline (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) grads_states_parallel_group_is_shared() (core.optimizer.optimizer.ChainedOptimizer method) GraphableMegatronModule (class in core.transformer.module) graphs_created() (core.transformer.cuda_graphs.TECudaGraphHelper method) group_commit() (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface static method) group_limited_topk() (in module core.transformer.moe.moe_utils) grouped_gemm_is_available() (in module core.transformer.moe.grouped_gemm_util) grouped_mlp_modules() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) GroupedMLP (class in core.transformer.moe.experts) H h2d_stream (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager property) handle_experts_in_state_dict() (in module core.transformer.fsdp_dtensor_checkpoint) handle_fp8_extra_state_case() (in module core.transformer.fsdp_dtensor_checkpoint) handle_swiglu_in_state_dict() (in module core.transformer.fsdp_dtensor_checkpoint) has_config_logger_enabled() (in module core.config_logger) has_explicit_chunked_prefill_req (core.inference.batch_dimensions_utils.InferenceBatchDimensions attribute) has_final_layernorm_in_this_stage() (core.transformer.transformer_block.TransformerBlock method) has_regular_grid (core.dist_checkpointing.mapping.ShardedTensor property) has_separate_all_gather_group() (in module core.parallel_state) has_unfinished_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) HAVE_APEX_OR_TE (in module core.optimizer.distrib_optimizer) HAVE_KITCHEN (in module core.extensions.kitchen) have_requests_pending() (core.inference.scheduler.Scheduler method) HAVE_TE (in module core.fp4_utils) (in module core.fp8_utils) HAVE_TE_FP4_TENSOR_CLASS (in module core.fp4_utils) HAVE_TE_FP8_TENSOR_CLASS (in module core.fp8_utils) hcp (core.process_groups_config.ProcessGroupCollection attribute) head_object() (core.datasets.object_storage_utils.S3Client method) Headers (class in core.inference.headers) hetereogenous_dist_checkpoint (core.transformer.transformer_config.TransformerConfig attribute) heterogeneous_block_specs (core.transformer.transformer_config.TransformerConfig attribute) hidden_dropout (core.transformer.transformer_config.TransformerConfig attribute) hidden_size (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) hidden_states (core.transformer.moe.moe_utils.MoECudaGraphTensorStore attribute) hierarchical_context_parallel_sizes (core.model_parallel_config.ModelParallelConfig attribute) hnorm (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) hsdp_gbuf (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) hsdp_wbuf (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) HuggingFaceModule (class in core.models.huggingface.module) HuggingFaceTokenizer (class in core.tokenizers.text.libraries.huggingface_tokenizer) hybrid_context_parallel (core.datasets.gpt_dataset.GPTDatasetConfig attribute) (core.model_parallel_config.ModelParallelConfig attribute) hybrid_context_parallel_forward_backward() (in module core.pipeline_parallel.hybrid_cp_schedule) HybridCPDataLoaderWrapper (class in core.datasets.data_schedule) HybridDeviceOptimizer (class in core.optimizer.cpu_offloading.hybrid_optimizer) HybridEPCombine (class in core.transformer.moe.fused_a2a) HybridEPDispatch (class in core.transformer.moe.fused_a2a) HyperCommGrid (class in core.hyper_comm_grid) hysteresis (core.optimizer.optimizer_config.OptimizerConfig attribute) I id_to_token() (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) IdentityFuncOp (class in core.transformer.identity_op) IdentityOp (class in core.transformer.identity_op) IDLE (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) ids_to_text() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) ids_to_tokens() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) idx (core.dist_checkpointing.strategies.async_utils._ActiveAsyncRequest attribute) IGNORE_ALL (core.dist_checkpointing.validation.StrictHandling attribute) IGNORE_INDEX (in module core.models.multimodal.llava_model) image_h (core.datasets.multimodal_dataset.MultimodalDatasetConfig attribute) IMAGE_TOKEN (in module core.models.multimodal.llava_model) image_w (core.datasets.multimodal_dataset.MultimodalDatasetConfig attribute) imgs (core.inference.inference_request.VLMInferenceRequest attribute) import_module() (in module core.transformer.spec_utils) import_package() (core.msc_utils._FeatureFlag method) in_proj (core.ssm.gated_delta_net.GatedDeltaNetSubmodules attribute) (core.ssm.mamba_mixer.MambaMixerSubmodules attribute) increment_batch_size_offset() (core.inference.contexts.base_context.BaseInferenceContext method) increment_sequence_len_offset() (core.inference.contexts.base_context.BaseInferenceContext method) IndexedDataset (class in core.datasets.indexed_dataset) IndexedDatasetBuilder (class in core.datasets.indexed_dataset) IndicesToMultihot (class in core.fusions.fused_indices_converter) inference_batch_times_seqlen_threshold (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) inference_fuse_tp_communication (core.transformer.transformer_config.TransformerConfig attribute) inference_max_requests (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) inference_max_seq_length (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) inference_parameters (core.inference.inference_request.InferenceRequest attribute) inference_params (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper property) inference_pp_size (core.export.export_config.ExportConfig attribute) inference_rng_tracker (core.transformer.transformer_config.TransformerConfig attribute) inference_sampling_seed (core.transformer.transformer_config.TransformerConfig attribute) inference_tp_size (core.export.export_config.ExportConfig attribute) InferenceBatchDimensions (class in core.inference.batch_dimensions_utils) InferenceClient (class in core.inference.inference_client) InferenceLayerNormColumnParallelLinear (class in core.tensor_parallel.inference_layers) InferenceRequest (class in core.inference.inference_request) InferenceRowParallelLinear (class in core.tensor_parallel.inference_layers) InferenceSpecProvider (class in core.models.backends) InferenceWrapperConfig (class in core.inference.model_inference_wrappers.inference_wrapper_config) info() (core.resharding.nvshmem_copy_service.logger.PELogger class method) init() (core.resharding.nvshmem_copy_service.core.gpu_resource_manager.GPUResourceManager method) (core.resharding.nvshmem_copy_service.logger.PELogger class method) (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) (in module core.nccl_allocator) init_backward_dw_wrapper() (core.transformer.module.GraphableMegatronModule method) init_chunk_handler() (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface static method) init_cuda_graph_cache() (in module core.transformer.utils) init_data() (core.dist_checkpointing.mapping.ShardedTensor method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) init_hybrid_ep_buffer() (in module core.transformer.moe.fused_a2a) init_method (core.transformer.transformer_config.TransformerConfig attribute) init_method_normal() (in module core.utils) init_method_std (core.transformer.transformer_config.TransformerConfig attribute) init_model_chunk_offload_handler() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) init_model_with_meta_device (core.transformer.transformer_config.TransformerConfig attribute) init_num_microbatches_calculator() (in module core.num_microbatches_calculator) init_tensors() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) initial_loss_scale (core.optimizer.optimizer_config.OptimizerConfig attribute) INITIAL_RUN (core.rerun_state_machine.RerunState attribute) (core.rerun_state_machine.RerunValidationStatus attribute) initialize() (core.datasets.indexed_dataset.IndexedDataset method) initialize_attention_state() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) initialize_model_parallel() (in module core.parallel_state) initialize_rerun_state_machine() (in module core.rerun_state_machine) initialize_rng_tracker() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.tensor_parallel.random) initialized (core.resharding.nvshmem_copy_service.service.RemoteCopyService property) input_layernorm (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) input_layernorm_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) input_layernorm_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) input_use_count (core.transformer.cuda_graphs.CudagraphBufferMetadata attribute) insert() (core.transformer.cuda_graphs.TensorReusePool method) insert_tensors() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) inspect_types() (in module core.dist_checkpointing.dict_utils) install_optimized_model_weights() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) int16 (core.datasets.indexed_dataset.DType attribute) int32 (core.datasets.indexed_dataset.DType attribute) int64 (core.datasets.indexed_dataset.DType attribute) int8 (core.datasets.indexed_dataset.DType attribute) inter_dist_opt (core.process_groups_config.ProcessGroupCollection attribute) internal_api() (in module core.utils) intra_dist_opt (core.process_groups_config.ProcessGroupCollection attribute) intra_dp_cp (core.process_groups_config.ProcessGroupCollection attribute) intra_expt_dp (core.process_groups_config.ProcessGroupCollection attribute) inv_scale (core.optimizer.grad_scaler.MegatronGradScaler property) inv_vocab (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) is_applicable_for_batch_dim() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) is_aux_loss_enabled() (core.transformer.moe.router.TopKRouter method) is_batch_invariant_mode_enabled() (in module core.transformer.custom_layers.batch_invariant_kernels) is_blockwise_float8tensor() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) is_causal_conv1d_min_version() (in module core.utils) IS_CHECKPOINTING (in module core.tensor_parallel.random) is_checkpointing() (in module core.tensor_parallel.random) is_column_parallel_linear() (in module core.fp8_utils) is_cudagraph_input (core.transformer.cuda_graphs.CudagraphBufferMetadata attribute) is_cudagraph_output (core.transformer.cuda_graphs.CudagraphBufferMetadata attribute) is_current_async_call_done() (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) is_current_rank_in_grid() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) is_decode_only() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.contexts.static_context.StaticInferenceContext method) is_dynamic_batching() (core.inference.contexts.base_context.BaseInferenceContext method) is_empty() (core.transformer.moe.moe_utils.MoECudaGraphTensorStore method) is_empty_chunk() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) is_enabled() (core.msc_utils._FeatureFlag method) is_ep (core.resharding.utils.ParameterMetadata attribute) is_experimental_enabled() (in module core.config) is_expert_param (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) is_fa_min_version() (in module core.utils) is_first_last_bf16_layer() (in module core.fp8_utils) is_float8tensor() (in module core.dist_checkpointing.exchange_utils) (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) (in module core.fp8_utils) is_frozen (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) is_gated_activation() (in module core.export.trtllm.trtllm_weights_converter.utils) is_graph_capturing() (in module core.transformer.cuda_graphs) is_graph_safe_cuda_rng_tracker() (in module core.tensor_parallel.random) is_graph_warmup() (in module core.transformer.cuda_graphs) is_hollow (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) is_hybrid_model (core.transformer.transformer_config.TransformerConfig attribute) is_initialized() (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.tensor_parallel.random.CudaRNGStatesTracker method) (in module core.parallel_state) is_kernel_available() (core.fusions.fused_softmax.FusedScaleMaskSoftmax method) is_layer_window_attention() (in module core.transformer.utils) is_linear_attention_variant() (in module core.models.gpt.experimental_attention_variant_module_specs) is_main_replica() (in module core.dist_checkpointing.mapping) is_mamba_min_version() (in module core.utils) is_mcore_tensor_model_parallel() (in module core.distributed.fsdp.src.megatron_fsdp.utils) is_mcore_tensor_parallel_duplicated() (in module core.distributed.fsdp.src.megatron_fsdp.utils) is_memory_available() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) is_mxfp8tensor() (in module core.fp8_utils) is_nvfp4tensor() (in module core.fp4_utils) is_object_storage_path() (in module core.datasets.object_storage_utils) is_pipeline_first_stage() (in module core.inference.communication_utils) (in module core.parallel_state) is_pipeline_last_stage() (in module core.inference.communication_utils) (in module core.parallel_state) is_pp_first_stage (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator property) is_pp_first_stage() (in module core.pipeline_parallel.utils) is_pp_last_stage (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator property) is_pp_last_stage() (in module core.pipeline_parallel.utils) is_rank_in_embedding_group() (in module core.parallel_state) is_rank_in_position_embedding_group() (in module core.parallel_state) is_row_parallel_linear() (in module core.fp8_utils) is_send (core.resharding.utils.TransferOp attribute) is_single_shape() (in module core.pipeline_parallel.p2p_communication) is_source_stage (core.pipeline_parallel.multimodule_communicator.RankModuleInfo attribute) is_static_batching() (core.inference.contexts.base_context.BaseInferenceContext method) (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.contexts.static_context.StaticInferenceContext method) is_submodule() (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.utils) is_te_min_version() (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.utils) is_terminal_stage (core.pipeline_parallel.multimodule_communicator.RankModuleInfo attribute) is_torch_min_version() (in module core.utils) is_tp (core.resharding.utils.ParameterMetadata attribute) is_unexpectedly_large() (core.rerun_state_machine.RerunStateMachine method) is_unitialized() (in module core.parallel_state) is_using_quantization_scales() (in module core.utils) is_valid() (core.inference.batch_dimensions_utils.InferenceBatchDimensions method) is_vp_first_stage() (in module core.pipeline_parallel.utils) is_vp_last_stage() (in module core.pipeline_parallel.utils) items() (core.optimizer.optimizer.ProxyDict method) iteration (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) J jit_fuser (in module core.jit) JSONEncoderWithMcoreTypes (class in core.config_logger) K k_layernorm (core.transformer.attention.SelfAttentionSubmodules attribute) keep_fp8_transpose_cache (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) KEEP_VARS_HINT (in module core.dist_checkpointing.optimizer) KernelLauncher (class in core.resharding.nvshmem_copy_service.core.kernel_launcher) key (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) key_value_cache() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) kitchen_attention_backend (core.transformer.transformer_config.TransformerConfig attribute) kitchen_quantization_recipe_config() (in module core.quantization.utils) KitchenColumnParallelGroupedLinear (in module core.extensions.kitchen) KitchenColumnParallelLinear (in module core.extensions.kitchen) KitchenDotProductAttention (in module core.extensions.kitchen) KitchenFlashAttention (in module core.extensions.kitchen) KitchenLayerNormColumnParallelLinear (in module core.extensions.kitchen) KitchenRowParallelGroupedLinear (in module core.extensions.kitchen) KitchenRowParallelLinear (in module core.extensions.kitchen) KitchenSpecProvider (in module core.extensions.kitchen) kv_channels (core.transformer.transformer_config.TransformerConfig attribute) kv_layernorm (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) kv_lora_rank (core.transformer.transformer_config.MLATransformerConfig attribute) L L2Norm (class in core.transformer.torch_norm) language_model_spec (core.models.mimo.config.base_configs.MimoModelConfig attribute) LanguageModelEmbedding (class in core.models.common.embeddings.language_model_embedding) LanguageModule (class in core.models.common.language_module.language_module) lap() (core.energy_monitor.EnergyMonitor method) last_token_logits() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) latency (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.DynamicInferenceRequestRecord attribute) launch_pack() (core.resharding.nvshmem_copy_service.core.kernel_launcher.KernelLauncher method) launch_unpack() (core.resharding.nvshmem_copy_service.core.kernel_launcher.KernelLauncher method) layer_norm (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) (core.transformer.transformer_block.TransformerBlockSubmodules attribute) layer_norm() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) layer_number (core.quantization.quant_config.MatchContext attribute) layer_specs (core.transformer.multi_token_prediction.MultiTokenPredictionBlockSubmodules attribute) (core.transformer.transformer_block.TransformerBlockSubmodules attribute) layernorm_epsilon (core.transformer.transformer_config.TransformerConfig attribute) layernorm_zero_centered_gamma (core.transformer.transformer_config.TransformerConfig attribute) LayerType (class in core.transformer.enums) LayerWiseDistributedOptimizer (class in core.optimizer.layer_wise_optimizer) Linear (class in core.post_training.modelopt.layers) linear() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.InferenceSpecProvider method) linear_attention_freq (core.transformer.transformer_config.TransformerConfig attribute) linear_conv_kernel_dim (core.transformer.transformer_config.TransformerConfig attribute) linear_fc1 (core.transformer.mlp.MLPSubmodules attribute) linear_fc1_forward_and_act() (core.transformer.moe.shared_experts.SharedExpertMLP method) linear_fc2 (core.transformer.mlp.MLPSubmodules attribute) linear_fc2_forward() (core.transformer.moe.shared_experts.SharedExpertMLP method) linear_key_head_dim (core.transformer.transformer_config.TransformerConfig attribute) linear_kv (core.transformer.attention.CrossAttentionSubmodules attribute) linear_kv_down_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_kv_up_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_num_key_heads (core.transformer.transformer_config.TransformerConfig attribute) linear_num_value_heads (core.transformer.transformer_config.TransformerConfig attribute) linear_proj (core.transformer.attention.CrossAttentionSubmodules attribute) (core.transformer.attention.SelfAttentionSubmodules attribute) (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_q (core.transformer.attention.CrossAttentionSubmodules attribute) linear_q_down_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_q_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_q_up_proj (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) linear_qkv (core.transformer.attention.SelfAttentionSubmodules attribute) linear_value_head_dim (core.transformer.transformer_config.TransformerConfig attribute) linear_with_frozen_weight() (in module core.tensor_parallel.layers) linear_with_grad_accumulation_and_async_allreduce() (in module core.tensor_parallel.layers) LinearLayer (class in core.transformer.attention) LinearLayerBuilder (class in core.transformer.attention) LinearQkv (class in core.transformer.attention) LinearQkvBuilder (class in core.transformer.attention) LinearWithFrozenWeight (class in core.tensor_parallel.layers) LinearWithGradAccumulationAndAsyncCommunication (class in core.tensor_parallel.layers) LLaVAModel (class in core.models.multimodal.llava_model) lm_head (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) load() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) load_and_upcycle_model() (in module core.transformer.moe.upcycling_utils) LOAD_COMMON (core.dist_checkpointing.strategies.base.StrategyAction attribute) load_common() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) load_common_state_dict() (in module core.dist_checkpointing.serialization) load_content_metadata() (in module core.dist_checkpointing.serialization) load_kernels() (core.resharding.nvshmem_copy_service.core.kernel_launcher.KernelLauncher method) load_parameter_state() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) load_parameter_state_from_dp_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_dp_zero() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_dp_zero_legacy() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_fs_model_space() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_parameter_state_from_fully_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) load_plain_tensors() (in module core.dist_checkpointing.serialization) load_preprocess() (in module core.dist_checkpointing.state_dict_utils) load_quantization_recipe() (in module core.quantization.utils) LOAD_SHARDED (core.dist_checkpointing.strategies.base.StrategyAction attribute) load_sharded_metadata() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) load_sharded_objects() (core.dist_checkpointing.strategies.base.LoadCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonLoadStrategy method) load_state_dict() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) (core.distributed.torch_fully_sharded_data_parallel.TorchFullyShardedDataParallel method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.grad_scaler.ConstantGradScaler method) (core.optimizer.grad_scaler.DynamicGradScaler method) (core.optimizer.grad_scaler.MegatronGradScaler method) (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer_param_scheduler.OptimizerParamScheduler method) (core.rerun_state_machine.RerunDataIterator method) (core.rerun_state_machine.RerunErrorInjector method) (core.rerun_state_machine.RerunStateMachine method) (core.transformer.module.Float16Module method) load_state_dict_from_file() (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) load_tensors_metadata() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelLoadStrategyWrapper method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) LoadCommonStrategy (class in core.dist_checkpointing.strategies.base) LoadShardedStrategy (class in core.dist_checkpointing.strategies.base) LoadStrategyBase (class in core.dist_checkpointing.strategies.base) local (core.transformer.enums.AttnBackend attribute) local_chunk_offset_in_global() (core.dist_checkpointing.mapping.ShardedTensor method) local_cp_size (core.packed_seq_params.PackedSeqParams attribute) local_multi_tensor_applier() (in module core.utils) local_multi_tensor_l2_norm() (in module core.utils) local_multi_tensor_scale() (in module core.utils) local_shape (core.dist_checkpointing.mapping.ShardedTensor attribute) LocalNonpersistentObject (class in core.dist_checkpointing.mapping) LocalShardsContainer (class in core.dist_checkpointing.strategies.checkpointable) LocalSpecProvider (class in core.models.backends) locate_item_in_global_item() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) LOCK (in module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.common) log() (core.timers.Timers method) LOG_ALL (core.dist_checkpointing.validation.StrictHandling attribute) log_config_to_disk() (in module core.config_logger) log_max_attention_logit (core.transformer.transformer_config.TransformerConfig attribute) log_num_zeros_in_grad (core.optimizer.optimizer_config.OptimizerConfig attribute) log_on_each_pipeline_stage() (in module core.utils) log_single_rank() (in module core._rank_utils) log_softmax() (in module core.transformer.custom_layers.batch_invariant_kernels) LOG_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) log_validation_summary() (in module core.resharding.nvshmem_copy_service.validation) logger (in module core.datasets.blended_dataset) (in module core.datasets.blended_megatron_dataset_builder) (in module core.datasets.blended_megatron_dataset_config) (in module core.datasets.gpt_dataset) (in module core.datasets.indexed_dataset) (in module core.datasets.masked_dataset) (in module core.datasets.megatron_tokenizer) (in module core.datasets.utils) (in module core.dist_checkpointing.exchange_utils) (in module core.dist_checkpointing.mapping) (in module core.dist_checkpointing.optimizer) (in module core.dist_checkpointing.serialization) (in module core.dist_checkpointing.strategies.async_utils) (in module core.dist_checkpointing.strategies.common) (in module core.dist_checkpointing.strategies.filesystem_async) (in module core.dist_checkpointing.strategies.fully_parallel) (in module core.dist_checkpointing.strategies.state_dict_saver) (in module core.dist_checkpointing.strategies.torch) (in module core.dist_checkpointing.tensor_aware_state_dict) (in module core.dist_checkpointing.validation) (in module core.distributed.distributed_data_parallel) (in module core.distributed.fsdp.mcore_fsdp_adapter) (in module core.distributed.fsdp.src.megatron_fsdp.fully_shard) (in module core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) (in module core.distributed.fsdp.src.megatron_fsdp.mixed_precision) (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) (in module core.distributed.fsdp.src.megatron_fsdp.utils) (in module core.distributed.param_and_grad_buffer) (in module core.full_cuda_graph) (in module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.chat_completions) (in module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.completions) (in module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.health) (in module core.inference.text_generation_server.dynamic_text_gen_server.flask_server) (in module core.models.common.embeddings.relative_pos_embedding) (in module core.models.common.embeddings.rope_utils) (in module core.models.common.embeddings.rotary_pos_embedding) (in module core.models.common.embeddings.yarn_rotary_pos_embedding) (in module core.models.mimo.model.base) (in module core.msc_utils) (in module core.nccl_allocator) (in module core.num_microbatches_calculator) (in module core.optimizer) (in module core.optimizer.distrib_optimizer) (in module core.optimizer.layer_wise_optimizer) (in module core.optimizer.muon) (in module core.optimizer.optimizer) (in module core.optimizer_param_scheduler) (in module core.parallel_state) (in module core.pipeline_parallel.utils) (in module core.post_training.modelopt.gpt.state_dict_hooks) (in module core.post_training.modelopt.layers) (in module core.quantization.quant_config) (in module core.rerun_state_machine) (in module core.resharding.copy_services.gloo_copy_service) (in module core.resharding.copy_services.nccl_copy_service) (in module core.resharding.copy_services.nvshmem_copy_service) (in module core.resharding.execution) (in module core.resharding.nvshmem_copy_service.core.gpu_resource_manager) (in module core.resharding.nvshmem_copy_service.planning.task_segmenter) (in module core.resharding.planner) (in module core.resharding.utils) (in module core.ssm.gated_delta_net) (in module core.ssm.mamba_hybrid_layer_allocation) (in module core.ssm.mamba_mixer) (in module core.timers) (in module core.tokenizers.megatron_tokenizer) (in module core.tokenizers.text.libraries.huggingface_tokenizer) (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) (in module core.tokenizers.text.libraries.tiktoken_tokenizer) (in module core.transformer.cuda_graphs) (in module core.transformer.fsdp_dtensor_checkpoint) (in module core.transformer.mlp) (in module core.transformer.moe.experts) (in module core.transformer.moe.token_dispatcher) (in module core.transformer.pipeline_parallel_layer_layout) (in module core.transformer.spec_utils) (in module core.transformer.transformer_block) (in module core.transformer.transformer_layer) (in module core.utils) logger_stack() (in module core.dist_checkpointing.utils) loss (core.transformer.enums.LayerType attribute) loss_scale (core.optimizer.optimizer_config.OptimizerConfig attribute) loss_scale_window (core.optimizer.optimizer_config.OptimizerConfig attribute) LowLevelDataset (in module core.datasets.megatron_dataset) lr (core.optimizer.optimizer_config.OptimizerConfig attribute) M main_grad_buffer (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) main_grads_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) main_loss_backward_scale (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler attribute) (core.transformer.multi_token_prediction.MTPLossAutoScaler attribute) main_params_dtype (core.optimizer.optimizer_config.OptimizerConfig attribute) main_rank_for_shard (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) main_weight_buffer (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) MAJOR (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) make_buckets_equal() (core.pipeline_parallel.hybrid_cp_schedule.BalancedCPScheduler method) make_fsdp_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) make_sharded_object_for_checkpoint() (in module core.transformer.utils) make_sharded_optimizer_tensor() (in module core.dist_checkpointing.optimizer) make_sharded_tensor_for_checkpoint() (in module core.utils) make_sharded_tensors_for_checkpoint() (in module core.transformer.utils) make_tp_sharded_tensor_for_checkpoint() (in module core.utils) make_viewless() (in module core.pipeline_parallel.utils) make_viewless_tensor() (in module core.utils) MakeViewlessTensor (class in core.utils) MAMBA (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) mamba (core.transformer.enums.CudaGraphScope attribute) mamba_bda (core.ssm.mamba_layer.MambaLayerSubmodules attribute) mamba_head_dim (core.transformer.transformer_config.TransformerConfig attribute) mamba_inference_stack_spec (in module core.models.mamba.mamba_layer_specs) mamba_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) mamba_num_groups (core.transformer.transformer_config.TransformerConfig attribute) mamba_num_heads (core.transformer.transformer_config.TransformerConfig attribute) mamba_stack_spec (in module core.models.mamba.mamba_layer_specs) mamba_state_dim (core.transformer.transformer_config.TransformerConfig attribute) mamba_state_shapes_per_request() (core.ssm.mamba_block.MambaStack method) (core.ssm.mamba_layer.MambaLayer method) (core.ssm.mamba_mixer.MambaMixer method) mamba_states_cache() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) MambaContextParallel (class in core.ssm.mamba_context_parallel) MambaLayer (class in core.ssm.mamba_layer) MambaLayerSubmodules (class in core.ssm.mamba_layer) MambaMixer (class in core.ssm.mamba_mixer) MambaMixerSubmodules (class in core.ssm.mamba_mixer) MambaModel (class in core.models.mamba.mamba_model) MambaStack (class in core.ssm.mamba_block) MambaStackSubmodules (class in core.ssm.mamba_block) MambaTokenizer (class in core.tokenizers.text.models.mamba_tokenizer) manual_buffer_registration() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) map_reduce() (in module core.dist_checkpointing.dict_utils) mark_not_offloadable() (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface static method) (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) mask (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) mask_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) masked_softmax_fusion (core.transformer.transformer_config.TransformerConfig attribute) MaskedWordPieceDataset (class in core.datasets.masked_dataset) MaskedWordPieceDatasetConfig (class in core.datasets.masked_dataset) masking_do_full_word (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_do_permutation (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_max_ngram (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_probability (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_use_geometric_distribution (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) masking_use_longer_ngrams (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) match() (core.quantization.quant_config.GlobMatcher method) (core.quantization.quant_config.Matcher method) (core.quantization.quant_config.RecipeConfig method) match_graph_config() (core.inference.batch_dimensions_utils.CUDAGraphBatchDimensionBuilder static method) match_to_config_key() (core.quantization.quant_config.RecipeConfig method) MatchContext (class in core.quantization.quant_config) Matcher (class in core.quantization.quant_config) matches() (core.optimizer.optimizer_config.ParamKey method) matmul_kernel_persistent() (in module core.transformer.custom_layers.batch_invariant_kernels) matmul_persistent() (in module core.transformer.custom_layers.batch_invariant_kernels) max_allowed_chunks() (core.dist_checkpointing.mapping.ShardedTensor method) max_btime (core.utils._StragglerData attribute) max_clock (core.utils._StragglerData attribute) max_elapsed (core.utils._StragglerData attribute) max_lr (core.optimizer_param_scheduler.ParamGroupOverride attribute) max_power (core.utils._StragglerData attribute) MAX_REQUESTS (in module core.resharding.nvshmem_copy_service.planning.task_segmenter) MAX_SEGMENT_SIZE (in module core.resharding.nvshmem_copy_service.nvshmem_types) MAX_SEGMENTS_PER_REQUEST (in module core.resharding.nvshmem_copy_service.planning.task_segmenter) max_seqlen_kv (core.packed_seq_params.PackedSeqParams attribute) max_seqlen_per_dp_cp_rank (core.model_parallel_config.ModelParallelConfig attribute) max_seqlen_q (core.packed_seq_params.PackedSeqParams attribute) max_sequence_length (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) MAX_TASKS_PER_BATCH (in module core.resharding.nvshmem_copy_service.nvshmem_types) max_temp (core.utils._StragglerData attribute) max_util (core.utils._StragglerData attribute) MaxSequenceLengthOverflowError maybe_finalize_async_calls() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) maybe_get_tensor() (core.utils.GlobalSymmetricMemoryBuffer method) maybe_initialize_symmetric_memory() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) maybe_inject() (core.rerun_state_machine.RerunErrorInjector method) maybe_load_config() (in module core.dist_checkpointing.core) maybe_miscompare() (core.rerun_state_machine.RerunErrorInjector method) maybe_move_tensor_to_cpu() (in module core.transformer.moe.moe_utils) maybe_report_missing_and_unexpected_keys() (in module core.dist_checkpointing.validation) maybe_skip_or_early_return_by_cudagraph() (in module core.transformer.moe.moe_utils) mcore_gpt_load_te_state_dict_pre_hook() (in module core.post_training.modelopt.gpt.state_dict_hooks) mcore_to_pyt_state_dict() (in module core.dist_checkpointing.strategies.torch) MCoreLoadPlanner (class in core.dist_checkpointing.strategies.torch) MCoreMetadata (class in core.dist_checkpointing.strategies.torch) MCoreSavePlan (class in core.dist_checkpointing.strategies.torch) MCoreSavePlanner (class in core.dist_checkpointing.strategies.torch) MCoreTensorAwareStateDict (class in core.dist_checkpointing.tensor_aware_state_dict) mean_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) mean_dim() (in module core.transformer.custom_layers.batch_invariant_kernels) mean_kernel() (in module core.transformer.custom_layers.batch_invariant_kernels) MEGATRON_CACHE (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) MEGATRON_CONFIG_MAP (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) MegatronDataset (class in core.datasets.megatron_dataset) MegatronFSDP (class in core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) MegatronGenerate (class in core.inference.text_generation_server.text_generation_server) MegatronGradScaler (class in core.optimizer.grad_scaler) MegatronHFTokenizer (class in core.tokenizers.text.libraries.megatron_hf_tokenizer) MegatronLegacyTokenizer (class in core.datasets.megatron_tokenizer) MegatronModule (class in core.transformer.module) MegatronOptimizer (class in core.optimizer.optimizer) MegatronServer (class in core.inference.text_generation_server.text_generation_server) MegatronTokenizer (class in core.tokenizers.megatron_tokenizer) MegatronTokenizerBase (class in core.tokenizers.base_tokenizer) MegatronTokenizerChatTemplate (class in core.tokenizers.text.libraries.chat_template) MegatronTokenizerText (class in core.tokenizers.text.text_tokenizer) MegatronTokenizerTextAbstract (class in core.tokenizers.text.libraries.abstract_tokenizer) MEMBER (core.pipeline_parallel.bridge_communicator.CommRole attribute) memory_efficient_layer_norm (core.transformer.transformer_config.TransformerConfig attribute) MemPoolAllocatorWithoutRegistration (class in core.nccl_allocator) merge() (core.inference.inference_request.DynamicInferenceRequestRecord method) (in module core.dist_checkpointing.dict_utils) merge_fn (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) merges_file (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) message (core.rerun_state_machine.Caller attribute) metainfo (core.transformer.spec_utils.ModuleSpec attribute) microbatch_group_size_per_vp_stage (core.model_parallel_config.ModelParallelConfig attribute) mid_level_dataset_surplus (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MidLevelDataset (in module core.datasets.blended_megatron_dataset_builder) MimoModel (class in core.models.mimo.model.base) MimoModelConfig (class in core.models.mimo.config.base_configs) min_btime (core.utils._StragglerData attribute) min_clock (core.utils._StragglerData attribute) min_elapsed (core.utils._StragglerData attribute) min_loss_scale (core.optimizer.optimizer_config.OptimizerConfig attribute) min_lr (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.optimizer_param_scheduler.ParamGroupOverride attribute) min_offloaded_tensor_size (core.transformer.transformer_config.TransformerConfig attribute) min_power (core.utils._StragglerData attribute) min_temp (core.utils._StragglerData attribute) min_util (core.utils._StragglerData attribute) MINOR (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) mismatches (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) MixedPrecisionOptimizer (class in core.optimizer.optimizer) mixer (core.ssm.mamba_layer.MambaLayerSubmodules attribute) MLASelfAttention (class in core.transformer.multi_latent_attention) MLASelfAttentionSubmodules (class in core.transformer.multi_latent_attention) MLATransformerConfig (class in core.transformer.transformer_config) MLP (class in core.transformer.mlp) mlp (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) MLP (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) mlp (core.transformer.enums.CudaGraphScope attribute) (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) mlp_bda (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) mlp_chunks_for_prefill (core.transformer.transformer_config.TransformerConfig attribute) mlp_fc_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_fc_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_fc_weight_mixture_of_experts (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) mlp_projection_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_projection_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_projection_weight_mixture_of_experts (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) mlp_router_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) MLPLayer (class in core.ssm.mlp_layer) MLPSubmodules (class in core.transformer.mlp) mm_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) mmap_bin_files (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) mock (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MockGPTDataset (class in core.datasets.gpt_dataset) MockGPTLowLevelDataset (class in core.datasets.gpt_dataset) MockMultimodalDataset (class in core.datasets.multimodal_dataset) modality_submodules_spec (core.models.mimo.config.base_configs.MimoModelConfig attribute) model_parallel_cuda_manual_seed() (in module core.tensor_parallel.random) model_parallel_is_initialized() (in module core.parallel_state) model_weight_buffer (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) ModelChunkState (class in core.models.common.model_chunk_schedule_plan) ModelParallelConfig (class in core.model_parallel_config) ModelType (class in core.enums) (class in core.transformer.enums) (in module core.export.model_type) modify_underlying_storage() (in module core.fp8_utils) module core core._rank_utils core.activations core.config core.config_logger core.datasets core.datasets.bert_dataset core.datasets.blended_dataset core.datasets.blended_megatron_dataset_builder core.datasets.blended_megatron_dataset_config core.datasets.data_schedule core.datasets.gpt_dataset core.datasets.helpers core.datasets.indexed_dataset core.datasets.masked_dataset core.datasets.megatron_dataset core.datasets.megatron_tokenizer core.datasets.multimodal_dataset core.datasets.object_storage_utils core.datasets.t5_dataset core.datasets.utils core.datasets.utils_s3 core.dist_checkpointing core.dist_checkpointing.core core.dist_checkpointing.dict_utils core.dist_checkpointing.exchange_utils core.dist_checkpointing.mapping core.dist_checkpointing.optimizer core.dist_checkpointing.serialization core.dist_checkpointing.state_dict_utils core.dist_checkpointing.strategies core.dist_checkpointing.strategies.async_utils core.dist_checkpointing.strategies.base core.dist_checkpointing.strategies.cached_metadata_filesystem_reader core.dist_checkpointing.strategies.checkpointable core.dist_checkpointing.strategies.common core.dist_checkpointing.strategies.filesystem_async core.dist_checkpointing.strategies.fully_parallel core.dist_checkpointing.strategies.state_dict_saver core.dist_checkpointing.strategies.torch core.dist_checkpointing.tensor_aware_state_dict core.dist_checkpointing.utils core.dist_checkpointing.validation core.distributed core.distributed.data_parallel_base core.distributed.distributed_data_parallel core.distributed.distributed_data_parallel_config core.distributed.finalize_model_grads core.distributed.fsdp core.distributed.fsdp.mcore_fsdp_adapter core.distributed.fsdp.src core.distributed.fsdp.src.megatron_fsdp core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config core.distributed.fsdp.src.megatron_fsdp.fully_shard core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp core.distributed.fsdp.src.megatron_fsdp.mixed_precision core.distributed.fsdp.src.megatron_fsdp.package_info core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor core.distributed.fsdp.src.megatron_fsdp.utils core.distributed.param_and_grad_buffer core.distributed.reduce_scatter_with_fp32_accumulation core.distributed.torch_fully_sharded_data_parallel core.distributed.torch_fully_sharded_data_parallel_config core.energy_monitor core.enums core.export core.export.data_type core.export.export_config core.export.model_type core.export.trtllm core.export.trtllm.engine_builder core.export.trtllm.engine_builder.trtllm_engine_builder core.export.trtllm.model_to_trllm_mapping core.export.trtllm.model_to_trllm_mapping.default_conversion_dict core.export.trtllm.trt_model_config core.export.trtllm.trt_model_type core.export.trtllm.trtllm_helper core.export.trtllm.trtllm_layers core.export.trtllm.trtllm_weights_converter core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter core.export.trtllm.trtllm_weights_converter.utils core.extensions core.extensions.kitchen core.extensions.transformer_engine core.extensions.transformer_engine_spec_provider core.fp4_utils core.fp8_utils core.full_cuda_graph core.fusions core.fusions.fused_bias_dropout core.fusions.fused_bias_geglu core.fusions.fused_bias_gelu core.fusions.fused_bias_swiglu core.fusions.fused_cross_entropy core.fusions.fused_indices_converter core.fusions.fused_layer_norm core.fusions.fused_mla_yarn_rope_apply core.fusions.fused_pad_routing_map core.fusions.fused_softmax core.fusions.fused_weighted_squared_relu core.hyper_comm_grid core.inference core.inference.async_stream core.inference.batch_dimensions_utils core.inference.common_inference_params core.inference.communication_utils core.inference.contexts core.inference.contexts.base_context core.inference.contexts.dynamic_block_allocator core.inference.contexts.dynamic_context core.inference.contexts.fused_kv_append_kernel core.inference.contexts.static_context core.inference.data_parallel_inference_coordinator core.inference.engines core.inference.engines.abstract_engine core.inference.engines.async_zmq_communicator core.inference.engines.dynamic_engine core.inference.engines.mcore_engine core.inference.engines.static_engine core.inference.headers core.inference.inference_client core.inference.inference_request core.inference.model_inference_wrappers core.inference.model_inference_wrappers.abstract_model_inference_wrapper core.inference.model_inference_wrappers.gpt core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper core.inference.model_inference_wrappers.inference_wrapper_config core.inference.model_inference_wrappers.t5 core.inference.model_inference_wrappers.t5.t5_inference_wrapper core.inference.sampling_params core.inference.scheduler core.inference.text_generation_controllers core.inference.text_generation_controllers.encoder_decoder_text_generation_controller core.inference.text_generation_controllers.simple_text_generation_controller core.inference.text_generation_controllers.text_generation_controller core.inference.text_generation_controllers.vlm_text_generation_controller core.inference.text_generation_server core.inference.text_generation_server.dynamic_text_gen_server core.inference.text_generation_server.dynamic_text_gen_server.endpoints core.inference.text_generation_server.dynamic_text_gen_server.endpoints.chat_completions core.inference.text_generation_server.dynamic_text_gen_server.endpoints.common core.inference.text_generation_server.dynamic_text_gen_server.endpoints.completions core.inference.text_generation_server.dynamic_text_gen_server.endpoints.health core.inference.text_generation_server.dynamic_text_gen_server.flask_server core.inference.text_generation_server.dynamic_text_gen_server.tokenization core.inference.text_generation_server.run_mcore_engine core.inference.text_generation_server.text_generation_server core.inference.text_generation_server.tokenization core.inference.unified_memory core.inference.utils core.inference_params core.jit core.model_parallel_config core.models core.models.backends core.models.bert core.models.bert.bert_layer_specs core.models.bert.bert_lm_head core.models.bert.bert_model core.models.bert.pooler core.models.common core.models.common.embeddings core.models.common.embeddings.language_model_embedding core.models.common.embeddings.relative_pos_embedding core.models.common.embeddings.rope_utils core.models.common.embeddings.rotary_pos_embedding core.models.common.embeddings.yarn_rotary_pos_embedding core.models.common.language_module core.models.common.language_module.language_module core.models.common.model_chunk_schedule_plan core.models.common.vision_module core.models.common.vision_module.vision_module core.models.gpt core.models.gpt.experimental_attention_variant_module_specs core.models.gpt.fine_grained_callables core.models.gpt.gpt_layer_specs core.models.gpt.gpt_model core.models.gpt.moe_module_specs core.models.huggingface core.models.huggingface.clip_model core.models.huggingface.module core.models.huggingface.qwen_model core.models.mamba core.models.mamba.mamba_layer_specs core.models.mamba.mamba_model core.models.mimo core.models.mimo.config core.models.mimo.config.base_configs core.models.mimo.model core.models.mimo.model.base core.models.multimodal core.models.multimodal.context_parallel core.models.multimodal.llava_model core.models.multimodal.llava_spec core.models.T5 core.models.T5.t5_model core.models.T5.t5_spec core.models.vision core.models.vision.clip_vit_model core.models.vision.multimodal_projector core.models.vision.radio core.models.vision.vit_layer_specs core.msc_utils core.nccl_allocator core.num_microbatches_calculator core.optimizer core.optimizer.clip_grads core.optimizer.cpu_offloading core.optimizer.cpu_offloading.hybrid_optimizer core.optimizer.distrib_optimizer core.optimizer.grad_scaler core.optimizer.layer_wise_optimizer core.optimizer.muon core.optimizer.optimizer core.optimizer.optimizer_config core.optimizer.qk_clip core.optimizer_param_scheduler core.package_info core.packed_seq_params core.parallel_state core.pipeline_parallel core.pipeline_parallel.bridge_communicator core.pipeline_parallel.combined_1f1b core.pipeline_parallel.fine_grained_activation_offload core.pipeline_parallel.hybrid_cp_schedule core.pipeline_parallel.multimodule_communicator core.pipeline_parallel.p2p_communication core.pipeline_parallel.schedules core.pipeline_parallel.utils core.post_training core.post_training.modelopt core.post_training.modelopt.gpt core.post_training.modelopt.gpt.model_specs core.post_training.modelopt.gpt.state_dict_hooks core.post_training.modelopt.layers core.post_training.modelopt.mamba core.post_training.modelopt.mamba.model_specs core.process_groups_config core.quantization core.quantization.quant_config core.quantization.utils core.rerun_state_machine core.resharding core.resharding.copy_services core.resharding.copy_services.base core.resharding.copy_services.gloo_copy_service core.resharding.copy_services.nccl_copy_service core.resharding.copy_services.nvshmem_copy_service core.resharding.execution core.resharding.nvshmem_copy_service core.resharding.nvshmem_copy_service.core core.resharding.nvshmem_copy_service.core.gpu_resource_manager core.resharding.nvshmem_copy_service.core.kernel_launcher core.resharding.nvshmem_copy_service.core.pipeline_executor core.resharding.nvshmem_copy_service.logger core.resharding.nvshmem_copy_service.memory core.resharding.nvshmem_copy_service.memory.double_buffer_manager core.resharding.nvshmem_copy_service.memory.tensor_pointer_utils core.resharding.nvshmem_copy_service.nvshmem_types core.resharding.nvshmem_copy_service.planning core.resharding.nvshmem_copy_service.planning.communication_scheduler core.resharding.nvshmem_copy_service.planning.gpu_execution_planner core.resharding.nvshmem_copy_service.planning.task_segmenter core.resharding.nvshmem_copy_service.planning.workload_packer core.resharding.nvshmem_copy_service.service core.resharding.nvshmem_copy_service.validation core.resharding.planner core.resharding.refit core.resharding.utils core.safe_globals core.ssm core.ssm.gated_delta_net core.ssm.mamba_block core.ssm.mamba_context_parallel core.ssm.mamba_hybrid_layer_allocation core.ssm.mamba_layer core.ssm.mamba_mixer core.ssm.mlp_layer core.ssm.triton_cache_manager core.tensor_parallel core.tensor_parallel.cross_entropy core.tensor_parallel.data core.tensor_parallel.inference_layers core.tensor_parallel.layers core.tensor_parallel.mappings core.tensor_parallel.random core.tensor_parallel.utils core.timers core.tokenizers core.tokenizers.base_tokenizer core.tokenizers.megatron_tokenizer core.tokenizers.text core.tokenizers.text.libraries core.tokenizers.text.libraries.abstract_tokenizer core.tokenizers.text.libraries.bytelevel_tokenizer core.tokenizers.text.libraries.chat_template core.tokenizers.text.libraries.huggingface_tokenizer core.tokenizers.text.libraries.megatron_hf_tokenizer core.tokenizers.text.libraries.null_tokenizer core.tokenizers.text.libraries.sentencepiece_tokenizer core.tokenizers.text.libraries.tiktoken_tokenizer core.tokenizers.text.models core.tokenizers.text.models.bert_tokenizer core.tokenizers.text.models.default_tokenizer core.tokenizers.text.models.gpt_tokenizer core.tokenizers.text.models.mamba_tokenizer core.tokenizers.text.models.t5_tokenizer core.tokenizers.text.text_tokenizer core.transformer core.transformer.attention core.transformer.cuda_graphs core.transformer.custom_layers core.transformer.custom_layers.batch_invariant_kernels core.transformer.custom_layers.transformer_engine core.transformer.dot_product_attention core.transformer.enums core.transformer.fsdp_dtensor_checkpoint core.transformer.identity_op core.transformer.mlp core.transformer.module core.transformer.moe core.transformer.moe.experts core.transformer.moe.fused_a2a core.transformer.moe.grouped_gemm_util core.transformer.moe.moe_layer core.transformer.moe.moe_utils core.transformer.moe.router core.transformer.moe.router_replay core.transformer.moe.shared_experts core.transformer.moe.token_dispatcher core.transformer.moe.upcycling_utils core.transformer.multi_latent_attention core.transformer.multi_token_prediction core.transformer.pipeline_parallel_layer_layout core.transformer.spec_utils core.transformer.torch_layer_norm core.transformer.torch_norm core.transformer.transformer_block core.transformer.transformer_config core.transformer.transformer_layer core.transformer.utils core.typed_torch core.utils module (core.transformer.spec_utils.ModuleSpec attribute) module_path (core.quantization.quant_config.MatchContext attribute) ModuleSpec (class in core.transformer.spec_utils) MOE (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) moe (core.transformer.enums.CudaGraphScope attribute) (in module core.models.mamba.mamba_layer_specs) moe_apply_probs_on_input (core.transformer.transformer_config.TransformerConfig attribute) moe_aux_loss_coeff (core.transformer.transformer_config.TransformerConfig attribute) moe_combine (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) moe_deepep_num_sms (core.transformer.transformer_config.TransformerConfig attribute) moe_dispatch (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) moe_enable_deepep (core.transformer.transformer_config.TransformerConfig attribute) moe_enable_routing_replay (core.transformer.transformer_config.TransformerConfig attribute) moe_expert_capacity_factor (core.transformer.transformer_config.TransformerConfig attribute) moe_extended_tp (core.model_parallel_config.ModelParallelConfig attribute) moe_ffn_hidden_size (core.transformer.transformer_config.TransformerConfig attribute) moe_flex_dispatcher_backend (core.transformer.transformer_config.TransformerConfig attribute) moe_grouped_gemm (core.transformer.transformer_config.TransformerConfig attribute) moe_hybridep_num_sms (core.transformer.transformer_config.TransformerConfig attribute) moe_input_jitter_eps (core.transformer.transformer_config.TransformerConfig attribute) moe_latent_size (core.transformer.transformer_config.TransformerConfig attribute) moe_layer (core.ssm.mamba_block.MambaStackSubmodules attribute) moe_layer_cache (in module core.inference.utils) moe_layer_freq (core.transformer.transformer_config.TransformerConfig attribute) moe_layer_recompute (core.transformer.transformer_config.TransformerConfig attribute) moe_pad_expert_input_to_capacity (core.transformer.transformer_config.TransformerConfig attribute) moe_pad_experts_for_cuda_graph_inference (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) moe_per_layer_logging (core.transformer.transformer_config.TransformerConfig attribute) moe_permute_fusion (core.transformer.transformer_config.TransformerConfig attribute) moe_preprocess (core.transformer.enums.CudaGraphScope attribute) moe_router (core.transformer.enums.CudaGraphScope attribute) moe_router_bias_update_rate (core.transformer.transformer_config.TransformerConfig attribute) moe_router_dtype (core.transformer.transformer_config.TransformerConfig attribute) moe_router_enable_expert_bias (core.transformer.transformer_config.TransformerConfig attribute) moe_router_force_load_balancing (core.transformer.transformer_config.TransformerConfig attribute) moe_router_fusion (core.transformer.transformer_config.TransformerConfig attribute) moe_router_group_topk (core.transformer.transformer_config.TransformerConfig attribute) moe_router_load_balancing_type (core.transformer.transformer_config.TransformerConfig attribute) moe_router_num_groups (core.transformer.transformer_config.TransformerConfig attribute) moe_router_padding_for_fp8 (core.transformer.transformer_config.TransformerConfig attribute) moe_router_padding_for_quantization (core.transformer.transformer_config.TransformerConfig attribute) moe_router_pre_softmax (core.transformer.transformer_config.TransformerConfig attribute) moe_router_score_function (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk_limited_devices (core.transformer.transformer_config.TransformerConfig attribute) moe_router_topk_scaling_factor (core.transformer.transformer_config.TransformerConfig attribute) moe_shared_expert_gate (core.transformer.transformer_config.TransformerConfig attribute) moe_shared_expert_intermediate_size (core.transformer.transformer_config.TransformerConfig attribute) moe_shared_expert_overlap (core.transformer.transformer_config.TransformerConfig attribute) moe_token_dispatcher_type (core.transformer.transformer_config.TransformerConfig attribute) moe_token_drop_policy (core.transformer.transformer_config.TransformerConfig attribute) moe_token_dropping (core.transformer.transformer_config.TransformerConfig attribute) moe_use_legacy_grouped_gemm (core.transformer.transformer_config.TransformerConfig attribute) moe_z_loss_coeff (core.transformer.transformer_config.TransformerConfig attribute) MoEAllGatherTokenDispatcher (class in core.transformer.moe.token_dispatcher) MoEAlltoAllTokenDispatcher (class in core.transformer.moe.token_dispatcher) MoEAuxLossAutoScaler (class in core.transformer.moe.moe_utils) MoECudaGraphPartialCaptureSignal MoECudaGraphTensorStore (class in core.transformer.moe.moe_utils) MoEFlexTokenDispatcher (class in core.transformer.moe.token_dispatcher) MoELayer (class in core.transformer.moe.moe_layer) MoESubmodules (class in core.transformer.moe.moe_layer) MoETokenDispatcher (class in core.transformer.moe.token_dispatcher) MoETransformerLayer (class in core.transformer.transformer_layer) mp (core.process_groups_config.ProcessGroupCollection attribute) mpu (in module core) mrope_section (core.transformer.transformer_config.TransformerConfig attribute) MSC_PREFIX (in module core.datasets.object_storage_utils) (in module core.dist_checkpointing.strategies.torch) mscale (core.transformer.transformer_config.MLATransformerConfig attribute) mscale_all_dim (core.transformer.transformer_config.MLATransformerConfig attribute) mtp (core.transformer.enums.LayerType attribute) mtp_loss_scaling_factor (core.transformer.transformer_config.TransformerConfig attribute) mtp_num_layers (core.transformer.transformer_config.TransformerConfig attribute) mtp_on_this_rank() (in module core.transformer.multi_token_prediction) mtp_post_process (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan attribute) mtp_standalone (core.model_parallel_config.ModelParallelConfig attribute) MTPLossAutoScaler (class in core.transformer.multi_token_prediction) MTPLossLoggingHelper (class in core.transformer.multi_token_prediction) multi_latent_attention (core.transformer.transformer_config.MLATransformerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) MultiGroupMemPoolAllocator (class in core.nccl_allocator) MultiGroupUBRAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) MultiLatentAttention (class in core.transformer.multi_latent_attention) MultimodalDatasetConfig (class in core.datasets.multimodal_dataset) MultimodalProjector (class in core.models.vision.multimodal_projector) MultimodalRotaryEmbedding (class in core.models.common.embeddings.rotary_pos_embedding) MultiModulePipelineCommunicator (class in core.pipeline_parallel.multimodule_communicator) multiple_validation_sets (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) MultiStorageClientFeature (in module core.msc_utils) MultiTokenPredictionBlock (class in core.transformer.multi_token_prediction) MultiTokenPredictionBlockSubmodules (class in core.transformer.multi_token_prediction) MultiTokenPredictionLayer (class in core.transformer.multi_token_prediction) MultiTokenPredictionLayerSubmodules (class in core.transformer.multi_token_prediction) muon_extra_scale_factor (core.optimizer.optimizer_config.OptimizerConfig attribute) muon_fp32_matmul_prec (core.optimizer.optimizer_config.OptimizerConfig attribute) muon_momentum (core.optimizer.optimizer_config.OptimizerConfig attribute) muon_num_ns_steps (core.optimizer.optimizer_config.OptimizerConfig attribute) muon_scale_mode (core.optimizer.optimizer_config.OptimizerConfig attribute) muon_split_qkv (core.optimizer.optimizer_config.OptimizerConfig attribute) muon_tp_mode (core.optimizer.optimizer_config.OptimizerConfig attribute) muon_use_nesterov (core.optimizer.optimizer_config.OptimizerConfig attribute) mxfp8 (core.enums.Fp8Recipe attribute) my_pe (core.resharding.nvshmem_copy_service.service.RemoteCopyService property) my_rank (core.utils.StragglerDetector property) my_slice (core.resharding.utils.TransferOp attribute) N n_pes (core.resharding.nvshmem_copy_service.service.RemoteCopyService property) name (core.optimizer.optimizer_config.ParamKey attribute) (core.optimizer.optimizer_config.ParamPredicate attribute) (core.optimizer.optimizer_config.ParamWithNamePredicate attribute) (core.resharding.utils.ParameterMetadata attribute) (core.resharding.utils.ShardingDescriptor attribute) narrow() (core.dist_checkpointing.mapping.ShardedTensor method) nccl_all_reduce_for_prefill (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) NCCL_ALLOCATOR (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) nccl_mem (class in core.nccl_allocator) NCCL_MEMORY_POOL (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) nccl_ub (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) NCCLCopyService (class in core.resharding.copy_services.nccl_copy_service) NEMOTRON_NAS_CONVERSION_DICT (in module core.export.trtllm.model_to_trllm_mapping.default_conversion_dict) nested_items_iter() (in module core.dist_checkpointing.dict_utils) nested_values() (in module core.dist_checkpointing.dict_utils) next_hdp_group() (core.pipeline_parallel.hybrid_cp_schedule.BalancedCPScheduler method) next_iter() (core.full_cuda_graph.FullCudaGraphWrapper method) no_mask (core.transformer.enums.AttnMaskType attribute) no_rope_freq (core.transformer.transformer_config.TransformerConfig attribute) NO_SHARD (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) no_sync() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) no_sync_func (core.model_parallel_config.ModelParallelConfig attribute) NON_TRANSFORMER_LAYERS_NAMES (in module core.export.trtllm.trtllm_layers) noop_decorator() (in module core.jit) NoopScheduleNode (class in core.pipeline_parallel.utils) Norm (class in core.post_training.modelopt.layers) norm (core.ssm.mamba_layer.MambaLayerSubmodules attribute) normalization (core.transformer.transformer_config.MLATransformerConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) normalize() (core.optimizer.distrib_optimizer.Range method) (in module core.datasets.utils) not_none() (in module core.typed_torch) NOT_RUNNING_YET (core.rerun_state_machine.RerunState attribute) null_decorator() (in module core.utils) null_method() (core.utils.StragglerDetector method) NullTokenizer (class in core.tokenizers.text.libraries.null_tokenizer) num_attention_heads (core.transformer.transformer_config.TransformerConfig attribute) num_buckets (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline property) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline property) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer property) num_dataset_builder_threads (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) num_decode_requests (core.inference.contexts.dynamic_context.DynamicInferenceContext property) num_distributed_optimizer_instances (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) num_experts (core.resharding.utils.ParameterMetadata attribute) num_img_embeddings_per_tile (core.inference.inference_request.VLMInferenceRequest attribute) num_layers (core.transformer.transformer_config.TransformerConfig attribute) num_layers() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) num_layers_at_end_in_bf16 (core.transformer.transformer_config.TransformerConfig attribute) num_layers_at_start_in_bf16 (core.transformer.transformer_config.TransformerConfig attribute) num_layers_in_first_pipeline_stage (core.transformer.transformer_config.TransformerConfig attribute) num_layers_in_last_pipeline_stage (core.transformer.transformer_config.TransformerConfig attribute) num_microbatches_with_partial_activation_checkpoints (core.model_parallel_config.ModelParallelConfig attribute) num_moe_experts (core.transformer.transformer_config.TransformerConfig attribute) num_query_groups (core.transformer.transformer_config.TransformerConfig attribute) num_requests_pending() (core.inference.scheduler.Scheduler method) num_tasks (core.resharding.nvshmem_copy_service.nvshmem_types.TransferMetadata attribute) num_tiles (core.inference.inference_request.VLMInferenceRequest attribute) num_tokens_to_generate (core.inference.sampling_params.SamplingParams attribute) num_tokens_total (core.inference.sampling_params.SamplingParams attribute) num_warmup_microbatches (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator property) numel_low_level_dataset() (core.datasets.gpt_dataset.GPTDataset static method) (core.datasets.gpt_dataset.MockGPTDataset static method) (core.datasets.masked_dataset.MaskedWordPieceDataset static method) (core.datasets.megatron_dataset.MegatronDataset static method) NumMicroBatchesCalculator (class in core.num_microbatches_calculator) nvfp4 (core.enums.Fp4Recipe attribute) NVSHMEMCopyService (class in core.resharding.copy_services.nvshmem_copy_service) nvtx_decorator() (in module core.utils) nvtx_range_pop() (in module core.utils) nvtx_range_push() (in module core.utils) O OBJECT_STORAGE_BIN_READERS (in module core.datasets.indexed_dataset) object_storage_cache_path (core.datasets.gpt_dataset.GPTDatasetConfig attribute) ObjectStorageConfig (class in core.datasets.object_storage_utils) offload() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) offload_grad_buffers() (core.distributed.distributed_data_parallel.DistributedDataParallel method) OFFLOAD_MGR (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager attribute) offload_modules (core.transformer.transformer_config.TransformerConfig attribute) offload_summary_bytes (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager property) offload_summary_total_bytes (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager property) offload_to_cpu() (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) OffloadTensorGroup (class in core.pipeline_parallel.fine_grained_activation_offload) offsets() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) on_get_saved_tensor() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) on_group_commit_backward() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) on_group_commit_forward() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) on_group_start_backward() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) on_group_start_forward() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) on_save_for_backward() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) open_file() (in module core.msc_utils) openai_gelu() (in module core.transformer.utils) ops (in module core.transformer.moe.grouped_gemm_util) OPTIM (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) OPTIM_GRADS (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) OPTIM_GRADS_PARAMS (core.distributed.fsdp.src.megatron_fsdp.fully_shard.ShardingStrategy attribute) optim_state_to_sharding_state() (in module core.dist_checkpointing.optimizer) optimal_dtype() (core.datasets.indexed_dataset.DType static method) optimizer (core.optimizer.optimizer.ChainedOptimizer property) (core.optimizer.optimizer_config.AdamOptimizerConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.optimizer.optimizer_config.SGDOptimizerConfig attribute) optimizer_cpu_offload (core.optimizer.optimizer_config.OptimizerConfig attribute) optimizer_offload_fraction (core.optimizer.optimizer_config.OptimizerConfig attribute) OptimizerConfig (class in core.optimizer.optimizer_config) OptimizerParamScheduler (class in core.optimizer_param_scheduler) original_max_position_embeddings (core.transformer.transformer_config.MLATransformerConfig attribute) orthogonalize() (core.optimizer.muon.TensorParallelMuon method) out_norm (core.ssm.gated_delta_net.GatedDeltaNetSubmodules attribute) out_proj (core.ssm.gated_delta_net.GatedDeltaNetSubmodules attribute) (core.ssm.mamba_mixer.MambaMixerSubmodules attribute) outer_dp_sharding_strategy (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) output_layer_init_method (core.transformer.transformer_config.TransformerConfig attribute) overlap_cpu_optimizer_d2h_h2d (core.optimizer.optimizer_config.OptimizerConfig attribute) overlap_grad_reduce (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) overlap_moe_expert_parallel_comm (core.model_parallel_config.ModelParallelConfig attribute) overlap_p2p_comm (core.model_parallel_config.ModelParallelConfig attribute) overlap_p2p_comm_warmup_flush (core.model_parallel_config.ModelParallelConfig attribute) overlap_param_gather (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) overlap_param_gather_with_optimizer_step (core.optimizer.optimizer_config.OptimizerConfig attribute) override_nonquantized_autocast (core.extensions.transformer_engine.TEQuantizationRecipe attribute) override_quantized_autocast (core.extensions.transformer_engine.TEQuantizationRecipe attribute) override_sharded_param_methods_with_safety_checks() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) overwrite_nccl_comm_cfgs() (in module core.parallel_state) owner_rank (core.resharding.utils.ParameterMetadata attribute) owns() (core.transformer.cuda_graphs.TensorReusePool method) P P (in module core.typed_torch) p2p_communicator (core.pipeline_parallel.multimodule_communicator.RankModuleInfo attribute) P2PCommunicator (class in core.pipeline_parallel.p2p_communication) pack_workloads() (core.resharding.nvshmem_copy_service.planning.workload_packer.WorkloadPacker method) PackedSeqParams (class in core.packed_seq_params) pad (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) pad_buckets_for_high_nccl_busbw (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) pad_encoder_prompts_tokens() (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) pad_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) pad_input_prompt_tokens() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) pad_routing_map() (in module core.transformer.moe.moe_utils) pad_vocab_size() (in module core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter) padded_vocab_size (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) padding (core.transformer.enums.AttnMaskType attribute) padding_causal (core.transformer.enums.AttnMaskType attribute) ParallelFileCacheManager (class in core.ssm.triton_cache_manager) PARAM (core.distributed.param_and_grad_buffer.BufferType attribute) param_group_identifier_keys (in module core.optimizer.optimizer) param_group_override_to_tuple() (in module core.optimizer_param_scheduler) param_groups (core.optimizer.optimizer.ChainedOptimizer property) (core.optimizer.optimizer.MegatronOptimizer attribute) param_is_not_shared() (in module core.transformer.module) param_is_not_tensor_parallel_duplicate() (in module core.tensor_parallel.layers) param_name (core.resharding.utils.TransferOp attribute) param_sync_func (core.model_parallel_config.ModelParallelConfig attribute) ParamAndGradBuffer (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) ParameterGroup (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) ParameterMetadata (class in core.resharding.utils) ParamGroupOverride (class in core.optimizer_param_scheduler) ParamKey (class in core.optimizer.optimizer_config) ParamPredicate (class in core.optimizer.optimizer_config) params (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) (core.transformer.spec_utils.ModuleSpec attribute) params_dtype (core.inference.model_inference_wrappers.inference_wrapper_config.InferenceWrapperConfig attribute) (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) ParamWithNamePredicate (class in core.optimizer.optimizer_config) parse_and_normalize_split() (in module core.datasets.blended_megatron_dataset_config) parse_from_config() (core.extensions.transformer_engine.TEQuantizationParams static method) (core.extensions.transformer_engine.TEQuantizationRecipe class method) parse_s3_path() (in module core.datasets.object_storage_utils) parse_str_to_list() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout static method) parse_strict_flag() (in module core.dist_checkpointing.validation) partition_buckets() (in module core.distributed.param_and_grad_buffer) partition_dim (core.resharding.utils.ParameterMetadata attribute) partition_stride (core.resharding.utils.ParameterMetadata attribute) passed (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) passed_tasks (core.resharding.nvshmem_copy_service.validation.ValidationSummary attribute) PATCH (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) path_to_cache (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) path_to_idx_cache (core.datasets.object_storage_utils.ObjectStorageConfig attribute) PAUSE (core.inference.headers.Headers attribute) (core.inference.inference_request.DynamicInferenceEventType attribute) pause() (core.energy_monitor.EnergyMonitor method) PAUSE_ACK (core.inference.headers.Headers attribute) pause_engines() (core.inference.inference_client.InferenceClient method) payload (core.inference.inference_request.DynamicInferenceEvent attribute) peer_rank (core.resharding.utils.TransferOp attribute) peer_slice (core.resharding.utils.TransferOp attribute) PELogger (class in core.resharding.nvshmem_copy_service.logger) perform_initialization (core.model_parallel_config.ModelParallelConfig attribute) permute() (in module core.transformer.moe.moe_utils) persist_layer_norm (core.transformer.transformer_config.TransformerConfig attribute) PERSISTENT_ERROR (core.rerun_state_machine.RerunDiagnostic attribute) PersistentAsyncCaller (class in core.dist_checkpointing.strategies.async_utils) pin_cpu_grads (core.optimizer.optimizer_config.OptimizerConfig attribute) pin_cpu_params (core.optimizer.optimizer_config.OptimizerConfig attribute) pipeline_dtype (core.model_parallel_config.ModelParallelConfig attribute) pipeline_model_parallel_comm_backend (core.model_parallel_config.ModelParallelConfig attribute) pipeline_model_parallel_layout (core.transformer.transformer_config.TransformerConfig attribute) pipeline_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) pipeline_parallel_group_ranks (core.resharding.utils.ParameterMetadata attribute) PipelineExecutor (class in core.resharding.nvshmem_copy_service.core.pipeline_executor) PipelineOffloadManager (class in core.pipeline_parallel.fine_grained_activation_offload) PipelineParallelLayerLayout (class in core.transformer.pipeline_parallel_layer_layout) pixel_shuffle() (in module core.models.multimodal.llava_model) pool (core.transformer.cuda_graphs.TensorReusePool attribute) Pooler (class in core.models.bert.pooler) pop_backward_chunk() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) pop_forward_chunk() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) pop_layer() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) pop_tensor() (core.pipeline_parallel.fine_grained_activation_offload.OffloadTensorGroup method) pop_tensors() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) pos_embd (core.process_groups_config.ProcessGroupCollection attribute) position_embedding (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) post_all_gather_processing() (in module core.fp8_utils) POST_BACKWARD (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) post_conv_ssm() (core.ssm.mamba_context_parallel.MambaContextParallel method) post_forward_comm() (core.transformer.moe.shared_experts.SharedExpertMLP method) post_layernorm_bias (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) post_layernorm_weight (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) post_process_requests() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) post_warmup_callback() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) postprocess() (core.transformer.moe.moe_layer.MoELayer method) PostProcessNode (class in core.models.gpt.fine_grained_callables) pp (core.process_groups_config.ProcessGroupCollection attribute) pp_rank (core.pipeline_parallel.multimodule_communicator.RankModuleInfo attribute) pp_size (core.pipeline_parallel.multimodule_communicator.RankModuleInfo attribute) PRE_BACKWARD (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.TrainingState attribute) pre_conv_ssm() (core.ssm.mamba_context_parallel.MambaContextParallel method) pre_cross_attn_layernorm (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) pre_forward_comm() (core.transformer.moe.shared_experts.SharedExpertMLP method) pre_mlp_layernorm (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) PRE_RELEASE (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) pre_reload_last_layer() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) predicate (core.optimizer.optimizer_config.ParamKey attribute) prefetch_managed_module_parameters() (in module core.inference.unified_memory) prefetch_managed_tensor() (in module core.inference.unified_memory) PrefetchOrder (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) prefill_req_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions attribute) preload_fn (core.dist_checkpointing.strategies.async_utils.AsyncRequest attribute) preload_tensors() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync static method) prep_inference_input() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) (core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper.GPTInferenceWrapper method) (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) (core.inference.text_generation_controllers.encoder_decoder_text_generation_controller.EncoderDecoderTextGenerationController method) (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) (core.inference.text_generation_controllers.vlm_text_generation_controller.VLMTextGenerationController method) prep_model_for_inference() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) prepare_decentralized_global_plan() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) prepare_for_absorption() (core.transformer.multi_latent_attention.MLASelfAttention method) prepare_gradient_calculation_operands() (core.tensor_parallel.cross_entropy.VocabParallelCrossEntropy static method) prepare_grads() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) prepare_input_tensors_for_wgrad_compute() (in module core.utils) prepare_local_plan() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) prepare_write_data() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) prepend_axis_num (core.dist_checkpointing.mapping.ShardedTensor attribute) preprocess() (core.transformer.moe.moe_layer.MoELayer method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) preprocess_for_fine_grained_offloading() (core.models.gpt.gpt_model.GPTModel method) preprocess_func (core.datasets.multimodal_dataset.MultimodalDatasetConfig attribute) preprocess_state_dict_for_uneven_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) PreProcessNode (class in core.models.gpt.fine_grained_callables) preserve_fp32_weights (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) pretty_repr() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) print_diff_in_state_dicts() (in module core.transformer.fsdp_dtensor_checkpoint) print_offload_summary_table() (in module core.pipeline_parallel.fine_grained_activation_offload) print_stats() (core.rerun_state_machine.QuickStats method) probs (core.transformer.moe.moe_utils.MoECudaGraphTensorStore attribute) process_self_moves() (core.resharding.nvshmem_copy_service.core.pipeline_executor.PipelineExecutor method) ProcessGroupCollection (class in core.process_groups_config) ProcessGroupHelperMeta (class in core.process_groups_config) prompt (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.InferenceRequest attribute) prompt_log_probs (core.inference.inference_request.InferenceRequest attribute) prompt_tokens (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.InferenceRequest attribute) prompt_top_n_logprobs (core.inference.inference_request.InferenceRequest attribute) ProxyDict (class in core.optimizer.optimizer) ptrs (core.resharding.nvshmem_copy_service.nvshmem_types.TransferMetadata attribute) push() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) push_offload_groups() (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) push_tensor() (core.pipeline_parallel.fine_grained_activation_offload.OffloadTensorGroup method) put() (core.inference.async_stream.AsyncStream method) (core.inference.text_generation_server.text_generation_server.MegatronGenerate method) (core.ssm.triton_cache_manager.ParallelFileCacheManager method) Q q_layernorm (core.transformer.attention.SelfAttentionSubmodules attribute) (core.transformer.multi_latent_attention.MLASelfAttentionSubmodules attribute) q_lora_rank (core.transformer.transformer_config.MLATransformerConfig attribute) QAttentionParamsConfigSchema (in module core.extensions.kitchen) QFlashAttentionParamsConfigSchema (in module core.extensions.kitchen) qk_clip (core.transformer.transformer_config.TransformerConfig attribute) qk_clip_alpha (core.transformer.transformer_config.TransformerConfig attribute) qk_clip_threshold (core.transformer.transformer_config.TransformerConfig attribute) qk_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) qk_l2_norm (core.transformer.transformer_config.TransformerConfig attribute) qk_layernorm (core.transformer.transformer_config.TransformerConfig attribute) qk_pos_emb_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) qkv_format (core.packed_seq_params.PackedSeqParams attribute) QLinearParams (in module core.extensions.kitchen) QLinearParamsConfigSchema (in module core.extensions.kitchen) quant_recipe (core.transformer.transformer_config.TransformerConfig attribute) QuantizationConfig (class in core.quantization.quant_config) quantize_param_shard() (in module core.fp8_utils) QuantizeRecipe (in module core.extensions.kitchen) QuantizeRecipeAttnBMM (in module core.extensions.kitchen) quick_geglu() (in module core.fusions.fused_bias_geglu) quick_geglu_back() (in module core.fusions.fused_bias_geglu) quick_gelu() (in module core.activations) (in module core.fusions.fused_bias_geglu) QuickStats (class in core.rerun_state_machine) QwenHuggingFaceModel (class in core.models.huggingface.qwen_model) R R_co (in module core.typed_torch) RADIOViTModel (class in core.models.vision.radio) RAISE_ALL (core.dist_checkpointing.validation.StrictHandling attribute) RAISE_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) RampupBatchsizeNumMicroBatchesCalculator (class in core.num_microbatches_calculator) random_seed (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) RandomSTE (class in core.transformer.moe.moe_utils) Range (class in core.optimizer.distrib_optimizer) rank (core.rerun_state_machine.Caller attribute) RankCommInfo (class in core.pipeline_parallel.bridge_communicator) RankGenerator (class in core.parallel_state) RankModuleInfo (class in core.pipeline_parallel.multimodule_communicator) read() (core.datasets.indexed_dataset._BinReader method) (core.datasets.indexed_dataset._FileBinReader method) (core.datasets.indexed_dataset._MMapBinReader method) (core.datasets.indexed_dataset._MultiStorageClientBinReader method) (core.datasets.indexed_dataset._S3BinReader method) read_metadata() (core.dist_checkpointing.strategies.cached_metadata_filesystem_reader.CachedMetadataFileSystemReader method) READY_TO_USE (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketStatus attribute) real_quant_cfg (core.post_training.modelopt.layers.BlockwiseFP8WeightTransformerLayer attribute) (core.post_training.modelopt.layers.FP8WeightTransformerLayer attribute) (core.post_training.modelopt.layers.RealQuantTransformerLayer attribute) RealQuantTransformerLayer (class in core.post_training.modelopt.layers) RECEIVER (core.pipeline_parallel.bridge_communicator.CommRole attribute) ReceiveRequest (class in core.resharding.nvshmem_copy_service.nvshmem_types) RecipeConfig (class in core.quantization.quant_config) recompute_granularity (core.transformer.transformer_config.TransformerConfig attribute) recompute_method (core.transformer.transformer_config.TransformerConfig attribute) recompute_modules (core.transformer.transformer_config.TransformerConfig attribute) recompute_num_layers (core.transformer.transformer_config.TransformerConfig attribute) reconfigure_num_microbatches_calculator() (in module core.num_microbatches_calculator) record (core.inference.engines.dynamic_engine.RequestEntry attribute) RECORD (core.transformer.moe.router_replay.RouterReplayAction attribute) record() (core.rerun_state_machine.QuickStats method) record_bwd_graph() (core.transformer.cuda_graphs._CudagraphGlobalRecord class method) record_current_stream() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) record_fwd_graph() (core.transformer.cuda_graphs._CudagraphGlobalRecord class method) record_graph_capture() (core.transformer.cuda_graphs._CudaGraphRunner method) record_indices() (core.transformer.moe.router_replay.RouterReplay method) record_offload_event() (core.pipeline_parallel.fine_grained_activation_offload.OffloadTensorGroup method) record_reload_event() (core.pipeline_parallel.fine_grained_activation_offload.OffloadTensorGroup method) recv_backward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) recv_forward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) recv_from_prev_pipeline_rank_() (in module core.inference.communication_utils) recv_from_ranks (core.pipeline_parallel.bridge_communicator.RankCommInfo attribute) recv_ops (core.resharding.utils.ReshardPlan attribute) RecvOp (class in core.resharding.copy_services.gloo_copy_service) (class in core.resharding.copy_services.nccl_copy_service) recycle_unused_buckets() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) reduce_aux_losses_tracker_across_ranks() (in module core.transformer.moe.moe_utils) reduce_from_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) reduce_gradients() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) reduce_loss_in_tracker() (core.transformer.multi_token_prediction.MTPLossLoggingHelper method) reduce_scatter_gradients() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) reduce_scatter_last_dim_to_tensor_parallel_region() (in module core.tensor_parallel.mappings) reduce_scatter_to_sequence_parallel_region() (in module core.tensor_parallel.mappings) reduce_scatter_with_fp32_accumulation (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) reduce_scatter_with_fp32_accumulation() (in module core.distributed.reduce_scatter_with_fp32_accumulation) RefitBackendName (in module core.resharding.refit) register_default_common_strategies() (in module core.dist_checkpointing.strategies.common) register_default_strategy() (in module core.dist_checkpointing.strategies.base) register_default_torch_strategies() (in module core.dist_checkpointing.strategies.torch) register_grad_ready() (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) register_mem_pool() (in module core.nccl_allocator) register_receive() (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) register_safe_globals() (in module core.safe_globals) register_send() (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) RegisterFSDPBackwardFunction (class in core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) RelativePositionEmbedding (class in core.models.common.embeddings.relative_pos_embedding) release_bucket() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) release_memory_blocks() (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) release_memory_blocks_from_request_indexes() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) release_state() (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan method) (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) reload() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) reload_from_cpu() (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) reload_mergeable_ranks() (in module core.tokenizers.text.libraries.tiktoken_tokenizer) reload_model_params() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) remaining_prompt_length (core.inference.inference_request.DynamicInferenceRequest property) remaining_prompt_tokens (core.inference.inference_request.DynamicInferenceRequest attribute) RemoteCopyService (class in core.resharding.nvshmem_copy_service.service) remove_sharded_tensors() (core.dist_checkpointing.strategies.base.LoadShardedStrategy method) (core.dist_checkpointing.strategies.torch.TorchDistLoadShardedStrategy method) (in module core.dist_checkpointing.serialization) rename_input_layer_names_to_trtllm_layer_names() (core.export.trtllm.trtllm_layers.TRTLLMLayers static method) replace_prefix_for_sharding() (in module core.dist_checkpointing.utils) REPLAY_BACKWARD (core.transformer.moe.router_replay.RouterReplayAction attribute) REPLAY_FORWARD (core.transformer.moe.router_replay.RouterReplayAction attribute) replay_graph_capture() (core.transformer.cuda_graphs._CudaGraphRunner method) replica_id (core.dist_checkpointing.mapping.ShardedBase attribute) (core.dist_checkpointing.mapping.ShardedObject attribute) (core.dist_checkpointing.mapping.ShardedTensor attribute) (core.dist_checkpointing.mapping.ShardedTensorFactory attribute) ReplicaId (in module core.dist_checkpointing.mapping) report() (core.utils.StragglerDetector method) REPORT_DETERMINISM_STATS (core.rerun_state_machine.RerunMode attribute) REPORTING_INTERVAL_ITERATIONS (core.rerun_state_machine.RerunStateMachine attribute) req_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions property) request_id (core.inference.inference_request.DynamicInferenceRequest attribute) (core.inference.inference_request.DynamicInferenceRequestRecord property) (core.inference.inference_request.InferenceRequest attribute) REQUEST_ID_BASE (in module core.resharding.nvshmem_copy_service.planning.task_segmenter) REQUEST_ROUNDER (core.inference.contexts.dynamic_context.DynamicInferenceContext attribute) RequestEntry (class in core.inference.engines.dynamic_engine) RequestOverflowError requests (core.inference.inference_request.DynamicInferenceRequestRecord attribute) requires_explicit_ckpt_mismatch_check() (core.dist_checkpointing.validation.StrictHandling static method) requires_global_app_metadata() (core.dist_checkpointing.validation.StrictHandling static method) requires_grad (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) requires_returning_mismatch_keys() (core.dist_checkpointing.validation.StrictHandling static method) reroute_samples_to_hdp_ranks() (core.datasets.data_schedule.HybridCPDataLoaderWrapper method) RERUN_DISABLED (core.rerun_state_machine.RerunValidationStatus attribute) RerunDataIterator (class in core.rerun_state_machine) RerunDiagnostic (class in core.rerun_state_machine) RerunErrorInjector (class in core.rerun_state_machine) RerunMode (class in core.rerun_state_machine) RERUNNING_AGAIN_FROM_CHECKPOINT (core.rerun_state_machine.RerunState attribute) RERUNNING_FROM_CHECKPOINT (core.rerun_state_machine.RerunState attribute) RERUNNING_IN_PLACE (core.rerun_state_machine.RerunState attribute) RerunState (class in core.rerun_state_machine) RerunStateMachine (class in core.rerun_state_machine) RerunValidationStatus (class in core.rerun_state_machine) reset() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.inference.contexts.dynamic_block_allocator.BlockAllocator method) (core.inference.contexts.dynamic_context.DynamicInferenceContext method) (core.inference.contexts.static_context.StaticInferenceContext method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.utils.Counter method) (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface static method) (core.pipeline_parallel.fine_grained_activation_offload.GPUTensorPool method) (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager method) (core.rerun_state_machine.QuickStats method) (core.tensor_parallel.random.CudaRNGStatesTracker method) (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) (core.utils.StragglerDetector method) reset_attention_mask (core.datasets.gpt_dataset.GPTDatasetConfig attribute) reset_attention_state() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) reset_batch_size_offset() (core.inference.contexts.base_context.BaseInferenceContext method) reset_global_aux_loss_tracker() (core.transformer.moe.router.TopKRouter method) reset_hybrid_ep_buffer() (in module core.transformer.moe.fused_a2a) reset_instance() (core.pipeline_parallel.fine_grained_activation_offload.FineGrainedActivationOffloadingInterface static method) (core.pipeline_parallel.fine_grained_activation_offload.PipelineOffloadManager class method) reset_mamba_state() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) reset_model_temporary_tensors() (in module core.distributed.finalize_model_grads) reset_parameters() (core.fusions.fused_layer_norm.FusedLayerNorm method) (core.ssm.gated_delta_net.GatedDeltaNet method) (core.transformer.moe.router.Router method) reset_position_ids (core.datasets.gpt_dataset.GPTDatasetConfig attribute) ResetParametersContext (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) reshard_after_forward (core.distributed.torch_fully_sharded_data_parallel_config.TorchFullyShardedDataParallelConfig attribute) reshard_model_weights() (in module core.resharding.refit) ReshardPlan (class in core.resharding.utils) resolve_tensor() (core.dist_checkpointing.strategies.torch.MCoreLoadPlanner method) resolved_name (core.resharding.utils.ParameterMetadata attribute) restore_from_cpu() (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.MegatronOptimizer method) restore_grad_buffers() (core.distributed.distributed_data_parallel.DistributedDataParallel method) restore_tensor_device() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) result (core.full_cuda_graph.FullCudaGraphWrapper attribute) results (core.resharding.nvshmem_copy_service.validation.ValidationSummary attribute) RESUME (core.inference.headers.Headers attribute) resume() (core.energy_monitor.EnergyMonitor method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) resume_engines() (core.inference.inference_client.InferenceClient method) resume_paused_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) retrieve_write_results() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) RETURN_ALL (core.dist_checkpointing.validation.StrictHandling attribute) return_layer_name_and_number() (core.export.trtllm.trtllm_layers.TRTLLMLayers static method) return_log_probs (core.inference.sampling_params.SamplingParams attribute) return_prompt_top_n_logprobs (core.inference.sampling_params.SamplingParams attribute) return_segments (core.inference.sampling_params.SamplingParams attribute) RETURN_UNEXPECTED (core.dist_checkpointing.validation.StrictHandling attribute) reuse_grad_buf_for_mxfp8_param_ag (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) rewind() (core.rerun_state_machine.RerunDataIterator method) rmsnorm_batch_invariant() (in module core.transformer.custom_layers.batch_invariant_kernels) role (core.pipeline_parallel.bridge_communicator.RankCommInfo attribute) roll_tensor() (in module core.transformer.multi_token_prediction) rope_type (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_base (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_bwd_kv_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_bwd_q_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_fwd_kv_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_fwd_q_kernel() (in module core.fusions.fused_mla_yarn_rope_apply) rotary_interleaved (core.transformer.transformer_config.TransformerConfig attribute) rotary_percent (core.transformer.transformer_config.MLATransformerConfig attribute) rotary_scaling_factor (core.transformer.transformer_config.MLATransformerConfig attribute) RotaryBucketAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) RotaryEmbedding (class in core.models.common.embeddings.rotary_pos_embedding) round_up() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) round_up_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) round_up_tokens() (core.inference.contexts.dynamic_context.DynamicInferenceContext class method) route() (core.transformer.moe.moe_layer.MoELayer method) routed_experts_compute() (core.transformer.moe.moe_layer.MoELayer method) Router (class in core.transformer.moe.router) router (core.transformer.moe.moe_layer.MoESubmodules attribute) router_and_preprocess() (core.transformer.moe.moe_layer.MoELayer method) router_gating_linear() (in module core.transformer.moe.moe_utils) RouterBuilder (class in core.transformer.moe.moe_layer) RouterGatingLinearFunction (class in core.transformer.moe.moe_utils) RouterInterface (class in core.transformer.moe.moe_layer) RouterReplay (class in core.transformer.moe.router_replay) RouterReplayAction (class in core.transformer.moe.router_replay) routing() (core.transformer.moe.router.Router method) (core.transformer.moe.router.TopKRouter method) routing_map (core.transformer.moe.moe_utils.MoECudaGraphTensorStore attribute) row_parallel_linear() (core.extensions.transformer_engine_spec_provider.TESpecProvider method) (core.models.backends.BackendSpecProvider method) (core.models.backends.InferenceSpecProvider method) (core.models.backends.LocalSpecProvider method) RowParallelLinear (class in core.tensor_parallel.layers) run() (core.inference.text_generation_server.text_generation_server.MegatronServer method) (core.models.common.model_chunk_schedule_plan.TransformerLayerSchedulePlan static method) (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan static method) (core.pipeline_parallel.utils.AbstractSchedulePlan static method) (core.resharding.copy_services.base.CopyService method) (core.resharding.copy_services.gloo_copy_service.GlooCopyService method) (core.resharding.copy_services.nccl_copy_service.NCCLCopyService method) (core.resharding.copy_services.nvshmem_copy_service.NVSHMEMCopyService method) (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) run_engine() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.engines.static_engine.StaticInferenceEngine method) run_engine_async() (core.inference.engines.static_engine.StaticInferenceEngine method) run_engine_with_coordinator() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) run_flask_server() (in module core.inference.text_generation_server.dynamic_text_gen_server.flask_server) run_mcore_engine() (in module core.inference.text_generation_server.run_mcore_engine) run_one_forward_step() (core.inference.model_inference_wrappers.abstract_model_inference_wrapper.AbstractModelInferenceWrapper method) run_realtime_tests() (core.transformer.attention.SelfAttention method) S S3_PREFIX (in module core.datasets.object_storage_utils) S3Client (class in core.datasets.object_storage_utils) S3Config (in module core.datasets.object_storage_utils) safe_get_rank() (in module core._rank_utils) SAFE_GLOBALS (in module core.safe_globals) safely_set_viewless_tensor_data() (in module core.utils) sample_from_logits() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) sampling_params (core.inference.inference_request.InferenceRequest attribute) SamplingParams (class in core.inference.sampling_params) save() (core.dist_checkpointing.strategies.base.AsyncSaveShardedStrategy method) (core.dist_checkpointing.strategies.base.SaveShardedStrategy method) (core.dist_checkpointing.strategies.fully_parallel.FullyParallelSaveStrategyWrapper method) (in module core.dist_checkpointing.serialization) SAVE_COMMON (core.dist_checkpointing.strategies.base.StrategyAction attribute) save_common() (core.dist_checkpointing.strategies.base.SaveCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonSaveStrategy method) save_config() (in module core.dist_checkpointing.core) save_loss_to_tracker() (core.transformer.multi_token_prediction.MTPLossLoggingHelper static method) save_parameter_state() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) save_preprocess() (in module core.dist_checkpointing.state_dict_utils) save_pretrained() (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) SAVE_SHARDED (core.dist_checkpointing.strategies.base.StrategyAction attribute) save_sharded_objects() (core.dist_checkpointing.strategies.base.SaveCommonStrategy method) (core.dist_checkpointing.strategies.common.TorchCommonSaveStrategy method) save_state_dict_async_finalize() (in module core.dist_checkpointing.strategies.state_dict_saver) save_state_dict_async_plan() (in module core.dist_checkpointing.strategies.state_dict_saver) save_state_dict_to_file() (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) save_to_aux_losses_tracker() (in module core.transformer.moe.moe_utils) save_vocabulary() (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) SaveCommonStrategy (class in core.dist_checkpointing.strategies.base) SaveShardedStrategy (class in core.dist_checkpointing.strategies.base) SaveStrategyBase (class in core.dist_checkpointing.strategies.base) scale (core.optimizer.grad_scaler.MegatronGradScaler property) scale_gradients() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) (core.distributed.param_and_grad_buffer._ParamAndGradBuffer method) scale_loss() (core.optimizer.optimizer.MegatronOptimizer method) scaled_init_method_normal() (in module core.utils) ScaledMaskedSoftmax (class in core.fusions.fused_softmax) ScaledSoftmax (class in core.fusions.fused_softmax) ScaledUpperTriangMaskedSoftmax (class in core.fusions.fused_softmax) scatter_to_sequence_parallel_region() (in module core.tensor_parallel.mappings) scatter_to_tensor_model_parallel_region() (in module core.tensor_parallel.mappings) schedule() (core.resharding.nvshmem_copy_service.service.RemoteCopyService method) schedule_async_call() (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) (core.dist_checkpointing.strategies.async_utils.PersistentAsyncCaller method) (core.dist_checkpointing.strategies.async_utils.TemporalAsyncCaller method) schedule_async_request() (core.dist_checkpointing.strategies.async_utils.AsyncCallsQueue method) schedule_chunked_prefill() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) schedule_non_chunked_prefill() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) schedule_requests() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) schedule_waiting_requests() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) ScheduledBatch (class in core.resharding.nvshmem_copy_service.nvshmem_types) ScheduleNode (class in core.pipeline_parallel.utils) Scheduler (class in core.inference.scheduler) SECOND_RERUN_NOT_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) SECOND_RERUN_REPRODUCIBLE (core.rerun_state_machine.RerunValidationStatus attribute) seed (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) SEGMENT_ID_MULTIPLIER (in module core.resharding.nvshmem_copy_service.planning.task_segmenter) segment_receive_request() (core.resharding.nvshmem_copy_service.planning.task_segmenter.TaskSegmenter method) segment_send_request() (core.resharding.nvshmem_copy_service.planning.task_segmenter.TaskSegmenter method) segments (core.inference.inference_request.InferenceRequest attribute) select_src_metadata_balanced() (in module core.resharding.utils) self_attention (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) self_attn (core.transformer.enums.AttnType attribute) self_attn_bda (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) SelfAttention (class in core.transformer.attention) SelfAttentionSubmodules (class in core.transformer.attention) send_backward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_backward_recv_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_backward_recv_forward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_do_generate() (in module core.inference.text_generation_server.dynamic_text_gen_server.endpoints.common) send_forward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_backward_recv_forward_backward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_recv_backward() (core.pipeline_parallel.bridge_communicator.BridgeCommunicator method) (core.pipeline_parallel.multimodule_communicator.MultiModulePipelineCommunicator method) (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_forward_recv_forward() (core.pipeline_parallel.p2p_communication.P2PCommunicator method) send_ops (core.resharding.utils.ReshardPlan attribute) send_to_next_pipeline_rank() (in module core.inference.communication_utils) send_to_ranks (core.pipeline_parallel.bridge_communicator.RankCommInfo attribute) SENDER (core.pipeline_parallel.bridge_communicator.CommRole attribute) SendOp (class in core.resharding.copy_services.gloo_copy_service) (class in core.resharding.copy_services.nccl_copy_service) SendRequest (class in core.resharding.nvshmem_copy_service.nvshmem_types) SentencePieceTokenizer (class in core.tokenizers.text.libraries.sentencepiece_tokenizer) sep (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) sep_id (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) sequence (core.rerun_state_machine.Call attribute) sequence_length (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) sequence_length_decoder (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig attribute) sequence_length_encoder (core.datasets.t5_dataset.T5MaskedWordPieceDatasetConfig attribute) sequence_lengths (core.datasets.indexed_dataset.IndexedDataset property) sequence_modes (core.datasets.indexed_dataset.IndexedDataset property) sequence_parallel (core.model_parallel_config.ModelParallelConfig attribute) sequence_parallel_size (core.datasets.gpt_dataset.GPTDatasetConfig attribute) sequences_per_dataset (core.datasets.gpt_dataset.GPTDatasetConfig attribute) SequentialMLP (class in core.transformer.moe.experts) SerializableStateType (in module core.rerun_state_machine) serialize() (core.inference.contexts.dynamic_context.ContextErrorFactory class method) (core.inference.inference_request.DynamicInferenceEvent method) (core.inference.inference_request.DynamicInferenceRequest method) (core.inference.inference_request.DynamicInferenceRequestRecord method) (core.inference.inference_request.InferenceRequest method) (core.inference.sampling_params.SamplingParams method) serialize_tensor() (in module core.inference.inference_request) set() (core.transformer.moe.moe_utils.MoECudaGraphTensorStore method) set_barrier_group() (core.timers.Timer method) set_batch_invariant_mode() (in module core.transformer.custom_layers.batch_invariant_kernels) set_current_microbatch() (in module core.pipeline_parallel.schedules) set_data_parallel_rank() (in module core.parallel_state) set_decode_expert_padding() (in module core.inference.utils) set_defaults_if_not_set_tensor_model_parallel_attributes() (in module core.tensor_parallel.layers) set_document_indices() (core.datasets.indexed_dataset.IndexedDataset method) set_elapsed() (core.timers.Timer method) set_events() (core.resharding.nvshmem_copy_service.core.pipeline_executor.PipelineExecutor method) set_experimental_flag() (in module core.config) set_expert_model_parallel_rank() (in module core.parallel_state) set_expert_model_parallel_world_size() (in module core.parallel_state) set_expert_tensor_parallel_rank() (in module core.parallel_state) set_expert_tensor_parallel_world_size() (in module core.parallel_state) set_extra_state() (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) set_for_recompute_input_layernorm() (core.transformer.attention.Attention method) (core.transformer.attention.SelfAttention method) (core.transformer.multi_latent_attention.MLASelfAttention method) set_for_recompute_pre_mlp_layernorm() (core.transformer.moe.moe_layer.MoELayer method) set_global_router_replay_action() (core.transformer.moe.router_replay.RouterReplay static method) set_graphed_backward_dw_callable() (core.models.gpt.fine_grained_callables._BackwardDWWrapper method) set_ideal_affinity_for_current_gpu() (in module core.pipeline_parallel.utils) set_input_tensor() (core.models.bert.bert_model.BertModel method) (core.models.gpt.gpt_model.GPTModel method) (core.models.huggingface.module.HuggingFaceModule method) (core.models.mamba.mamba_model.MambaModel method) (core.models.mimo.model.base.MimoModel method) (core.models.multimodal.llava_model.LLaVAModel method) (core.models.T5.t5_model.T5Model method) (core.models.vision.clip_vit_model.CLIPViTModel method) (core.models.vision.radio.RADIOViTModel method) (core.ssm.mamba_block.MambaStack method) (core.transformer.module.Float16Module method) (core.transformer.transformer_block.TransformerBlock method) set_is_first_microbatch() (core.transformer.module.MegatronModule method) set_item() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.DataParallelBuffer method) set_layer_number() (core.transformer.moe.moe_layer.BaseMoELayer method) (core.transformer.moe.moe_layer.RouterInterface method) (core.transformer.moe.router.Router method) set_level() (core.resharding.nvshmem_copy_service.logger.PELogger class method) set_loss_scale() (core.transformer.moe.moe_utils.MoEAuxLossAutoScaler static method) (core.transformer.multi_token_prediction.MTPLossAutoScaler static method) set_mode() (core.rerun_state_machine.RerunStateMachine method) set_model_auto_sync() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) set_model_to_sequence_parallel() (in module core.transformer.utils) set_pipeline_model_parallel_rank() (in module core.parallel_state) set_pipeline_model_parallel_world_size() (in module core.parallel_state) set_replay_data() (core.transformer.moe.router_replay.RouterReplay static method) set_router_replay_action() (core.transformer.moe.router_replay.RouterReplay method) set_save_original_input() (in module core.extensions.transformer_engine) set_shared_experts() (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) set_states() (core.extensions.transformer_engine.TECudaRNGStatesTracker method) (core.tensor_parallel.random.CudaRNGStatesTracker method) set_stop_word_finished_ids_callback() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) set_streams() (core.resharding.nvshmem_copy_service.core.kernel_launcher.KernelLauncher method) (core.resharding.nvshmem_copy_service.core.pipeline_executor.PipelineExecutor method) (in module core.pipeline_parallel.utils) set_symmetric_ar() (core.transformer.module.MegatronModule method) set_target_indices() (core.transformer.moe.router_replay.RouterReplay method) set_te_cuda_graph_backward_dw_wrapper() (core.transformer.module.GraphableMegatronModule method) set_tensor_grad_fn_sequence_sr() (in module core.transformer.moe.shared_experts) set_tensor_model_parallel_attributes() (in module core.tensor_parallel.layers) set_tensor_model_parallel_rank() (in module core.parallel_state) set_tensor_model_parallel_world_size() (in module core.parallel_state) set_virtual_pipeline_model_parallel_rank() (in module core.parallel_state) set_virtual_pipeline_model_parallel_world_size() (in module core.parallel_state) setup() (core.energy_monitor.EnergyMonitor method) setup_embeddings_and_output_layer() (core.models.common.language_module.language_module.LanguageModule method) setup_manual_hooks() (core.transformer.module.GraphableMegatronModule method) setup_metadata() (core.transformer.moe.token_dispatcher._DeepepManager method) (core.transformer.moe.token_dispatcher._DispatchManager method) (core.transformer.moe.token_dispatcher._HybridEPManager method) setup_process_groups_for_ddp() (core.process_groups_config.ProcessGroupCollection static method) setup_process_groups_for_optimizer() (core.process_groups_config.ProcessGroupCollection static method) sgd_momentum (core.optimizer.optimizer_config.OptimizerConfig attribute) (core.optimizer.optimizer_config.SGDOptimizerConfig attribute) SGDOptimizerConfig (class in core.optimizer.optimizer_config) shape (core.resharding.utils.ParameterMetadata attribute) Shape (in module core.pipeline_parallel.combined_1f1b) (in module core.pipeline_parallel.multimodule_communicator) (in module core.pipeline_parallel.p2p_communication) (in module core.pipeline_parallel.schedules) shard_buffer() (in module core.distributed.param_and_grad_buffer) shard_params() (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) shard_to_metadata (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) ShardBucketIndex (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) ShardDistribution (class in core.dist_checkpointing.exchange_utils) sharded_backend (core.dist_checkpointing.core.CheckpointingConfig attribute) sharded_backend_version (core.dist_checkpointing.core.CheckpointingConfig attribute) sharded_param_state_dp_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_dp_zero() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_fs_model_space() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_fsdp_dtensor() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_param_state_fully_reshardable() (core.optimizer.distrib_optimizer.DistributedOptimizer method) sharded_state_dict (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict attribute) sharded_state_dict() (core.extensions.transformer_engine.TEColumnParallelLinear method) (core.extensions.transformer_engine.TEDotProductAttention method) (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) (core.extensions.transformer_engine.TERowParallelLinear method) (core.models.common.language_module.language_module.LanguageModule method) (core.models.gpt.gpt_model.GPTModel method) (core.models.T5.t5_model.T5Model method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.post_training.modelopt.layers.Linear method) (core.ssm.gated_delta_net.GatedDeltaNet method) (core.ssm.mamba_block.MambaStack method) (core.ssm.mamba_layer.MambaLayer method) (core.ssm.mamba_mixer.ExtendedRMSNorm method) (core.ssm.mamba_mixer.MambaMixer method) (core.tensor_parallel.layers.ColumnParallelLinear method) (core.tensor_parallel.layers.RowParallelLinear method) (core.tensor_parallel.layers.VocabParallelEmbedding method) (core.transformer.dot_product_attention.DotProductAttention method) (core.transformer.mlp.MLP method) (core.transformer.module.Float16Module method) (core.transformer.module.MegatronModule method) (core.transformer.moe.experts.GroupedMLP method) (core.transformer.moe.experts.SequentialMLP method) (core.transformer.moe.experts.TEGroupedMLP method) (core.transformer.moe.shared_experts.SharedExpertMLP method) (core.transformer.multi_token_prediction.MultiTokenPredictionBlock method) (core.transformer.multi_token_prediction.MultiTokenPredictionLayer method) (core.transformer.transformer_block.TransformerBlock method) (core.transformer.transformer_layer.TransformerLayer method) sharded_state_dict_default() (in module core.transformer.utils) sharded_state_dict_keys_map (core.ssm.mamba_layer.MambaLayerSubmodules attribute) (core.transformer.transformer_layer.TransformerLayerSubmodules attribute) sharded_tensor_to_torch_sharded_tensor() (in module core.dist_checkpointing.strategies.torch) ShardedBase (class in core.dist_checkpointing.mapping) ShardedObject (class in core.dist_checkpointing.mapping) ShardedStateDict (in module core.dist_checkpointing.mapping) ShardedTensor (class in core.dist_checkpointing.mapping) ShardedTensorFactory (class in core.dist_checkpointing.mapping) ShardingDescriptor (class in core.resharding.utils) ShardingStrategy (class in core.distributed.fsdp.src.megatron_fsdp.fully_shard) shards_in_this_group (core.dist_checkpointing.exchange_utils.ShardDistribution attribute) shared_embedding_or_output_weight() (core.models.common.language_module.language_module.LanguageModule method) (core.models.gpt.gpt_model.GPTModel method) (core.models.multimodal.llava_model.LLaVAModel method) (core.models.T5.t5_model.T5Model method) shared_expert_output (core.transformer.moe.moe_utils.MoECudaGraphTensorStore attribute) shared_experts (core.transformer.moe.moe_layer.MoESubmodules attribute) shared_experts_compute() (core.transformer.moe.moe_layer.MoELayer method) SharedExpertMLP (class in core.transformer.moe.shared_experts) short_sequence_probability (core.datasets.masked_dataset.MaskedWordPieceDatasetConfig attribute) should_bulk_offload() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) should_checkpoint_and_exit() (core.rerun_state_machine.RerunStateMachine method) should_free_input() (in module core.models.gpt.fine_grained_callables) should_run_forward_backward() (core.rerun_state_machine.RerunStateMachine method) shutdown() (core.energy_monitor.EnergyMonitor method) (core.resharding.nvshmem_copy_service.logger.PELogger class method) SiglipHuggingFaceModel (class in core.models.huggingface.clip_model) SingleDeviceTRTLLMModelWeightsConverter (class in core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter) sinkhorn() (in module core.transformer.moe.moe_utils) sinkhorn_load_balancing() (core.transformer.moe.router.TopKRouter method) size (core.datasets.gpt_dataset.MockGPTLowLevelDataset attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.ReceiveRequest attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.SendRequest attribute) (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) size() (core.datasets.indexed_dataset.DType static method) sizes (core.resharding.nvshmem_copy_service.nvshmem_types.TransferMetadata attribute) skip_prompt_log_probs (core.inference.sampling_params.SamplingParams attribute) softmax_scale (core.transformer.transformer_config.TransformerConfig attribute) softmax_type (core.transformer.transformer_config.TransformerConfig attribute) SoftmaxOne (class in core.fusions.fused_softmax) sort_chunks_by_idxs() (in module core.transformer.moe.moe_utils) special_token_ids (core.models.mimo.config.base_configs.MimoModelConfig attribute) SPECIAL_TOKEN_TEMPLATE (in module core.tokenizers.text.libraries.tiktoken_tokenizer) SPECIAL_TOKENS (in module core.tokenizers.text.libraries.tiktoken_tokenizer) Split (class in core.datasets.utils) split (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) split_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) split_matrix (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) split_state_dict_if_needed() (core.optimizer.distrib_optimizer.DistributedOptimizer method) split_te_layernorm_column_parallel_linear() (in module core.extensions.transformer_engine) split_tensor_along_last_dim() (in module core.tensor_parallel.utils) split_tensor_into_1d_equal_chunks() (in module core.tensor_parallel.utils) squared_relu() (in module core.activations) src_dim_ranks (core.resharding.utils.ShardingDescriptor attribute) src_pe (core.resharding.nvshmem_copy_service.nvshmem_types.ReceiveRequest attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) src_pos (core.resharding.nvshmem_copy_service.nvshmem_types.SendRequest attribute) src_rank (core.resharding.copy_services.gloo_copy_service.RecvOp attribute) (core.resharding.copy_services.nccl_copy_service.RecvOp attribute) src_stride (core.resharding.utils.ShardingDescriptor attribute) src_tensor (core.resharding.nvshmem_copy_service.nvshmem_types.SendRequest attribute) start() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator method) (core.inference.inference_client.InferenceClient method) (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) start_grad_sync() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) start_listening_to_data_parallel_coordinator() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) start_method() (core.utils.StragglerDetector method) start_param_sync() (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) (core.distributed.param_and_grad_buffer._ParamAndGradBucketGroup method) start_wd (core.optimizer_param_scheduler.ParamGroupOverride attribute) state (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan property) (core.optimizer.optimizer.ChainedOptimizer property) (core.optimizer.optimizer.MegatronOptimizer attribute) state_dict() (core.distributed.data_parallel_base._BaseDataParallel method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.grad_scaler.ConstantGradScaler method) (core.optimizer.grad_scaler.DynamicGradScaler method) (core.optimizer.grad_scaler.MegatronGradScaler method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer_param_scheduler.OptimizerParamScheduler method) (core.rerun_state_machine.RerunDataIterator method) (core.rerun_state_machine.RerunErrorInjector method) (core.rerun_state_machine.RerunStateMachine method) (core.transformer.module.Float16Module method) state_dict_for_save_checkpoint() (core.distributed.data_parallel_base._BaseDataParallel method) (core.transformer.module.Float16Module method) (core.transformer.module.MegatronModule method) StateDict (in module core.dist_checkpointing.mapping) static_buffers (core.full_cuda_graph.StaticBufferLoader attribute) StaticBufferLoader (class in core.full_cuda_graph) StaticInferenceContext (class in core.inference.contexts.static_context) StaticInferenceEngine (class in core.inference.engines.static_engine) Status (class in core.inference.inference_request) status (core.inference.inference_request.InferenceRequest attribute) step (core.inference.engines.dynamic_engine.DynamicInferenceEngine attribute) step() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) (core.optimizer.layer_wise_optimizer.LayerWiseDistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) (core.optimizer_param_scheduler.OptimizerParamScheduler method) step_legacy() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) step_modern() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) step_with_ready_grads() (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) (core.optimizer.optimizer.MixedPrecisionOptimizer method) STOP (core.inference.headers.Headers attribute) stop() (core.inference.data_parallel_inference_coordinator.DataParallelInferenceCoordinator method) (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) (core.inference.inference_client.InferenceClient method) (core.timers.DummyTimer method) (core.timers.Timer method) (core.timers.TimerBase method) STOP_ACK (core.inference.headers.Headers attribute) stop_communication() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) stop_engines() (core.inference.inference_client.InferenceClient method) STOP_ITERATION (in module core.inference.async_stream) stop_method() (core.utils.StragglerDetector method) stop_word_ids (core.inference.inference_request.DynamicInferenceRequest attribute) stop_words (core.inference.sampling_params.SamplingParams attribute) StorageResizeBasedBucketAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) store_param_remainders (core.optimizer.optimizer_config.OptimizerConfig attribute) str_dtype_to_torch() (in module core.export.trtllm.trtllm_weights_converter.distributed_trtllm_model_weights_converter) (in module core.export.trtllm.trtllm_weights_converter.single_device_trtllm_model_weights_converter) StragglerDetector (class in core.utils) StrategyAction (class in core.dist_checkpointing.strategies.base) stream (core.transformer.moe.shared_experts.SharedExpertMLP attribute) stream_acquire_context() (in module core.pipeline_parallel.utils) stream_tokens() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) StrictHandling (class in core.dist_checkpointing.validation) sub_optimizers (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer property) submit_recv() (core.resharding.copy_services.base.CopyService method) (core.resharding.copy_services.gloo_copy_service.GlooCopyService method) (core.resharding.copy_services.nccl_copy_service.NCCLCopyService method) (core.resharding.copy_services.nvshmem_copy_service.NVSHMEMCopyService method) submit_recv_with_id() (core.resharding.copy_services.gloo_copy_service.GlooCopyService method) (core.resharding.copy_services.nccl_copy_service.NCCLCopyService method) (core.resharding.copy_services.nvshmem_copy_service.NVSHMEMCopyService method) SUBMIT_REQUEST (core.inference.headers.Headers attribute) submit_send() (core.resharding.copy_services.base.CopyService method) (core.resharding.copy_services.gloo_copy_service.GlooCopyService method) (core.resharding.copy_services.nccl_copy_service.NCCLCopyService method) (core.resharding.copy_services.nvshmem_copy_service.NVSHMEMCopyService method) submit_send_with_id() (core.resharding.copy_services.gloo_copy_service.GlooCopyService method) (core.resharding.copy_services.nccl_copy_service.NCCLCopyService method) (core.resharding.copy_services.nvshmem_copy_service.NVSHMEMCopyService method) submodules (core.transformer.spec_utils.ModuleSpec attribute) succeeded() (core.inference.inference_request.DynamicInferenceRequest method) SUCCESS (core.inference.unified_memory.CompilationState attribute) suggested_bucket_size (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.BucketingPolicy attribute) suggested_communication_unit_size (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) summary() (core.resharding.nvshmem_copy_service.logger.PELogger class method) SUPPORTED_ATTN_MASK (in module core.transformer.multi_token_prediction) SUSPEND (core.inference.headers.Headers attribute) suspend() (core.inference.engines.dynamic_engine.DynamicInferenceEngine method) suspend_engines() (core.inference.inference_client.InferenceClient method) suspend_resume_ctx() (core.inference.engines.dynamic_engine.DynamicInferenceEngine static method) swap_key_value_dict() (core.inference.contexts.static_context.StaticInferenceContext method) swap_model_weights() (in module core.resharding.refit) swiglu() (in module core.fusions.fused_bias_swiglu) swiglu_back() (in module core.fusions.fused_bias_swiglu) SwiGLUFunction (class in core.fusions.fused_bias_swiglu) switch_load_balancing_loss_func() (in module core.transformer.moe.moe_utils) symbolic() (core.tensor_parallel.mappings._AllGatherFromTensorParallelRegion static method) (core.tensor_parallel.mappings._CopyToModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromModelParallelRegion static method) (core.tensor_parallel.mappings._GatherFromSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceFromModelParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToSequenceParallelRegion static method) (core.tensor_parallel.mappings._ReduceScatterToTensorParallelRegion static method) (core.tensor_parallel.mappings._ScatterToModelParallelRegion static method) (core.tensor_parallel.mappings._ScatterToSequenceParallelRegion static method) Symbols (class in core.ssm.mamba_hybrid_layer_allocation) symmetric_ar_type (core.transformer.transformer_config.TransformerConfig attribute) sync() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) sync_all_async_calls() (core.dist_checkpointing.strategies.async_utils.AsyncCaller method) sync_rng_states_across_tp_group() (core.distributed.fsdp.mcore_fsdp_adapter.FullyShardedDataParallel method) synchronize_gradient_reduce() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) synchronize_param_gather() (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) T T (in module core.dist_checkpointing.exchange_utils) (in module core.dist_checkpointing.strategies.fully_parallel) (in module core.transformer.identity_op) (in module core.typed_torch) t5_extended_attention_mask() (in module core.models.T5.t5_model) t5_position_ids() (in module core.models.T5.t5_model) T5InferenceWrapper (class in core.inference.model_inference_wrappers.t5.t5_inference_wrapper) T5LMHead (class in core.models.T5.t5_model) T5MaskedWordPieceDataset (class in core.datasets.t5_dataset) T5MaskedWordPieceDatasetConfig (class in core.datasets.t5_dataset) T5Model (class in core.models.T5.t5_model) T5Tokenizer (class in core.tokenizers.text.models.t5_tokenizer) task_id (core.resharding.copy_services.gloo_copy_service.RecvOp attribute) (core.resharding.copy_services.gloo_copy_service.SendOp attribute) (core.resharding.copy_services.nccl_copy_service.RecvOp attribute) (core.resharding.copy_services.nccl_copy_service.SendOp attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.ReceiveRequest attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.SendRequest attribute) (core.resharding.nvshmem_copy_service.validation.ValidationResult attribute) (core.resharding.utils.TransferOp attribute) task_ids (core.resharding.nvshmem_copy_service.nvshmem_types.WorkloadSummary attribute) task_sizes (core.resharding.nvshmem_copy_service.nvshmem_types.WorkloadSummary attribute) tasks (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.WorkloadGroup attribute) tasks_summary (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) TaskSegmenter (class in core.resharding.nvshmem_copy_service.planning.task_segmenter) te_checkpoint (in module core.transformer.transformer_block) te_checkpoint() (in module core.extensions.transformer_engine) TEColumnParallelLinear (class in core.extensions.transformer_engine) TECudaGraphHelper (class in core.transformer.cuda_graphs) TECudaRNGStatesTracker (class in core.extensions.transformer_engine) TEDelayedScaling (class in core.extensions.transformer_engine) TEDotProductAttention (class in core.extensions.transformer_engine) TEGroupedMLP (class in core.transformer.moe.experts) TELayerNormColumnParallelLinear (class in core.extensions.transformer_engine) TELinear (class in core.extensions.transformer_engine) temp_log_level() (in module core.inference.text_generation_server.dynamic_text_gen_server.flask_server) temperature (core.inference.sampling_params.SamplingParams attribute) TemporalAsyncCaller (class in core.dist_checkpointing.strategies.async_utils) TemporaryBucketAllocator (class in core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) TENorm (class in core.extensions.transformer_engine) tensor (core.resharding.copy_services.gloo_copy_service.RecvOp attribute) (core.resharding.copy_services.gloo_copy_service.SendOp attribute) (core.resharding.copy_services.nccl_copy_service.RecvOp attribute) (core.resharding.copy_services.nccl_copy_service.SendOp attribute) tensor_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) tensor_need_offloading_checker() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) tensor_parallel_group_ranks (core.resharding.utils.ParameterMetadata attribute) tensor_pop() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) tensor_push() (core.pipeline_parallel.fine_grained_activation_offload.ChunkOffloadHandler method) tensor_reuse_pool (core.transformer.cuda_graphs._CudagraphGlobalRecord attribute) tensor_strong_refs (core.transformer.cuda_graphs.TensorReusePool attribute) tensor_strong_refs_dataptrs (core.transformer.cuda_graphs.TensorReusePool attribute) tensor_swap() (in module core.inference.utils) TensorItemIndex (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) TensorParallelMuon (class in core.optimizer.muon) TensorPointerExtractor (class in core.resharding.nvshmem_copy_service.memory.tensor_pointer_utils) TensorReusePool (class in core.transformer.cuda_graphs) tensors (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict property) TensorStateDeallocatedError tensorwise (core.enums.Fp8Recipe attribute) TEQuantizationParams (class in core.extensions.transformer_engine) (core.extensions.transformer_engine.TransformerEngineConfigType attribute) TEQuantizationRecipe (class in core.extensions.transformer_engine) termination_id (core.inference.sampling_params.SamplingParams attribute) TERowParallelLinear (class in core.extensions.transformer_engine) TESpecProvider (class in core.extensions.transformer_engine_spec_provider) test (core.datasets.utils.Split attribute) test_mode (core.transformer.transformer_config.TransformerConfig attribute) text_to_ids() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) text_to_tokens() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) TextGenerationController (class in core.inference.text_generation_controllers.text_generation_controller) tie_embeddings_and_output_weights_state_dict() (core.models.common.language_module.language_module.LanguageModule method) tie_output_layer_state_dict() (in module core.transformer.multi_token_prediction) tie_word_embeddings_state_dict() (in module core.transformer.multi_token_prediction) TikTokenTokenizer (class in core.tokenizers.text.libraries.tiktoken_tokenizer) Timer (class in core.timers) TimerBase (class in core.timers) Timers (class in core.timers) timers (core.model_parallel_config.ModelParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) timestamp (core.inference.inference_request.DynamicInferenceEvent attribute) to_list() (core.transformer.cuda_graphs._CudaGraphRunner method) to_local_if_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer) (in module core.utils) to_state_dict() (core.dist_checkpointing.tensor_aware_state_dict.MCoreTensorAwareStateDict method) toggle_cuda_graphs() (in module core.transformer.utils) token_combine() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) token_count (core.inference.batch_dimensions_utils.InferenceBatchDimensions attribute) token_dispatch() (core.transformer.moe.token_dispatcher.MoEAllGatherTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEAlltoAllTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoEFlexTokenDispatcher method) (core.transformer.moe.token_dispatcher.MoETokenDispatcher method) token_dtype_code (core.datasets.gpt_dataset.GPTDatasetConfig attribute) TOKEN_ROUNDER (core.inference.contexts.dynamic_context.DynamicInferenceContext attribute) token_to_id() (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) tokenize() (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer method) (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText method) tokenize_encoder_prompt() (core.inference.model_inference_wrappers.t5.t5_inference_wrapper.T5InferenceWrapper method) tokenize_prompt() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) tokenize_prompts() (in module core.inference.text_generation_server.dynamic_text_gen_server.tokenization) (in module core.inference.text_generation_server.tokenization) tokenizer (core.datasets.blended_megatron_dataset_config.BlendedMegatronDatasetConfig attribute) TOKENIZER_LIBRARIES (in module core.tokenizers.megatron_tokenizer) TOKENIZER_MAPPING_LIBRARIES (in module core.tokenizers.text.text_tokenizer) TOKENIZER_MAPPING_NAMES (in module core.tokenizers.megatron_tokenizer) TokenOverflowError tokens_to_ids() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) tokens_to_text() (core.tokenizers.text.libraries.abstract_tokenizer.MegatronTokenizerTextAbstract method) (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer method) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer method) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer method) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer method) top_k (core.inference.sampling_params.SamplingParams attribute) top_n_logprobs (core.inference.sampling_params.SamplingParams attribute) top_p (core.inference.sampling_params.SamplingParams attribute) topk_routing_with_score_function() (in module core.transformer.moe.moe_utils) TopKRouter (class in core.transformer.moe.router) TopLevelDataset (in module core.datasets.blended_megatron_dataset_builder) torch_chunk_gated_delta_rule() (in module core.ssm.gated_delta_net) torch_home (in module core.tokenizers.text.libraries.megatron_hf_tokenizer) TorchCommonLoadStrategy (class in core.dist_checkpointing.strategies.common) TorchCommonSaveStrategy (class in core.dist_checkpointing.strategies.common) TorchDistLoadShardedStrategy (class in core.dist_checkpointing.strategies.torch) TorchDistSaveShardedStrategy (class in core.dist_checkpointing.strategies.torch) TorchFullyShardedDataParallel (class in core.distributed.torch_fully_sharded_data_parallel) TorchFullyShardedDataParallelConfig (class in core.distributed.torch_fully_sharded_data_parallel_config) total_bytes (core.resharding.nvshmem_copy_service.validation.ValidationSummary attribute) total_size (core.resharding.nvshmem_copy_service.nvshmem_types.ScheduledBatch attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.TransferMetadata attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.WorkloadGroup attribute) (core.resharding.nvshmem_copy_service.nvshmem_types.WorkloadSummary attribute) total_tasks (core.resharding.nvshmem_copy_service.validation.ValidationSummary attribute) tp (core.process_groups_config.ProcessGroupCollection attribute) tp_comm_atomic_ag (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_atomic_rs (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_bootstrap_backend (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_bulk_dgrad (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_bulk_wgrad (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_ag (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_disable_fc1 (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_disable_qkv (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_rs (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_overlap_rs_dgrad (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_split_ag (core.model_parallel_config.ModelParallelConfig attribute) tp_comm_split_rs (core.model_parallel_config.ModelParallelConfig attribute) tp_cp (core.process_groups_config.ProcessGroupCollection attribute) tp_dp_cp (core.process_groups_config.ProcessGroupCollection attribute) tp_ep (core.process_groups_config.ProcessGroupCollection attribute) tp_ep_pp (core.process_groups_config.ProcessGroupCollection attribute) tp_only_amax_red (core.extensions.transformer_engine.TEQuantizationRecipe attribute) (core.transformer.transformer_config.TransformerConfig attribute) tpot (core.inference.inference_request.InferenceRequest attribute) trace() (core.resharding.nvshmem_copy_service.logger.PELogger class method) trace_async_exceptions() (in module core.utils) track_moe_metrics() (in module core.transformer.moe.moe_utils) track_mtp_metrics() (core.transformer.multi_token_prediction.MTPLossLoggingHelper method) tracked_metadata (core.inference.inference_request.DynamicInferenceRequest property) tracker (core.transformer.multi_token_prediction.MTPLossLoggingHelper attribute) train (core.datasets.utils.Split attribute) training_recipe (core.extensions.transformer_engine.TEQuantizationParams attribute) TrainingState (class in core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp) TransferMetadata (class in core.resharding.nvshmem_copy_service.nvshmem_types) TransferOp (class in core.resharding.utils) transform_object() (core.dist_checkpointing.strategies.torch.MCoreSavePlanner method) transformer_impl (core.transformer.transformer_config.TransformerConfig attribute) transformer_layer (core.transformer.multi_token_prediction.MultiTokenPredictionLayerSubmodules attribute) TransformerBlock (class in core.transformer.transformer_block) TransformerBlockSubmodules (class in core.transformer.transformer_block) TransformerConfig (class in core.transformer.transformer_config) TransformerEngineConfigType (class in core.extensions.transformer_engine) TransformerLayer (class in core.transformer.transformer_layer) TransformerLayerNode (class in core.models.gpt.fine_grained_callables) TransformerLayerSchedulePlan (class in core.models.common.model_chunk_schedule_plan) TransformerLayerState (class in core.models.gpt.fine_grained_callables) TransformerLayerSubmodules (class in core.transformer.transformer_layer) TransformerModelChunkSchedulePlan (class in core.models.common.model_chunk_schedule_plan) TRANSIENT_ERROR (core.rerun_state_machine.RerunDiagnostic attribute) transpose_weight_buffer (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParameterGroup attribute) tree_map() (in module core.transformer.cuda_graphs) triton_append_key_value_cache() (in module core.inference.contexts.fused_kv_append_kernel) TRT_MODEL_CONFIG (in module core.export.trtllm.trt_model_config) TRT_MODEL_TYPE_STRING (in module core.export.trtllm.trt_model_type) TRTLLMEngineBuilder (class in core.export.trtllm.engine_builder.trtllm_engine_builder) TRTLLMHelper (class in core.export.trtllm.trtllm_helper) TRTLLMLayers (class in core.export.trtllm.trtllm_layers) type (core.inference.inference_request.DynamicInferenceEvent attribute) U uint16 (core.datasets.indexed_dataset.DType attribute) uint8 (core.datasets.indexed_dataset.DType attribute) UNATTEMPTED (core.inference.unified_memory.CompilationState attribute) uncompress_kv_from_cache() (core.transformer.multi_latent_attention.MLASelfAttention method) unfused (core.transformer.enums.AttnBackend attribute) UnifiedMemoryCompileTimeoutError UnifiedMemoryUnsupportedError unique_identifiers (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) unique_key (core.dist_checkpointing.mapping.ShardedObject property) unk (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) unk_id (core.tokenizers.text.libraries.bytelevel_tokenizer.ByteLevelTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) UnknownHeaderError unpack_batch() (core.datasets.data_schedule.HybridCPDataLoaderWrapper method) unpad_input_prompt_tokens() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) UNPAUSE (core.inference.headers.Headers attribute) unpause_engines() (core.inference.inference_client.InferenceClient method) unpermute() (in module core.transformer.moe.moe_utils) unset_num_microbatches_calculator() (in module core.num_microbatches_calculator) unwrap() (core.dist_checkpointing.mapping.LocalNonpersistentObject method) (core.utils.WrappedTensor method) unwrap_model() (in module core.utils) upcycle_state_dict() (in module core.transformer.moe.upcycling_utils) update() (core.num_microbatches_calculator.ConstantNumMicroBatchesCalculator method) (core.num_microbatches_calculator.NumMicroBatchesCalculator method) (core.num_microbatches_calculator.RampupBatchsizeNumMicroBatchesCalculator method) (core.optimizer.grad_scaler.ConstantGradScaler method) (core.optimizer.grad_scaler.DynamicGradScaler method) (core.optimizer.grad_scaler.MegatronGradScaler method) update_fp32_param_by_new_param() (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) update_generation_status() (core.inference.text_generation_controllers.text_generation_controller.TextGenerationController method) update_main_grads() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) update_num_microbatches() (in module core.num_microbatches_calculator) update_offload_info() (core.pipeline_parallel.fine_grained_activation_offload.OffloadTensorGroup method) update_pg_timeout() (in module core.parallel_state) update_requests() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) update_requests_pools() (core.inference.scheduler.Scheduler method) update_uneven_dtensor_chunk_metadata() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) upload_file() (core.datasets.object_storage_utils.S3Client method) use_cpu_initialization (core.model_parallel_config.ModelParallelConfig attribute) use_custom_fsdp (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) use_distributed_optimizer (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.optimizer.optimizer_config.OptimizerConfig attribute) use_embedding_sharing (core.export.export_config.ExportConfig attribute) use_fused_weighted_squared_relu (core.transformer.transformer_config.TransformerConfig attribute) use_inference_optimized_layers (core.transformer.transformer_config.TransformerConfig attribute) use_kitchen (core.transformer.transformer_config.TransformerConfig attribute) use_kitchen_attention (core.transformer.transformer_config.TransformerConfig attribute) use_mamba_mem_eff_path (core.transformer.transformer_config.TransformerConfig attribute) use_megatron_fsdp (core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig attribute) (core.distributed.fsdp.src.megatron_fsdp.distributed_data_parallel_config.DistributedDataParallelConfig attribute) use_mpu_process_groups() (core.process_groups_config.ProcessGroupCollection class method) use_parallel_embedding (core.export.export_config.ExportConfig attribute) use_precision_aware_optimizer (core.optimizer.optimizer_config.OptimizerConfig attribute) use_ring_exchange_p2p (core.model_parallel_config.ModelParallelConfig attribute) use_te_activation_func (core.transformer.transformer_config.TransformerConfig attribute) use_te_rng_tracker (core.model_parallel_config.ModelParallelConfig attribute) (core.transformer.transformer_config.TransformerConfig attribute) use_torch_optimizer_for_cpu_offload (core.optimizer.optimizer_config.OptimizerConfig attribute) USING_APEX_OPTIMIZER (in module core.optimizer.distrib_optimizer) using_cuda_graph_this_step() (core.inference.contexts.dynamic_context.DynamicInferenceContext method) USING_TE_OPTIMIZER (in module core.optimizer.distrib_optimizer) V v_head_dim (core.transformer.transformer_config.MLATransformerConfig attribute) valid (core.datasets.utils.Split attribute) VALID (core.ssm.mamba_hybrid_layer_allocation.Symbols attribute) validate_checkpoint_id() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync class method) validate_integrity_and_strict_load() (in module core.dist_checkpointing.validation) validate_layer_layout() (core.transformer.pipeline_parallel_layer_layout.PipelineParallelLayerLayout method) validate_loaded_state_dict() (in module core.transformer.fsdp_dtensor_checkpoint) validate_metadata_integrity() (core.dist_checkpointing.mapping.ShardedBase method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) validate_received_data() (in module core.resharding.nvshmem_copy_service.validation) validate_result() (core.rerun_state_machine.RerunStateMachine method) VALIDATE_RESULTS (core.rerun_state_machine.RerunMode attribute) validate_sharded_objects_handling() (in module core.dist_checkpointing.validation) validate_sharding_integrity() (in module core.dist_checkpointing.validation) validate_state_dict() (core.rerun_state_machine.RerunStateMachine method) validate_uneven_dtensor() (in module core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor) ValidationResult (class in core.resharding.nvshmem_copy_service.validation) ValidationSummary (class in core.resharding.nvshmem_copy_service.validation) variable_seq_lengths (core.model_parallel_config.ModelParallelConfig attribute) verbose (core.post_training.modelopt.layers.RealQuantTransformerLayer attribute) verify_checkpoint_and_load_strategy() (in module core.dist_checkpointing.validation) verify_global_md_reuse() (in module core.dist_checkpointing.strategies.state_dict_saver) VERSION (in module core.distributed.fsdp.src.megatron_fsdp.package_info) (in module core.package_info) VIDEO_TOKEN (in module core.models.multimodal.llava_model) virtual_pipeline_model_parallel_size (core.model_parallel_config.ModelParallelConfig attribute) VisionModule (class in core.models.common.vision_module.vision_module) VLMInferenceRequest (class in core.inference.inference_request) VLMTextGenerationController (class in core.inference.text_generation_controllers.vlm_text_generation_controller) vocab (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.sentencepiece_tokenizer.SentencePieceTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) vocab() (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) vocab_embedding (core.export.trtllm.trtllm_layers.TRTLLMLayers attribute) vocab_file (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) vocab_parallel_cross_entropy() (in module core.tensor_parallel.cross_entropy) vocab_range_from_global_vocab_size() (core.tensor_parallel.utils.VocabUtility static method) vocab_range_from_per_partition_vocab_size() (core.tensor_parallel.utils.VocabUtility static method) vocab_size (core.datasets.megatron_tokenizer.MegatronLegacyTokenizer property) (core.tokenizers.text.libraries.huggingface_tokenizer.HuggingFaceTokenizer property) (core.tokenizers.text.libraries.null_tokenizer.NullTokenizer property) (core.tokenizers.text.libraries.tiktoken_tokenizer.TikTokenTokenizer property) (core.tokenizers.text.text_tokenizer.MegatronTokenizerText property) vocab_size() (core.tokenizers.base_tokenizer.MegatronTokenizerBase method) VocabParallelCrossEntropy (class in core.tensor_parallel.cross_entropy) VocabParallelEmbedding (class in core.tensor_parallel.layers) VocabUtility (class in core.tensor_parallel.utils) W wait() (core.distributed.reduce_scatter_with_fp32_accumulation._ReduceScatterWithFP32AccumulationWorkHandle method) wait_bucket_ready() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.AllGatherPipeline method) wait_current_stream() (core.models.common.model_chunk_schedule_plan.TransformerModelChunkSchedulePlan method) wait_for_previous_grad_reduce() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.GradReducePipeline method) wait_offload_event() (core.pipeline_parallel.fine_grained_activation_offload.OffloadTensorGroup method) wait_reload_event() (core.pipeline_parallel.fine_grained_activation_offload.OffloadTensorGroup method) WAITING_IN_QUEUE (core.inference.inference_request.Status attribute) warn() (core.resharding.nvshmem_copy_service.logger.PELogger class method) warning() (core.resharding.nvshmem_copy_service.logger.PELogger class method) wd_mult (core.optimizer_param_scheduler.ParamGroupOverride attribute) weak_method() (in module core.models.gpt.fine_grained_callables) weight_decay (core.optimizer.optimizer_config.OptimizerConfig attribute) weighted_bias_quick_geglu() (in module core.fusions.fused_bias_geglu) weighted_bias_quick_geglu_back() (in module core.fusions.fused_bias_geglu) weighted_bias_quick_geglu_impl() (in module core.fusions.fused_bias_geglu) weighted_bias_swiglu_impl() (in module core.fusions.fused_bias_swiglu) weighted_quick_geglu() (in module core.fusions.fused_bias_geglu) weighted_quick_geglu_back() (in module core.fusions.fused_bias_geglu) weighted_squared_relu() (in module core.fusions.fused_weighted_squared_relu) weighted_squared_relu_back() (in module core.fusions.fused_weighted_squared_relu) weighted_squared_relu_impl() (in module core.fusions.fused_weighted_squared_relu) weighted_swiglu() (in module core.fusions.fused_bias_swiglu) weighted_swiglu_back() (in module core.fusions.fused_bias_swiglu) WeightedBiasQuickGeGLUFunction (class in core.fusions.fused_bias_geglu) WeightedQuickGeGLUFunction (class in core.fusions.fused_bias_geglu) WeightedSquaredReLUFunction (class in core.fusions.fused_weighted_squared_relu) WeightedSwiGLUFunction (class in core.fusions.fused_bias_swiglu) wgrad_deferral_limit (core.model_parallel_config.ModelParallelConfig attribute) will_execute_quantized() (core.extensions.transformer_engine.TELayerNormColumnParallelLinear method) (core.extensions.transformer_engine.TELinear method) WILL_RERUN_FROM_CHECKPOINT (core.rerun_state_machine.RerunState attribute) window_attn_skip_freq (core.transformer.transformer_config.TransformerConfig attribute) window_size (core.transformer.transformer_config.TransformerConfig attribute) with_name_predicate (core.optimizer.optimizer_config.ParamKey attribute) without_data() (core.dist_checkpointing.mapping.ShardedBase method) (core.dist_checkpointing.mapping.ShardedObject method) (core.dist_checkpointing.mapping.ShardedTensor method) (core.dist_checkpointing.mapping.ShardedTensorFactory method) WorkloadGroup (class in core.resharding.nvshmem_copy_service.nvshmem_types) WorkloadPacker (class in core.resharding.nvshmem_copy_service.planning.workload_packer) WorkloadSummary (class in core.resharding.nvshmem_copy_service.nvshmem_types) world_size (core.utils.StragglerDetector property) WrappedTensor (class in core.utils) WrappedTorchLayerNorm (in module core.transformer.torch_layer_norm) WrappedTorchNorm (class in core.transformer.torch_norm) write() (core.datasets.indexed_dataset._IndexWriter method) (core.timers.Timers method) write_data() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync method) write_metadata() (core.tokenizers.megatron_tokenizer.MegatronTokenizer method) write_preloaded_data() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync static method) write_preloaded_data_multiproc() (core.dist_checkpointing.strategies.filesystem_async.FileSystemWriterAsync static method) WriteBucket (in module core.dist_checkpointing.strategies.filesystem_async) Y YarnRotaryEmbedding (class in core.models.common.embeddings.yarn_rotary_pos_embedding) Z z_loss_func() (in module core.transformer.moe.moe_utils) zero_grad() (core.distributed.fsdp.src.megatron_fsdp.param_and_grad_buffer.ParamAndGradBuffer method) (core.optimizer.cpu_offloading.hybrid_optimizer.HybridDeviceOptimizer method) (core.optimizer.distrib_optimizer.DistributedOptimizer method) (core.optimizer.optimizer.ChainedOptimizer method) (core.optimizer.optimizer.Float16OptimizerWithFloat16Params method) (core.optimizer.optimizer.FP32Optimizer method) (core.optimizer.optimizer.MegatronOptimizer method) zero_grad_buffer() (core.distributed.data_parallel_base._BaseDataParallel method) (core.distributed.distributed_data_parallel.DistributedDataParallel method) (core.distributed.fsdp.src.megatron_fsdp.megatron_fsdp.MegatronFSDP method) zero_parameters() (core.models.common.embeddings.language_model_embedding.LanguageModelEmbedding method) zeros_like() (core.transformer.cuda_graphs.ArgMetadata method) zip_strict() (in module core.dist_checkpointing.utils)