Megatron-LM
Table of Contents
User Guide
API Guide
core
core.dist_checkpointing
core.dist_checkpointing.strategies
core.dist_checkpointing.optimizer
core.dist_checkpointing.mapping
core.dist_checkpointing.dict_utils
core.dist_checkpointing.serialization
core.dist_checkpointing.validation
core.dist_checkpointing.core
core.dist_checkpointing.state_dict_utils
core.dist_checkpointing.tensor_aware_state_dict
core.dist_checkpointing.exchange_utils
core.dist_checkpointing.utils
core.export
core.export.trtllm
core.export.data_type
core.export.export_config
core.export.model_type
core.extensions
core.extensions.kitchen
core.extensions.transformer_engine
core.extensions.transformer_engine_spec_provider
core.tokenizers
core.tokenizers.text
core.tokenizers.base_tokenizer
core.tokenizers.megatron_tokenizer
core.models
core.models.retro
core.models.vision
core.models.gpt
core.models.T5
core.models.multimodal
core.models.mamba
core.models.common
core.models.huggingface
core.models.mimo
core.models.bert
core.models.backends
core.tensor_parallel
core.tensor_parallel.data
core.tensor_parallel.layers
core.tensor_parallel.inference_layers
core.tensor_parallel.mappings
core.tensor_parallel.random
core.tensor_parallel.cross_entropy
core.tensor_parallel.utils
core.distributed
core.distributed.fsdp
core.distributed.param_and_grad_buffer
core.distributed.torch_fully_sharded_data_parallel
core.distributed.torch_fully_sharded_data_parallel_config
core.distributed.finalize_model_grads
core.distributed.distributed_data_parallel_config
core.distributed.reduce_scatter_with_fp32_accumulation
core.distributed.data_parallel_base
core.distributed.distributed_data_parallel
core.datasets
core.datasets.retro
core.datasets.blended_megatron_dataset_builder
core.datasets.multimodal_dataset
core.datasets.t5_dataset
core.datasets.gpt_dataset
core.datasets.helpers
core.datasets.object_storage_utils
core.datasets.bert_dataset
core.datasets.utils_s3
core.datasets.megatron_tokenizer
core.datasets.megatron_dataset
core.datasets.blended_dataset
core.datasets.masked_dataset
core.datasets.indexed_dataset
core.datasets.utils
core.datasets.blended_megatron_dataset_config
core.quantization
core.quantization.quant_config
core.quantization.utils
core.ssm
core.ssm.mamba_block
core.ssm.mamba_context_parallel
core.ssm.triton_cache_manager
core.ssm.mamba_mixer
core.ssm.mlp_layer
core.ssm.mamba_layer
core.ssm.mamba_hybrid_layer_allocation
core.transformer
core.transformer.moe
core.transformer.custom_layers
core.transformer.torch_layer_norm
core.transformer.transformer_layer
core.transformer.multi_token_prediction
core.transformer.module
core.transformer.mlp
core.transformer.transformer_block
core.transformer.identity_op
core.transformer.multi_latent_attention
core.transformer.pipeline_parallel_layer_layout
core.transformer.attention
core.transformer.fsdp_dtensor_checkpoint
core.transformer.enums
core.transformer.spec_utils
core.transformer.transformer_config
core.transformer.cuda_graphs
core.transformer.torch_norm
core.transformer.dot_product_attention
core.transformer.utils
core.inference
core.inference.text_generation_controllers
core.inference.model_inference_wrappers
core.inference.contexts
core.inference.text_generation_server
core.inference.engines
core.inference.batch_dimensions_utils
core.inference.inference_client
core.inference.communication_utils
core.inference.data_parallel_inference_coordinator
core.inference.unified_memory
core.inference.common_inference_params
core.inference.inference_request
core.inference.headers
core.inference.scheduler
core.inference.async_stream
core.inference.sampling_params
core.inference.utils
core.optimizer
core.optimizer.cpu_offloading
core.optimizer.clip_grads
core.optimizer.optimizer
core.optimizer.grad_scaler
core.optimizer.qk_clip
core.optimizer.optimizer_config
core.optimizer.distrib_optimizer
core.pipeline_parallel
core.pipeline_parallel.bridge_communicator
core.pipeline_parallel.schedules
core.pipeline_parallel.p2p_communication
core.pipeline_parallel.combined_1f1b
core.pipeline_parallel.utils
core.post_training
core.post_training.modelopt
core.fusions
core.fusions.fused_weighted_squared_relu
core.fusions.fused_softmax
core.fusions.fused_pad_routing_map
core.fusions.fused_cross_entropy
core.fusions.fused_bias_dropout
core.fusions.fused_bias_geglu
core.fusions.fused_indices_converter
core.fusions.fused_layer_norm
core.fusions.fused_bias_swiglu
core.fusions.fused_bias_gelu
core.fusions.fused_mla_yarn_rope_apply
core.packed_seq_params
core.rerun_state_machine
core.config_logger
core.energy_monitor
core.activations
core.safe_globals
core.fp4_utils
core.num_microbatches_calculator
core.full_cuda_graph
core.parallel_state
core.inference_params
core.process_groups_config
core.jit
core.msc_utils
core.nccl_allocator
core.timers
core.package_info
core.config
core.model_parallel_config
core.enums
core.optimizer_param_scheduler
core.hyper_comm_grid
core.fp8_utils
core.utils
core.optimizer.cpu_offloading.hybrid_optimizer