bridge.models.t5_provider#

Module Contents#

Classes#

T5ModelProvider

Model config for T5 model. Adpated from megatron.core.models.t5.t5_model.T5Model

Functions#

transformer_engine_layer_spec

Spec for T5 when using transformer_engine mcore implementation

local_layer_spec

Spec for T5 when using local mcore implementation

Data#

API#

bridge.models.t5_provider.logger#

‘getLogger(…)’

bridge.models.t5_provider.transformer_engine_layer_spec(
encoder_config: T5ModelProvider,
decoder_config: T5ModelProvider,
) megatron.core.transformer.spec_utils.ModuleSpec#

Spec for T5 when using transformer_engine mcore implementation

bridge.models.t5_provider.local_layer_spec(
encoder_config: T5ModelProvider,
decoder_config: T5ModelProvider,
) megatron.core.transformer.spec_utils.ModuleSpec#

Spec for T5 when using local mcore implementation

class bridge.models.t5_provider.T5ModelProvider#

Bases: megatron.core.transformer.transformer_config.TransformerConfig, megatron.bridge.models.model_provider.ModelProviderMixin[megatron.core.models.T5.t5_model.T5Model]

Model config for T5 model. Adpated from megatron.core.models.t5.t5_model.T5Model

encoder_num_layers: int | None#

None

fp16_lm_cross_entropy: bool#

False

parallel_output: bool#

True

share_embeddings_and_output_weights: bool#

True

make_vocab_size_divisible_by: int#

128

position_embedding_type: Literal[learned_absolute, rope]#

‘learned_absolute’

apply_rope_fusion: bool#

True

max_position_embeddings: int#

512

relative_attention_num_buckets: int#

32

relative_attention_max_distance: int#

128

rotary_percent: float#

1.0

seq_len_interpolation_factor: Optional[float]#

None

seq_length: int#

512

seq_length_dec: int#

128

encoder_pipeline_model_parallel_size: int#

0

attention_softmax_in_fp32: float#

False

bias_activation_fusion: bool#

True

masked_softmax_fusion: bool#

True

persist_layer_norm: bool#

True

bias_dropout_fusion: bool#

True

deallocate_pipeline_outputs: bool#

True

num_moe_experts: Optional[int]#

None

recompute_num_layers: int#

1

distribute_saved_activations: bool#

False

enable_autocast: bool#

False

transformer_layer_spec: Union[megatron.core.transformer.spec_utils.ModuleSpec, Callable[[bridge.models.t5_provider.T5ModelProvider], megatron.core.transformer.spec_utils.ModuleSpec]]#

None

vocab_size: Optional[int]#

None

tp_comm_overlap_cfg: Optional[Union[str, dict[str, Any]]]#

None

provide(
pre_process=None,
post_process=None,
vp_stage=None,
tokenizer=None,
) megatron.core.models.T5.t5_model.T5Model#

Setup the T5 Model based on config definition.