bridge.models.glm_vl.glm_45v_provider#

Module Contents#

Classes#

GLM45VModelProvider

Base model provider for GLM 4.5 Vision-Language (VL) Models. Combines GLM 4.5 Air 106B language config with VL-specific settings.

API#

class bridge.models.glm_vl.glm_45v_provider.GLM45VModelProvider#

Bases: megatron.bridge.models.gpt_provider.GPTModelProvider

Base model provider for GLM 4.5 Vision-Language (VL) Models. Combines GLM 4.5 Air 106B language config with VL-specific settings.

transformer_layer_spec: Union[megatron.core.transformer.ModuleSpec, Callable[[megatron.bridge.models.gpt_provider.GPTModelProvider], megatron.core.transformer.ModuleSpec]]#

‘partial(…)’

normalization: str#

‘RMSNorm’

activation_func: Callable#

None

gated_linear_unit: bool#

True

add_bias_linear: bool#

False

add_qkv_bias: bool#

True

seq_length: int#

131072

init_method_std: float#

0.02

hidden_dropout: float#

0.0

vocab_size: int#

151552

share_embeddings_and_output_weights: Optional[bool]#

False

layernorm_epsilon: float#

1e-05

autocast_dtype: torch.dtype#

None

params_dtype: torch.dtype#

None

bf16: bool#

True

num_query_groups: int#

8

num_attention_heads: int#

96

attention_dropout: float#

0.0

kv_channels: int#

128

position_embedding_type: str#

‘mrope’

rotary_base: float#

1000000.0

rotary_percent: float#

0.5

mrope_section: List[int]#

‘field(…)’

moe_router_topk: int#

8

moe_shared_expert_overlap: bool#

True

moe_token_dispatcher_type: str#

‘alltoall’

moe_router_load_balancing_type: str#

‘seq_aux_loss’

moe_aux_loss_coeff: float#

0.001

moe_router_pre_softmax: bool#

False

moe_grouped_gemm: bool#

True

moe_router_score_function: str#

‘sigmoid’

moe_permute_fusion: bool#

True

moe_router_dtype: str#

‘fp32’

moe_router_enable_expert_bias: bool#

True

moe_router_bias_update_rate: float#

0

persist_layer_norm: bool#

True

bias_activation_fusion: bool#

True

bias_dropout_fusion: bool#

True

mtp_num_layers: Optional[int]#

1

mtp_loss_scaling_factor: Optional[float]#

0.3

num_layers: int#

46

num_moe_experts: int#

128

hidden_size: int#

4096

ffn_hidden_size: int#

10944

moe_layer_freq: Union[int, List[int]]#

‘field(…)’

moe_ffn_hidden_size: int#

1408

moe_shared_expert_intermediate_size: int#

1408

qk_layernorm: bool#

False

moe_router_topk_scaling_factor: float#

1.0

scatter_embedding_sequence_parallel: bool#

False

vision_config: transformers.models.glm4v.configuration_glm4v.Glm4vVisionConfig#

‘field(…)’

return_dict: bool#

True

eos_token_id: int#

151329

image_start_token_id: int#

151339

image_end_token_id: int#

151340

video_start_token_id: int#

151341

video_end_token_id: int#

151342

image_token_id: int#

151363

video_token_id: int#

151364

freeze_language_model: bool#

False

freeze_vision_model: bool#

False

freeze_vision_projection: bool#

False

provide(
pre_process=None,
post_process=None,
vp_stage=None,
) bridge.models.glm_vl.modeling_glm_45v.GLM45VModel#
provide_language_model(
pre_process=None,
post_process=None,
vp_stage=None,
) megatron.core.models.gpt.GPTModel#