nemo_automodel.components.models.baichuan.model

View as Markdown

Native Baichuan2 model implementation for NeMo Automodel.

Adapted from the Baichuan2 remote-code model on HuggingFace with the following changes:

  • Removed xformers / quantization / chat / streaming dependencies.
  • Added **kwargs to forward signatures so that extra batch keys (padding_mask, loss_mask, …) pass through without error.
  • Uses HFCheckpointingMixin for unified checkpointing.
  • Uses torch.nn.functional.scaled_dot_product_attention only.

Example (YAML)::

model: target: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained pretrained_model_name_or_path: baichuan-inc/Baichuan2-7B-Chat

Module Contents

Classes

Functions

Data

ModelClass

logger

API

class nemo_automodel.components.models.baichuan.model.Attention(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig
)

Bases: Module

W_pack
head_dim
= self.hidden_size // self.num_heads
hidden_size
= config.hidden_size
max_position_embeddings
= config.max_position_embeddings
num_heads
= config.num_attention_heads
o_proj
rotary_emb
nemo_automodel.components.models.baichuan.model.Attention.forward(
hidden_states: torch.Tensor,
attention_mask: typing.Optional[torch.Tensor] = None,
position_ids: typing.Optional[torch.LongTensor] = None,
past_key_value: typing.Optional[typing.Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False
) -> typing.Tuple[torch.Tensor, typing.Optional[torch.Tensor], typing.Optional[typing.Tuple[torch.Tensor]]]
class nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig,
model_kwargs = {}
)

Bases: HFCheckpointingMixin, BaichuanPreTrainedModel, GenerationMixin

_tied_weights_keys
= {'lm_head.weight': 'model.embed_tokens.weight'}
lm_head
model
= BaichuanModel(config)
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM._reorder_cache(
past_key_values,
beam_idx
)
staticmethod
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.forward(
input_ids: torch.LongTensor = None,
attention_mask: typing.Optional[torch.Tensor] = None,
position_ids: typing.Optional[torch.LongTensor] = None,
past_key_values: typing.Optional[typing.List[torch.FloatTensor]] = None,
inputs_embeds: typing.Optional[torch.FloatTensor] = None,
labels: typing.Optional[torch.LongTensor] = None,
use_cache: typing.Optional[bool] = None,
output_attentions: typing.Optional[bool] = None,
output_hidden_states: typing.Optional[bool] = None,
return_dict: typing.Optional[bool] = None,
logits_to_keep: typing.Union[int, torch.Tensor] = 0,
kwargs = {}
) -> typing.Union[typing.Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.get_decoder()
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.get_input_embeddings()
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.get_output_embeddings()
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.prepare_inputs_for_generation(
input_ids,
past_key_values = None,
attention_mask = None,
inputs_embeds = None,
kwargs = {}
)
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.set_decoder(
decoder
)
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.set_input_embeddings(
value
)
nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM.set_output_embeddings(
new_embeddings
)
class nemo_automodel.components.models.baichuan.model.BaichuanModel(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig
)

Bases: BaichuanPreTrainedModel

embed_tokens
layers
norm
padding_idx
= config.pad_token_id
vocab_size
= config.vocab_size
nemo_automodel.components.models.baichuan.model.BaichuanModel._prepare_decoder_attention_mask(
attention_mask,
input_shape,
inputs_embeds,
past_key_values_length
)
nemo_automodel.components.models.baichuan.model.BaichuanModel.forward(
input_ids: torch.LongTensor = None,
attention_mask: typing.Optional[torch.Tensor] = None,
position_ids: typing.Optional[torch.LongTensor] = None,
past_key_values: typing.Optional[typing.List[torch.FloatTensor]] = None,
inputs_embeds: typing.Optional[torch.FloatTensor] = None,
use_cache: typing.Optional[bool] = None,
output_attentions: typing.Optional[bool] = None,
output_hidden_states: typing.Optional[bool] = None,
return_dict: typing.Optional[bool] = None,
kwargs = {}
) -> typing.Union[typing.Tuple, transformers.modeling_outputs.BaseModelOutputWithPast]
nemo_automodel.components.models.baichuan.model.BaichuanModel.get_input_embeddings()
nemo_automodel.components.models.baichuan.model.BaichuanModel.set_input_embeddings(
value
)
class nemo_automodel.components.models.baichuan.model.BaichuanPreTrainedModel()

Bases: PreTrainedModel

_no_split_modules
= ['DecoderLayer']
base_model_prefix
= 'model'
nemo_automodel.components.models.baichuan.model.BaichuanPreTrainedModel._init_weights(
module
)
nemo_automodel.components.models.baichuan.model.BaichuanPreTrainedModel._set_gradient_checkpointing(
module,
value = False
)
class nemo_automodel.components.models.baichuan.model.DecoderLayer(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig
)

Bases: Module

hidden_size
= config.hidden_size
input_layernorm
mlp
post_attention_layernorm
self_attn
= Attention(config=config)
nemo_automodel.components.models.baichuan.model.DecoderLayer.forward(
hidden_states: torch.Tensor,
attention_mask: typing.Optional[torch.Tensor] = None,
position_ids: typing.Optional[torch.LongTensor] = None,
past_key_value: typing.Optional[typing.Tuple[torch.Tensor]] = None,
output_attentions: typing.Optional[bool] = False,
use_cache: typing.Optional[bool] = False
) -> typing.Tuple[torch.FloatTensor, ...]
class nemo_automodel.components.models.baichuan.model.MLP(
hidden_size,
intermediate_size,
hidden_act
)

Bases: Module

act_fn
= ACT2FN[hidden_act]
down_proj
gate_proj
up_proj
nemo_automodel.components.models.baichuan.model.MLP.forward(
x
)
class nemo_automodel.components.models.baichuan.model.NormHead(
hidden_size,
vocab_size,
bias = False
)

Bases: Module

weight
nemo_automodel.components.models.baichuan.model.NormHead.forward(
hidden_states
)
class nemo_automodel.components.models.baichuan.model.RMSNorm(
hidden_size,
eps = 1e-06
)

Bases: Module

weight
= nn.Parameter(torch.ones(hidden_size))
nemo_automodel.components.models.baichuan.model.RMSNorm.forward(
hidden_states
)
class nemo_automodel.components.models.baichuan.model.RotaryEmbedding(
dim,
max_position_embeddings = 2048,
base = 10000,
device = None
)

Bases: Module

cos_cached
= emb.cos()[None, None, :, :].to(torch.float32)
inv_freq
sin_cached
= emb.sin()[None, None, :, :].to(torch.float32)
nemo_automodel.components.models.baichuan.model.RotaryEmbedding.forward(
x,
seq_len = None
)
nemo_automodel.components.models.baichuan.model._apply_rotary_pos_emb(
q,
k,
cos_,
sin_,
position_ids
)
nemo_automodel.components.models.baichuan.model._expand_mask(
mask,
dtype,
tgt_len = None
)
nemo_automodel.components.models.baichuan.model._make_causal_mask(
input_ids_shape,
dtype,
device,
past_key_values_length = 0
)
nemo_automodel.components.models.baichuan.model._rotate_half(
x
)
nemo_automodel.components.models.baichuan.model.ModelClass = BaichuanForCausalLM
nemo_automodel.components.models.baichuan.model.logger = logging.get_logger(__name__)