nemo_automodel.components.models.baichuan.model#

Native Baichuan2 model implementation for NeMo Automodel.

Adapted from the Baichuan2 remote-code model on HuggingFace with the following changes:

  • Removed xformers / quantization / chat / streaming dependencies.

  • Added **kwargs to forward signatures so that extra batch keys (padding_mask, loss_mask, …) pass through without error.

  • Uses HFCheckpointingMixin for unified checkpointing.

  • Uses torch.nn.functional.scaled_dot_product_attention only.

Example (YAML)::

model:
  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
  pretrained_model_name_or_path: baichuan-inc/Baichuan2-7B-Chat

Module Contents#

Classes#

Functions#

Data#

API#

nemo_automodel.components.models.baichuan.model.logger#

‘get_logger(…)’

class nemo_automodel.components.models.baichuan.model.RMSNorm(hidden_size, eps=1e-06)#

Bases: torch.nn.Module

Initialization

forward(hidden_states)#
class nemo_automodel.components.models.baichuan.model.RotaryEmbedding(
dim,
max_position_embeddings=2048,
base=10000,
device=None,
)#

Bases: torch.nn.Module

Initialization

forward(x, seq_len=None)#
nemo_automodel.components.models.baichuan.model._rotate_half(x)#
nemo_automodel.components.models.baichuan.model._apply_rotary_pos_emb(q, k, cos_, sin_, position_ids)#
nemo_automodel.components.models.baichuan.model._make_causal_mask(
input_ids_shape,
dtype,
device,
past_key_values_length=0,
)#
nemo_automodel.components.models.baichuan.model._expand_mask(mask, dtype, tgt_len=None)#
class nemo_automodel.components.models.baichuan.model.MLP(hidden_size, intermediate_size, hidden_act)#

Bases: torch.nn.Module

Initialization

forward(x)#
class nemo_automodel.components.models.baichuan.model.Attention(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig,
)#

Bases: torch.nn.Module

Initialization

forward(
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: bool = False,
use_cache: bool = False,
) Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]#
class nemo_automodel.components.models.baichuan.model.DecoderLayer(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig,
)#

Bases: torch.nn.Module

Initialization

forward(
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = False,
) Tuple[torch.FloatTensor, ...]#
class nemo_automodel.components.models.baichuan.model.NormHead(hidden_size, vocab_size, bias=False)#

Bases: torch.nn.Module

Initialization

forward(hidden_states)#
class nemo_automodel.components.models.baichuan.model.BaichuanPreTrainedModel#

Bases: transformers.PreTrainedModel

config_class#

None

base_model_prefix#

‘model’

supports_gradient_checkpointing#

True

_no_split_modules#

[‘DecoderLayer’]

_supports_sdpa#

True

_supports_flash_attn#

False

_init_weights(module)#
_set_gradient_checkpointing(module, value=False)#
class nemo_automodel.components.models.baichuan.model.BaichuanModel(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig,
)#

Bases: nemo_automodel.components.models.baichuan.model.BaichuanPreTrainedModel

Initialization

get_input_embeddings()#
set_input_embeddings(value)#
_prepare_decoder_attention_mask(
attention_mask,
input_shape,
inputs_embeds,
past_key_values_length,
)#
forward(
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) Union[Tuple, transformers.modeling_outputs.BaseModelOutputWithPast]#
class nemo_automodel.components.models.baichuan.model.BaichuanForCausalLM(
config: nemo_automodel.components.models.baichuan.configuration.BaichuanConfig,
**model_kwargs,
)#

Bases: nemo_automodel.components.models.common.hf_checkpointing_mixin.HFCheckpointingMixin, nemo_automodel.components.models.baichuan.model.BaichuanPreTrainedModel, transformers.GenerationMixin

_tied_weights_keys#

None

get_input_embeddings()#
set_input_embeddings(value)#
get_output_embeddings()#
set_output_embeddings(new_embeddings)#
set_decoder(decoder)#
get_decoder()#
forward(
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
**kwargs,
) Union[Tuple, transformers.modeling_outputs.CausalLMOutputWithPast]#
prepare_inputs_for_generation(
input_ids,
past_key_values=None,
attention_mask=None,
inputs_embeds=None,
**kwargs,
)#
static _reorder_cache(past_key_values, beam_idx)#
nemo_automodel.components.models.baichuan.model.ModelClass#

None