nemo_automodel.components.models.nemotron_parse.model
nemo_automodel.components.models.nemotron_parse.model
Module Contents
Classes
| Name | Description |
|---|---|
NemotronParseConfig | Configuration class for NemotronParse model. |
NemotronParseDecoder | Transformer decoder consisting of config.decoder_layers layers. |
NemotronParseEncoderConfig | Configuration class for NemotronParse vision encoder (RADIO-based). |
NemotronParseForConditionalGeneration | NemotronParse model for conditional generation tasks. |
NemotronParsePreTrainedModel | Abstract class to handle weights initialization. |
NemotronParseTextConfig | Configuration class for NemotronParse text decoder (mBART-based). |
RadioWithNeck | Vision encoder using RADIO model with custom neck. |
Data
API
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseConfig( encoder: typing.Optional[dict] = None, decoder: typing.Optional[dict] = None, tie_word_embeddings: bool = False, decoder_start_token_id: int = 2, pad_token_id: int = 1, eos_token_id: int = 2, bos_token_id: int = 0, image_size: typing.List[int] = None, is_encoder_decoder: bool = True, max_sequence_length: int = 9000, kwargs = {} )
Bases: PretrainedConfig
Configuration class for NemotronParse model.
decoder
= NemotronParseTextConfig(**decoder)
encoder
image_size
= image_size or [2048, 1648]
model_type
= 'nemotron_parse'
vocab_size
= self.decoder.vocab_size
nemo_automodel.components.models.nemotron_parse.model.NemotronParseConfig.to_dict()
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder( config: transformers.models.mbart.modeling_mbart.MBartConfig, embed_tokens: typing.Optional[torch.nn.Embedding] = None )
Bases: MBartPreTrainedModel
Transformer decoder consisting of config.decoder_layers layers.
dropout
= config.dropout
embed_tokens
layer_norm
= nn.LayerNorm(config.d_model)
layerdrop
= config.decoder_layerdrop
layernorm_embedding
= nn.LayerNorm(config.d_model)
layers
padding_idx
= config.pad_token_id
nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder.forward( input_ids: typing.Optional[torch.LongTensor] = None, attention_mask: typing.Optional[torch.Tensor] = None, encoder_hidden_states: typing.Optional[torch.FloatTensor] = None, encoder_attention_mask: typing.Optional[torch.LongTensor] = None, past_key_values: typing.Optional[typing.Tuple[typing.Tuple[torch.FloatTensor]]] = None, inputs_embeds: typing.Optional[torch.FloatTensor] = None, use_cache: typing.Optional[bool] = None, output_attentions: typing.Optional[bool] = None, output_hidden_states: typing.Optional[bool] = None, return_dict: typing.Optional[bool] = None ) -> typing.Union[typing.Tuple, transformers.models.mbart.modeling_mbart.BaseModelOutputWithPastAndCrossAttentions]
nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder.get_input_embeddings()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder.set_input_embeddings( value )
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseEncoderConfig( patch_size: int = 16, max_resolution: int = 2048, preferred_resolution: typing.List[int] = None, torch_dtype: str = 'bfloat16', kwargs = {} )
Bases: PretrainedConfig
Configuration class for NemotronParse vision encoder (RADIO-based).
model_type
= 'nemotron_parse_encoder'
preferred_resolution
= preferred_resolution or [768, 768]
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration( config: nemo_automodel.components.models.nemotron_parse.model.NemotronParseConfig, loss_fn = None, kwargs = {} )
Bases: HFCheckpointingMixin, NemotronParsePreTrainedModel, GenerationMixin
NemotronParse model for conditional generation tasks.
class_token_indx_start
= getattr(config, 'class_token_start_idx', 50000)
decoder
= self.decoder.to(torch.bfloat16)
encoder
= self.encoder.to(torch.bfloat16)
lm_head
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration._reorder_cache( past_key_values, beam_idx )
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.forward( pixel_values: typing.Optional[torch.FloatTensor] = None, decoder_input_ids: typing.Optional[torch.LongTensor] = None, decoder_attention_mask: typing.Optional[torch.BoolTensor] = None, encoder_outputs: typing.Optional[typing.Tuple[torch.FloatTensor]] = None, past_key_values: typing.Optional[typing.Tuple[typing.Tuple[torch.FloatTensor]]] = None, decoder_inputs_embeds: typing.Optional[torch.FloatTensor] = None, labels: typing.Optional[torch.LongTensor] = None, use_cache: typing.Optional[bool] = None, output_attentions: typing.Optional[bool] = None, output_hidden_states: typing.Optional[bool] = None, return_dict: typing.Optional[bool] = None, logits_to_keep: typing.Union[int, torch.Tensor] = 0, kwargs = {} ) -> typing.Union[typing.Tuple[torch.FloatTensor], transformers.modeling_outputs.Seq2SeqLMOutput]
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_decoder()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_encoder()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_input_embeddings()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_output_embeddings()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.prepare_decoder_input_ids_from_labels( labels: torch.Tensor )
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.set_input_embeddings( value )
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.set_output_embeddings( new_embeddings )
class nemo_automodel.components.models.nemotron_parse.model.NemotronParsePreTrainedModel()
Bases: PreTrainedModel
Abstract class to handle weights initialization.
_no_split_modules
= ['RadioWithNeck', 'MBartDecoder']
_skip_keys_device_placement
= 'past_key_values'
base_model_prefix
= 'vision_encoder_decoder'
main_input_name
= 'pixel_values'
nemo_automodel.components.models.nemotron_parse.model.NemotronParsePreTrainedModel._init_weights( module )
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseTextConfig( vocab_size: int = 250027, d_model: int = 1024, encoder_layers: int = 12, decoder_layers: int = 12, encoder_attention_heads: int = 16, decoder_attention_heads: int = 16, decoder_ffn_dim: int = 4096, encoder_ffn_dim: int = 4096, activation_function: str = 'gelu', dropout: float = 0.1, attention_dropout: float = 0.0, activation_dropout: float = 0.0, classifier_dropout: float = 0.0, init_std: float = 0.02, encoder_layerdrop: float = 0.0, decoder_layerdrop: float = 0.0, scale_embedding: bool = False, use_cache: bool = True, num_labels: int = 3, forced_eos_token_id: int = 2, pad_token_id: int = 1, bos_token_id: int = 0, eos_token_id: int = 2, decoder_start_token_id: int = 2, add_cross_attention: bool = True, is_decoder: bool = True, max_sequence_length: int = 9000, kwargs = {} )
Bases: PretrainedConfig
Configuration class for NemotronParse text decoder (mBART-based).
hidden_size
= self.d_model
model_type
= 'nemotron_parse_text'
num_attention_heads
= self.encoder_attention_heads
class nemo_automodel.components.models.nemotron_parse.model.RadioWithNeck( config )
Bases: Module
Vision encoder using RADIO model with custom neck.
conv1
= nn.Conv1d(1280, last_hidden_state, 1)
conv2
layer_norm1
layer_norm2
layer_norm3
model_encoder
sum_proj
= nn.Linear(3840, last_hidden_state)
nemo_automodel.components.models.nemotron_parse.model.RadioWithNeck.forward( pixel_values, output_attentions = False, output_hidden_states = False, return_dict = False, kwargs = {} )
nemo_automodel.components.models.nemotron_parse.model.ModelClass = NemotronParseForConditionalGeneration