nemo_automodel.components.models.nemotron_parse.model

View as Markdown

Module Contents

Classes

NameDescription
NemotronParseConfigConfiguration class for NemotronParse model.
NemotronParseDecoderTransformer decoder consisting of config.decoder_layers layers.
NemotronParseEncoderConfigConfiguration class for NemotronParse vision encoder (RADIO-based).
NemotronParseForConditionalGenerationNemotronParse model for conditional generation tasks.
NemotronParsePreTrainedModelAbstract class to handle weights initialization.
NemotronParseTextConfigConfiguration class for NemotronParse text decoder (mBART-based).
RadioWithNeckVision encoder using RADIO model with custom neck.

Data

ModelClass

API

class nemo_automodel.components.models.nemotron_parse.model.NemotronParseConfig(
encoder: typing.Optional[dict] = None,
decoder: typing.Optional[dict] = None,
tie_word_embeddings: bool = False,
decoder_start_token_id: int = 2,
pad_token_id: int = 1,
eos_token_id: int = 2,
bos_token_id: int = 0,
image_size: typing.List[int] = None,
is_encoder_decoder: bool = True,
max_sequence_length: int = 9000,
kwargs = {}
)

Bases: PretrainedConfig

Configuration class for NemotronParse model.

decoder
= NemotronParseTextConfig(**decoder)
encoder
image_size
= image_size or [2048, 1648]
model_type
= 'nemotron_parse'
vocab_size
= self.decoder.vocab_size
nemo_automodel.components.models.nemotron_parse.model.NemotronParseConfig.to_dict()
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder(
config: transformers.models.mbart.modeling_mbart.MBartConfig,
embed_tokens: typing.Optional[torch.nn.Embedding] = None
)

Bases: MBartPreTrainedModel

Transformer decoder consisting of config.decoder_layers layers.

dropout
= config.dropout
embed_tokens
layer_norm
= nn.LayerNorm(config.d_model)
layerdrop
= config.decoder_layerdrop
layernorm_embedding
= nn.LayerNorm(config.d_model)
layers
padding_idx
= config.pad_token_id
nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder.forward(
input_ids: typing.Optional[torch.LongTensor] = None,
attention_mask: typing.Optional[torch.Tensor] = None,
encoder_hidden_states: typing.Optional[torch.FloatTensor] = None,
encoder_attention_mask: typing.Optional[torch.LongTensor] = None,
past_key_values: typing.Optional[typing.Tuple[typing.Tuple[torch.FloatTensor]]] = None,
inputs_embeds: typing.Optional[torch.FloatTensor] = None,
use_cache: typing.Optional[bool] = None,
output_attentions: typing.Optional[bool] = None,
output_hidden_states: typing.Optional[bool] = None,
return_dict: typing.Optional[bool] = None
) -> typing.Union[typing.Tuple, transformers.models.mbart.modeling_mbart.BaseModelOutputWithPastAndCrossAttentions]
nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder.get_input_embeddings()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseDecoder.set_input_embeddings(
value
)
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseEncoderConfig(
patch_size: int = 16,
max_resolution: int = 2048,
preferred_resolution: typing.List[int] = None,
torch_dtype: str = 'bfloat16',
kwargs = {}
)

Bases: PretrainedConfig

Configuration class for NemotronParse vision encoder (RADIO-based).

model_type
= 'nemotron_parse_encoder'
preferred_resolution
= preferred_resolution or [768, 768]
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration(
config: nemo_automodel.components.models.nemotron_parse.model.NemotronParseConfig,
loss_fn = None,
kwargs = {}
)

Bases: HFCheckpointingMixin, NemotronParsePreTrainedModel, GenerationMixin

NemotronParse model for conditional generation tasks.

class_token_indx_start
= getattr(config, 'class_token_start_idx', 50000)
decoder
= self.decoder.to(torch.bfloat16)
encoder
= self.encoder.to(torch.bfloat16)
lm_head
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration._reorder_cache(
past_key_values,
beam_idx
)
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.forward(
pixel_values: typing.Optional[torch.FloatTensor] = None,
decoder_input_ids: typing.Optional[torch.LongTensor] = None,
decoder_attention_mask: typing.Optional[torch.BoolTensor] = None,
encoder_outputs: typing.Optional[typing.Tuple[torch.FloatTensor]] = None,
past_key_values: typing.Optional[typing.Tuple[typing.Tuple[torch.FloatTensor]]] = None,
decoder_inputs_embeds: typing.Optional[torch.FloatTensor] = None,
labels: typing.Optional[torch.LongTensor] = None,
use_cache: typing.Optional[bool] = None,
output_attentions: typing.Optional[bool] = None,
output_hidden_states: typing.Optional[bool] = None,
return_dict: typing.Optional[bool] = None,
logits_to_keep: typing.Union[int, torch.Tensor] = 0,
kwargs = {}
) -> typing.Union[typing.Tuple[torch.FloatTensor], transformers.modeling_outputs.Seq2SeqLMOutput]
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_decoder()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_encoder()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_input_embeddings()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.get_output_embeddings()
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.prepare_decoder_input_ids_from_labels(
labels: torch.Tensor
)
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.set_input_embeddings(
value
)
nemo_automodel.components.models.nemotron_parse.model.NemotronParseForConditionalGeneration.set_output_embeddings(
new_embeddings
)
class nemo_automodel.components.models.nemotron_parse.model.NemotronParsePreTrainedModel()

Bases: PreTrainedModel

Abstract class to handle weights initialization.

_no_split_modules
= ['RadioWithNeck', 'MBartDecoder']
_skip_keys_device_placement
= 'past_key_values'
base_model_prefix
= 'vision_encoder_decoder'
main_input_name
= 'pixel_values'
nemo_automodel.components.models.nemotron_parse.model.NemotronParsePreTrainedModel._init_weights(
module
)
class nemo_automodel.components.models.nemotron_parse.model.NemotronParseTextConfig(
vocab_size: int = 250027,
d_model: int = 1024,
encoder_layers: int = 12,
decoder_layers: int = 12,
encoder_attention_heads: int = 16,
decoder_attention_heads: int = 16,
decoder_ffn_dim: int = 4096,
encoder_ffn_dim: int = 4096,
activation_function: str = 'gelu',
dropout: float = 0.1,
attention_dropout: float = 0.0,
activation_dropout: float = 0.0,
classifier_dropout: float = 0.0,
init_std: float = 0.02,
encoder_layerdrop: float = 0.0,
decoder_layerdrop: float = 0.0,
scale_embedding: bool = False,
use_cache: bool = True,
num_labels: int = 3,
forced_eos_token_id: int = 2,
pad_token_id: int = 1,
bos_token_id: int = 0,
eos_token_id: int = 2,
decoder_start_token_id: int = 2,
add_cross_attention: bool = True,
is_decoder: bool = True,
max_sequence_length: int = 9000,
kwargs = {}
)

Bases: PretrainedConfig

Configuration class for NemotronParse text decoder (mBART-based).

hidden_size
= self.d_model
model_type
= 'nemotron_parse_text'
num_attention_heads
= self.encoder_attention_heads
class nemo_automodel.components.models.nemotron_parse.model.RadioWithNeck(
config
)

Bases: Module

Vision encoder using RADIO model with custom neck.

conv1
= nn.Conv1d(1280, last_hidden_state, 1)
conv2
layer_norm1
layer_norm2
layer_norm3
model_encoder
sum_proj
= nn.Linear(3840, last_hidden_state)
nemo_automodel.components.models.nemotron_parse.model.RadioWithNeck.forward(
pixel_values,
output_attentions = False,
output_hidden_states = False,
return_dict = False,
kwargs = {}
)
nemo_automodel.components.models.nemotron_parse.model.ModelClass = NemotronParseForConditionalGeneration