nemo_export.multimodal.run#

Module Contents#

Classes#

Functions#

API#

nemo_export.multimodal.run.trt_dtype_to_torch(dtype)#
class nemo_export.multimodal.run.MultimodalModelRunner(
visual_engine_dir,
llm_engine_dir,
modality='vision',
)#

Initialization

init_tokenizer(llm_engine_dir)#
init_image_encoder(visual_engine_dir)#
init_vision_preprocessor(visual_encoder_dir)#
init_llm(llm_engine_dir)#
video_preprocess(video_path)#
insert_tokens_by_index(input_ids, num_frames)#
preprocess(
warmup,
pre_prompt,
post_prompt,
image,
attention_mask,
batch_size,
)#
static tokenizer_image_token(
batch_size,
prompt,
tokenizer,
image_token_index=-200,
)#
split_prompt_by_images(tensor)#
generate(
pre_prompt,
post_prompt,
image,
decoder_input_ids,
max_new_tokens,
attention_mask,
warmup,
batch_size,
top_k,
top_p,
temperature,
repetition_penalty,
num_beams,
lora_uids=None,
)#
get_visual_features(image, attention_mask)#
setup_fake_prompts(
visual_features,
pre_input_ids,
post_input_ids,
input_lengths,
)#
setup_fake_prompts_vila(
batch_size,
visual_features,
split_input_ids,
input_lengths,
)#
preprocess_lita_visual(visual_features, config)#
ptuning_setup(prompt_table, input_ids, input_lengths)#
expand2square_pt(images, background_color)#
load_video(config, video_path, processor, num_frames=None)#
preprocess_frames(frames, config, processor)#
get_num_sample_frames(config, vid_len)#
process_lita_video(nemo_config, video_path, image_processor)#
process_image(image_file, image_processor, nemo_config, image_folder)#
process_vila_img(images)#
setup_inputs(input_text, raw_image, batch_size)#
run(
input_text,
input_image,
max_new_tokens,
batch_size,
top_k,
top_p,
temperature,
repetition_penalty,
num_beams,
lora_uids=None,
run_profiling=False,
check_accuracy=False,
)#
print_result(
input_text,
output_text,
batch_size,
num_beams,
run_profiling,
check_accuracy,
)#
load_test_media(input_media)#