Module Contents
API
-
nemo_export.multimodal.run.trt_dtype_to_torch(dtype)[source]
-
class nemo_export.multimodal.run.MultimodalModelRunner(
- visual_engine_dir,
- llm_engine_dir,
- modality='vision',
)[source]
Initialization
-
init_tokenizer(llm_engine_dir)[source]
-
init_image_encoder(visual_engine_dir)[source]
-
init_vision_preprocessor(visual_encoder_dir)[source]
-
init_llm(llm_engine_dir)[source]
-
video_preprocess(video_path)[source]
-
insert_tokens_by_index(input_ids, num_frames)[source]
-
preprocess(
- warmup,
- pre_prompt,
- post_prompt,
- image,
- attention_mask,
- batch_size,
)[source]
-
static tokenizer_image_token(
- batch_size,
- prompt,
- tokenizer,
- image_token_index=-200,
)[source]
-
split_prompt_by_images(tensor)[source]
-
generate(
- pre_prompt,
- post_prompt,
- image,
- decoder_input_ids,
- max_new_tokens,
- attention_mask,
- warmup,
- batch_size,
- top_k,
- top_p,
- temperature,
- repetition_penalty,
- num_beams,
- lora_uids=None,
)[source]
-
get_visual_features(image, attention_mask)[source]
-
setup_fake_prompts(
- visual_features,
- pre_input_ids,
- post_input_ids,
- input_lengths,
)[source]
-
setup_fake_prompts_vila(
- batch_size,
- visual_features,
- split_input_ids,
- input_lengths,
)[source]
-
preprocess_lita_visual(visual_features, config)[source]
-
ptuning_setup(prompt_table, input_ids, input_lengths)[source]
-
expand2square_pt(images, background_color)[source]
-
load_video(config, video_path, processor, num_frames=None)[source]
-
preprocess_frames(frames, config, processor)[source]
-
get_num_sample_frames(config, vid_len)[source]
-
process_lita_video(nemo_config, video_path, image_processor)[source]
-
process_image(image_file, image_processor, nemo_config, image_folder)[source]
-
process_vila_img(images)[source]
-
setup_inputs(input_text, raw_image, batch_size)[source]
-
run(
- input_text,
- input_image,
- max_new_tokens,
- batch_size,
- top_k,
- top_p,
- temperature,
- repetition_penalty,
- num_beams,
- lora_uids=None,
- run_profiling=False,
- check_accuracy=False,
)[source]
-
print_result(
- input_text,
- output_text,
- batch_size,
- num_beams,
- run_profiling,
- check_accuracy,
)[source]
-
load_test_media(input_media)[source]