nemo_export.multimodal.run
#
Module Contents#
Classes#
Functions#
API#
- nemo_export.multimodal.run.trt_dtype_to_torch(dtype)#
- class nemo_export.multimodal.run.MultimodalModelRunner(
- visual_engine_dir,
- llm_engine_dir,
- modality='vision',
Initialization
- init_tokenizer(llm_engine_dir)#
- init_image_encoder(visual_engine_dir)#
- init_vision_preprocessor(visual_encoder_dir)#
- init_llm(llm_engine_dir)#
- video_preprocess(video_path)#
- insert_tokens_by_index(input_ids, num_frames)#
- preprocess(
- warmup,
- pre_prompt,
- post_prompt,
- image,
- attention_mask,
- batch_size,
- static tokenizer_image_token(
- batch_size,
- prompt,
- tokenizer,
- image_token_index=-200,
- split_prompt_by_images(tensor)#
- generate(
- pre_prompt,
- post_prompt,
- image,
- decoder_input_ids,
- max_new_tokens,
- attention_mask,
- warmup,
- batch_size,
- top_k,
- top_p,
- temperature,
- repetition_penalty,
- num_beams,
- lora_uids=None,
- get_visual_features(image, attention_mask)#
- setup_fake_prompts(
- visual_features,
- pre_input_ids,
- post_input_ids,
- input_lengths,
- setup_fake_prompts_vila(
- batch_size,
- visual_features,
- split_input_ids,
- input_lengths,
- preprocess_lita_visual(visual_features, config)#
- ptuning_setup(prompt_table, input_ids, input_lengths)#
- expand2square_pt(images, background_color)#
- load_video(config, video_path, processor, num_frames=None)#
- preprocess_frames(frames, config, processor)#
- get_num_sample_frames(config, vid_len)#
- process_lita_video(nemo_config, video_path, image_processor)#
- process_image(image_file, image_processor, nemo_config, image_folder)#
- process_vila_img(images)#
- setup_inputs(input_text, raw_image, batch_size)#
- run(
- input_text,
- input_image,
- max_new_tokens,
- batch_size,
- top_k,
- top_p,
- temperature,
- repetition_penalty,
- num_beams,
- lora_uids=None,
- run_profiling=False,
- check_accuracy=False,
- print_result(
- input_text,
- output_text,
- batch_size,
- num_beams,
- run_profiling,
- check_accuracy,
- load_test_media(input_media)#