nemo_export.tensorrt_mm_exporter#

Module Contents#

Classes#

TensorRTMMExporter

Exports nemo checkpoints to TensorRT and run fast inference.

Functions#

noop_decorator

No op decorator.

Data#

API#

nemo_export.tensorrt_mm_exporter.use_deploy = True#
nemo_export.tensorrt_mm_exporter.noop_decorator(func)#

No op decorator.

nemo_export.tensorrt_mm_exporter.use_pytriton = True#
nemo_export.tensorrt_mm_exporter.batch = None#
nemo_export.tensorrt_mm_exporter.LOGGER = 'getLogger(...)'#
class nemo_export.tensorrt_mm_exporter.TensorRTMMExporter(
model_dir: str,
load_model: bool = True,
modality: str = 'vision',
)#

Bases: nemo_deploy.ITritonDeployable

Exports nemo checkpoints to TensorRT and run fast inference.

.. rubric:: Example

from nemo_export import TensorRTMMExporter

exporter = TensorRTMMExporter(model_dir=”/path/for/model/files”) exporter.export( visual_checkpoint_path=”/path/for/nemo/checkpoint”, model_type=”neva”, tensor_parallel_size=1, )

output = exporter.forward(“Hi! What is in this image?”, “/path/for/input_media”) print(“output: “, output)

Initialization

export(
visual_checkpoint_path: str,
llm_checkpoint_path: str = None,
model_type: str = 'neva',
llm_model_type: str = 'llama',
processor_name: str = None,
tensor_parallel_size: int = 1,
max_input_len: int = 4096,
max_output_len: int = 256,
max_batch_size: int = 1,
vision_max_batch_size: int = 1,
max_multimodal_len: int = 3072,
dtype: str = 'bfloat16',
delete_existing_files: bool = True,
load_model: bool = True,
use_lora_plugin: str = None,
lora_target_modules: List[str] = None,
lora_checkpoint_path: str = None,
max_lora_rank: int = 64,
)#

Export multimodal models to TRTLLM.

forward(
input_text: str,
input_media: str,
batch_size: int = 1,
max_output_len: int = 30,
top_k: int = 1,
top_p: float = 0.0,
temperature: float = 1.0,
repetition_penalty: float = 1.0,
num_beams: int = 1,
lora_uids: List[str] = None,
)#

Run forward with loaded TRTLLM engine.

get_input_media_tensors()#

Get input media tensors.

property get_triton_input#
property get_triton_output#
triton_infer_fn(**inputs: numpy.ndarray)#
_load()#