Source code for nemo_deploy.nlp.trtllm_api_deployable

# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from pathlib import Path
from typing import List, Optional, Union

import numpy as np
from transformers import PreTrainedTokenizerBase

from nemo_deploy import ITritonDeployable
from nemo_deploy.utils import cast_output, str_ndarray2list
from nemo_export_deploy_common.import_utils import MISSING_TENSORRT_LLM_MSG, MISSING_TRITON_MSG, null_decorator

try:
    from pytriton.decorators import batch, first_value
    from pytriton.model_config import Tensor

    HAVE_TRITON = True
except ImportError:
    from unittest.mock import MagicMock

    Tensor = MagicMock()
    batch = null_decorator
    first_value = null_decorator
    HAVE_TRITON = False

try:
    from tensorrt_llm import SamplingParams
    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
    from tensorrt_llm.llmapi.llm import LLM, TokenizerBase

    HAVE_TENSORRT_LLM = True
except ImportError:
    HAVE_TENSORRT_LLM = False

LOGGER = logging.getLogger("NeMo")



[docs]
class TensorRTLLMAPIDeployable(ITritonDeployable):
    """A Triton inference server compatible wrapper for TensorRT-LLM LLM-API.

    This class provides a standardized interface for deploying TensorRT-LLM LLM-API
    in Triton inference server. It handles model loading, inference, and deployment configurations.

    Args:
        hf_model_id_path (str): Path to the HuggingFace model or model identifier.
            Can be a local path or a model ID from HuggingFace Hub.
        tokenizer (Optional[Union[str, Path, TokenizerBase, PreTrainedTokenizerBase]]):
            Path to the tokenizer or tokenizer instance.
        tensor_parallel_size (int): Tensor parallelism size. Defaults to 1.
        pipeline_parallel_size (int): Pipeline parallelism size. Defaults to 1.
        moe_expert_parallel_size (int): MOE expert parallelism size. Defaults to -1.
        moe_tensor_parallel_size (int): MOE tensor parallelism size. Defaults to -1.
        max_batch_size (int): Maximum batch size. Defaults to 8.
        max_num_tokens (int): Maximum total tokens across all sequences in a batch. Defaults to 8192.
        backend (str): Backend to use for TRTLLM. Defaults to "pytorch".
        dtype (str): Model data type. Defaults to "auto".
        **kwargs: Additional keyword arguments to pass to model loading.
    """

    def __init__(
        self,
        hf_model_id_path: str,
        tokenizer: Optional[Union[str, Path, "TokenizerBase", PreTrainedTokenizerBase]] = None,
        tensor_parallel_size: int = 1,
        pipeline_parallel_size: int = 1,
        moe_expert_parallel_size: int = -1,
        moe_tensor_parallel_size: int = -1,
        max_batch_size: int = 8,
        max_num_tokens: int = 8192,
        backend: str = "pytorch",
        dtype: str = "auto",
        **kwargs,
    ):
        if not HAVE_TENSORRT_LLM:
            raise ImportError(MISSING_TENSORRT_LLM_MSG)

        if not HAVE_TRITON:
            raise ImportError(MISSING_TRITON_MSG)

        config_args = {k: kwargs.pop(k) for k in PyTorchConfig.__annotations__.keys() & kwargs.keys()}
        pytorch_config = PyTorchConfig(**config_args)

        self.model = LLM(
            model=hf_model_id_path,
            tokenizer=hf_model_id_path if tokenizer is None else tokenizer,
            tensor_parallel_size=tensor_parallel_size,
            pipeline_parallel_size=pipeline_parallel_size,
            moe_expert_parallel_size=moe_expert_parallel_size,
            moe_tensor_parallel_size=moe_tensor_parallel_size,
            max_batch_size=max_batch_size,
            max_num_tokens=max_num_tokens,
            backend=backend,
            dtype=dtype,
            pytorch_backend_config=pytorch_config,
            **kwargs,
        )


[docs]
    def generate(
        self,
        prompts: List[str],
        max_length: int = 256,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        **kwargs,
    ) -> List[str]:
        """Generate text based on the provided input prompts.

        This method processes input prompts through the loaded model and
        generates text according to the specified parameters.

        Args:
            prompts: List of input prompts
            max_length: Maximum number of tokens to generate. Defaults to 256.
            temperature: Sampling temperature. Defaults to None.
            top_k: Number of highest probability tokens to consider. Defaults to None.
            top_p: Cumulative probability threshold for token sampling. Defaults to None.
            **kwargs: Additional keyword arguments to sampling params.

        Returns:
            List[str]: A list of generated texts, one for each input prompt.

        Raises:
            RuntimeError: If the model is not initialized.
        """
        if not self.model:
            raise RuntimeError("Model is not initialized")

        sampling_params = SamplingParams(
            max_tokens=max_length,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            **kwargs,
        )

        outputs = self.model.generate(
            inputs=prompts,
            sampling_params=sampling_params,
        )

        return [output.outputs[0].text for output in outputs]


    @property
    def get_triton_input(self):
        inputs = (
            Tensor(name="prompts", shape=(-1,), dtype=bytes),
            Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True),
            Tensor(name="max_batch_size", shape=(-1,), dtype=np.int_, optional=True),
            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
        )
        return inputs

    @property
    def get_triton_output(self):
        return (Tensor(name="sentences", shape=(-1,), dtype=bytes),)


[docs]
    @batch
    @first_value("temperature", "top_k", "top_p", "max_length")
    def triton_infer_fn(self, **inputs: np.ndarray):
        output_infer = {}

        prompts = str_ndarray2list(inputs.pop("prompts"))
        temperature = inputs.pop("temperature", None)
        top_k = inputs.pop("top_k", None)
        top_p = inputs.pop("top_p", None)
        max_length = inputs.pop("max_length", 256)

        output = self.generate(
            prompts=prompts,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            max_length=max_length,
        )

        output_infer = {"sentences": cast_output(output, np.bytes_)}

        return output_infer