Source code for nemo_deploy.nlp.query_llm

# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import time
from abc import ABC
from typing import List, Optional

import numpy as np

from nemo_deploy.utils import str_list2numpy
from nemo_export_deploy_common.import_utils import MISSING_TRITON_MSG, UnavailableError

try:
    from pytriton.client import ModelClient

    HAVE_TRITON = True
except (ImportError, ModuleNotFoundError):
    HAVE_TRITON = False



[docs]
class NemoQueryLLMBase(ABC):
    """Abstract base class for querying a Large Language Model (LLM).

    Args:
    url (str): The URL of the inference server.
    model_name (str): The name of the model to be queried.
    """

    def __init__(self, url, model_name):
        self.url = url
        self.model_name = model_name




[docs]
class NemoQueryLLMPyTorch(NemoQueryLLMBase):
    """Sends a query to Triton for LLM inference.

    Example:
        from nemo_deploy import NemoTritonQueryLLMPyTorch

        nq = NemoTritonQueryLLMPyTorch(url="localhost", model_name="GPT-2B")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(
            prompts=prompts,
            max_length=100,
            top_k=1,
            top_p=0.0,
            temperature=0.0,
        )
        print("prompts: ", prompts)
    """

    # these arguments are explicitly defined in order to make it clear to user what they can pass
    # names and optionality should exactly match the get_triton_input() results for MegatronGPTDeployable

[docs]
    def query_llm(
        self,
        prompts: List[str],
        use_greedy: Optional[bool] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        add_BOS: Optional[bool] = None,
        all_probs: Optional[bool] = None,
        compute_logprob: Optional[bool] = None,
        end_strings: Optional[List[str]] = None,
        min_length: Optional[int] = None,
        max_length: Optional[int] = None,
        apply_chat_template: bool = False,
        n_top_logprobs: Optional[int] = None,
        init_timeout: float = 60.0,
        echo: Optional[bool] = None,
    ):
        """Query the Triton server synchronously and return a list of responses.

        Args:
            prompts (List(str)): list of sentences.
            use_greedy (bool): use greedy sampling, effectively the same as top_k=1
            temperature (float): A parameter of the softmax function, which is the last layer in the network.
            top_k (int): limits us to a certain number (K) of the top tokens to consider.
            top_p (float): limits us to the top tokens within a certain probability mass (p).
            repetition_penalty (float): penalty applied to repeated sequences, 1.0 means no penalty.
            add_BOS (bool): whether or not to add a BOS (beginning of sentence) token.
            all_probs (bool): when using compute_logprob, returns probabilities for all tokens in vocabulary.
            compute_logprob (bool): get back probabilities of all tokens in the sequence.
            end_strings (List(str)): list of strings which will terminate generation when they appear in the output.
            min_length (int): min generated tokens.
            max_length (int): max generated tokens.
            apply_chat_template (bool): applies chat template if its a chat model. Default: False
            init_timeout (flat): timeout for the connection.
        """
        prompts = str_list2numpy(prompts)
        inputs = {
            "prompts": prompts,
        }
        if use_greedy is not None:
            inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_)
        if temperature is not None:
            inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
        if top_k is not None:
            inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
        if top_p is not None:
            inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
        if repetition_penalty is not None:
            inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single)
        if add_BOS is not None:
            inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_)
        if all_probs is not None:
            inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_)
        if compute_logprob is not None:
            inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_)
        if end_strings is not None:
            inputs["end_strings"] = str_list2numpy(end_strings)
        if min_length is not None:
            inputs["min_length"] = np.full(prompts.shape, min_length, dtype=np.int_)
        if max_length is not None:
            inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_)
        if apply_chat_template is not None:
            inputs["apply_chat_template"] = np.full(prompts.shape, apply_chat_template, dtype=np.bool_)
        if n_top_logprobs is not None:
            inputs["n_top_logprobs"] = np.full(prompts.shape, n_top_logprobs, dtype=np.int_)
        if echo is not None:
            inputs["echo"] = np.full(prompts.shape, echo, dtype=np.bool_)

        with ModelClient(
            self.url,
            self.model_name,
            init_timeout_s=init_timeout,
            inference_timeout_s=600,
        ) as client:
            result_dict = client.infer_batch(**inputs)
            output_type = client.model_config.outputs[0].dtype

            log_probs_output = None
            if "log_probs" in result_dict.keys():
                log_probs_output = result_dict["log_probs"]

            top_log_probs_output = None
            if "top_logprobs" in result_dict.keys():
                top_log_probs_output = result_dict["top_logprobs"]

            if output_type == np.bytes_:
                if "sentences" in result_dict.keys():
                    output = result_dict["sentences"]
                else:
                    return "Unknown output keyword."

                sentences = np.char.decode(output.astype("bytes"), "utf-8")
                openai_response = {
                    "id": f"cmpl-{int(time.time())}",
                    "object": "text_completion",
                    "created": int(time.time()),
                    "model": self.model_name,
                    "choices": [{"text": sentences}],
                }
                if log_probs_output is not None:
                    # logprobs are stored under choices in openai format.
                    openai_response["choices"][0]["logprobs"] = {}
                    openai_response["choices"][0]["logprobs"]["token_logprobs"] = log_probs_output
                    # TODO athitten: get top_n_logprobs from mcore once available
                    # we take 1st element because cast_output adds an extra dimension
                    if top_log_probs_output is not None:
                        n_log_probs_output = [json.loads(top_log_prob[0]) for top_log_prob in top_log_probs_output]
                        openai_response["choices"][0]["logprobs"]["top_logprobs"] = n_log_probs_output
                return openai_response
            else:
                return result_dict["sentences"]





[docs]
class NemoQueryLLMHF(NemoQueryLLMBase):
    """Sends a query to Triton for LLM inference.

    Example:
        from nemo_deploy import NemoQueryLLMHF

        nq = NemoQueryLLMHF(url="localhost", model_name="GPT-2B")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(
            prompts=prompts,
            max_length=100,
            top_k=1,
            top_p=0.0,
            temperature=0.0,
        )
        print("prompts: ", prompts)
    """

    # these arguments are explicitly defined in order to make it clear to user what they can pass
    # names and optionality should exactly match the get_triton_input() results for HuggingFaceLLMDeploy

[docs]
    def query_llm(
        self,
        prompts: List[str],
        use_greedy: Optional[bool] = None,
        temperature: Optional[float] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        add_BOS: Optional[bool] = None,
        all_probs: Optional[bool] = None,
        output_logits: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        end_strings: Optional[List[str]] = None,
        min_length: Optional[int] = None,
        max_length: Optional[int] = None,
        init_timeout: float = 60.0,
    ):
        """Query the Triton server synchronously and return a list of responses.

        Args:
            prompts (List[str]): list of sentences.
            use_greedy (Optional[bool]): use greedy sampling, effectively the same as top_k=1
            temperature (Optional[float]): A parameter of the softmax function, which is the last layer in the network.
            top_k (Optional[int]): limits us to a certain number (K) of the top tokens to consider.
            top_p (Optional[float]): limits us to the top tokens within a certain probability mass (p).
            repetition_penalty (Optional[float]): penalty applied to repeated sequences, 1.0 means no penalty.
            add_BOS (Optional[bool]): whether or not to add a BOS (beginning of sentence) token.
            all_probs (Optional[bool]): when using compute_logprob, returns probabilities for all tokens in vocabulary.
            output_logits (Optional[bool]): whether to return logits for each token
            output_scores (Optional[bool]): whether to return scores for each token
            end_strings (Optional[List[str]]): list of strs which will stop generation when they appear in the output.
            min_length (Optional[int]): min generated tokens.
            max_length (Optional[int]): max generated tokens.
            init_timeout (float): timeout for the connection.
        """
        prompts = str_list2numpy(prompts)
        inputs = {
            "prompts": prompts,
        }
        if use_greedy is not None:
            inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_)
        if temperature is not None:
            inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
        if top_k is not None:
            inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
        if top_p is not None:
            inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
        if repetition_penalty is not None:
            inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single)
        if add_BOS is not None:
            inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_)
        if all_probs is not None:
            inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_)
        if output_logits is not None:
            inputs["output_logits"] = np.full(prompts.shape, output_logits, dtype=np.bool_)
        if output_scores is not None:
            inputs["output_scores"] = np.full(prompts.shape, output_scores, dtype=np.bool_)
        if end_strings is not None:
            inputs["end_strings"] = str_list2numpy(end_strings)
        if min_length is not None:
            inputs["min_length"] = np.full(prompts.shape, min_length, dtype=np.int_)
        if max_length is not None:
            inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_)

        with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
            result_dict = client.infer_batch(**inputs)
            output_type = client.model_config.outputs[0].dtype

            if output_type == np.bytes_:
                if "sentences" in result_dict.keys():
                    output = result_dict["sentences"]
                else:
                    return "Unknown output keyword."

                sentences = np.char.decode(output.astype("bytes"), "utf-8")
                openai_response = {
                    "id": f"cmpl-{int(time.time())}",
                    "object": "text_completion",
                    "created": int(time.time()),
                    "model": self.model_name,
                    "choices": [{"text": sentences}],
                }
                if output_logits and "logits" in result_dict:
                    openai_response["logits"] = result_dict["logits"]
                if output_scores and "scores" in result_dict:
                    openai_response["scores"] = result_dict["scores"]
                return openai_response
            else:
                return result_dict["sentences"]





[docs]
class NemoQueryLLM(NemoQueryLLMBase):
    """Sends a query to Triton for LLM inference.

    Example:
        from nemo_deploy import NemoQueryLLM

        nq = NemoQueryLLM(url="localhost", model_name="GPT-2B")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(
            prompts=prompts,
            max_output_len=100,
            top_k=1,
            top_p=0.0,
            temperature=0.0,
        )
        print("prompts: ", prompts)
    """


[docs]
    def query_llm(
        self,
        prompts,
        stop_words_list=None,
        bad_words_list=None,
        no_repeat_ngram_size=None,
        min_output_len=None,
        max_output_len=None,
        top_k=None,
        top_p=None,
        temperature=None,
        random_seed=None,
        lora_uids=None,
        use_greedy: bool = None,
        repetition_penalty: float = None,
        add_BOS: bool = None,
        all_probs: bool = None,
        compute_logprob: bool = None,
        end_strings=None,
        init_timeout=60.0,
        openai_format_response: bool = False,
        output_context_logits: bool = False,
        output_generation_logits: bool = False,
    ):
        """Query the Triton server synchronously and return a list of responses.

        Args:
            prompts (List(str)): list of sentences.
            max_output_len (int): max generated tokens.
            top_k (int): limits us to a certain number (K) of the top tokens to consider.
            top_p (float): limits us to the top tokens within a certain probability mass (p).
            temperature (float): A parameter of the softmax function, which is the last layer in the network.
            random_seed (int): Seed to condition sampling.
            stop_words_list (List(str)): list of stop words.
            bad_words_list (List(str)): list of bad words.
            no_repeat_ngram_size (int): no repeat ngram size.
            init_timeout (flat): timeout for the connection.
            openai_format_response: return response similar to OpenAI API format
            output_generation_logits: return generation logits from model on PyTriton
        """
        if not HAVE_TRITON:
            raise UnavailableError(MISSING_TRITON_MSG)

        prompts = str_list2numpy(prompts)
        inputs = {"prompts": prompts}

        if min_output_len is not None:
            inputs["min_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)

        if max_output_len is not None:
            inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)

        if top_k is not None:
            inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)

        if top_p is not None:
            inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)

        if temperature is not None:
            inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)

        if random_seed is not None:
            inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_)

        if stop_words_list is not None:
            inputs["stop_words_list"] = str_list2numpy(stop_words_list)

        if bad_words_list is not None:
            inputs["bad_words_list"] = str_list2numpy(bad_words_list)

        if no_repeat_ngram_size is not None:
            inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)

        if lora_uids is not None:
            lora_uids = np.char.encode(lora_uids, "utf-8")
            inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)

        if use_greedy is not None:
            inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_)

        if repetition_penalty is not None:
            inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single)

        if add_BOS is not None:
            inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_)

        if all_probs is not None:
            inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_)

        if compute_logprob is not None:
            inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_)

        if end_strings is not None:
            inputs["end_strings"] = str_list2numpy(end_strings)

        if output_context_logits is not None:
            inputs["output_context_logits"] = np.full(prompts.shape, output_context_logits, dtype=np.bool_)

        if output_generation_logits is not None:
            inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_)

        with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
            result_dict = client.infer_batch(**inputs)
            output_type = client.model_config.outputs[0].dtype

            if output_type == np.bytes_:
                if "outputs" in result_dict.keys():
                    output = result_dict["outputs"]
                elif "sentences" in result_dict.keys():
                    output = result_dict["sentences"]
                else:
                    return "Unknown output keyword."

                sentences = np.char.decode(output.astype("bytes"), "utf-8")
                if openai_format_response:
                    openai_response = {
                        "id": f"cmpl-{int(time.time())}",
                        "object": "text_completion",
                        "created": int(time.time()),
                        "model": self.model_name,
                        "choices": [{"text": sentences}],
                    }
                    if output_generation_logits:
                        openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"]
                    if output_context_logits:
                        openai_response["choices"][0]["context_logits"] = result_dict["context_logits"]
                    return openai_response
                else:
                    return sentences
            else:
                return result_dict["outputs"]





[docs]
class NemoQueryTRTLLMAPI(NemoQueryLLMBase):
    """Sends a query to Triton for TensorRT-LLM API deployment inference.

    Example:
        from nemo_deploy import NemoQueryTRTLLMAPI

        nq = NemoQueryTRTLLMAPI(url="localhost", model_name="GPT-2B")

        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
        output = nq.query_llm(
            prompts=prompts,
            max_length=100,
            top_k=1,
            top_p=None,
            temperature=None,
        )
        print("prompts: ", prompts)
    """


[docs]
    def query_llm(
        self,
        prompts: List[str],
        max_length: int = 256,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        temperature: Optional[float] = None,
        init_timeout: float = 60.0,
    ):
        """
        Query the Triton server synchronously and return a list of responses.

        Args:
            prompts (List(str)): list of sentences.
            max_length (int): max generated tokens.
            top_k (int): limits us to a certain number (K) of the top tokens to consider.
            top_p (float): limits us to the top tokens within a certain probability mass (p).
            temperature (float): A parameter of the softmax function, which is the last layer in the network.
            init_timeout (flat): timeout for the connection.

        Returns:
            List[str]: A list of generated texts, one for each input prompt.
        """
        prompts = str_list2numpy(prompts)
        inputs = {
            "prompts": prompts,
        }

        if max_length is not None:
            inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_)

        if temperature is not None:
            inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)

        if top_k is not None:
            inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)

        if top_p is not None:
            inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)

        with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
            result_dict = client.infer_batch(**inputs)
            output_type = client.model_config.outputs[0].dtype

            if output_type == np.bytes_:
                if "sentences" in result_dict.keys():
                    output = result_dict["sentences"]
                else:
                    return "Unknown output keyword."

                sentences = np.char.decode(output.astype("bytes"), "utf-8")
                return sentences
            else:
                return result_dict["sentences"]