Running Inference with Guardrails#

Inference Endpoints#

The microservice supports two endpoints for running inference with guardrails:

  • /v1/guardrail/chat/completions

  • /v1/guardrail/completions

These endpoints are OpenAI compatible, with the addition of the guardrails field in the request body. The field is a dictionary with the following fields:

  • config_id: Specifies the ID of the guardrail configuration to apply.

  • config: Specifies the guardrail configuration content. Use this field to pass the guardrail configuration in the request body rather than specifying an existing config by ID.

  • options: Specifies additional guardrail options. For example, you can log the activated rails in the microservice logs:

    "guardrails": {
      "options": {
        "log": {
          "activated_rails": true
        }
      }
    }
    

    For more information about options, refer to Configuration Guide.

  • return_choice: When set to true, the guardrail data is returned as a choice in the choices array with the role field set to guardrails_data. The default value is false.

    This field is helpful when you use third-party clients to make requests to the NeMo Guardrails microservice. These clients typically don’t forward back additional response fields that are not part of the OpenAI response format.

Chat Completions#

Choose one of the following options of running inference with chat completions.

Set up a NeMoMicroservices client instance using the base URL of the NeMo Guardrails microservice and perform the task as follows.

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(
    base_url=os.environ["GUARDRAILS_BASE_URL"],
    inference_base_url=os.environ["NIM_BASE_URL"]
)
response = client.guardrail.chat.completions.create(
    model="meta/llama-3.1-8b-instruct",
    messages=[
        {"role": "user", "content": "what can you do?"}
    ],
    guardrails={
        "config_id": "demo-self-check-input-output",
    },
    stream=True
)
for chunk in response:
    print(response)

Make a POST request to the /v1/guardrail/chat/completions endpoint.

curl -X POST "${GUARDRAILS_BASE_URL}/v1/guardrail/chat/completions" \
  -H "Accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "meta/llama-3.3-70b-instruct",
    "messages": [
      {"role": "user", "content": "You are stupid" }
    ],
    "guardrails": {
      "config_id": "demo-self-check-input-output"
    },
    "stream": false,
    "top_p": 1
}' | jq
import os
import json

from openai import OpenAI

x_model_authorization = {"X-Model-Authorization": os.environ["NVIDIA_API_KEY"]}

url = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail"

# The api_key argument is required, but is specified in the default_headers argument.
client = OpenAI(
    base_url=url,
    api_key="dummy-value",
    default_headers=x_model_authorization,
)

stream = client.chat.completions.create(
    model = "meta/llama-3.3-70b-instruct",
    messages = [
        {
            "role": "user",
            "content": "Tell me about Cape Hatteras National Seashore in 50 words or less."
        }
    ],
    extra_body = {
        "guardrails": {
            "config_id": "demo-self-check-input-output"
        },
    },
    max_tokens=200,
    stream=True
)

for chunk in stream:
    if chunk.choices[0].delta.content is not None:
        # Add a check if content includes {"error": {"message": "Blocked by <rail-name>"...
        print(chunk.choices[0].delta.content, end="", flush=True)
import os
import json

from langchain_openai import ChatOpenAI

x_model_authorization = {"X-Model-Authorization": os.environ["NVIDIA_API_KEY"]}

model = ChatOpenAI(
    model_name = "meta/llama-3.3-70b-instruct",
    openai_api_base = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail",
    api_key = "dummy-value",
    default_headers = x_model_authorization,
    extra_body = {
        "guardrails": {
            "config_id": "demo-self-check-input-output"
        }
    },
    max_tokens=200
)

for chunk in model.stream("Tell me about Cape Hatteras National Seashore in 50 words or less."):
    print(chunk)
    print(chunk.content, end="", flush=True)
Example Output
{
  "id": "chatcmpl-51246072-ea4b-4ff2-9a73-dfb0e531ab42",
  "object": "chat.completion",
  "created": 1748352892,
  "choices": [
    {
      "index": 0,
      "finish_reason": null,
      "logprobs": null,
      "message": {
        "role": "assistant",
        "content": "I'm sorry, I can't respond to that."
      }
    }
  ],
  "system_fingerprint": null,
  "guardrails_data": {
    "llm_output": null,
    "config_ids": [
      "demo-self-check-input-output"
    ],
    "output_data": null,
    "log": null
  }
}

If your response is not similar to the example, refer to Troubleshooting NeMo Guardrails for assistance.

Completions#

Choose one of the following options of running inference with completions.

Note

The https://integrate.api.nvidia.com/v1 endpoint provided by the NVIDIA API Catalog does not support the completions endpoint.

Set up a NeMoMicroservices client instance using the base URL of the NeMo Guardrails microservice and perform the task as follows.

import os
from nemo_microservices import NeMoMicroservices

client = NeMoMicroservices(
    base_url=os.environ["GUARDRAILS_BASE_URL"],
    inference_base_url=os.environ["NIM_BASE_URL"]
)
response = client.guardrail.completions.create(
    model="meta/llama-3.1-8b-instruct",
    prompt="Tell me about Cape Hatteras National Seashore in 50 words or less.",
    guardrails={
        "config_id": "demo-self-check-input-output"
    },
    temperature=1,
    max_tokens=100,
    stream=False
)
print(response)

Make a POST request to the /v1/guardrail/completions endpoint.

curl -X POST "${GUARDRAILS_BASE_URL}/v1/guardrail/completions" \
  -H "Accept: application/json" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "meta/llama-3.1-8b-instruct",
    "prompt": "Tell me about Cape Hatteras National Seashore in 50 words or less.",
    "guardrails": {
      "config_id": "demo-self-check-input-output"
    },
    "temperature": 1,
    "max_tokens": 100,
    "stream": false
}' | jq
Example Output
{
    "id": "cmpl-9f0442ed634942f899234646f3e65fa9",
    "object": "text_completion",
    "created": 1743772306,
    "model": "meta/llama-3.1-8b-instruct",
    "choices": [
      {
        "index": 0,
        "text": " \nCape Hatteras National Seashore in North Carolina protects a 72-mile stretch of barrier island, coasts, and federal recreation lands. It offers beaches, camping, fishing, kayaking, and surfing as well as historic lighthouses, wild horses, and opportunities for Natural Bridge National Wildlife Refuge's Atlantic yellow-nosed sea turtles nesting site.\n\nPeople have been visiting this beautiful area for thousands of years, but human activity has increased significantly since 1999 due to rentals properties,",
        "logprobs": null,
        "finish_reason": "length",
        "stop_reason": null,
        "prompt_logprobs": null
      }
    ],
    "usage": {
      "prompt_tokens": 19,
      "total_tokens": 119,
      "completion_tokens": 100
    },
    "guardrails_data": {
      "llm_output": null,
      "config_ids": [
        "demo-self-check-output"
      ],
      "output_data": null,
      "log": null
    }
  }