Running Inference with Guardrails#
Inference Endpoints#
The microservice supports two endpoints for running inference with guardrails:
/v1/guardrail/chat/completions
/v1/guardrail/completions
These endpoints are OpenAI compatible, with the addition of the guardrails
field in the request body.
The field is a dictionary with the following fields:
config_id
: Specifies the ID of the guardrail configuration to apply.config
: Specifies the guardrail configuration content. Use this field to pass the guardrail configuration in the request body rather than specifying an existing config by ID.options
: Specifies additional guardrail options. For example, you can log the activated rails in the microservice logs:"guardrails": { "options": { "log": { "activated_rails": true } } }
For more information about options, refer to Configuration Guide.
return_choice
: When set totrue
, the guardrail data is returned as a choice in thechoices
array with therole
field set toguardrails_data
. The default value isfalse
.This field is helpful when you use third-party clients to make requests to the NeMo Guardrails microservice. These clients typically don’t forward back additional response fields that are not part of the OpenAI response format.
Chat Completions#
Perform a POST request to the
/v1/guardrail/chat/completions
endpoint.curl -X POST "${GUARDRAILS_BASE_URL}/v1/guardrail/chat/completions" \ -H "Accept: application/json" \ -H "Content-Type: application/json" \ -d '{ "model": "meta/llama-3.3-70b-instruct", "messages": [ {"role": "user", "content": "You are stupid" } ], "guardrails": { "config_id": "demo-self-check-input-output" }, "stream": false, "top_p": 1 }' | jq
import os import json import requests url = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail/chat/completions" headers = {"Accept": "application/json", "Content-Type": "application/json"} data = { "model": "meta/llama-3.3-70b-instruct", "messages": [ {"role": "user", "content": "You are stupid"} ], "guardrails": { "config_id": "demo-self-check-input-output", }, "top_p": 1 } response = requests.post(url, headers=headers, json=data) print(json.dumps(response.json(), indent=2))
import os import json from openai import OpenAI x_model_authorization = {"X-Model-Authorization": os.environ["NVIDIA_API_KEY"]} url = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail" # The api_key argument is required, but is specified in the default_headers argument. client = OpenAI( base_url=url, api_key="dummy-value", default_headers=x_model_authorization, ) stream = client.chat.completions.create( model = "meta/llama-3.3-70b-instruct", messages = [ { "role": "user", "content": "Tell me about Cape Hatteras National Seashore in 50 words or less." } ], extra_body = { "guardrails": { "config_id": "demo-self-check-input-output" }, }, max_tokens=200, stream=True ) for chunk in stream: if chunk.choices[0].delta.content is not None: # Add a check if content includes {"error": {"message": "Blocked by <rail-name>"... print(chunk.choices[0].delta.content, end="", flush=True)
import os import json from langchain_openai import ChatOpenAI x_model_authorization = {"X-Model-Authorization": os.environ["NVIDIA_API_KEY"]} model = ChatOpenAI( model_name = "meta/llama-3.3-70b-instruct", openai_api_base = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail", api_key = "dummy-value", default_headers = x_model_authorization, extra_body = { "guardrails": { "config_id": "demo-self-check-input-output" } }, max_tokens=200 ) for chunk in model.stream("Tell me about Cape Hatteras National Seashore in 50 words or less."): print(chunk) print(chunk.content, end="", flush=True)
Example Output
{ "id": "chatcmpl-51246072-ea4b-4ff2-9a73-dfb0e531ab42", "object": "chat.completion", "created": 1748352892, "choices": [ { "index": 0, "finish_reason": null, "logprobs": null, "message": { "role": "assistant", "content": "I'm sorry, I can't respond to that." } } ], "system_fingerprint": null, "guardrails_data": { "llm_output": null, "config_ids": [ "demo-self-check-input-output" ], "output_data": null, "log": null } }
If your response is not similar to the example, refer to Troubleshooting NeMo Guardrails for assistance.
Completions#
Perform a POST request to the
/v1/guardrail/completions
endpoint.Note
The https://integrate.api.nvidia.com/v1 endpoint provided by the NVIDIA API Catalog does not support the completions endpoint.
curl -X POST "${GUARDRAILS_BASE_URL}/v1/guardrail/completions" \ -H "Accept: application/json" \ -H "Content-Type: application/json" \ -d '{ "model": "meta/llama-3.1-8b-instruct", "prompt": "Tell me about Cape Hatteras National Seashore in 50 words or less.", "guardrails": { "config_id": "demo-self-check-input-output" }, "temperature": 1, "max_tokens": 100, "stream": false }' | jq
import os import json import requests url = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail/completions" headers = {"Accept": "application/json", "Content-Type": "application/json"} data = { "model": "meta/llama-3.1-8b-instruct", "prompt": "Tell me about Cape Hatteras National Seashore in 50 words or less.", "guardrails": { "config_id": "demo-self-check-input-output" }, "temperature": 1, "max_tokens": 100, "stream": False } response = requests.post(url, headers=headers, json=data) print(json.dumps(response.json(), indent=2))
Example Output
{ "id": "cmpl-9f0442ed634942f899234646f3e65fa9", "object": "text_completion", "created": 1743772306, "model": "meta/llama-3.1-8b-instruct", "choices": [ { "index": 0, "text": " \nCape Hatteras National Seashore in North Carolina protects a 72-mile stretch of barrier island, coasts, and federal recreation lands. It offers beaches, camping, fishing, kayaking, and surfing as well as historic lighthouses, wild horses, and opportunities for Natural Bridge National Wildlife Refuge's Atlantic yellow-nosed sea turtles nesting site.\n\nPeople have been visiting this beautiful area for thousands of years, but human activity has increased significantly since 1999 due to rentals properties,", "logprobs": null, "finish_reason": "length", "stop_reason": null, "prompt_logprobs": null } ], "usage": { "prompt_tokens": 19, "total_tokens": 119, "completion_tokens": 100 }, "guardrails_data": { "llm_output": null, "config_ids": [ "demo-self-check-output" ], "output_data": null, "log": null } }