Running Inference with Guardrails#

Inference Endpoints#

The microservice supports two endpoints for running inference with guardrails:

  • /v1/guardrail/chat/completions

  • /v1/guardrail/completions

These endpoints are OpenAI compatible, with the addition of the guardrails field in the request body. The field is a dictionary with the following fields:

  • config_id: Specifies the ID of the guardrail configuration to apply.

  • config: Specifies the guardrail configuration content. Use this field to pass the guardrail configuration in the request body rather than specifying an existing config by ID.

  • options: Specifies additional guardrail options. For example, you can log the activated rails in the microservice logs:

    "guardrails": {
      "options": {
        "log": {
          "activated_rails": true
        }
      }
    }
    

    For more information about options, refer to Configuration Guide.

  • return_choice: When set to true, the guardrail data is returned as a choice in the choices array with the role field set to guardrails_data. The default value is false.

    This field is helpful when you use third-party clients to make requests to the NeMo Guardrails microservice. These clients typically don’t forward back additional response fields that are not part of the OpenAI response format.

Chat Completions#

  • Perform a POST request to the /v1/guardrail/chat/completions endpoint.

    curl -X POST "${GUARDRAILS_BASE_URL}/v1/guardrail/chat/completions" \
      -H "Accept: application/json" \
      -H "Content-Type: application/json" \
      -d '{
        "model": "meta/llama-3.3-70b-instruct",
        "messages": [
          {"role": "user", "content": "You are stupid" }
        ],
        "guardrails": {
          "config_id": "demo-self-check-input-output"
        },
        "stream": false,
        "top_p": 1
    }' | jq
    
    import os
    import json
    import requests
    
    url = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail/chat/completions"
    
    headers = {"Accept": "application/json", "Content-Type": "application/json"}
    
    data = {
        "model": "meta/llama-3.3-70b-instruct",
        "messages": [
            {"role": "user", "content": "You are stupid"}
        ],
        "guardrails": {
            "config_id": "demo-self-check-input-output",
        },
        "top_p": 1
    }
    
    response = requests.post(url, headers=headers, json=data)
    print(json.dumps(response.json(), indent=2))
    
    import os
    import json
    
    from openai import OpenAI
    
    x_model_authorization = {"X-Model-Authorization": os.environ["NVIDIA_API_KEY"]}
    
    url = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail"
    
    # The api_key argument is required, but is specified in the default_headers argument.
    client = OpenAI(
        base_url=url,
        api_key="dummy-value",
        default_headers=x_model_authorization,
    )
    
    stream = client.chat.completions.create(
        model = "meta/llama-3.3-70b-instruct",
        messages = [
            {
                "role": "user",
                "content": "Tell me about Cape Hatteras National Seashore in 50 words or less."
            }
        ],
        extra_body = {
            "guardrails": {
                "config_id": "demo-self-check-input-output"
            },
        },
        max_tokens=200,
        stream=True
    )
    
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            # Add a check if content includes {"error": {"message": "Blocked by <rail-name>"...
            print(chunk.choices[0].delta.content, end="", flush=True)
    
    import os
    import json
    
    from langchain_openai import ChatOpenAI
    
    x_model_authorization = {"X-Model-Authorization": os.environ["NVIDIA_API_KEY"]}
    
    model = ChatOpenAI(
        model_name = "meta/llama-3.3-70b-instruct",
        openai_api_base = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail",
        api_key = "dummy-value",
        default_headers = x_model_authorization,
        extra_body = {
            "guardrails": {
                "config_id": "demo-self-check-input-output"
            }
        },
        max_tokens=200
    )
    
    for chunk in model.stream("Tell me about Cape Hatteras National Seashore in 50 words or less."):
        print(chunk)
        print(chunk.content, end="", flush=True)
    
    Example Output
    {
      "id": "chatcmpl-51246072-ea4b-4ff2-9a73-dfb0e531ab42",
      "object": "chat.completion",
      "created": 1748352892,
      "choices": [
        {
          "index": 0,
          "finish_reason": null,
          "logprobs": null,
          "message": {
            "role": "assistant",
            "content": "I'm sorry, I can't respond to that."
          }
        }
      ],
      "system_fingerprint": null,
      "guardrails_data": {
        "llm_output": null,
        "config_ids": [
          "demo-self-check-input-output"
        ],
        "output_data": null,
        "log": null
      }
    }
    

    If your response is not similar to the example, refer to Troubleshooting NeMo Guardrails for assistance.

Completions#

  • Perform a POST request to the /v1/guardrail/completions endpoint.

    Note

    The https://integrate.api.nvidia.com/v1 endpoint provided by the NVIDIA API Catalog does not support the completions endpoint.

    curl -X POST "${GUARDRAILS_BASE_URL}/v1/guardrail/completions" \
      -H "Accept: application/json" \
      -H "Content-Type: application/json" \
      -d '{
        "model": "meta/llama-3.1-8b-instruct",
        "prompt": "Tell me about Cape Hatteras National Seashore in 50 words or less.",
        "guardrails": {
          "config_id": "demo-self-check-input-output"
        },
        "temperature": 1,
        "max_tokens": 100,
        "stream": false
    }' | jq
    
    import os
    import json
    import requests
    
    url = f"{os.environ['GUARDRAILS_BASE_URL']}/v1/guardrail/completions"
    
    headers = {"Accept": "application/json", "Content-Type": "application/json"}
    
    data = {
        "model": "meta/llama-3.1-8b-instruct",
        "prompt": "Tell me about Cape Hatteras National Seashore in 50 words or less.",
        "guardrails": {
          "config_id": "demo-self-check-input-output"
        },
        "temperature": 1,
        "max_tokens": 100,
        "stream": False
    }
    
    response = requests.post(url, headers=headers, json=data)
    print(json.dumps(response.json(), indent=2))
    
    Example Output
    {
        "id": "cmpl-9f0442ed634942f899234646f3e65fa9",
        "object": "text_completion",
        "created": 1743772306,
        "model": "meta/llama-3.1-8b-instruct",
        "choices": [
          {
            "index": 0,
            "text": " \nCape Hatteras National Seashore in North Carolina protects a 72-mile stretch of barrier island, coasts, and federal recreation lands. It offers beaches, camping, fishing, kayaking, and surfing as well as historic lighthouses, wild horses, and opportunities for Natural Bridge National Wildlife Refuge's Atlantic yellow-nosed sea turtles nesting site.\n\nPeople have been visiting this beautiful area for thousands of years, but human activity has increased significantly since 1999 due to rentals properties,",
            "logprobs": null,
            "finish_reason": "length",
            "stop_reason": null,
            "prompt_logprobs": null
          }
        ],
        "usage": {
          "prompt_tokens": 19,
          "total_tokens": 119,
          "completion_tokens": 100
        },
        "guardrails_data": {
          "llm_output": null,
          "config_ids": [
            "demo-self-check-output"
          ],
          "output_data": null,
          "log": null
        }
      }