Use Llama Stack API#
This tutorial shows how you can use the Llama Stack APIs to achieve the same end-to-end workflow introduced in the previous tutorials.
Prerequisites#
Before you begin, complete the following:
A
meta-llama/llama-3.2-1B-Instruct
NIM microservice deployed to your environment, either through Helm deployment or following the instructions in the NIM Deployment tutorial.Llama Stack running in your environment. For more information, see Running Llama Stack with NVIDIA in the Llama Stack documentation.
Set Up Environment Variables#
Set up environment variables for using the NeMo microservices through the Llama Stack APIs.
For a complete list of environment variables, see Environment Variables for NVIDIA Distribution in the Llama Stack documentation.
For
BASE_MODEL
, use a model from the available models for NVIDIA Distribution in the Llama Stack documentation.
Python Example: Environment Setup
# (Required) NeMo microservices URLs
NDS_URL = "http://data-store.test" # Data Store
NEMO_URL = "http://nemo.test" # Customizer, Evaluator, Guardrails
NIM_URL = "http://nim.test" # NIM
# (Required) Base model alias
BASE_MODEL = "meta-llama/llama-3.2-1B-Instruct"
# (Required) Hugging Face Token
HF_TOKEN = ""
# (Optional) NeMo Entity Store namespace and project
NAMESPACE = "llamastack-e2e-notebook"
PROJECT_ID = ""
CUSTOMIZED_MODEL_DIR = "llamastack-e2e-notebook/customized-model@v1"
import os
os.environ["NVIDIA_DATASET_NAMESPACE"] = NAMESPACE
os.environ["NVIDIA_PROJECT_ID"] = PROJECT_ID
os.environ["NVIDIA_BASE_URL"] = NIM_URL
os.environ["NVIDIA_DATASETS_URL"] = NEMO_URL
os.environ["NVIDIA_CUSTOMIZER_URL"] = NEMO_URL
os.environ["NVIDIA_OUTPUT_MODEL_DIR"] = CUSTOMIZED_MODEL_DIR
os.environ["NVIDIA_EVALUATOR_URL"] = NEMO_URL
os.environ["GUARDRAILS_SERVICE_URL"] = NEMO_URL
Initialize the Llama Stack Client#
After setting environment variables, initialize the Llama Stack client for use in subsequent steps.
Python Example: Initialize the Llama Stack Client
from llama_stack.distribution.library_client import LlamaStackAsLibraryClient
client = LlamaStackAsLibraryClient("nvidia")
client.initialize()
Register Datasets to NeMo Entity Store#
After uploading a dataset to NeMo Data Store using the Hugging Face CLI, register it with NeMo Entity Store using the Llama Stack client:
Python Example: Register Dataset
response = client.datasets.register(
purpose="post-training/messages",
dataset_id=sample_squad_dataset_name,
source={"type": "uri", "uri": f"hf://datasets/{repo_id}"},
metadata={
"format": "json",
"description": "Test dataset for Llama Stack",
"provider_id": "nvidia",
},
)
print(response)
Customize (Fine-Tune) the Model#
Run customization jobs using the Llama Stack client as follows.
Python Example: Start Customization Job
response = client.post_training.supervised_fine_tune(
job_uuid="",
model="meta/llama-3.2-1b-instruct@v1.0.0+A100",
training_config={
"n_epochs": 2,
"data_config": {
"batch_size": 16,
"dataset_id": sample_squad_dataset_name,
},
"optimizer_config": {
"lr": 0.0001,
},
},
algorithm_config={
"type": "LoRA",
"adapter_dim": 16,
"adapter_dropout": 0.1,
"alpha": 16,
"rank": 8,
"lora_attn_modules": [],
"apply_lora_to_mlp": True,
"apply_lora_to_output": False,
},
hyperparam_search_config={},
logger_config={},
checkpoint_dir="",
)
job_id = response.job_uuid
print(f"Created job with ID: {job_id}")
Evaluate Models#
Run evaluation jobs using the Llama Stack client as follows.
Python Example: Run Evaluation
benchmark_id = "test-eval-config"
simple_eval_config = {
"benchmark_id": benchmark_id,
"dataset_id": repo_id,
"scoring_functions": [],
"metadata": {
"type": "custom",
"params": {"parallelism": 8},
"tasks": {
"qa": {
"type": "completion",
"params": {
"template": {
"prompt": "{{prompt}}",
"max_tokens": 20,
"temperature": 0.7,
"top_p": 0.9,
},
},
"dataset": {"files_url": f"hf://datasets/{repo_id}/testing/testing.jsonl"},
"metrics": {
"bleu": {
"type": "bleu",
"params": {"references": ["{{ideal_response}}"]},
},
"string-check": {
"type": "string-check",
"params": {"check": ["{{ideal_response | trim}}", "equals", "{{output_text | trim}}"]},
},
},
}
},
},
}
response = client.benchmarks.register(
benchmark_id=benchmark_id,
dataset_id=repo_id,
scoring_functions=simple_eval_config["scoring_functions"],
metadata=simple_eval_config["metadata"],
)
response = client.eval.run_eval(
benchmark_id=benchmark_id,
benchmark_config={"eval_candidate": {"type": "model", "model": BASE_MODEL, "sampling_params": {}}},
)
job_id = response.model_dump()["job_id"]
Add Safety Checks with Guardrails#
Register a shield to deployed NIM microservices and run a safety check using Guardrails.
Python Example: Register Shield and Run Safety Check
shield_id = "meta/llama-3.2-1b-instruct"
client.shields.register(shield_id=shield_id, provider_id="nvidia")
message = {"role": "user", "content": "You are stupid."}
response = client.safety.run_shield(messages=[message], shield_id=shield_id, params={})
print(f"Safety response: {response}")
assert response.violation.user_message == "Sorry I cannot do this."
Run Inference on Deployed NIMs#
Run inference on deployed NIMs using the Llama Stack client as follows.
Python Example: Inference
with open("./tmp/sample_squad_data/testing/testing.jsonl", "r") as f:
examples = [json.loads(line) for line in f]
sample_prompt = examples[-1]["prompt"]
response = client.inference.chat_completion(
messages=[{"role": "user", "content": sample_prompt}],
model_id=BASE_MODEL,
sampling_params={"max_tokens": 20, "strategy": {"type": "top_p", "temperature": 0.7, "top_p": 0.9}},
)
print(f"Inference response: {response.completion_message.content}")