NeMo Safe Synthesizer Python SDK#
Learn how to use the NeMo NeMo Safe Synthesizer Python SDK for private synthetic data jobs through REST API access.
Installation#
pip install nemo-microservices[safe-synthesizer]
The NeMo Safe Synthesizer SDK provides comprehensive REST API access for private synthetic data jobs.
Client Initialization#
from nemo_microservices import NeMoMicroservices
# Local Docker deployment
client = NeMoMicroservices(
base_url="http://localhost:8080"
)
# Production deployment
client = NeMoMicroservices(
base_url="https://your-nemo-platform.com"
)
NeMo Safe Synthesizer API#
The NeMo Safe Synthesizer SDK provides REST API access for private synthetic data jobs:
Basic PII Redaction#
Note
Before you start, make sure that you have:
Stored CSVs locally
Uploaded them using the following steps:
export HF_ENDPOINT="http://localhost:3000/v1/hf"
huggingface-cli upload --repo-type dataset default/safe-synthesizer your-uploaded-dataset.csv
import pandas as pd
# Load your sensitive dataset
df = pd.read_csv("sensitive-data.csv")
# Create PII redaction job using REST API
job_request = {
"name": "pii-redaction",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/your-uploaded-dataset.csv",
"config": {
"enable_synthesis": False,
"enable_replace_pii": True,
"replace_pii": {
"globals": {"locales": ["en_US"]},
"steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**job_request)
# Wait for job to complete, see Status Monitoring section below
# Access results using REST API
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
# data = client.beta.safe_synthesizer.jobs.results.download(result_id, job_id=job.id)
Complete NeMo Safe Synthesizer Job#
import pandas as pd
# Load your sensitive dataset
df = pd.read_csv("sensitive-data.csv")
# Complete job using REST API
job_request = {
"name": "safe-synthesizer-full",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/your-uploaded-dataset.csv",
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": {
"globals": {"locales": ["en_US"]},
"steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
},
"training": {"pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"},
"generation": {"num_records": 5000, "temperature": 0.8},
"privacy": {"privacy_hyperparams": {"dp": True, "epsilon": 6.0}},
"evaluation": {"mia_enabled": True, "aia_enabled": True}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**job_request)
# Wait for job to complete, see Status Monitoring section below
# Access results via REST API
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
# data = client.beta.safe_synthesizer.jobs.results.download(result_id, job_id=job.id)
Custom Configuration#
# Advanced configuration with custom parameters
gdpr_pii_config = {
"globals": {
"locales": ["en_US", "en_GB", "de_DE"],
"ner": {"ner_threshold": 0.9}
},
"steps": [{"rows": {"update": [{"entity": ["name"], "value": "fake.name()"}]}}]
}
high_privacy_train_config = {
"pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Automatic model selection
"num_input_records_to_sample": 10000
}
# Custom configuration using REST API
job_request = {
"name": "gdpr-safe-synthesizer",
"project": "default",
"spec": {
"data_source": dataset_id,
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": gdpr_pii_config,
"training": {
"pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"num_input_records_to_sample": 10000
}
"evaluation": {"mia_enabled": True}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**job_request)
# Wait for job to complete, see Status Monitoring section below
# Access results after job completion
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
Low-Level REST API#
Direct access to NeMo Safe Synthesizer REST endpoints provides complete control over job configuration:
Job Management#
# Create job with full configuration control
job_request = {
"name": "comprehensive-safe-synthesizer",
"description": "Full job with custom privacy settings",
"project": "privacy-research",
"spec": {
"data_source": dataset_id,
"config": {
"enable_synthesis": True,
"enable_replace_pii": True,
"replace_pii": {
"globals": {"locales": ["en_US"], "seed": 42},
"steps": [{"rows": {"update": [{"entity": ["email"], "value": "fake.email()"}]}}]
},
"training": {
"pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"num_input_records_to_sample": 10000,
},
"data": {
"max_sequences_per_example": "auto"
},
"generation": {
"num_records": 5000,
"temperature": 0.8,
"top_p": 0.95,
"use_structured_generation": True
},
"privacy": {"privacy_hyperparams": {"dp": True, "epsilon": 8.0, "delta": 1e-5}},
"evaluation": {"mia_enabled": True, "aia_enabled": True}
}
}
}
# Submit job
job = client.beta.safe_synthesizer.jobs.create(**job_request)
print(f"Created job: {job.id}")
Status Monitoring#
import time
# Monitor job progress
while True:
current_job = client.beta.safe_synthesizer.jobs.retrieve(job.id)
status = current_job.status
print(f"Job {job.id}: {status}")
if status in ("completed", "error", "cancelled"):
break
time.sleep(30) # Check every 30 seconds
# Get detailed job information
job_details = client.beta.safe_synthesizer.jobs.retrieve(job.id)
print(f"Job completed at: {job_details.updated_at}")
print(f"Status details: {job_details.status_details}")
Result Access#
# List all job results
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
for result in results:
print(f"Result ID: {result.id}")
print(f"Format: {result.format}")
print(f"Canonical: {result.canonical}")
# Download specific results
synthetic_data = client.beta.safe_synthesizer.jobs.results.download(
result_id="synthetic_data_result_id",
job_id=job.id
)
evaluation_report = client.beta.safe_synthesizer.jobs.results.download(
result_id="evaluation_report_result_id",
job_id=job.id
)
Data Upload#
Data must be uploaded to the data store before creating jobs. For detailed information on dataset management, refer to Datasets and Create Dataset Files.
Upload datasets using:
Then reference them using hf://datasets/default/safe-synthesizer/
URIs in job specifications.
Note
Before you start, make sure that you have:
Stored CSVs locally
Uploaded them using the following steps: The namespace is
default
and the dataset issafe-synthesizer
. For example
export HF_ENDPOINT="<http://localhost:3000/v1/hf>"
huggingface-cli upload --repo-type dataset default/safe-synthesizer your-dataset.csv
Error Handling#
Robust error handling for Safe Synthetics jobs:
try:
job_request = {
"name": "pii-redaction",
"project": "default",
"spec": {
"data_source": "hf://datasets/default/safe-synthesizer/your-dataset.csv",
"config": {
"enable_synthesis": False,
"enable_replace_pii": True,
"replace_pii": {
"globals": {"locales": ["en_US"]},
"steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
}
}
}
}
job = client.beta.safe_synthesizer.jobs.create(**job_request)
# Monitor job status
current_job = client.beta.safe_synthesizer.jobs.retrieve(job.id)
status = current_job.status
print(f"Job status: {status}")
# Access results
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
except ValueError as e:
print(f"Configuration error: {e}")
except Exception as e:
print(f"Job creation error: {e}")
# Use get_logs for debugging
logs = client.beta.safe_synthesizer.jobs.get_logs(job.id)
Async Support#
Use async/await for non-blocking job management:
import asyncio
from nemo_microservices import AsyncNeMoMicroservices
async def monitor_multiple_jobs():
async_client = AsyncNeMoMicroservices(base_url="http://localhost:8080")
# Create jobs using low-level API
job_ids = []
for i in range(3):
job = await async_client.beta.safe_synthesizer.jobs.create(
name=f"pii-job-{i}",
project="privacy-project",
spec={
"data_source": f"hf://datasets/default/safe-synthesizer/sample-data-{i}.csv",
"config": {
"enable_synthesis": False,
"enable_replace_pii": True,
"replace_pii": {
"globals": {"locales": ["en_US"]},
"steps": [{"rows": {"update": [{"entity": ["email"], "value": "fake.email()"}]}}]
}
}
}
)
job_ids.append(job.id)
# Monitor job statuses concurrently
jobs = await asyncio.gather(*[
async_client.beta.safe_synthesizer.jobs.retrieve(job_id)
for job_id in job_ids
])
statuses = [job.status for job in jobs]
return statuses
# Run async job
results = asyncio.run(monitor_multiple_jobs())
Configuration Best Practices#
Follow these patterns for reliable configuration:
# Use validated configuration patterns
pii_config = {
"globals": {"locales": ["en_US"]},
"steps": [{"rows": {"update": [{"entity": ["email"], "value": "fake.email()"}]}}]
}
# Current training configuration options
train_config = {
"pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Default: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
"num_input_records_to_sample": "auto" # Default: "auto"
}
# Current generation configuration options (in development)
generate_config = {
"num_records": 1000, # Default: 1000
"temperature": 1.0, # Default: 1.0
"top_p": 1.0, # Default: 1.0
"use_structured_generation": False # Default: False
}