⚠️ Warning: ${nss_short_name} is in Early Access and not recommended for production use.
Python Builder SDK Examples#
Practical examples demonstrating common ${nss_short_name} jobs using the Python Builder API.
Initialize the NeMo Safe Synthesizer Client#
The Python SDK provides a wrapper around the NeMo Microservices Platform APIs.
http://localhost:8080is the default URL forbase_urlin quickstart.If using a managed or remote deployment, ensure correct base URLs and tokens.
from nemo_microservices import NeMoMicroservices
from nemo_microservices.beta.safe_synthesizer.sdk.job_builder import SafeSynthesizerJobBuilder
client = NeMoMicroservices(base_url="http://localhost:8080")
Set up the DataStore#
NeMo DataStore is launched as one of the services. We’ll use it to manage storage for uploaded datasets.
datastore_config = {
"endpoint": "http://localhost:3000/v1/hf",
# "token": "<optional token>",
}
Basic Examples#
Example 1: Basic Synthetic Data Generation#
Generate private synthetic data from a local CSV. The Builder uploads your CSV to Datastore automatically.
df = pd.read_csv("customer-data.csv")
job = (
SafeSynthesizerJobBuilder(client)
.with_data_source()
.with_datastore(datastore_config)
.with_replace_pii(
globals={"locales": ["en_US"]},
steps=[{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}],
)
.with_generate(num_records=5000, temperature=0.8)
.with_differential_privacy(dp_enabled=True, epsilon=6.0)
.with_evaluate(mia_enabled=True, aia_enabled=True)
.synthesize()
)
print(f"job_id = {job.job_id}")
job.wait_for_completion()
print(f"Job finished with status {job.fetch_status()}")
Example 2: PII Detection, Redaction, or Replacement Only#
Run PII replacement on a local CSV without synthetic generation.
df = pd.read_csv("customer-data.csv")
job = (
SafeSynthesizerJobBuilder(client)
.with_data_source()
.with_datastore(datastore_config)
.with_replace_pii(
globals={"locales": ["en_US"]},
steps=[{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}],
)
)
print(f"job_id = {job.job_id}")
job.wait_for_completion()
print(f"Job finished with status {job.fetch_status()}")
Industry-Specific Examples#
Example 3: GDPR Compliance#
European locales, higher NER confidence, and stronger DP settings.
gdpr_pii_config = {
"globals": {
"locales": ["en_GB", "de_DE", "fr_FR"],
"ner": {"ner_threshold": 0.9, "entities": ["name", "email", "phone_number", "address", "iban"]},
},
"steps": [
{"rows": {"update": [
{"entity": ["name"], "value": "fake.name()"},
{"entity": ["email"], "value": "fake.email()"},
{"entity": ["phone_number"], "value": "fake.phone_number()"},
{"entity": ["address"], "value": "fake.address()"},
]}}
],
}
df = pd.read_csv("eu-customer-data.csv")
job = (
SafeSynthesizerJobBuilder(client)
.with_data_source(df)
.with_datastore(datastore_config)
.with_replace_pii(gdpr_pii_config)
.with_differential_privacy(dp_enabled=True, epsilon=2.0, delta=1e-6)
.with_evaluate(mia_enabled=True)
.synthesize()
)
print(f"job_id = {job.job_id}")
job.wait_for_completion()
print(f"Job finished with status {job.fetch_status()}")
Example 4: Healthcare Data (HIPAA)#
PHI-sensitive NER configuration and very strong privacy hyperparameters (i.e., small epilson).
hipaa_config = {
"globals": {
"locales": ["en_US"],
"ner": {
"entities": [
"name", "email", "phone_number", "address",
"medical_record_number", "ssn", "date_of_birth"
],
"ner_threshold": 0.95,
},
},
"steps": [
{"rows": {
"update": [
{"name": "patient_name", "value": "fake.name()"},
{"name": "mrn", "value": "fake.random_number(digits=8)"},
{"name": "dob", "value": "fake.date_of_birth(minimum_age=18, maximum_age=90)"},
],
"drop": [{"condition": "notes CONTAINS 'confidential'"}],
}}
],
}
df = pd.read_csv("patient-data.csv")
job = (
SafeSynthesizerJobBuilder(client)
.with_data_source()
.with_datastore(datastore_config)
.with_replace_pii(hipaa_config)
.with_differential_privacy(
dp_enabled=True,
epsilon=1.0,
delta=1e-7,
per_sample_max_grad_norm=0.5,
)
.with_generate(temperature=0.6)
.with_evaluate(mia_enabled=True, aia_enabled=True)
.synthesize()
)
print(f"job_id = {job.job_id}")
job.wait_for_completion()
print(f"Job finished with status {job.fetch_status()}")
Example 5: Financial Data#
Domain-specific PII replacement with ordering/grouping hints for training.
financial_training = {
"group_training_examples_by": "account_type",
"order_training_examples_by": "transaction_date",
}
financial_pii = {
"steps": [
{"rows": {"update": [
{"name": "account_number", "value": "fake.random_number(digits=12)"},
{"name": "routing_number", "value": "fake.random_number(digits=9)"},
{"entity": ["credit_debit_card"], "value": "fake.credit_card_number()"},
{"entity": ["ssn"], "value": "fake.ssn()"},
]}}
]
}
df = pd.read_csv("financial-transactions.csv")
job = (
SafeSynthesizerJobBuilder(client)
.with_data_source()
.with_datastore(datastore_config)
.with_replace_pii(financial_pii)
.with_differential_privacy(dp_enabled=True, epsilon=4.0)
.with_train(financial_training)
.with_generate(num_records=50000, use_structured_generation=True)
.with_evaluate(mia_enabled=True)
.synthesize()
)
print(f"job_id = {job.job_id}")
job.wait_for_completion()
print(f"Job finished with status {job.fetch_status()}")
View Synthetic data#
After the job completes, fetch the generated synthetic dataset.
synthetic_df = job.fetch_data()
synthetic_df
View evaluation report#
An evaluation comparing the synthetic data to the input data is performed automatically. You can:
Inspect key scores: overall synthetic data quality and privacy.
Download the full HTML report: includes charts and detailed metrics.
Display the report inline: useful when viewing in notebook environments.
# Print selected information from the job summary
summary = job.fetch_summary()
print(
f"Synthetic data quality score (0-10, higher is better): {summary.synthetic_data_quality_score}"
)
print(f"Data privacy score (0-10, higher is better): {summary.data_privacy_score}")
# Download the full evaluation report to your local machine
job.save_report("evaluation_report.html")
# Fetch and display the full evaluation report inline
job.display_report_in_notebook()
Advanced Jobs#
Example 6: Batch Processing#
Process multiple quarterly datasets with shared configuration.
import pandas as pd
pii_config = {
"steps": [
{"rows": {"update": [
{"entity": ["email", "phone_number"], "value": "column.entity | fake"}
]}}
]
}
jobs = []
for path in [f"customer-q{q}.csv" for q in range(1, 5)]:
builder = (
SafeSynthesizerJobBuilder(client)
.with_data_source(pd.read_csv(path))
.with_datastore(datastore_config)
.with_replace_pii(pii_config)
.with_differential_privacy(dp_enabled=True, epsilon=5.0)
.with_generate(num_records=5000, temperature=0.8)
.with_evaluate(mia_enabled=True, aia_enabled=True)
.synthesize()
)
job = builder.create_job(name=f"batch-{path.split('.')[0]}", project="default")
jobs.append(job)
# Wait for all jobs to complete before fetching results
for job in jobs:
job.wait_for_completion()
# Collect results after completion
all_results = []
for job in jobs:
try:
all_results.append(client.beta.safe_synthesizer.jobs.results.list(job.job_id))
except Exception:
all_results.append(None)
Error Handling Patterns#
Example 7: Robust Production Job#
import logging
import time
from nemo_microservices import APIError, APIStatusError
import pandas as pd
def safe_synthesizer_with_retry(local_csv_path: str, max_retries: int = 3):
for attempt in range(max_retries):
try:
builder = (
SafeSynthesizerJobBuilder(client)
.with_data_source(pd.read_csv(local_csv_path))
.with_datastore(datastore_config)
.with_replace_pii(
steps=[{"rows": {"update": [{"entity": ["email"], "value": "column.entity | fake"}]}}],
)
.with_differential_privacy(dp_enabled=True, epsilon=5.0)
.with_generate(num_records=5000)
.with_evaluate(mia_enabled=True)
.synthesize()
)
job = builder.create_job(name=f"production-synthetics-attempt-{attempt + 1}", project="default")
while True:
current_job = client.beta.safe_synthesizer.jobs.retrieve(job.job_id)
status = current_job.status
if status == "completed":
break
elif status == "error":
raise APIError("Job failed - check logs for details", request=None, body=None)
time.sleep(30)
return job
except APIError as e:
logging.error(f"Attempt {attempt + 1} failed: {getattr(e, 'message', str(e))}")
if attempt == max_retries - 1:
raise
continue
except APIStatusError as e:
logging.error(f"API error on attempt {attempt + 1}: {getattr(e, 'message', str(e))}")
if attempt == max_retries - 1:
raise
continue
raise RuntimeError(f"All {max_retries} attempts failed")
# Usage
try:
job = safe_synthesizer_with_retry("sensitive-dataset.csv")
print(f"Successfully created job: {job.job_id}")
except Exception as e:
print(f"Failed to create ${nss_short_name} job: {e}")
Performance Optimization#
Example 8: Large Dataset Processing#
Tuning training/generation for large datasets approaching the 500MB limit.
import pandas as pd
large_training = {
"max_sequences_per_example": 256,
"num_input_records_to_sample": 100000,
"batch_size": 2,
"gradient_accumulation_steps": 8,
}
builder = (
SafeSynthesizerJobBuilder(client)
.with_data_source(pd.read_csv("large-dataset.csv"))
.with_datastore(datastore_config)
.with_replace_pii(
steps=[{"rows": {"update": [{"entity": ["email"], "value": "column.entity | fake"}]}}],
)
.with_train(large_training)
.with_generate(num_records=25000)
.with_evaluate(mia_enabled=True)
.synthesize()
)
job = builder.create_job(name="large-dataset-synthetics", project="default")
The Builder uploads your dataset to the datastore under
default/safe-synthesizerwith a randomized filename. If you need custom repo paths or filenames, prefer the lower-level REST API for now.The Builder accepts either a local
pandas.DataFrameor a local file path (string) for.with_data_source(...).