NeMo Safe Synthesizer Python SDK#

Learn how to use the NeMo NeMo Safe Synthesizer Python SDK for private synthetic data jobs through REST API access.

Installation#

pip install nemo-microservices[safe-synthesizer]

The NeMo Safe Synthesizer SDK provides comprehensive REST API access for private synthetic data jobs.

Client Initialization#

from nemo_microservices import NeMoMicroservices

# Local Docker deployment
client = NeMoMicroservices(
    base_url="http://localhost:8080"
)

# Production deployment
client = NeMoMicroservices(
    base_url="https://your-nemo-platform.com"
)

NeMo Safe Synthesizer API#

The NeMo Safe Synthesizer SDK provides REST API access for private synthetic data jobs:

Basic PII Redaction#

Note

Before you start, make sure that you have:

  • Stored CSVs locally

  • Uploaded them using the following steps:

export HF_ENDPOINT="http://localhost:3000/v1/hf"
huggingface-cli upload --repo-type dataset default/safe-synthesizer your-uploaded-dataset.csv
import pandas as pd

# Load your sensitive dataset
df = pd.read_csv("sensitive-data.csv")

# Create PII redaction job using REST API
job_request = {
    "name": "pii-redaction",
    "project": "default",
    "spec": {
        "data_source": "hf://datasets/default/safe-synthesizer/your-uploaded-dataset.csv",
        "config": {
            "enable_synthesis": False,
            "enable_replace_pii": True,
            "replace_pii": {
                "globals": {"locales": ["en_US"]},
                "steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
            }
        }
    }
}

job = client.beta.safe_synthesizer.jobs.create(**job_request)

# Wait for job to complete, see Status Monitoring section below

# Access results using REST API
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
# data = client.beta.safe_synthesizer.jobs.results.download(result_id, job_id=job.id)

Complete NeMo Safe Synthesizer Job#

import pandas as pd

# Load your sensitive dataset
df = pd.read_csv("sensitive-data.csv")

# Complete job using REST API
job_request = {
    "name": "safe-synthesizer-full",
    "project": "default",
    "spec": {
        "data_source": "hf://datasets/default/safe-synthesizer/your-uploaded-dataset.csv",
        "config": {
            "enable_synthesis": True,
            "enable_replace_pii": True,
            "replace_pii": {
                "globals": {"locales": ["en_US"]},
                "steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
            },
            "training": {"pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"},
            "generation": {"num_records": 5000, "temperature": 0.8},
            "privacy": {"privacy_hyperparams": {"dp": True, "epsilon": 6.0}},
            "evaluation": {"mia_enabled": True, "aia_enabled": True}
        }
    }
}

job = client.beta.safe_synthesizer.jobs.create(**job_request)

# Wait for job to complete, see Status Monitoring section below

# Access results via REST API
results = client.beta.safe_synthesizer.jobs.results.list(job.id)
# data = client.beta.safe_synthesizer.jobs.results.download(result_id, job_id=job.id)

Custom Configuration#

# Advanced configuration with custom parameters
gdpr_pii_config = {
    "globals": {
        "locales": ["en_US", "en_GB", "de_DE"],
        "ner": {"ner_threshold": 0.9}
    },
    "steps": [{"rows": {"update": [{"entity": ["name"], "value": "fake.name()"}]}}]
}

high_privacy_train_config = {
    "pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Automatic model selection
    "num_input_records_to_sample": 10000
}

# Custom configuration using REST API
job_request = {
    "name": "gdpr-safe-synthesizer",
    "project": "default",
    "spec": {
        "data_source": dataset_id,
        "config": {
            "enable_synthesis": True,
            "enable_replace_pii": True,
            "replace_pii": gdpr_pii_config,
            "training": {
                "pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                "num_input_records_to_sample": 10000
            }
            "evaluation": {"mia_enabled": True}
        }
    }
}

job = client.beta.safe_synthesizer.jobs.create(**job_request)

# Wait for job to complete, see Status Monitoring section below

# Access results after job completion  
results = client.beta.safe_synthesizer.jobs.results.list(job.id)

Low-Level REST API#

Direct access to NeMo Safe Synthesizer REST endpoints provides complete control over job configuration:

Job Management#

# Create job with full configuration control
job_request = {
    "name": "comprehensive-safe-synthesizer",
    "description": "Full job with custom privacy settings",
    "project": "privacy-research",
    "spec": {
        "data_source": dataset_id,
        "config": {
            "enable_synthesis": True,
            "enable_replace_pii": True,
            "replace_pii": {
                "globals": {"locales": ["en_US"], "seed": 42},
                "steps": [{"rows": {"update": [{"entity": ["email"], "value": "fake.email()"}]}}]
            },
            "training": {
                "pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                "num_input_records_to_sample": 10000,
            },
            "data": {
                "max_sequences_per_example": "auto"
            },
            "generation": {
                "num_records": 5000,
                "temperature": 0.8,
                "top_p": 0.95,
                "use_structured_generation": True
            },
            "privacy": {"privacy_hyperparams": {"dp": True, "epsilon": 8.0, "delta": 1e-5}},
            "evaluation": {"mia_enabled": True, "aia_enabled": True}
        }
    }
}

# Submit job
job = client.beta.safe_synthesizer.jobs.create(**job_request)
print(f"Created job: {job.id}")

Status Monitoring#

import time

# Monitor job progress
while True:
    current_job = client.beta.safe_synthesizer.jobs.retrieve(job.id)
    status = current_job.status
    print(f"Job {job.id}: {status}")
    
    if status in ("completed", "error", "cancelled"):
        break
        
    time.sleep(30)  # Check every 30 seconds

# Get detailed job information
job_details = client.beta.safe_synthesizer.jobs.retrieve(job.id)
print(f"Job completed at: {job_details.updated_at}")
print(f"Status details: {job_details.status_details}")

Result Access#

# List all job results
results = client.beta.safe_synthesizer.jobs.results.list(job.id)

for result in results:
    print(f"Result ID: {result.id}")
    print(f"Format: {result.format}")
    print(f"Canonical: {result.canonical}")

# Download specific results
synthetic_data = client.beta.safe_synthesizer.jobs.results.download(
    result_id="synthetic_data_result_id",
    job_id=job.id
)

evaluation_report = client.beta.safe_synthesizer.jobs.results.download(
    result_id="evaluation_report_result_id", 
    job_id=job.id
)

Data Upload#

Data must be uploaded to the data store before creating jobs. For detailed information on dataset management, refer to Datasets and Create Dataset Files.

Upload datasets using:

Then reference them using hf://datasets/default/safe-synthesizer/ URIs in job specifications.

Note

Before you start, make sure that you have:

  • Stored CSVs locally

  • Uploaded them using the following steps: The namespace is default and the dataset is safe-synthesizer. For example

export HF_ENDPOINT="<http://localhost:3000/v1/hf>"
huggingface-cli upload --repo-type dataset default/safe-synthesizer your-dataset.csv

Error Handling#

Robust error handling for Safe Synthetics jobs:

try:
    job_request = {
        "name": "pii-redaction",
        "project": "default",
        "spec": {
            "data_source": "hf://datasets/default/safe-synthesizer/your-dataset.csv",
            "config": {
                "enable_synthesis": False,
                "enable_replace_pii": True,
                "replace_pii": {
                    "globals": {"locales": ["en_US"]},
                    "steps": [{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}]
                }
            }
        }
    }
    
    job = client.beta.safe_synthesizer.jobs.create(**job_request)
    
    # Monitor job status
    current_job = client.beta.safe_synthesizer.jobs.retrieve(job.id)
    status = current_job.status
    print(f"Job status: {status}")
    
    # Access results
    results = client.beta.safe_synthesizer.jobs.results.list(job.id)
    
except ValueError as e:
    print(f"Configuration error: {e}")
    
except Exception as e:
    print(f"Job creation error: {e}")
    # Use get_logs for debugging
    logs = client.beta.safe_synthesizer.jobs.get_logs(job.id)

Async Support#

Use async/await for non-blocking job management:

import asyncio
from nemo_microservices import AsyncNeMoMicroservices

async def monitor_multiple_jobs():
    async_client = AsyncNeMoMicroservices(base_url="http://localhost:8080")
    
    # Create jobs using low-level API
    job_ids = []
    for i in range(3):
        job = await async_client.beta.safe_synthesizer.jobs.create(
            name=f"pii-job-{i}",
            project="privacy-project",
            spec={
                "data_source": f"hf://datasets/default/safe-synthesizer/sample-data-{i}.csv",
                "config": {
                    "enable_synthesis": False,
                    "enable_replace_pii": True,
                    "replace_pii": {
                        "globals": {"locales": ["en_US"]},
                        "steps": [{"rows": {"update": [{"entity": ["email"], "value": "fake.email()"}]}}]
                    }
                }
            }
        )
        job_ids.append(job.id)
    
    # Monitor job statuses concurrently
    jobs = await asyncio.gather(*[
        async_client.beta.safe_synthesizer.jobs.retrieve(job_id)
        for job_id in job_ids
    ])
    statuses = [job.status for job in jobs]
    
    return statuses

# Run async job
results = asyncio.run(monitor_multiple_jobs())

Configuration Best Practices#

Follow these patterns for reliable configuration:

# Use validated configuration patterns
pii_config = {
    "globals": {"locales": ["en_US"]},
    "steps": [{"rows": {"update": [{"entity": ["email"], "value": "fake.email()"}]}}]
}

# Current training configuration options
train_config = {
    "pretrained_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # Default: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    "num_input_records_to_sample": "auto"  # Default: "auto"
}

# Current generation configuration options (in development)
generate_config = {
    "num_records": 1000,  # Default: 1000
    "temperature": 1.0,   # Default: 1.0
    "top_p": 1.0,         # Default: 1.0
    "use_structured_generation": False  # Default: False
}