⚠️ Warning: ${nss_short_name} is in Early Access and not recommended for production use.

Python Builder SDK Examples#

Practical examples demonstrating common ${nss_short_name} jobs using the Python Builder API.

Initialize the NeMo Safe Synthesizer Client#

  • The Python SDK provides a wrapper around the NeMo Microservices Platform APIs.

  • http://localhost:8080 is the default URL for base_url in quickstart.

  • If using a managed or remote deployment, ensure correct base URLs and tokens.

from nemo_microservices import NeMoMicroservices
from nemo_microservices.beta.safe_synthesizer.sdk.job_builder import SafeSynthesizerJobBuilder

client = NeMoMicroservices(base_url="http://localhost:8080")

Set up the DataStore#

NeMo DataStore is launched as one of the services. We’ll use it to manage storage for uploaded datasets.

datastore_config = {
    "endpoint": "http://localhost:3000/v1/hf",
    # "token": "<optional token>",
}

Basic Examples#

Example 1: Basic Synthetic Data Generation#

Generate private synthetic data from a local CSV. The Builder uploads your CSV to Datastore automatically.

df = pd.read_csv("customer-data.csv")
job = (
    SafeSynthesizerJobBuilder(client)
    .with_data_source()
    .with_datastore(datastore_config)
    .with_replace_pii(
        globals={"locales": ["en_US"]},
        steps=[{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}],
    )
    .with_generate(num_records=5000, temperature=0.8)
    .with_differential_privacy(dp_enabled=True, epsilon=6.0)
    .with_evaluate(mia_enabled=True, aia_enabled=True)
    .synthesize()
)

print(f"job_id = {job.job_id}")
job.wait_for_completion()

print(f"Job finished with status {job.fetch_status()}")

Example 2: PII Detection, Redaction, or Replacement Only#

Run PII replacement on a local CSV without synthetic generation.

df = pd.read_csv("customer-data.csv")
job = (
    SafeSynthesizerJobBuilder(client)
    .with_data_source()
    .with_datastore(datastore_config)
    .with_replace_pii(
        globals={"locales": ["en_US"]},
        steps=[{"rows": {"update": [{"entity": ["email", "phone_number"], "value": "column.entity | fake"}]}}],
    )
)

print(f"job_id = {job.job_id}")
job.wait_for_completion()

print(f"Job finished with status {job.fetch_status()}")

Industry-Specific Examples#

Example 3: GDPR Compliance#

European locales, higher NER confidence, and stronger DP settings.

gdpr_pii_config = {
    "globals": {
        "locales": ["en_GB", "de_DE", "fr_FR"],
        "ner": {"ner_threshold": 0.9, "entities": ["name", "email", "phone_number", "address", "iban"]},
    },
    "steps": [
        {"rows": {"update": [
            {"entity": ["name"], "value": "fake.name()"},
            {"entity": ["email"], "value": "fake.email()"},
            {"entity": ["phone_number"], "value": "fake.phone_number()"},
            {"entity": ["address"], "value": "fake.address()"},
        ]}}
    ],
}
df = pd.read_csv("eu-customer-data.csv")
job = (
    SafeSynthesizerJobBuilder(client)
    .with_data_source(df)
    .with_datastore(datastore_config)
    .with_replace_pii(gdpr_pii_config)
    .with_differential_privacy(dp_enabled=True, epsilon=2.0, delta=1e-6)
    .with_evaluate(mia_enabled=True)
    .synthesize()
)

print(f"job_id = {job.job_id}")
job.wait_for_completion()

print(f"Job finished with status {job.fetch_status()}")

Example 4: Healthcare Data (HIPAA)#

PHI-sensitive NER configuration and very strong privacy hyperparameters (i.e., small epilson).

hipaa_config = {
    "globals": {
        "locales": ["en_US"],
        "ner": {
            "entities": [
                "name", "email", "phone_number", "address",
                "medical_record_number", "ssn", "date_of_birth"
            ],
            "ner_threshold": 0.95,
        },
    },
    "steps": [
        {"rows": {
            "update": [
                {"name": "patient_name", "value": "fake.name()"},
                {"name": "mrn", "value": "fake.random_number(digits=8)"},
                {"name": "dob", "value": "fake.date_of_birth(minimum_age=18, maximum_age=90)"},
            ],
            "drop": [{"condition": "notes CONTAINS 'confidential'"}],
        }}
    ],
}
df = pd.read_csv("patient-data.csv")
job = (
    SafeSynthesizerJobBuilder(client)
    .with_data_source()
    .with_datastore(datastore_config)
    .with_replace_pii(hipaa_config)
    .with_differential_privacy(
        dp_enabled=True,
        epsilon=1.0,
        delta=1e-7,
        per_sample_max_grad_norm=0.5,
    )
    .with_generate(temperature=0.6)
    .with_evaluate(mia_enabled=True, aia_enabled=True)
    .synthesize()
)

print(f"job_id = {job.job_id}")
job.wait_for_completion()

print(f"Job finished with status {job.fetch_status()}")

Example 5: Financial Data#

Domain-specific PII replacement with ordering/grouping hints for training.

financial_training = {
    "group_training_examples_by": "account_type",
    "order_training_examples_by": "transaction_date",
}

financial_pii = {
    "steps": [
        {"rows": {"update": [
            {"name": "account_number", "value": "fake.random_number(digits=12)"},
            {"name": "routing_number", "value": "fake.random_number(digits=9)"},
            {"entity": ["credit_debit_card"], "value": "fake.credit_card_number()"},
            {"entity": ["ssn"], "value": "fake.ssn()"},
        ]}}
    ]
}
df = pd.read_csv("financial-transactions.csv")
job = (
    SafeSynthesizerJobBuilder(client)
    .with_data_source()
    .with_datastore(datastore_config)
    .with_replace_pii(financial_pii)
    .with_differential_privacy(dp_enabled=True, epsilon=4.0)
    .with_train(financial_training)
    .with_generate(num_records=50000, use_structured_generation=True)
    .with_evaluate(mia_enabled=True)
    .synthesize()
)

print(f"job_id = {job.job_id}")
job.wait_for_completion()

print(f"Job finished with status {job.fetch_status()}")

View Synthetic data#

After the job completes, fetch the generated synthetic dataset.

synthetic_df = job.fetch_data()
synthetic_df

View evaluation report#

An evaluation comparing the synthetic data to the input data is performed automatically. You can:

  • Inspect key scores: overall synthetic data quality and privacy.

  • Download the full HTML report: includes charts and detailed metrics.

  • Display the report inline: useful when viewing in notebook environments.

# Print selected information from the job summary
summary = job.fetch_summary()
print(
    f"Synthetic data quality score (0-10, higher is better): {summary.synthetic_data_quality_score}"
)
print(f"Data privacy score (0-10, higher is better): {summary.data_privacy_score}")

# Download the full evaluation report to your local machine
job.save_report("evaluation_report.html")

# Fetch and display the full evaluation report inline
job.display_report_in_notebook()

Advanced Jobs#

Example 6: Batch Processing#

Process multiple quarterly datasets with shared configuration.

import pandas as pd

pii_config = {
    "steps": [
        {"rows": {"update": [
            {"entity": ["email", "phone_number"], "value": "column.entity | fake"}
        ]}}
    ]
}

jobs = []
for path in [f"customer-q{q}.csv" for q in range(1, 5)]:
    builder = (
        SafeSynthesizerJobBuilder(client)
        .with_data_source(pd.read_csv(path))
        .with_datastore(datastore_config)
        .with_replace_pii(pii_config)
        .with_differential_privacy(dp_enabled=True, epsilon=5.0)
        .with_generate(num_records=5000, temperature=0.8)
        .with_evaluate(mia_enabled=True, aia_enabled=True)
        .synthesize()
    )
    job = builder.create_job(name=f"batch-{path.split('.')[0]}", project="default")
    jobs.append(job)

# Wait for all jobs to complete before fetching results
for job in jobs:
    job.wait_for_completion()

# Collect results after completion
all_results = []
for job in jobs:
    try:
        all_results.append(client.beta.safe_synthesizer.jobs.results.list(job.job_id))
    except Exception:
        all_results.append(None)

Error Handling Patterns#

Example 7: Robust Production Job#

import logging
import time
from nemo_microservices import APIError, APIStatusError
import pandas as pd


def safe_synthesizer_with_retry(local_csv_path: str, max_retries: int = 3):
    for attempt in range(max_retries):
        try:
            builder = (
                SafeSynthesizerJobBuilder(client)
                .with_data_source(pd.read_csv(local_csv_path))
                .with_datastore(datastore_config)
                .with_replace_pii(
                    steps=[{"rows": {"update": [{"entity": ["email"], "value": "column.entity | fake"}]}}],
                )
                .with_differential_privacy(dp_enabled=True, epsilon=5.0)
                .with_generate(num_records=5000)
                .with_evaluate(mia_enabled=True)
                .synthesize()
            )

            job = builder.create_job(name=f"production-synthetics-attempt-{attempt + 1}", project="default")

            while True:
                current_job = client.beta.safe_synthesizer.jobs.retrieve(job.job_id)
                status = current_job.status
                if status == "completed":
                    break
                elif status == "error":
                    raise APIError("Job failed - check logs for details", request=None, body=None)
                time.sleep(30)

            return job

        except APIError as e:
            logging.error(f"Attempt {attempt + 1} failed: {getattr(e, 'message', str(e))}")
            if attempt == max_retries - 1:
                raise
            continue
        except APIStatusError as e:
            logging.error(f"API error on attempt {attempt + 1}: {getattr(e, 'message', str(e))}")
            if attempt == max_retries - 1:
                raise
            continue

    raise RuntimeError(f"All {max_retries} attempts failed")

# Usage
try:
    job = safe_synthesizer_with_retry("sensitive-dataset.csv")
    print(f"Successfully created job: {job.job_id}")
except Exception as e:
    print(f"Failed to create ${nss_short_name} job: {e}")

Performance Optimization#

Example 8: Large Dataset Processing#

Tuning training/generation for large datasets approaching the 500MB limit.

import pandas as pd

large_training = {
    "max_sequences_per_example": 256,
    "num_input_records_to_sample": 100000,
    "batch_size": 2,
    "gradient_accumulation_steps": 8,
}

builder = (
    SafeSynthesizerJobBuilder(client)
    .with_data_source(pd.read_csv("large-dataset.csv"))
    .with_datastore(datastore_config)
    .with_replace_pii(
        steps=[{"rows": {"update": [{"entity": ["email"], "value": "column.entity | fake"}]}}],
    )
    .with_train(large_training)
    .with_generate(num_records=25000)
    .with_evaluate(mia_enabled=True)
    .synthesize()
)

job = builder.create_job(name="large-dataset-synthetics", project="default")
  • The Builder uploads your dataset to the datastore under default/safe-synthesizer with a randomized filename. If you need custom repo paths or filenames, prefer the lower-level REST API for now.

  • The Builder accepts either a local pandas.DataFrame or a local file path (string) for .with_data_source(...).