Create Data Generation Job#
Prerequisites#
Before you can create a data generation job, make sure that you have:
Obtained the base URL of your NeMo Data Designer service
Prepared your data generation configuration including:
Model configurations - Configure model aliases and inference parameters
Column schemas - Define your data column types and parameters
Model constraints - Optional validation rules for your data
Set the
NEMO_MICROSERVICES_BASE_URL
environment variable to your NeMo Data Designer service endpoint
export NEMO_MICROSERVICES_BASE_URL="https://your-data-designer-service-url"
To Create a Data Generation Job#
Choose one of the following options to create a data generation job.
Full Data Generation Job#
import os
from nemo_microservices import NeMoMicroservices
# Initialize the client
client = NeMoMicroservices(
base_url=os.environ['NEMO_MICROSERVICES_BASE_URL']
)
# Create a data generation job
job = client.data_designer.jobs.create(
name="data-designer-job",
spec={
"num_records": 10,
"config": {
"model_configs": [
{
"alias": "main_model",
"model": "meta/llama-3.3-70b-instruct",
"inference_parameters": {"temperature": 0.5, "top_p": 1.0, "max_tokens": 1024},
}
],
"columns": [
{"name": "language", "sampler_type": "category", "params": {"values": ["english", "french"]}},
{
"name": "story",
"model_alias": "main_model",
"prompt": "Write one sentence about synthetic data in {{ language }} language",
},
],
},
},
)
print(f"Created job with ID: {job.id}")
print(f"Job status: {job.status}")
curl -X POST \
"${NEMO_MICROSERVICES_BASE_URL}/v1/data-designer/jobs" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"name": "data-designer-job",
"project": "my-project",
"spec": {
"num_records": 10,
"config": {
"model_configs": [
{
"alias": "main_model",
"model": "meta/llama-3.3-70b-instruct",
"inference_parameters": {
"temperature": 0.5,
"top_p": 1.0,
"max_tokens": 1024
}
}
],
"columns": [
{
"name": "language",
"sampler_type": "category",
"params": {
"values": [
"english",
"french"
]
}
},
{
"name": "story",
"model_alias": "main_model",
"prompt": "Write one sentence about synthetic data in {{ language }} language"
}
]
}
}
}' | jq
Example Job Response
{
"id": "job-sdt19mhkbk7orszsycwx7z",
"name": "data-designer-job",
"description": null,
"project": "my-project",
"namespace": "default",
"created_at": "2025-09-18T15:58:01.179087",
"updated_at": "2025-09-18T15:58:01.179091",
"spec": {
"num_records": 10,
"config": {
"columns": [
{
"name": "language",
"drop": false,
"failure_threshold": 0.2,
"sampler_type": "category",
"params": {
"values": [
"english",
"french"
],
"weights": null
},
"conditional_params": {},
"convert_to": null
},
{
"name": "story",
"drop": false,
"failure_threshold": 0.2,
"prompt": "Write one sentence about synthetic data in {{ language }} language",
"model_alias": "main_model",
"system_prompt": null,
"multi_modal_context": null
}
],
"model_configs": [
{
"alias": "main_model",
"model": "meta/llama-3.3-70b-instruct",
"inference_parameters": {
"temperature": 0.5,
"top_p": 1.0,
"max_tokens": 1024,
"max_parallel_requests": 4,
"timeout": null
},
"provider": null
}
],
"seed_config": null,
"constraints": null,
"profilers": null
}
},
"status": "created",
"status_details": {},
"error_details": null,
"ownership": null,
"custom_fields": null
}
Tip
Simplified Job Management
Instead of manually managing job creation and monitoring, use the DataDesignerClient
wrapper for a more convenient approach:
import os
from nemo_microservices.data_designer.essentials import (
CategorySamplerParams,
DataDesignerConfigBuilder,
InferenceParameters,
LLMStructuredColumn,
ModelConfig,
NeMoDataDesignerClient,
SamplerColumnConfig,
SamplerType,
)
# Create a configuration builder with your model
config_builder = DataDesignerConfigBuilder(
model_configs=[
ModelConfig(
alias="main-model",
model="meta/llama-3.3-70b-instruct",
inference_parameters=InferenceParameters(
temperature=0.90,
top_p=0.99,
max_tokens=2048,
),
),
]
)
# Add columns to define your data structure
config_builder.add_column(
SamplerColumnConfig(
name="language",
sampler_type=SamplerType.CATEGORY,
params=CategorySamplerParams(
values=["English", "French"]
)
)
)
config_builder.add_column(
LLMTextColumnConfig(
name="story",
prompt="Write one sentence about synthetic data in {{ language }} language",
model_alias="main_model"
)
)
# Initialize wrapper
data_designer_client = NeMoDataDesignerClient(
base_url=os.environ["NEMO_MICROSERVICES_BASE_URL"]
)
# Create job with automatic waiting and result loading
job_result = data_designer_client.create(
config_builder,
num_records=100,
wait_until_done=True # Waits for completion automatically
)
# Access results as pandas DataFrame
df = job_result.load_dataset()
print(f"Job completed! Generated {len(df)} records.")
This eliminates the need for separate job status checking and result retrieval calls.