Create Data Generation Job#
Prerequisites#
Before you can create a data generation job, make sure that you have:
Obtained the base URL of your NeMo Data Designer service
Prepared your data generation configuration including:
Model configurations - Set up model endpoints and inference parameters
Column schemas - Define your data column types and parameters
Model constraints - Optional validation rules for your data
Set the
DATA_DESIGNER_BASE_URL
environment variable to your NeMo Data Designer service endpoint
export DATA_DESIGNER_BASE_URL="https://your-data-designer-service-url"
To Create a Data Generation Job#
Choose one of the following options to create a data generation job.
Full Data Generation Job#
import os
from nemo_microservices import NeMoMicroservices
# Initialize the client
client = NeMoMicroservices(
base_url=os.environ['DATA_DESIGNER_BASE_URL']
)
# Create a data generation job
job = client.beta.data_designer.jobs.create(
config={
"model_configs": [
{
"alias": "main_model",
"model": {
"api_endpoint": {
"url": "https://integrate.api.nvidia.com/v1",
"model_id": "meta/llama-3.3-70b-instruct",
"api_key": "your-api-key"
}
}
}
],
"columns": [
{
"name": "language",
"type": "category",
"params": {
"values": ["english", "french"]
}
},
{
"name": "story",
"type": "llm-text",
"model_alias": "main_model",
"prompt": "Write one sentence about synthetic data in {{ language }} language"
}
]
}
)
print(f"Created job with ID: {job.id}")
print(f"Job status: {job.status}")
curl -X POST \
"${DATA_DESIGNER_BASE_URL}/v1beta1/data-designer/jobs" \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"config": {
"model_configs": [
{
"alias": "main_model",
"model": {
"api_endpoint": {
"url": "https://integrate.api.nvidia.com/v1",
"model_id": "meta/llama-3.3-70b-instruct",
"api_key": "your-api-key"
}
}
}
],
"columns": [
{
"name": "language",
"type": "category",
"params": {
"values": ["english", "french"]
}
},
{
"name": "story",
"type": "llm-text",
"model_alias": "main_model",
"prompt": "Write one sentence about synthetic data in {{ language }} language"
}
]
}
}' | jq
Tip
Simplified Job Management
Instead of manually managing job creation and monitoring, use the DataDesignerClient
wrapper for a more convenient approach:
from nemo_microservices import NeMoMicroservices
from nemo_microservices.beta.data_designer import DataDesignerClient, DataDesignerConfigBuilder
from nemo_microservices.beta.data_designer.config import columns as C
from nemo_microservices.beta.data_designer.config import params as P
# Create a configuration builder with your model
config_builder = DataDesignerConfigBuilder(
model_configs=[
P.ModelConfig(
alias="main-model",
model=P.Model(
api_endpoint=P.ApiEndpoint(
model_id="meta/llama-3.3-70b-instruct",
url="https://integrate.api.nvidia.com/v1",
api_key="your-api-key"
)
),
inference_parameters=P.InferenceParameters(
temperature=0.90,
top_p=0.99,
max_tokens=2048,
),
),
]
)
# Add columns to define your data structure
config_builder.add_column(
C.SamplerColumn(
name="language",
type=P.SamplerType.CATEGORY,
params={"values": ["english", "french"]}
)
)
config_builder.add_column(
C.LLMTextColumn(
name="story",
prompt="Write one sentence about synthetic data in {{ language }} language",
model_alias="main_model"
)
)
# Initialize wrapper
data_designer_client = DataDesignerClient(
client=NeMoMicroservices(base_url=os.environ['NEMO_MICROSERVICES_BASE_URL'])
)
# Create job with automatic waiting and result loading
job_result = data_designer_client.create(
config_builder,
num_records=100,
wait_until_done=True # Waits for completion automatically
)
# Access results as pandas DataFrame
df = job_result.load_dataset()
print(f"Job completed! Generated {len(df)} records.")
This eliminates the need for separate job status checking and result retrieval calls.
Example Job Response
{
"id": "job-abc123def456",
"status": "created",
"status_details": null,
"created_at": "2024-01-15T10:30:00.000Z",
"updated_at": "2024-01-15T10:30:00.000Z",
"namespace": "default",
"schema_version": "1.0"
}
Preview Generation#
For testing configurations before running full jobs, use the preview endpoint:
# Generate a preview
preview = client.beta.data_designer.preview(
config={
"model_configs": [
{
"alias": "main_model",
"model": {
"api_endpoint": {
"url": "https://integrate.api.nvidia.com/v1",
"model_id": "meta/llama-3.3-70b-instruct",
"api_key": "your-api-key"
}
}
}
],
"columns": [
{
"name": "language",
"type": "category",
"params": {
"values": ["english", "french"]
}
},
{
"name": "story",
"type": "llm-text",
"model_alias": "main_model",
"prompt": "Write one sentence about synthetic data in {{ language }} language"
}
]
}
)
print("Preview results:")
for response in preview:
print(f" {response}")
curl -X POST \
"${DATA_DESIGNER_BASE_URL}/v1beta1/data-designer/preview" \
-H 'accept: application/jsonl' \
-H 'Content-Type: application/json' \
-d '{
"config": {
"model_configs": [
{
"alias": "main_model",
"model": {
"api_endpoint": {
"url": "https://integrate.api.nvidia.com/v1",
"model_id": "meta/llama-3.3-70b-instruct",
"api_key": "your-api-key"
}
}
}
],
"columns": [
{
"name": "language",
"type": "category",
"params": {
"values": ["english", "french"]
}
},
{
"name": "story",
"type": "llm-text",
"model_alias": "main_model",
"prompt": "Write one sentence about synthetic data in {{ language }} language"
}
]
}
}'
Example Preview Response
{"type": "data", "step": "generate", "stream": "step_outputs", "payload": {"language": "english", "story": "Synthetic data provides a powerful way to generate realistic datasets for machine learning training while preserving privacy."}, "ts": "2024-01-01T00:00:00Z"}
{"type": "data", "step": "generate", "stream": "step_outputs", "payload": {"language": "french", "story": "Les données synthétiques offrent un moyen puissant de générer des ensembles de données réalistes pour l'entraînement d'apprentissage automatique."}, "ts": "2024-01-01T00:00:01Z"}
{"type": "completed", "step": "generate", "stream": "step_outputs", "payload": {"total_rows": 2}, "ts": "2024-01-01T00:00:02Z"}