Create Data Generation Job#

Prerequisites#

Before you can create a data generation job, make sure that you have:

  • Obtained the base URL of your NeMo Data Designer service

  • Prepared your data generation configuration including:

  • Set the NEMO_MICROSERVICES_BASE_URL environment variable to your NeMo Data Designer service endpoint

export NEMO_MICROSERVICES_BASE_URL="https://your-data-designer-service-url"

To Create a Data Generation Job#

Choose one of the following options to create a data generation job.

Full Data Generation Job#

import os
from nemo_microservices import NeMoMicroservices

# Initialize the client
client = NeMoMicroservices(
    base_url=os.environ['NEMO_MICROSERVICES_BASE_URL']
)

# Create a data generation job
job = client.data_designer.jobs.create(
    name="data-designer-job",
    spec={
        "num_records": 10,
        "config": {
            "model_configs": [
                {
                    "alias": "main_model",
                    "model": "meta/llama-3.3-70b-instruct",
                    "inference_parameters": {"temperature": 0.5, "top_p": 1.0, "max_tokens": 1024},
                }
            ],
            "columns": [
                {"name": "language", "sampler_type": "category", "params": {"values": ["english", "french"]}},
                {
                    "name": "story",
                    "model_alias": "main_model",
                    "prompt": "Write one sentence about synthetic data in {{ language }} language",
                },
            ],
        },
    },
)

print(f"Created job with ID: {job.id}")
print(f"Job status: {job.status}")
curl -X POST \
  "${NEMO_MICROSERVICES_BASE_URL}/v1/data-designer/jobs" \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
    "name": "data-designer-job",
    "project": "my-project",
    "spec": {
        "num_records": 10,
        "config": {
            "model_configs": [
                {
                    "alias": "main_model",
                    "model": "meta/llama-3.3-70b-instruct",
                    "inference_parameters": {
                        "temperature": 0.5,
                        "top_p": 1.0,
                        "max_tokens": 1024
                    }
                }
            ],
            "columns": [
                {
                    "name": "language",
                    "sampler_type": "category",
                    "params": {
                        "values": [
                            "english",
                            "french"
                        ]
                    }
                },
                {
                    "name": "story",
                    "model_alias": "main_model",
                    "prompt": "Write one sentence about synthetic data in {{ language }} language"
                }
            ]
        }
    }
  }' | jq
Example Job Response
{
  "id": "job-sdt19mhkbk7orszsycwx7z",
  "name": "data-designer-job",
  "description": null,
  "project": "my-project",
  "namespace": "default",
  "created_at": "2025-09-18T15:58:01.179087",
  "updated_at": "2025-09-18T15:58:01.179091",
  "spec": {
    "num_records": 10,
    "config": {
      "columns": [
        {
          "name": "language",
          "drop": false,
          "failure_threshold": 0.2,
          "sampler_type": "category",
          "params": {
            "values": [
              "english",
              "french"
            ],
            "weights": null
          },
          "conditional_params": {},
          "convert_to": null
        },
        {
          "name": "story",
          "drop": false,
          "failure_threshold": 0.2,
          "prompt": "Write one sentence about synthetic data in {{ language }} language",
          "model_alias": "main_model",
          "system_prompt": null,
          "multi_modal_context": null
        }
      ],
      "model_configs": [
        {
          "alias": "main_model",
          "model": "meta/llama-3.3-70b-instruct",
          "inference_parameters": {
            "temperature": 0.5,
            "top_p": 1.0,
            "max_tokens": 1024,
            "max_parallel_requests": 4,
            "timeout": null
          },
          "provider": null
        }
      ],
      "seed_config": null,
      "constraints": null,
      "profilers": null
    }
  },
  "status": "created",
  "status_details": {},
  "error_details": null,
  "ownership": null,
  "custom_fields": null
}

Tip

Simplified Job Management Instead of manually managing job creation and monitoring, use the DataDesignerClient wrapper for a more convenient approach:

import os
from nemo_microservices.data_designer.essentials import (
    CategorySamplerParams,
    DataDesignerConfigBuilder,
    InferenceParameters,
    LLMStructuredColumn,
    ModelConfig,
    NeMoDataDesignerClient,
    SamplerColumnConfig,
    SamplerType,
)

# Create a configuration builder with your model
config_builder = DataDesignerConfigBuilder(
    model_configs=[
        ModelConfig(
            alias="main-model",
            model="meta/llama-3.3-70b-instruct",
            inference_parameters=InferenceParameters(
                temperature=0.90,
                top_p=0.99,
                max_tokens=2048,
            ),
        ),
    ]
)

# Add columns to define your data structure
config_builder.add_column(
    SamplerColumnConfig(
        name="language",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["English", "French"]
        )
    )
)

config_builder.add_column(
    LLMTextColumnConfig(
        name="story",
        prompt="Write one sentence about synthetic data in {{ language }} language",
        model_alias="main_model"
    )
)

# Initialize wrapper
data_designer_client = NeMoDataDesignerClient(
    base_url=os.environ["NEMO_MICROSERVICES_BASE_URL"]
)

# Create job with automatic waiting and result loading
job_result = data_designer_client.create(
    config_builder,
    num_records=100,
    wait_until_done=True  # Waits for completion automatically
)

# Access results as pandas DataFrame
df = job_result.load_dataset()
print(f"Job completed! Generated {len(df)} records.")

This eliminates the need for separate job status checking and result retrieval calls.