Create Data Generation Job#

Prerequisites#

Before you can create a data generation job, make sure that you have:

  • Obtained the base URL of your NeMo Data Designer service

  • Prepared your data generation configuration including:

  • Set the DATA_DESIGNER_BASE_URL environment variable to your NeMo Data Designer service endpoint

export DATA_DESIGNER_BASE_URL="https://your-data-designer-service-url"

To Create a Data Generation Job#

Choose one of the following options to create a data generation job.

Full Data Generation Job#

import os
from nemo_microservices import NeMoMicroservices

# Initialize the client
client = NeMoMicroservices(
    base_url=os.environ['DATA_DESIGNER_BASE_URL']
)

# Create a data generation job
job = client.beta.data_designer.jobs.create(
    config={
        "model_configs": [
            {
                "alias": "main_model",
                "model": {
                    "api_endpoint": {
                        "url": "https://integrate.api.nvidia.com/v1",
                        "model_id": "meta/llama-3.3-70b-instruct",
                        "api_key": "your-api-key"
                    }
                }
            }
        ],
        "columns": [
            {
                "name": "language",
                "type": "category",
                "params": {
                    "values": ["english", "french"]
                }
            },
            {
                "name": "story",
                "type": "llm-text",
                "model_alias": "main_model",
                "prompt": "Write one sentence about synthetic data in {{ language }} language"
            }
        ]
    }
)

print(f"Created job with ID: {job.id}")
print(f"Job status: {job.status}")
curl -X POST \
  "${DATA_DESIGNER_BASE_URL}/v1beta1/data-designer/jobs" \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
    "config": {
      "model_configs": [
        {
          "alias": "main_model",
          "model": {
            "api_endpoint": {
              "url": "https://integrate.api.nvidia.com/v1",
              "model_id": "meta/llama-3.3-70b-instruct",
              "api_key": "your-api-key"
            }
          }
        }
      ],
      "columns": [
        {
          "name": "language",
          "type": "category",
          "params": {
            "values": ["english", "french"]
          }
        },
        {
          "name": "story",
          "type": "llm-text",
          "model_alias": "main_model",
          "prompt": "Write one sentence about synthetic data in {{ language }} language"
        }
      ]
    }
  }' | jq

Tip

Simplified Job Management Instead of manually managing job creation and monitoring, use the DataDesignerClient wrapper for a more convenient approach:

from nemo_microservices import NeMoMicroservices
from nemo_microservices.beta.data_designer import DataDesignerClient, DataDesignerConfigBuilder
from nemo_microservices.beta.data_designer.config import columns as C
from nemo_microservices.beta.data_designer.config import params as P

# Create a configuration builder with your model
config_builder = DataDesignerConfigBuilder(
    model_configs=[
        P.ModelConfig(
            alias="main-model",
            model=P.Model(
                    api_endpoint=P.ApiEndpoint(
                        model_id="meta/llama-3.3-70b-instruct",
                        url="https://integrate.api.nvidia.com/v1",
                        api_key="your-api-key"
                    )
                ),
            inference_parameters=P.InferenceParameters(
                temperature=0.90,
                top_p=0.99,
                max_tokens=2048,
            ),
        ),
    ]
)

# Add columns to define your data structure
config_builder.add_column(
    C.SamplerColumn(
        name="language",
        type=P.SamplerType.CATEGORY,
        params={"values": ["english", "french"]}
    )
)

config_builder.add_column(
    C.LLMTextColumn(
        name="story",
        prompt="Write one sentence about synthetic data in {{ language }} language",
        model_alias="main_model"
    )
)

# Initialize wrapper
data_designer_client = DataDesignerClient(
    client=NeMoMicroservices(base_url=os.environ['NEMO_MICROSERVICES_BASE_URL'])
)

# Create job with automatic waiting and result loading
job_result = data_designer_client.create(
    config_builder,
    num_records=100,
    wait_until_done=True  # Waits for completion automatically
)

# Access results as pandas DataFrame
df = job_result.load_dataset()
print(f"Job completed! Generated {len(df)} records.")

This eliminates the need for separate job status checking and result retrieval calls.

Example Job Response
{
  "id": "job-abc123def456",
  "status": "created",
  "status_details": null,
  "created_at": "2024-01-15T10:30:00.000Z",
  "updated_at": "2024-01-15T10:30:00.000Z",
  "namespace": "default",
  "schema_version": "1.0"
}

Preview Generation#

For testing configurations before running full jobs, use the preview endpoint:

# Generate a preview
preview = client.beta.data_designer.preview(
    config={
        "model_configs": [
            {
                "alias": "main_model",
                "model": {
                    "api_endpoint": {
                        "url": "https://integrate.api.nvidia.com/v1",
                        "model_id": "meta/llama-3.3-70b-instruct",
                        "api_key": "your-api-key"
                    }
                }
            }
        ],
        "columns": [
            {
                "name": "language",
                "type": "category",
                "params": {
                    "values": ["english", "french"]
                }
            },
            {
                "name": "story",
                "type": "llm-text",
                "model_alias": "main_model",
                "prompt": "Write one sentence about synthetic data in {{ language }} language"
            }
        ]
    }
)

print("Preview results:")
for response in preview:
    print(f"  {response}")
curl -X POST \
  "${DATA_DESIGNER_BASE_URL}/v1beta1/data-designer/preview" \
  -H 'accept: application/jsonl' \
  -H 'Content-Type: application/json' \
  -d '{
    "config": {
      "model_configs": [
        {
          "alias": "main_model",
          "model": {
            "api_endpoint": {
              "url": "https://integrate.api.nvidia.com/v1",
              "model_id": "meta/llama-3.3-70b-instruct",
              "api_key": "your-api-key"
            }
          }
        }
      ],
      "columns": [
        {
          "name": "language",
          "type": "category",
          "params": {
            "values": ["english", "french"]
          }
        },
        {
          "name": "story",
          "type": "llm-text",
          "model_alias": "main_model",
          "prompt": "Write one sentence about synthetic data in {{ language }} language"
        }
      ]
    }
  }'
Example Preview Response
{"type": "data", "step": "generate", "stream": "step_outputs", "payload": {"language": "english", "story": "Synthetic data provides a powerful way to generate realistic datasets for machine learning training while preserving privacy."}, "ts": "2024-01-01T00:00:00Z"}
{"type": "data", "step": "generate", "stream": "step_outputs", "payload": {"language": "french", "story": "Les données synthétiques offrent un moyen puissant de générer des ensembles de données réalistes pour l'entraînement d'apprentissage automatique."}, "ts": "2024-01-01T00:00:01Z"}
{"type": "completed", "step": "generate", "stream": "step_outputs", "payload": {"total_rows": 2}, "ts": "2024-01-01T00:00:02Z"}