Sampling-Based Columns#

Sampling-based columns generate data through statistical sampling methods, distributions, and predefined datasets.

Before You Start#

Before getting started, ensure you have the Data Designer client and configuration builder set up:

import os
from nemo_microservices import NeMoMicroservices
from nemo_microservices.beta.data_designer import DataDesignerClient, DataDesignerConfigBuilder
from nemo_microservices.beta.data_designer.config import columns as C
from nemo_microservices.beta.data_designer.config import params as P

data_designer_client = DataDesignerClient(
    client=NeMoMicroservices(base_url=os.environ["NEMO_MICROSERVICES_BASE_URL"])
)

config_builder = DataDesignerConfigBuilder(model_configs="path/to/your/model_configs.yaml")

Sampling-Based Column Types#

Category#

Creates categorical values from a defined set of options.

config_builder.add_column(
    name="product_category",
    type="category",
    params={
        "values": ["Electronics", "Clothing", "Home Goods", "Books"],
        "weights": [0.4, 0.3, 0.2, 0.1],  # Optional: probability weights
        "description": "Product category classification"  # Optional
    }
)
config_builder.add_column(
    SamplerColumn(
        name="product_category",
        type=P.SamplerType.CATEGORY,
        params=P.CategorySamplerParams(
            values=["Electronics", "Clothing", "Home Goods", "Books"],
            weights=[0.4, 0.3, 0.2, 0.1],  # Optional: probability weights
            description="Product category classification"  # Optional
        )
    )
)

Subcategory#

Creates values associated with a parent category.

config_builder.add_column(
    name="product_subcategory",
    type="subcategory",
    params={
        "category": "product_category",  # Parent category column
        "values": {
            "Electronics": ["Smartphones", "Laptops", "Headphones"],
            "Clothing": ["Shirts", "Pants", "Dresses", "Shoes"],
            "Home Goods": ["Kitchen", "Bathroom", "Bedroom"]
        }
    }
)
config_builder.add_column(
    SamplerColumn(
        name="product_subcategory",
        type=P.SamplerType.SUBCATEGORY,
        params=P.SubcategorySamplerParams(
            category="product_category",  # Parent category column
            values={
                "Electronics": ["Smartphones", "Laptops", "Headphones"],
                "Clothing": ["Shirts", "Pants", "Dresses", "Shoes"],
                "Home Goods": ["Kitchen", "Bathroom", "Bedroom"]
            }
        )
    )
)

UUID#

Generates unique identifiers.

config_builder.add_column(
    name="order_id",
    type="uuid",
    params={
        "prefix": "ORD-",  # Optional: adds a prefix
        "short_form": True,  # Optional: uses a shorter format
        "uppercase": True  # Optional: uses uppercase letters
    }
)
config_builder.add_column(
    C.SamplerColumn(
        name="order_id",
        type=P.SamplerType.UUID,
        params=P.UUIDSamplerParams(
            prefix="ORD-",  # Optional: adds a prefix
            short_form=True,  # Optional: uses a shorter format
            uppercase=True  # Optional: uses uppercase letters
        )
    )
)

Uniform Distribution#

config_builder.add_column(
    name="product_rating",
    type="uniform",
    params={"low": 1, "high": 5},
    convert_to="int"  # Optional: converts to integer
)
config_builder.add_column(
    C.SamplerColumn(
        name="product_rating",
        type=P.SamplerType.UNIFORM,
        params=P.UniformSamplerParams(low=1, high=10),
        convert_to="int"  # Optional: converts to integer
    )
)

Gaussian Distribution#

config_builder.add_column(
    name="item_weight",
    type="gaussian",
    params={"mean": 50, "stddev": 10}
)
config_builder.add_column(
    C.SamplerColumn(
        name="item_weight",
        type=P.SamplerType.GAUSSIAN,
        params=P.GaussianSamplerParams(mean=50, stddev=10)
    )
)

Poisson Distribution#

config_builder.add_column(
    name="number_of_pets",
    type="poisson",
    params={"mean": 2}
)
config_builder.add_column(
    C.SamplerColumn(
        name="number_of_pets",
        type=P.SamplerType.POISSON,
        params=P.PoissonSamplerParams(mean=2)
    )
)

Bernoulli Distribution#

config_builder.add_column(
    name="is_in_stock",
    type="bernoulli",
    params={"p": 0.8}
)
config_builder.add_column(
    C.SamplerColumn(
        name="is_in_stock",
        type=P.SamplerType.BERNOULLI,
        params=P.BernoulliSamplerParams(p=0.8)
    )
)

Bernoulli Mixture Distribution#

config_builder.add_column(
    name="bern_exp",
    type="bernoulli_mixture",
    params={"p": 0.4, "dist_name": "expon", "dist_params": {"scale": 10}}
)
config_builder.add_column(
    C.SamplerColumn(
        name="bern_exp",
        type=P.SamplerType.BERNOULLI_MIXTURE,
        params=P.BernoulliMixtureSamplerParams(p=0.8, dist_name="expon", dist_params={"scale": 10})
    )
)

Binomial Distribution#

config_builder.add_column(
    name="items_returned",
    type="binomial",
    params={"n": 10, "p": 0.1}
)
config_builder.add_column(
    C.SamplerColumn(
        name="items_returned",
        type=P.SamplerType.BINOMIAL,
        params=P.BinomialSamplerParams(n=10, p=0.1)
    )
)

SciPy Sampler#

Use this sampler to access any statistical methods available in scipy.stats;

config_builder.add_column(
    name="log_gaussian", 
    type="scipy", 
    params={
        "dist_name": "lognorm", 
        "dist_params": {
            "s": 0.9,   # sigma 
            "scale": 8, # exp(mean) 
        }
    }
)
config_builder.add_column(
    C.SamplerColumn(
        name="log_gaussian",
        type=P.SamplerType.SCIPY,
        params=P.ScipySamplerParams(dist_name="lognorm", dist_params={"s": 0.9, "scale": 8})
    )
)

DateTime#

config_builder.add_column(
    name="order_date",
    type="datetime",
    params={"start": "2023-01-01", "end": "2023-12-31"}
)
config_builder.add_column(
    C.SamplerColumn(
        name="order_date",
        type=P.SamplerType.DATETIME,
        params=P.DatetimeSamplerParams(start="2023-01-01", end="2023-12-31")
    )
)

TimeDelta#

config_builder.add_column(
    name="delivery_date",
    type="timedelta",
    params={
        "dt_min": 1,  # Minimum days
        "dt_max": 7,  # Maximum days (exclusive)
        "reference_column_name": "order_date"  # Reference date column
    }
)
config_builder.add_column(
    C.SamplerColumn(
        name="delivery_date",
        type=P.SamplerType.TIMEDELTA,
        params=P.TimeDeltaSamplerParams(
            dt_min=1,  # Minimum days
            dt_max=7,  # Maximum days (exclusive)
            reference_column_name="order_date"  # Reference date column
        )
    )
)

Person#

Defines person samplers that create realistic person entities.

# Define person samplers (note: use with_person_samplers method)
config_builder.with_person_samplers(
    {
        "customer": {
            "sex": "Female",  # Optional
            "locale": "en_US",  # Optional (default: "en_US")
            "age_range": [25, 65],  # Optional (default: [18, 114])
            "state": "CA",  # Optional (US only)
            "city": ["San Francisco", "Los Angeles"],  # Optional
            "with_synthetic_personas": True  # Optional (default: False)
        }
    }
)
# Define person samplers (note: use with_person_samplers method)
config_builder.with_person_samplers(
    {
        "customer": P.PersonSamplerParams(
            sex="Female",  # Optional
            locale="en_US",  # Optional (default: "en_US")
            age_range=[25, 65],  # Optional (default: [18, 114])
            state="CA",  # Optional (US only)
            city=["San Francisco", "Los Angeles"],  # Optional
            with_synthetic_personas=True  # Optional (default: False)
        )
    }
)

Using Conditional Parameters#

All sampling-based columns support conditional parameters that change based on other column values:

config_builder.add_column(
    name="pet_type",
    type="category",
    params={"values": ["dog", "cat", "fish"], "weights": [0.5, 0.3, 0.2]},
    conditional_params={
        "number_of_pets == 0": {"values": ["none"]}
    }
)
config_builder.add_column(
    SamplerColumn(
        name="pet_type",
        type=SamplerType.CATEGORY,
        params=CategorySamplerParams(values=["dog", "cat", "fish"], weights=[0.5, 0.3, 0.2]),
        conditional_params={
            "number_of_pets == 0": CategorySamplerParams(values=["none"])
        }
    )
)

Reference Table#

Simplified API Type

Typed API Equivalent

Description

"category"

SamplerType.CATEGORY

Categorical values

"subcategory"

SamplerType.SUBCATEGORY

Dependent categories

"uuid"

SamplerType.UUID

Unique identifiers

"uniform"

SamplerType.UNIFORM

Uniform distribution

"gaussian"

SamplerType.GAUSSIAN

Normal distribution

"poisson"

SamplerType.POISSON

Poisson distribution

"bernoulli"

SamplerType.BERNOULLI

Binary outcomes

"bernoulli_mixture"

SamplerType.BERNOULLI_MIXTURE

Mixed distribution

"binomial"

SamplerType.BINOMIAL

Number of successes

"scipy"

SamplerType.SCIPY

SciPy distributions

"datetime"

SamplerType.DATETIME

Date/time values

"timedelta"

SamplerType.TIMEDELTA

Time intervals

"person"

SamplerType.PERSON

Person entities

Available Person Attributes#

When referencing person samplers in prompt templates or jinja templates, these attributes are available:

  • first_name: First name

  • last_name: Last name

  • email: Email address

  • phone: Phone number

  • address: Street address

  • city: City name

  • state: State/province

  • zip_code: Postal code

  • country: Country

  • date_of_birth: Date of birth

  • age: Age (calculated from date of birth)

  • sex: Gender