Sampling-Based Columns#

Sampling-based columns generate data through statistical sampling methods, distributions, and predefined datasets.

Before You Start#

Before getting started, ensure you have the Data Designer client and configuration builder set up:

from nemo_microservices.data_designer.essentials import (
    BernoulliMixtureSamplerParams,
    BernoulliSamplerParams,
    BinomialSamplerParams,
    CategorySamplerParams,
    DataDesignerConfigBuilder,
    DatetimeSamplerParams,
    GaussianSamplerParams,
    NeMoDataDesignerClient,
    PersonSamplerParams,
    PoissonSamplerParams,
    SamplerColumnConfig,
    SamplerType,
    ScipySamplerParams,
    SubcategorySamplerParams,
    TimeDeltaSamplerParams,
    UniformSamplerParams,
    UUIDSamplerParams,
)

data_designer_client = NeMoDataDesignerClient(
    base_url=os.environ["NEMO_MICROSERVICES_BASE_URL"]
)

config_builder = DataDesignerConfigBuilder(model_configs="path/to/your/model_configs.yaml")

Sampling-Based Column Types#

Category#

Creates categorical values from a defined set of options.

config_builder.add_column(
    SamplerColumnConfig(
        name="product_category",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(
            values=["Electronics", "Clothing", "Home Goods", "Books"],
            weights=[0.4, 0.3, 0.2, 0.1],  # Optional: probability weights
            description="Product category classification"  # Optional
        )
    )
)
config_builder.add_column(
    name="product_category",
    column_type="sampler",
    sampler_type="category"
    params={
        "values": ["Electronics", "Clothing", "Home Goods", "Books"],
        "weights": [0.4, 0.3, 0.2, 0.1],  # Optional: probability weights
        "description": "Product category classification"  # Optional
    }
)

Subcategory#

Creates values associated with a parent category.

config_builder.add_column(
    SamplerColumnConfig(
        name="product_subcategory",
        sampler_type=SamplerType.SUBCATEGORY,
        params=SubcategorySamplerParams(
            category="product_category",  # Parent category column
            values={
                "Electronics": ["Smartphones", "Laptops", "Headphones"],
                "Clothing": ["Shirts", "Pants", "Dresses", "Shoes"],
                "Home Goods": ["Kitchen", "Bathroom", "Bedroom"]
            }
        )
    )
)
config_builder.add_column(
    name="product_subcategory",
    column_type="sampler",
    sampler_type="subcategory",
    params={
        "category": "product_category",  # Parent category column
        "values": {
            "Electronics": ["Smartphones", "Laptops", "Headphones"],
            "Clothing": ["Shirts", "Pants", "Dresses", "Shoes"],
            "Home Goods": ["Kitchen", "Bathroom", "Bedroom"]
        }
    }
)

UUID#

Generates unique identifiers.

config_builder.add_column(
    SamplerColumnConfig(
        name="order_id",
        sampler_type=SamplerType.UUID,
        params=UUIDSamplerParams(
            prefix="ORD-",  # Optional: adds a prefix
            short_form=True,  # Optional: uses a shorter format
            uppercase=True  # Optional: uses uppercase letters
        )
    )
)
config_builder.add_column(
    name="order_id",
    column_type="sampler",
    sampler_type="uuid",
    params={
        "prefix": "ORD-",  # Optional: adds a prefix
        "short_form": True,  # Optional: uses a shorter format
        "uppercase": True  # Optional: uses uppercase letters
    }
)

Uniform Distribution#

config_builder.add_column(
    SamplerColumnConfig(
        name="product_rating",
        sampler_type=SamplerType.UNIFORM,
        params=UniformSamplerParams(low=1, high=10),
        convert_to="int"  # Optional: converts to integer
    )
)
config_builder.add_column(
    name="product_rating",
    column_type="sampler",
    sampler_type="uniform",
    params={"low": 1, "high": 5},
    convert_to="int"  # Optional: converts to integer
)

Gaussian Distribution#

config_builder.add_column(
    SamplerColumnConfig(
        name="item_weight",
        sampler_type=SamplerType.GAUSSIAN,
        params=GaussianSamplerParams(mean=50, stddev=10)
    )
)
config_builder.add_column(
    name="item_weight",
    column_type="sampler",
    sampler_type="gaussian",
    params={"mean": 50, "stddev": 10}
)

Poisson Distribution#

config_builder.add_column(
    SamplerColumnConfig(
        name="number_of_pets",
        sampler_type=SamplerType.POISSON,
        params=PoissonSamplerParams(mean=2)
    )
)
config_builder.add_column(
    name="number_of_pets",
    column_type="sampler",
    sampler_type="poisson",
    params={"mean": 2}
)

Bernoulli Distribution#

config_builder.add_column(
    SamplerColumnConfig(
        name="is_in_stock",
        sampler_type=SamplerType.BERNOULLI,
        params=BernoulliSamplerParams(p=0.8)
    )
)
config_builder.add_column(
    name="is_in_stock",
    column_type="sampler",
    sampler_type="bernoulli",
    params={"p": 0.8}
)

Bernoulli Mixture Distribution#

config_builder.add_column(
    SamplerColumnConfig(
        name="bern_exp",
        sampler_type=SamplerType.BERNOULLI_MIXTURE,
        params=BernoulliMixtureSamplerParams(p=0.8, dist_name="expon", dist_params={"scale": 10})
    )
)
config_builder.add_column(
    name="bern_exp",
    column_type="sampler",
    sampler_type="bernoulli_mixture",
    params={"p": 0.4, "dist_name": "expon", "dist_params": {"scale": 10}}
)

Binomial Distribution#

config_builder.add_column(
    SamplerColumnConfig(
        name="items_returned",
        sampler_type=SamplerType.BINOMIAL,
        params=BinomialSamplerParams(n=10, p=0.1)
    )
)
config_builder.add_column(
    name="items_returned",
    column_type="sampler",
    sampler_type="binomial",
    params={"n": 10, "p": 0.1}
)

SciPy Sampler#

Use this sampler to access any statistical methods available in scipy.stats;

config_builder.add_column(
    SamplerColumnConfig(
        name="log_gaussian",
        sampler_type=SamplerType.SCIPY,
        params=ScipySamplerParams(dist_name="lognorm", dist_params={"s": 0.9, "scale": 8})
    )
)
config_builder.add_column(
    name="log_gaussian", 
    column_type="sampler",
    sampler_type="scipy", 
    params={
        "dist_name": "lognorm", 
        "dist_params": {
            "s": 0.9,   # sigma 
            "scale": 8, # exp(mean) 
        }
    }
)

DateTime#

config_builder.add_column(
    SamplerColumnConfig(
        name="order_date",
        sampler_type=SamplerType.DATETIME,
        params=DatetimeSamplerParams(start="2023-01-01", end="2023-12-31")
    )
)
config_builder.add_column(
    name="order_date",
    column_type="sampler",
    sampler_type="datetime",
    params={"start": "2023-01-01", "end": "2023-12-31"}
)

TimeDelta#

config_builder.add_column(
    SamplerColumnConfig(
        name="delivery_date",
        sampler_type=SamplerType.TIMEDELTA,
        params=TimeDeltaSamplerParams(
            dt_min=1,  # Minimum days
            dt_max=7,  # Maximum days (exclusive)
            reference_column_name="order_date"  # Reference date column
        )
    )
)
config_builder.add_column(
    name="delivery_date",
    column_type="sampler",
    sampler_type="timedelta",
    params={
        "dt_min": 1,  # Minimum days
        "dt_max": 7,  # Maximum days (exclusive)
        "reference_column_name": "order_date"  # Reference date column
    }
)

Person#

Defines person samplers that create realistic person entities.

config_builder.add_column(
    SamplerColumnConfig(
        name="customer",
        sampler_type=SamplerType.PERSON,
        params=PersonSamplerParams(
            sex="Female",  # Optional
            locale="en_US",  # Optional (default: "en_US")
            age_range=[25, 65],  # Optional (default: [18, 114])
            state="CA",  # Optional (US only)
            city=["San Francisco", "Los Angeles"],  # Optional
            with_synthetic_personas=True  # Optional (default: False)
        )
    )
)
config_builder.add_column(
    name="customer",
    column_type="sampler",
    sampler_type="person",
    params={
        "sex": "Female",  # Optional
        "locale": "en_US",  # Optional (default: "en_US")
        "age_range": [25, 65],  # Optional (default: [18, 114])
        "state": "CA",  # Optional (US only)
        "city": ["San Francisco", "Los Angeles"],  # Optional
        "with_synthetic_personas": True  # Optional (default: False)
    }
)

Using Conditional Parameters#

All sampling-based columns support conditional parameters that change based on other column values:

config_builder.add_column(
    SamplerColumnConfig(
        name="pet_type",
        sampler_type=SamplerType.CATEGORY,
        params=CategorySamplerParams(values=["dog", "cat", "fish"], weights=[0.5, 0.3, 0.2]),
        conditional_params={
            "number_of_pets == 0": CategorySamplerParams(values=["none"])
        }
    )
)
config_builder.add_column(
    name="pet_type",
    column_type="sampler",
    sampler_type="category",
    params={"values": ["dog", "cat", "fish"], "weights": [0.5, 0.3, 0.2]},
    conditional_params={
        "number_of_pets == 0": {"values": ["none"]}
    }
)

Reference Table#

Simplified API Type

Typed API Equivalent

Description

"category"

SamplerType.CATEGORY

Categorical values

"subcategory"

SamplerType.SUBCATEGORY

Dependent categories

"uuid"

SamplerType.UUID

Unique identifiers

"uniform"

SamplerType.UNIFORM

Uniform distribution

"gaussian"

SamplerType.GAUSSIAN

Normal distribution

"poisson"

SamplerType.POISSON

Poisson distribution

"bernoulli"

SamplerType.BERNOULLI

Binary outcomes

"bernoulli_mixture"

SamplerType.BERNOULLI_MIXTURE

Mixed distribution

"binomial"

SamplerType.BINOMIAL

Number of successes

"scipy"

SamplerType.SCIPY

SciPy distributions

"datetime"

SamplerType.DATETIME

Date/time values

"timedelta"

SamplerType.TIMEDELTA

Time intervals

"person"

SamplerType.PERSON

Person entities

Available Person Attributes#

When referencing person samplers in prompt templates or jinja templates, these attributes are available:

  • first_name: First name

  • last_name: Last name

  • email: Email address

  • phone: Phone number

  • address: Street address

  • city: City name

  • state: State/province

  • zip_code: Postal code

  • country: Country

  • date_of_birth: Date of birth

  • age: Age (calculated from date of birth)

  • sex: Gender