Sampling-Based Columns#
Sampling-based columns generate data through statistical sampling methods, distributions, and predefined datasets.
Before You Start#
Before getting started, ensure you have the Data Designer client and configuration builder set up:
import os
from nemo_microservices import NeMoMicroservices
from nemo_microservices.beta.data_designer import DataDesignerClient, DataDesignerConfigBuilder
from nemo_microservices.beta.data_designer.config import columns as C
from nemo_microservices.beta.data_designer.config import params as P
data_designer_client = DataDesignerClient(
client=NeMoMicroservices(base_url=os.environ["NEMO_MICROSERVICES_BASE_URL"])
)
config_builder = DataDesignerConfigBuilder(model_configs="path/to/your/model_configs.yaml")
Sampling-Based Column Types#
Category#
Creates categorical values from a defined set of options.
config_builder.add_column(
name="product_category",
type="category",
params={
"values": ["Electronics", "Clothing", "Home Goods", "Books"],
"weights": [0.4, 0.3, 0.2, 0.1], # Optional: probability weights
"description": "Product category classification" # Optional
}
)
config_builder.add_column(
SamplerColumn(
name="product_category",
type=P.SamplerType.CATEGORY,
params=P.CategorySamplerParams(
values=["Electronics", "Clothing", "Home Goods", "Books"],
weights=[0.4, 0.3, 0.2, 0.1], # Optional: probability weights
description="Product category classification" # Optional
)
)
)
Subcategory#
Creates values associated with a parent category.
config_builder.add_column(
name="product_subcategory",
type="subcategory",
params={
"category": "product_category", # Parent category column
"values": {
"Electronics": ["Smartphones", "Laptops", "Headphones"],
"Clothing": ["Shirts", "Pants", "Dresses", "Shoes"],
"Home Goods": ["Kitchen", "Bathroom", "Bedroom"]
}
}
)
config_builder.add_column(
SamplerColumn(
name="product_subcategory",
type=P.SamplerType.SUBCATEGORY,
params=P.SubcategorySamplerParams(
category="product_category", # Parent category column
values={
"Electronics": ["Smartphones", "Laptops", "Headphones"],
"Clothing": ["Shirts", "Pants", "Dresses", "Shoes"],
"Home Goods": ["Kitchen", "Bathroom", "Bedroom"]
}
)
)
)
UUID#
Generates unique identifiers.
config_builder.add_column(
name="order_id",
type="uuid",
params={
"prefix": "ORD-", # Optional: adds a prefix
"short_form": True, # Optional: uses a shorter format
"uppercase": True # Optional: uses uppercase letters
}
)
config_builder.add_column(
C.SamplerColumn(
name="order_id",
type=P.SamplerType.UUID,
params=P.UUIDSamplerParams(
prefix="ORD-", # Optional: adds a prefix
short_form=True, # Optional: uses a shorter format
uppercase=True # Optional: uses uppercase letters
)
)
)
Uniform Distribution#
config_builder.add_column(
name="product_rating",
type="uniform",
params={"low": 1, "high": 5},
convert_to="int" # Optional: converts to integer
)
config_builder.add_column(
C.SamplerColumn(
name="product_rating",
type=P.SamplerType.UNIFORM,
params=P.UniformSamplerParams(low=1, high=10),
convert_to="int" # Optional: converts to integer
)
)
Gaussian Distribution#
config_builder.add_column(
name="item_weight",
type="gaussian",
params={"mean": 50, "stddev": 10}
)
config_builder.add_column(
C.SamplerColumn(
name="item_weight",
type=P.SamplerType.GAUSSIAN,
params=P.GaussianSamplerParams(mean=50, stddev=10)
)
)
Poisson Distribution#
config_builder.add_column(
name="number_of_pets",
type="poisson",
params={"mean": 2}
)
config_builder.add_column(
C.SamplerColumn(
name="number_of_pets",
type=P.SamplerType.POISSON,
params=P.PoissonSamplerParams(mean=2)
)
)
Bernoulli Distribution#
config_builder.add_column(
name="is_in_stock",
type="bernoulli",
params={"p": 0.8}
)
config_builder.add_column(
C.SamplerColumn(
name="is_in_stock",
type=P.SamplerType.BERNOULLI,
params=P.BernoulliSamplerParams(p=0.8)
)
)
Bernoulli Mixture Distribution#
config_builder.add_column(
name="bern_exp",
type="bernoulli_mixture",
params={"p": 0.4, "dist_name": "expon", "dist_params": {"scale": 10}}
)
config_builder.add_column(
C.SamplerColumn(
name="bern_exp",
type=P.SamplerType.BERNOULLI_MIXTURE,
params=P.BernoulliMixtureSamplerParams(p=0.8, dist_name="expon", dist_params={"scale": 10})
)
)
Binomial Distribution#
config_builder.add_column(
name="items_returned",
type="binomial",
params={"n": 10, "p": 0.1}
)
config_builder.add_column(
C.SamplerColumn(
name="items_returned",
type=P.SamplerType.BINOMIAL,
params=P.BinomialSamplerParams(n=10, p=0.1)
)
)
SciPy Sampler#
Use this sampler to access any statistical methods available in scipy.stats
;
config_builder.add_column(
name="log_gaussian",
type="scipy",
params={
"dist_name": "lognorm",
"dist_params": {
"s": 0.9, # sigma
"scale": 8, # exp(mean)
}
}
)
config_builder.add_column(
C.SamplerColumn(
name="log_gaussian",
type=P.SamplerType.SCIPY,
params=P.ScipySamplerParams(dist_name="lognorm", dist_params={"s": 0.9, "scale": 8})
)
)
DateTime#
config_builder.add_column(
name="order_date",
type="datetime",
params={"start": "2023-01-01", "end": "2023-12-31"}
)
config_builder.add_column(
C.SamplerColumn(
name="order_date",
type=P.SamplerType.DATETIME,
params=P.DatetimeSamplerParams(start="2023-01-01", end="2023-12-31")
)
)
TimeDelta#
config_builder.add_column(
name="delivery_date",
type="timedelta",
params={
"dt_min": 1, # Minimum days
"dt_max": 7, # Maximum days (exclusive)
"reference_column_name": "order_date" # Reference date column
}
)
config_builder.add_column(
C.SamplerColumn(
name="delivery_date",
type=P.SamplerType.TIMEDELTA,
params=P.TimeDeltaSamplerParams(
dt_min=1, # Minimum days
dt_max=7, # Maximum days (exclusive)
reference_column_name="order_date" # Reference date column
)
)
)
Person#
Defines person samplers that create realistic person entities.
# Define person samplers (note: use with_person_samplers method)
config_builder.with_person_samplers(
{
"customer": {
"sex": "Female", # Optional
"locale": "en_US", # Optional (default: "en_US")
"age_range": [25, 65], # Optional (default: [18, 114])
"state": "CA", # Optional (US only)
"city": ["San Francisco", "Los Angeles"], # Optional
"with_synthetic_personas": True # Optional (default: False)
}
}
)
# Define person samplers (note: use with_person_samplers method)
config_builder.with_person_samplers(
{
"customer": P.PersonSamplerParams(
sex="Female", # Optional
locale="en_US", # Optional (default: "en_US")
age_range=[25, 65], # Optional (default: [18, 114])
state="CA", # Optional (US only)
city=["San Francisco", "Los Angeles"], # Optional
with_synthetic_personas=True # Optional (default: False)
)
}
)
Using Conditional Parameters#
All sampling-based columns support conditional parameters that change based on other column values:
config_builder.add_column(
name="pet_type",
type="category",
params={"values": ["dog", "cat", "fish"], "weights": [0.5, 0.3, 0.2]},
conditional_params={
"number_of_pets == 0": {"values": ["none"]}
}
)
config_builder.add_column(
SamplerColumn(
name="pet_type",
type=SamplerType.CATEGORY,
params=CategorySamplerParams(values=["dog", "cat", "fish"], weights=[0.5, 0.3, 0.2]),
conditional_params={
"number_of_pets == 0": CategorySamplerParams(values=["none"])
}
)
)
Reference Table#
Simplified API Type |
Typed API Equivalent |
Description |
---|---|---|
|
|
Categorical values |
|
|
Dependent categories |
|
|
Unique identifiers |
|
|
Uniform distribution |
|
|
Normal distribution |
|
|
Poisson distribution |
|
|
Binary outcomes |
|
|
Mixed distribution |
|
|
Number of successes |
|
|
SciPy distributions |
|
|
Date/time values |
|
|
Time intervals |
|
|
Person entities |
Available Person Attributes#
When referencing person samplers in prompt templates or jinja templates, these attributes are available:
first_name
: First namelast_name
: Last nameemail
: Email addressphone
: Phone numberaddress
: Street addresscity
: City namestate
: State/provincezip_code
: Postal codecountry
: Countrydate_of_birth
: Date of birthage
: Age (calculated from date of birth)sex
: Gender