modules.dataset_ops#

Module Contents#

Classes#

Functions#

API#

class modules.dataset_ops.Shuffle(
seed: int | None = None,
npartitions: int | None = None,
partition_to_filename: collections.abc.Callable[[int], str] = default_filename,
filename_col: str = 'file_name',
)#

Bases: nemo_curator.modules.base.BaseModule

Initialization

call(
dataset: nemo_curator.datasets.doc_dataset.DocumentDataset,
) nemo_curator.datasets.doc_dataset.DocumentDataset#
shuffle_deterministic(
dataset: nemo_curator.datasets.doc_dataset.DocumentDataset,
) nemo_curator.datasets.doc_dataset.DocumentDataset#
shuffle_nondeterministic(
dataset: nemo_curator.datasets.doc_dataset.DocumentDataset,
) nemo_curator.datasets.doc_dataset.DocumentDataset#
modules.dataset_ops.blend_datasets(
target_size: int,
datasets: list[nemo_curator.datasets.doc_dataset.DocumentDataset],
sampling_weights: list[float],
) nemo_curator.datasets.doc_dataset.DocumentDataset#
modules.dataset_ops.default_filename(partition_num: int) str#