modules.dataset_ops#
Module Contents#
Classes#
Functions#
API#
- class modules.dataset_ops.Shuffle(
- seed: int | None = None,
- npartitions: int | None = None,
- partition_to_filename: collections.abc.Callable[[int], str] = default_filename,
- filename_col: str = 'file_name',
Bases:
nemo_curator.modules.base.BaseModuleInitialization
- call(
- dataset: nemo_curator.datasets.doc_dataset.DocumentDataset,
- shuffle_deterministic(
- dataset: nemo_curator.datasets.doc_dataset.DocumentDataset,
- shuffle_nondeterministic(
- dataset: nemo_curator.datasets.doc_dataset.DocumentDataset,
- modules.dataset_ops.blend_datasets(
- target_size: int,
- datasets: list[nemo_curator.datasets.doc_dataset.DocumentDataset],
- sampling_weights: list[float],
- modules.dataset_ops.default_filename(partition_num: int) str#