datasets.parallel_dataset#

Module Contents#

Classes#

API#

class datasets.parallel_dataset.ParallelDataset(dataset_df: dask.dataframe.DataFrame)#

Bases: nemo_curator.datasets.doc_dataset.DocumentDataset

Initialization

persist() datasets.parallel_dataset.ParallelDataset#
classmethod read_simple_bitext(
src_input_files: str | list[str],
tgt_input_files: str | list[str],
src_lang: str,
tgt_lang: str,
backend: str = 'pandas',
add_filename: bool | str = False,
npartitions: int = 16,
) datasets.parallel_dataset.ParallelDataset#
static read_single_simple_bitext_file_pair(
input_file_pair: tuple[str],
src_lang: str,
tgt_lang: str,
doc_id: str | None = None,
backend: str = 'cudf',
add_filename: bool | str = False,
) dask.dataframe.DataFrame | dask_cudf.DataFrame#
to_bitext(
output_file_dir: str,
write_to_filename: bool | str = False,
) None#