nemo_curator.stages.deduplication.io_utils

View as Markdown

Module Contents

Classes

NameDescription
DeduplicationIO-

API

class nemo_curator.stages.deduplication.io_utils.DeduplicationIO(
id_generator: nemo_curator.stages.deduplication.id_generator.IdGenerator | None,
kwargs = {}
)
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.assign_id(
filepath: str | list[str],
df: cudf.DataFrame
) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.custom_read(
filepath: str | list[str],
read_func: collections.abc.Callable,
assign_id: bool = False,
kwargs = {}
) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.read_jsonl(
filepath: str | list[str],
columns: list[str] | None = None,
assign_id: bool = False,
kwargs = {}
) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.read_parquet(
filepath: str | list[str],
assign_id: bool = False,
kwargs = {}
) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.write_parquet(
df: cudf.DataFrame,
filepath: str,
kwargs = {}
) -> None