nemo_curator.stages.deduplication.io_utils
nemo_curator.stages.deduplication.io_utils
Module Contents
Classes
| Name | Description |
|---|---|
DeduplicationIO | - |
API
class nemo_curator.stages.deduplication.io_utils.DeduplicationIO( id_generator: nemo_curator.stages.deduplication.id_generator.IdGenerator | None, kwargs = {} )
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.assign_id( filepath: str | list[str], df: cudf.DataFrame ) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.custom_read( filepath: str | list[str], read_func: collections.abc.Callable, assign_id: bool = False, kwargs = {} ) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.read_jsonl( filepath: str | list[str], columns: list[str] | None = None, assign_id: bool = False, kwargs = {} ) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.read_parquet( filepath: str | list[str], assign_id: bool = False, kwargs = {} ) -> cudf.DataFrame
nemo_curator.stages.deduplication.io_utils.DeduplicationIO.write_parquet( df: cudf.DataFrame, filepath: str, kwargs = {} ) -> None