stages.deduplication.io_utils
#
Module Contents#
Classes#
API#
- class stages.deduplication.io_utils.DeduplicationIO(id_generator: IdGenerator | None, **kwargs)#
Initialization
- assign_id(
- filepath: str | list[str],
- df: cudf.DataFrame,
- custom_read(
- filepath: str | list[str],
- read_func: collections.abc.Callable,
- assign_id: bool = False,
- **kwargs,
- read_jsonl(
- filepath: str | list[str],
- columns: list[str] | None = None,
- assign_id: bool = False,
- **kwargs,
- read_parquet(
- filepath: str | list[str],
- assign_id: bool = False,
- **kwargs,
- write_parquet(df: cudf.DataFrame, filepath: str, **kwargs) None #