stages.deduplication.io_utils#

Module Contents#

Classes#

API#

class stages.deduplication.io_utils.DeduplicationIO(id_generator: IdGenerator | None, **kwargs)#

Initialization

assign_id(
filepath: str | list[str],
df: cudf.DataFrame,
) cudf.DataFrame#
custom_read(
filepath: str | list[str],
read_func: collections.abc.Callable,
assign_id: bool = False,
**kwargs,
) cudf.DataFrame#
read_jsonl(
filepath: str | list[str],
columns: list[str] | None = None,
assign_id: bool = False,
**kwargs,
) cudf.DataFrame#
read_parquet(
filepath: str | list[str],
assign_id: bool = False,
**kwargs,
) cudf.DataFrame#
write_parquet(df: cudf.DataFrame, filepath: str, **kwargs) None#