`utils.fuzzy_dedup_utils.io_utils`#

Module Contents#

`aggregated_anchor_docs_with_bk_read`
`check_empty_buckets`	Inspects parquet metadata of the buckets dataset to check if it’s an empty dataset.
`chunk_files`	Chunk files into lists of files that are less than max_size_mb
`get_bucket_ddf_from_parquet_path`
`get_file_size`
`get_frag_size`
`get_restart_offsets`
`get_text_ddf_from_json_path_with_blocksize`
`strip_trailing_sep`	Strips a path string of trailing path seperators like `/` if any.
`update_restart_offsets`

utils.fuzzy_dedup_utils.io_utils.aggregated_anchor_docs_with_bk_read( path: str, blocksize: int, ) → dask.dataframe.DataFrame#

utils.fuzzy_dedup_utils.io_utils.check_empty_buckets(bucket_path: str) → bool#: Inspects parquet metadata of the buckets dataset to check if it’s an empty dataset.

utils.fuzzy_dedup_utils.io_utils.chunk_files( file_list: list[str | pyarrow.dataset.Fragment], max_size_mb: int, ) → list[list[str]]#: Chunk files into lists of files that are less than max_size_mb

utils.fuzzy_dedup_utils.io_utils.get_bucket_ddf_from_parquet_path( input_bucket_path: str, num_workers: int, ) → dask.dataframe.DataFrame#

utils.fuzzy_dedup_utils.io_utils.get_frag_size(frag: pyarrow.dataset.Fragment) → int#

utils.fuzzy_dedup_utils.io_utils.get_restart_offsets(output_path: str) → tuple[int, int]#

utils.fuzzy_dedup_utils.io_utils.get_text_ddf_from_json_path_with_blocksize( input_data_paths: list[str], num_files: int, blocksize: int, id_column: str, text_column: str, input_meta: dict[str, str] | None = None, ) → dask.dataframe.DataFrame#

utils.fuzzy_dedup_utils.io_utils.strip_trailing_sep(path: str) → str#: Strips a path string of trailing path seperators like / if any.

utils.fuzzy_dedup_utils.io_utils.update_restart_offsets( output_path: str, bucket_offset: int, text_offset: int, ) → None#