utils.fuzzy_dedup_utils.merge_utils#

Module Contents#

Functions#

API#

utils.fuzzy_dedup_utils.merge_utils.apply_bk_mapping(
part: utils.fuzzy_dedup_utils.merge_utils.cudf | pandas.DataFrame,
bk_map: utils.fuzzy_dedup_utils.merge_utils.cudf | pandas.DataFrame,
) utils.fuzzy_dedup_utils.merge_utils.cudf | pandas.Series#
utils.fuzzy_dedup_utils.merge_utils.blockwise_merge(
left: dask.dataframe.DataFrame,
right: dask.dataframe.DataFrame,
on: str,
how: str = 'inner',
) dask.dataframe.DataFrame#
utils.fuzzy_dedup_utils.merge_utils.extract_partitioning_index(
left_df: dask.dataframe.DataFrame,
merge_on: str,
bk_mapping: dask.dataframe.DataFrame,
parts_per_bucket_batch: int,
total_bucket_partitions: int,
) tuple[dask.dataframe.DataFrame, dask.dataframe.Series]#
utils.fuzzy_dedup_utils.merge_utils.filter_text_rows_by_bucket_batch(
left_df: dask.dataframe.DataFrame,
global_partitioning_index: dask.dataframe.Series,
bucket_part_offset: int,
bucket_part_end_offset: int,
total_bucket_partitions: int,
) dask.dataframe.DataFrame#
utils.fuzzy_dedup_utils.merge_utils.merge_left_to_shuffled_right(
subset_text_df: dask.dataframe.DataFrame,
subset_bucket_df: dask.dataframe.DataFrame,
merge_on: str,
) dask.dataframe.DataFrame#