modules.fuzzy_dedup.connectedcomponents#

Module Contents#

Classes#

API#

class modules.fuzzy_dedup.connectedcomponents.ConnectedComponents(
cache_dir: str,
jaccard_pairs_path: str,
id_column: str = 'id',
jaccard_threshold: float = 0.8,
logger: logging.LoggerAdapter | str = './',
profile_dir: str | None = None,
)#

Initialization

cc_workflow(output_path: str) None#
static thresholding(
df: cudf.DataFrame,
threshold: float,
column_to_threshold: str,
) cudf.DataFrame#