modules.fuzzy_dedup.lsh#

Module Contents#

Classes#

LSH

API#

class modules.fuzzy_dedup.lsh.LSH(
cache_dir: str,
num_hashes: int,
num_buckets: int,
buckets_per_shuffle: int = 1,
false_positive_check: bool = False,
logger: logging.LoggerAdapter | str = './',
id_fields: str | list = 'id',
minhash_field: str = '_minhash_signature',
profile_dir: str | None = None,
)#

Initialization

bucket_id_to_int(
bucket_ddf: dask_cudf.DataFrame,
bucket_col_name: str = 'bucket_id',
start_id: int = 0,
) tuple[dask_cudf.DataFrame, int]#
lsh(write_path: str, df: dask_cudf.DataFrame) bool#
minhash_to_buckets(
df: cudf.DataFrame,
bucket_ranges: list[list[int]],
) cudf.DataFrame#