modules.fuzzy_dedup.lsh#
Module Contents#
Classes#
API#
- class modules.fuzzy_dedup.lsh.LSH(
- cache_dir: str,
- num_hashes: int,
- num_buckets: int,
- buckets_per_shuffle: int = 1,
- false_positive_check: bool = False,
- logger: logging.LoggerAdapter | str = './',
- id_fields: str | list = 'id',
- minhash_field: str = '_minhash_signature',
- profile_dir: str | None = None,
Initialization
- bucket_id_to_int(
- bucket_ddf: dask_cudf.DataFrame,
- bucket_col_name: str = 'bucket_id',
- start_id: int = 0,
- lsh(write_path: str, df: dask_cudf.DataFrame) bool#
- minhash_to_buckets(
- df: cudf.DataFrame,
- bucket_ranges: list[list[int]],