modules.fuzzy_dedup.minhash#
Module Contents#
Classes#
Data#
API#
- modules.fuzzy_dedup.minhash.BIT_WIDTH_32#
32
- modules.fuzzy_dedup.minhash.BIT_WIDTH_64#
64
- class modules.fuzzy_dedup.minhash.MinHash(
- seed: int = 42,
- num_hashes: int = 260,
- char_ngrams: int = 24,
- use_64bit_hash: bool = False,
- logger: logging.LoggerAdapter | str = './',
- id_field: str = 'id',
- text_field: str = 'text',
- profile_dir: str | None = None,
- cache_dir: str | None = None,
Initialization
- generate_hash_permutation_seeds(
- bit_width: int,
- n_permutations: int = 260,
- seed: int = 0,
- minhash32(
- ser: cudf.Series,
- seeds: numpy.ndarray,
- char_ngram: int,
- minhash64(
- ser: cudf.Series,
- seeds: numpy.ndarray,
- char_ngram: int,