modules.fuzzy_dedup.minhash#

Module Contents#

Classes#

Data#

API#

modules.fuzzy_dedup.minhash.BIT_WIDTH_32#

32

modules.fuzzy_dedup.minhash.BIT_WIDTH_64#

64

class modules.fuzzy_dedup.minhash.MinHash(
seed: int = 42,
num_hashes: int = 260,
char_ngrams: int = 24,
use_64bit_hash: bool = False,
logger: logging.LoggerAdapter | str = './',
id_field: str = 'id',
text_field: str = 'text',
profile_dir: str | None = None,
cache_dir: str | None = None,
)#

Initialization

generate_hash_permutation_seeds(
bit_width: int,
n_permutations: int = 260,
seed: int = 0,
) numpy.ndarray#
minhash32(
ser: cudf.Series,
seeds: numpy.ndarray,
char_ngram: int,
) cudf.Series#
minhash64(
ser: cudf.Series,
seeds: numpy.ndarray,
char_ngram: int,
) cudf.Series#