modules.fuzzy_dedup.minhash#

Module Contents#

Classes#

MinHash

Computes minhash signatures of a document corpus

Data#

API#

modules.fuzzy_dedup.minhash.BIT_WIDTH_32#

32

modules.fuzzy_dedup.minhash.BIT_WIDTH_64#

64

class modules.fuzzy_dedup.minhash.MinHash(
seed: int = 42,
num_hashes: int = 260,
char_ngrams: int = 24,
use_64bit_hash: bool = False,
logger: logging.LoggerAdapter | str = './',
id_field: str = 'id',
text_field: str = 'text',
profile_dir: str | None = None,
cache_dir: str | None = None,
)#

Computes minhash signatures of a document corpus

Initialization

Parameters

seed: Seed for minhash permutations num_hashes: Length of minhash signature (No. of minhash permutations) char_ngrams: Width of text window (in characters) while computing minhashes. use_64bit_hash: Whether to use a 64 bit hash function. logger: Existing logger to log to, or a path to a log directory. id_field: Column in the Dataset denoting document ID. text_field: Column in the Dataset denoting document content. profile_dir: str, Default None If specified directory to write dask profile cache_dir: str, Default None If specified, will compute & write id, minhash pairs to directory

generate_hash_permutation_seeds(
bit_width: int,
n_permutations: int = 260,
seed: int = 0,
) numpy.ndarray#

Generate seeds for all minhash permutations based on the given seed.

minhash32(
ser: cudf.Series,
seeds: numpy.ndarray,
char_ngram: int,
) cudf.Series#

Compute 32bit minhashes based on the MurmurHash3 algorithm

minhash64(
ser: cudf.Series,
seeds: numpy.ndarray,
char_ngram: int,
) cudf.Series#

Compute 64bit minhashes based on the MurmurHash3 algorithm