modules.fuzzy_dedup.jaccardsimilarity#

Module Contents#

Classes#

API#

class modules.fuzzy_dedup.jaccardsimilarity.JaccardSimilarity(
id_field: str = 'id',
anchor_id_fields: list[str] | None = None,
text_field: str = 'text',
ngram_width: int = 5,
)#

Initialization

jaccard_compute(shuffled_docs_path: str) cudf.DataFrame#