modules.semantic_dedup.semanticclusterleveldedup#
Module Contents#
Classes#
API#
- class modules.semantic_dedup.semanticclusterleveldedup.SemanticClusterLevelDedup(
- n_clusters: int = 1000,
- emb_by_clust_dir: str = './clustering_results/embs_by_nearest_center',
- id_column: str = 'id',
- which_to_keep: str = 'hard',
- sim_metric: Literal[cosine, l2] = 'cosine',
- output_dir: str = './clustering_results',
- embedding_column: str = 'embeddings',
- batched_cosine_similarity: int = 1024,
- logger: logging.Logger | str = './',
- profile_dir: str | None = None,
Initialization
- compute_semantic_match_dfs() None#
- extract_dedup_data(
- eps_to_extract: float,