modules.semantic_dedup.semanticclusterleveldedup#

Module Contents#

Classes#

API#

class modules.semantic_dedup.semanticclusterleveldedup.SemanticClusterLevelDedup(
n_clusters: int = 1000,
emb_by_clust_dir: str = './clustering_results/embs_by_nearest_center',
id_column: str = 'id',
which_to_keep: str = 'hard',
sim_metric: Literal[cosine, l2] = 'cosine',
output_dir: str = './clustering_results',
embedding_column: str = 'embeddings',
batched_cosine_similarity: int = 1024,
logger: logging.Logger | str = './',
profile_dir: str | None = None,
)#

Initialization

compute_semantic_match_dfs() None#
extract_dedup_data(
eps_to_extract: float,
) nemo_curator.datasets.DocumentDataset#