modules.semantic_dedup.clusteringmodel#

Module Contents#

Classes#

API#

class modules.semantic_dedup.clusteringmodel.ClusteringModel(
id_column: str = 'id',
max_iter: int = 100,
n_clusters: int = 1000,
clustering_output_dir: str = './clustering_results',
embedding_column: str = 'embeddings',
random_state: int = 1234,
clustering_input_partition_size: str | None = '2gb',
logger: logging.Logger | str = './',
profile_dir: str | None = None,
keep_all_columns: bool = False,
)#

Initialization