modules.config#
Module Contents#
Classes#
API#
- class modules.config.BaseConfig#
- classmethod from_yaml(file_path: str) modules.config.BaseConfig#
- class modules.config.FuzzyDuplicatesConfig#
Bases:
modules.config.BaseConfig- bucket_mapping_blocksize: int | None#
None
- bucket_parts_per_worker: int | None#
None
- buckets_per_shuffle: int#
1
- cache_dir: str#
None
- char_ngrams: int#
24
- false_positive_check: bool#
False
- hashes_per_bucket: int#
13
- id_field: str#
‘id’
- jaccard_threshold: float | None#
None
- num_anchors: int | None#
None
- num_buckets: int#
20
- parts_per_worker: int | None#
None
- perform_removal: bool#
False
- profile_dir: str | None#
None
- seed: int#
42
- text_field: str#
‘text’
- use_64_bit_hash: bool#
False
- class modules.config.SemDedupConfig#
Bases:
modules.config.BaseConfig- batched_cosine_similarity: bool | int#
1024
- cache_dir: str#
None
- clustering_input_partition_size: str#
‘2gb’
- clustering_save_loc: str#
‘clustering_results’
- embedding_batch_size: int#
128
- embedding_column: str#
‘embeddings’
- embedding_max_mem_gb: int | None#
None
- embedding_model_name_or_path: str#
‘sentence-transformers/all-MiniLM-L6-v2’
- embedding_pooling_strategy: str#
‘mean_pooling’
- embeddings_save_loc: str#
‘embeddings’
- eps_to_extract: float#
0.01
- max_iter: int#
100
- n_clusters: int#
1000
- num_files: int#
None
- profile_dir: str | None#
None
- random_state: int#
1234
- sim_metric: Literal[cosine, l2]#
‘cosine’
- which_to_keep: Literal[hard, easy, random]#
‘hard’
- write_embeddings_to_disk: bool#
True
- write_to_filename: bool#
False