modules.config#

Module Contents#

Classes#

API#

class modules.config.BaseConfig#
classmethod from_yaml(file_path: str) modules.config.BaseConfig#
class modules.config.FuzzyDuplicatesConfig#

Bases: modules.config.BaseConfig

bucket_mapping_blocksize: int | None#

None

bucket_parts_per_worker: int | None#

None

buckets_per_shuffle: int#

1

cache_dir: str#

None

char_ngrams: int#

24

false_positive_check: bool#

False

hashes_per_bucket: int#

13

id_field: str#

‘id’

jaccard_threshold: float | None#

None

num_anchors: int | None#

None

num_buckets: int#

20

parts_per_worker: int | None#

None

perform_removal: bool#

False

profile_dir: str | None#

None

seed: int#

42

text_field: str#

‘text’

use_64_bit_hash: bool#

False

class modules.config.SemDedupConfig#

Bases: modules.config.BaseConfig

batched_cosine_similarity: bool | int#

1024

cache_dir: str#

None

clustering_input_partition_size: str#

‘2gb’

clustering_save_loc: str#

‘clustering_results’

embedding_batch_size: int#

128

embedding_column: str#

‘embeddings’

embedding_max_mem_gb: int | None#

None

embedding_model_name_or_path: str#

‘sentence-transformers/all-MiniLM-L6-v2’

embedding_pooling_strategy: str#

‘mean_pooling’

embeddings_save_loc: str#

‘embeddings’

eps_to_extract: float#

0.01

max_iter: int#

100

n_clusters: int#

1000

num_files: int#

None

profile_dir: str | None#

None

random_state: int#

1234

sim_metric: Literal[cosine, l2]#

‘cosine’

which_to_keep: Literal[hard, easy, random]#

‘hard’

write_embeddings_to_disk: bool#

True

write_to_filename: bool#

False