modules.semantic_dedup.embeddings#
Module Contents#
Classes#
API#
- class modules.semantic_dedup.embeddings.EmbeddingConfig#
- max_seq_length: int#
None
- model_name_or_path: str#
None
- pooling_strategy: str#
‘mean_pooling’
- class modules.semantic_dedup.embeddings.EmbeddingCreator(
- embedding_model_name_or_path: str = 'sentence-transformers/all-MiniLM-L6-v2',
- embedding_batch_size: int = 128,
- embedding_output_dir: str = './embeddings',
- embedding_max_mem_gb: int | None = None,
- embedding_pooling_strategy: str = 'mean_pooling',
- input_column: str = 'text',
- embedding_column: str = 'embeddings',
- write_embeddings_to_disk: bool = True,
- write_to_filename: bool = False,
- logger: logging.Logger | str = './',
- profile_dir: str | None = None,
Initialization
- create_embeddings(
- ddf: dask_cudf.DataFrame,
- input_column: str = 'text',
- class modules.semantic_dedup.embeddings.EmbeddingCrossFitModel(
- config: modules.semantic_dedup.embeddings.EmbeddingConfig,
- max_mem_gb: int | None = None,
Bases:
crossfit.backend.torch.hf.model.HFModelInitialization
- load_config() transformers.AutoConfig#
- load_model(
- device: str = 'cuda',
- load_tokenizer() transformers.AutoTokenizer#
- max_seq_length() int#