modules.semantic_dedup.embeddings#

Module Contents#

Classes#

API#

class modules.semantic_dedup.embeddings.EmbeddingConfig#
max_seq_length: int#

None

model_name_or_path: str#

None

pooling_strategy: str#

‘mean_pooling’

class modules.semantic_dedup.embeddings.EmbeddingCreator(
embedding_model_name_or_path: str = 'sentence-transformers/all-MiniLM-L6-v2',
embedding_batch_size: int = 128,
embedding_output_dir: str = './embeddings',
embedding_max_mem_gb: int | None = None,
embedding_pooling_strategy: str = 'mean_pooling',
input_column: str = 'text',
embedding_column: str = 'embeddings',
write_embeddings_to_disk: bool = True,
write_to_filename: bool = False,
logger: logging.Logger | str = './',
profile_dir: str | None = None,
)#

Initialization

create_embeddings(
ddf: dask_cudf.DataFrame,
input_column: str = 'text',
) dask_cudf.DataFrame#
class modules.semantic_dedup.embeddings.EmbeddingCrossFitModel(
config: modules.semantic_dedup.embeddings.EmbeddingConfig,
max_mem_gb: int | None = None,
)#

Bases: crossfit.backend.torch.hf.model.HFModel

Initialization

load_config() transformers.AutoConfig#
load_model(
device: str = 'cuda',
) modules.semantic_dedup.embeddings.EmbeddingPytorchModel#
load_tokenizer() transformers.AutoTokenizer#
max_seq_length() int#
class modules.semantic_dedup.embeddings.EmbeddingPytorchModel(
config: modules.semantic_dedup.embeddings.EmbeddingConfig,
)#

Bases: torch.nn.Module

Initialization

feature(
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
) torch.Tensor#
forward(batch: dict[str, torch.Tensor]) torch.Tensor#