modules.semantic_dedup.embeddings#

Module Contents#

Classes#

API#

class modules.semantic_dedup.embeddings.EmbeddingConfig#
max_seq_length: int#

None

model_name_or_path: str#

None

pooling_strategy: str#

‘mean_pooling’

class modules.semantic_dedup.embeddings.EmbeddingCreator(
embedding_model_name_or_path: str = 'sentence-transformers/all-MiniLM-L6-v2',
embedding_batch_size: int = 128,
embedding_output_dir: str = './embeddings',
embedding_max_mem_gb: int | None = None,
embedding_pooling_strategy: str = 'mean_pooling',
input_column: str = 'text',
embedding_column: str = 'embeddings',
write_embeddings_to_disk: bool = True,
write_to_filename: bool = False,
logger: logging.Logger | str = './',
profile_dir: str | None = None,
)#

Initialization

Initializes an EmbeddingCreator for generating embeddings using the specified model configurations.

Args: embedding_model_name_or_path (str): Model name or path for embeddings. Default is “sentence-transformers/all-MiniLM-L6-v2”. embedding_batch_size (int): Initial batch size for processing embeddings. Default is 128. embedding_output_dir (str): Location to save embeddings. Default is “./embeddings”. embedding_max_mem_gb (int, optional): Maximum memory usage in GB for the embedding process. If None, it defaults to the available GPU memory minus 4 GB. embedding_pooling_strategy: Strategy for pooling embeddings, either “mean_pooling” or “last_token”. Default is “mean_pooling”. input_column (str): Column name from the data to be used for embedding generation. Default is “text”. embedding_column (str): The column name that stores the embeddings. Default is “embeddings”. write_embeddings_to_disk (bool): If True, saves the embeddings to disk. We recommend setting this to False when you have a delayed pipeline. Setting it to False can lead to more memory overhead. Default is True. write_to_filename (bool): If True, saves the embeddings to the same filename as input files. Default False. logger (Union[logging.Logger, str]): Existing logger to log to, or a path to a log directory. Default is “./”. profile_dir (Optional[str]): If specified, directory to write Dask profile. Default is None.

create_embeddings(
ddf: dask_cudf.DataFrame,
input_column: str = 'text',
) dask_cudf.DataFrame#
class modules.semantic_dedup.embeddings.EmbeddingCrossFitModel(
config: modules.semantic_dedup.embeddings.EmbeddingConfig,
max_mem_gb: int | None = None,
)#

Bases: crossfit.backend.torch.hf.model.HFModel

Initialization

load_config() transformers.AutoConfig#
load_model(
device: str = 'cuda',
) modules.semantic_dedup.embeddings.EmbeddingPytorchModel#
load_tokenizer() transformers.AutoTokenizer#
max_seq_length() int#
class modules.semantic_dedup.embeddings.EmbeddingPytorchModel(
config: modules.semantic_dedup.embeddings.EmbeddingConfig,
)#

Bases: torch.nn.Module

Initialization

feature(
input_ids: torch.Tensor,
attention_mask: torch.Tensor,
) torch.Tensor#
forward(batch: dict[str, torch.Tensor]) torch.Tensor#