nemo_curator.stages.text.embedders.base

View as Markdown

Module Contents

Classes

NameDescription
EmbeddingCreatorStage-
EmbeddingModelStageHuggingFace model stage that produces embeddings with pooling.
SentenceTransformerEmbeddingModelStage-

API

class nemo_curator.stages.text.embedders.base.EmbeddingCreatorStage(
model_identifier: str = 'sentence-transformers/all-...,
use_sentence_transformer: bool = True,
text_field: str = 'text',
embedding_field: str = 'embeddings',
cache_dir: str | None = None,
max_chars: int | None = None,
max_seq_length: int | None = None,
padding_side: typing.Literal['left', 'right'] = 'right',
embedding_pooling: typing.Literal['mean_pooling', 'last_token'] = 'mean_pooling',
model_inference_batch_size: int = 1024,
autocast: bool = True,
sort_by_length: bool = True,
hf_token: str | None = None
)
Dataclass

Bases: CompositeStage[DocumentBatch, DocumentBatch]

autocast
bool = True
cache_dir
str | None = None
embedding_field
str = 'embeddings'
embedding_pooling
Literal['mean_pooling', 'last_token'] = 'mean_pooling'
hf_token
str | None = None
max_chars
int | None = None
max_seq_length
int | None = None
model_identifier
str = 'sentence-transformers/all-MiniLM-L6-v2'
model_inference_batch_size
int = 1024
padding_side
Literal['left', 'right'] = 'right'
sort_by_length
bool = True
text_field
str = 'text'
use_sentence_transformer
bool = True
nemo_curator.stages.text.embedders.base.EmbeddingCreatorStage.__post_init__() -> None
nemo_curator.stages.text.embedders.base.EmbeddingCreatorStage.decompose() -> list[nemo_curator.stages.base.ProcessingStage]
class nemo_curator.stages.text.embedders.base.EmbeddingModelStage(
model_identifier: str,
cache_dir: str | None = None,
embedding_field: str = 'embeddings',
pooling: typing.Literal['mean_pooling', 'last_token'] = 'mean_pooling',
hf_token: str | None = None,
model_inference_batch_size: int = 1024,
has_seq_order: bool = True,
padding_side: typing.Literal['left', 'right'] = 'right',
autocast: bool = True
)

Bases: ModelStage

HuggingFace model stage that produces embeddings with pooling.

nemo_curator.stages.text.embedders.base.EmbeddingModelStage._get_last_token(
model_output: torch.Tensor,
attention_mask: torch.Tensor
) -> torch.Tensor
nemo_curator.stages.text.embedders.base.EmbeddingModelStage._mean_pooling(
model_output: torch.Tensor,
attention_mask: torch.Tensor
) -> torch.Tensor
nemo_curator.stages.text.embedders.base.EmbeddingModelStage.collect_outputs(
processed_outputs: list[torch.Tensor]
) -> list[list[float]]
nemo_curator.stages.text.embedders.base.EmbeddingModelStage.create_output_dataframe(
df_cpu: pandas.DataFrame,
collected_output: list[list[float]]
) -> pandas.DataFrame

Create output dataframe with embeddings.

nemo_curator.stages.text.embedders.base.EmbeddingModelStage.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.text.embedders.base.EmbeddingModelStage.process_model_output(
outputs: torch.Tensor,
model_input_batch: dict[str, torch.Tensor] | None = None
) -> torch.Tensor

Process model outputs to create embeddings.

nemo_curator.stages.text.embedders.base.EmbeddingModelStage.setup(
_: nemo_curator.backends.base.WorkerMetadata | None = None
) -> None

Load the model for inference.

class nemo_curator.stages.text.embedders.base.SentenceTransformerEmbeddingModelStage(
model_identifier: str,
cache_dir: str | None = None,
embedding_field: str = 'embeddings',
hf_token: str | None = None,
model_inference_batch_size: int = 1024,
has_seq_order: bool = True,
padding_side: typing.Literal['left', 'right'] = 'right',
autocast: bool = True
)

Bases: EmbeddingModelStage

nemo_curator.stages.text.embedders.base.SentenceTransformerEmbeddingModelStage.outputs() -> tuple[list[str], list[str]]
nemo_curator.stages.text.embedders.base.SentenceTransformerEmbeddingModelStage.process_model_output(
outputs: torch.Tensor,
model_input_batch: dict[str, torch.Tensor] | None = None
) -> torch.Tensor
nemo_curator.stages.text.embedders.base.SentenceTransformerEmbeddingModelStage.setup(
_: nemo_curator.backends.base.WorkerMetadata | None = None
) -> None

Load the model for inference.