Source code for nemo_retriever.model

# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
    from nemo_retriever.model.model import BaseModel

VL_EMBED_MODEL = "nvidia/llama-nemotron-embed-vl-1b-v2"
VL_RERANK_MODEL = "nvidia/llama-nemotron-rerank-vl-1b-v2"

_VL_EMBED_MODEL_IDS = frozenset(
    {
        VL_EMBED_MODEL,
        "llama-nemotron-embed-vl-1b-v2",
        "llama-3.2-nemoretriever-1b-vlm-embed-v1",
        "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1",
    }
)

_VL_RERANK_MODEL_IDS = frozenset(
    {
        VL_RERANK_MODEL,
        "llama-nemotron-rerank-vl-1b-v2",
    }
)

# Short name → full HF repo ID.
_EMBED_MODEL_ALIASES: dict[str, str] = {
    "nemo_retriever_v1": "nvidia/llama-nemotron-embed-1b-v2",
    "llama-nemotron-embed-vl-1b-v2": VL_EMBED_MODEL,
    "llama-3.2-nemoretriever-1b-vlm-embed-v1": VL_EMBED_MODEL,
    "nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1": VL_EMBED_MODEL,
}

_DEFAULT_EMBED_MODEL = VL_EMBED_MODEL


[docs] def resolve_embed_model(model_name: str | None) -> str: """Resolve a model name/alias to a full HF repo ID. Returns ``_DEFAULT_EMBED_MODEL`` when *model_name* is ``None`` or empty. """ if not model_name: return _DEFAULT_EMBED_MODEL return _EMBED_MODEL_ALIASES.get(model_name, model_name)
[docs] def is_vl_embed_model(model_name: str | None) -> bool: """Return True if *model_name* refers to the VL embedding model.""" return resolve_embed_model(model_name) in _VL_EMBED_MODEL_IDS
[docs] def is_vl_rerank_model(model_name: str | None) -> bool: """Return True if *model_name* refers to the VL reranker model.""" return (model_name or "") in _VL_RERANK_MODEL_IDS
[docs] def create_local_embedder( model_name: str | None = None, *, backend: str = "vllm", device: str | None = None, hf_cache_dir: str | None = None, gpu_memory_utilization: float = 0.45, enforce_eager: bool = False, dimensions: int | None = None, normalize: bool = True, max_length: int = 8192, query_max_length: int = 128, ) -> Any: """Create the appropriate local embedding model (VL or non-VL). *backend* must be ``"vllm"`` or ``"hf"``. For non-VL models: - ``backend="vllm"`` (default): vLLM via ``LlamaNemotronEmbed1BV2Embedder``. - ``backend="hf"``: HuggingFace via ``LlamaNemotronEmbed1BV2HFEmbedder``. For VL models: - ``backend="vllm"`` (default): vLLM via ``LlamaNemotronEmbedVL1BV2VLLMEmbedder``. - ``backend="hf"``: HuggingFace via ``LlamaNemotronEmbedVL1BV2Embedder``. ``device`` applies only to HuggingFace paths. For vLLM paths, ``device`` is forwarded for compatibility but deprecated and ignored (vLLM placement is process-level); passing it emits ``DeprecationWarning``. Note: ``gpu_memory_utilization``, ``enforce_eager``, ``dimensions``, ``normalize``, and ``max_length`` apply to vLLM paths only; the HF VL path ignores them. """ b = (backend or "vllm").strip().lower() if b not in ("vllm", "hf"): raise ValueError(f"backend must be 'vllm' or 'hf', got {backend!r}") model_id = resolve_embed_model(model_name) if is_vl_embed_model(model_name): if b == "hf": from nemo_retriever.model.local.llama_nemotron_embed_vl_1b_v2_embedder import ( LlamaNemotronEmbedVL1BV2Embedder, ) return LlamaNemotronEmbedVL1BV2Embedder( device=device, hf_cache_dir=hf_cache_dir, model_id=model_id, ) from nemo_retriever.model.local.llama_nemotron_embed_vl_1b_v2_embedder import ( LlamaNemotronEmbedVL1BV2VLLMEmbedder, ) return LlamaNemotronEmbedVL1BV2VLLMEmbedder( model_id=model_id, device=device, hf_cache_dir=hf_cache_dir, gpu_memory_utilization=gpu_memory_utilization, enforce_eager=enforce_eager, ) if b == "hf": from nemo_retriever.model.local.llama_nemotron_embed_1b_v2_hf_embedder import ( LlamaNemotronEmbed1BV2HFEmbedder, ) return LlamaNemotronEmbed1BV2HFEmbedder( device=device, hf_cache_dir=hf_cache_dir, normalize=normalize, max_length=int(max_length), query_max_length=int(query_max_length), model_id=model_id, ) from nemo_retriever.model.local.llama_nemotron_embed_1b_v2_embedder import ( LlamaNemotronEmbed1BV2Embedder, ) return LlamaNemotronEmbed1BV2Embedder( model_id=model_id, hf_cache_dir=hf_cache_dir, device=device, gpu_memory_utilization=gpu_memory_utilization, enforce_eager=enforce_eager, dimensions=dimensions, normalize=normalize, max_length=int(max_length), )
_LOCAL_QUERY_BACKENDS = frozenset({"hf", "vllm"}) _LOCAL_RERANKER_BACKENDS = frozenset({"hf", "vllm"}) _LOCAL_INGEST_EMBED_BACKENDS = frozenset({"hf", "vllm"})
[docs] def normalize_backend(value: str | None, valid: frozenset[str], *, field_name: str, default: str) -> str: """Normalize *value* (strip + lowercase) and validate against *valid*. Raises ``ValueError`` referencing *field_name* on invalid input. Falsy *value* is replaced by *default* before validation. """ v = (value or default).strip().lower() if v not in valid: raise ValueError(f"{field_name} must be one of {sorted(valid)}; got {value!r}") return v
[docs] def create_local_query_embedder( model_name: str | None = None, *, backend: str = "hf", device: str | None = None, hf_cache_dir: str | None = None, gpu_memory_utilization: float = 0.45, enforce_eager: bool = False, dimensions: int | None = None, normalize: bool = True, max_length: int = 8192, query_max_length: int = 128, ) -> Any: """Create a local embedder for *query* vectors in retrieval (Retriever / recall). *backend* must be ``"hf"`` (default) or ``"vllm"``. - ``backend="hf"``: HuggingFace for both VL and non-VL models. - ``backend="vllm"``: vLLM for both VL and non-VL models. """ b = normalize_backend(backend, _LOCAL_QUERY_BACKENDS, field_name="backend", default="hf") return create_local_embedder( model_name, backend=b, device=device, hf_cache_dir=hf_cache_dir, gpu_memory_utilization=gpu_memory_utilization, enforce_eager=enforce_eager, dimensions=dimensions, normalize=normalize, max_length=int(max_length), query_max_length=int(query_max_length), )
[docs] def create_local_reranker( model_name: str | None = None, *, device: str | None = None, hf_cache_dir: str | None = None, backend: str = "vllm", gpu_memory_utilization: float = 0.5, ) -> "BaseModel": """Create the appropriate local reranker model (VL or text-only). Dispatches to ``NemotronRerankVLV2VLLM`` (default) or ``NemotronRerankVLV2`` when *model_name* matches a VL reranker ID, depending on *backend*. Otherwise returns the text-only ``NemotronRerankV2``. Parameters ---------- backend: ``"vllm"`` (default) uses vLLM's pooling runner for the VL reranker. ``"hf"`` uses HuggingFace ``AutoModelForSequenceClassification``. Only affects VL reranker dispatch; the text-only reranker always uses HuggingFace. gpu_memory_utilization: Fraction of GPU memory for the vLLM engine (only used when *backend* is ``"vllm"``). """ b = normalize_backend(backend, _LOCAL_RERANKER_BACKENDS, field_name="backend", default="vllm") if is_vl_rerank_model(model_name): if b == "vllm": from nemo_retriever.model.local.nemotron_rerank_vl_v2 import NemotronRerankVLV2VLLM return NemotronRerankVLV2VLLM( model_name=model_name, device=device, hf_cache_dir=hf_cache_dir, gpu_memory_utilization=gpu_memory_utilization, ) from nemo_retriever.model.local.nemotron_rerank_vl_v2_hf import NemotronRerankVLV2 return NemotronRerankVLV2( model_name=model_name, device=device, hf_cache_dir=hf_cache_dir, ) from nemo_retriever.model.local.nemotron_rerank_v2 import NemotronRerankV2 return NemotronRerankV2( model_name=model_name or "nvidia/llama-nemotron-rerank-1b-v2", device=device, hf_cache_dir=hf_cache_dir, )