Source code for nemo_retriever.vdb.adt_vdb

"""Abstract Vector Database (VDB) operator API.

Defines the `VDB` abstract base class — the small interface that custom
vector-database operators implement to plug into NeMo Retriever.

The interface separates ingestion from retrieval so the same ABC works for
both halves of the pipeline:

- `create_index` / `write_to_index` / `run` — index lifecycle and bulk
  ingestion of Nemo Retriever Library (NRL) record batches.
- `retrieval` — nearest-neighbor search over **precomputed query vectors**.
  Query strings are embedded upstream (see `nemo_retriever.Retriever`);
  the VDB only sees vectors.

Methods accept `**kwargs` so backend-specific options (e.g. LanceDB's
`where` predicate for metadata filtering, refinement factors,
hybrid-search flags) flow through without changing the ABC.

See `nemo_retriever/vdb/README.md` for the concrete `LanceDB` backend and
the `IngestVdbOperator` / `RetrieveVdbOperator` wrappers, including the
metadata-filtering section and its reference notebook.
"""

from abc import ABC, abstractmethod


[docs] class VDB(ABC): """Abstract base class for vector-database operators. Subclasses implement the four abstract methods below. The interface is intentionally small; backend-specific options (connection URIs, index tuning, search filters) are passed via `**kwargs`. The reference implementation is `LanceDB` (see `lancedb.py`). For an overview of how `IngestVdbOperator` and `RetrieveVdbOperator` consume this interface, see the package README. """ @abstractmethod def __init__(self, **kwargs): """Initialize the operator. Implementations parse backend-specific connection and index parameters from `kwargs` and set up any client handles. Heavy operations (creating indexes, loading data) belong in `create_index`, not here, so the operator stays cheap to construct in tests. Common kwargs vary by backend. For LanceDB, for example: `uri`, `table_name`, `vector_dim`, `overwrite`, `index_type`, `metric`, `num_partitions`, `num_sub_vectors`, `hybrid`, `on_bad_vectors`. The base class stores all kwargs as attributes on the instance as a convenience; subclasses may rely on that or override. """ self.__dict__.update(kwargs)
[docs] @abstractmethod def create_index(self, **kwargs): """Create the index(es) needed for ingestion and retrieval. Implementations create the table / index with the appropriate vector schema (dimension, distance metric, ANN parameters) and any auxiliary indexes (e.g. an FTS index for hybrid search). Common kwargs: - recreate (bool): drop and recreate even if the index exists. Return value is backend-specific. """ pass
[docs] @abstractmethod def write_to_index(self, records: list, **kwargs): """Write a batch of NRL record batches to the index. `records` is a list of record batches — each batch is a list of record dicts as produced by the NRL pipeline. Implementations transform each record into the table's row format (typically columns `vector`, `text`, `metadata`, `source`) and use the backend's bulk-write API. Sidecar metadata (when supplied via `meta_dataframe` / `meta_source_field` / `meta_fields` at operator construction) is merged into each record's `content_metadata` upstream of this method — implementations only see the merged result. Records missing required fields (vector, text) should be skipped rather than raised, matching the reference `LanceDB` backend's `on_bad_vectors` behavior. Common kwargs: - batch_size (int): documents per bulk request. """ pass
[docs] @abstractmethod def retrieval(self, queries: list, **kwargs): """Run nearest-neighbor search for **precomputed query vectors**. Despite the parameter name `queries` (kept for backward compatibility), this method receives a list of embedding vectors, one per query — *not* raw text. Query text is embedded upstream, typically inside `nemo_retriever.Retriever`, before this method is called. Implementations search the index, apply any post-filtering, and return a list of hit lists aligned with the input (one inner list per input vector). Stored vector columns should be stripped from hits to keep payloads small. Common kwargs: - top_k (int): neighbors per query. - where / _filter (str): a SQL predicate evaluated against table columns. NRL stores `content_metadata` (including sidecar fields) as a **compact JSON string** in the `metadata` column, so JSON filters typically use `LIKE` against a substring of the serialized JSON, e.g. `metadata LIKE '%"meta_a":"alpha"%'`. The `_filter` alias is accepted in addition to `where`. - refine_factor / nprobes / search_kwargs: ANN tuning passed through to the backend. See `nemo_retriever/vdb/README.md` and `examples/nemo_retriever_retriever_query_metadata_filter.ipynb` for the full filter cookbook (sidecar merge, server-side vs client-side filtering, escaping). Hybrid search with precomputed vectors is not implemented by the reference `LanceDB` backend; passing `hybrid=True` raises `NotImplementedError` on that path. """ pass
[docs] @abstractmethod def run(self, records): """Pipeline entry point: ensure the index exists, then ingest. Minimal implementation:: def run(self, records): self.create_index() self.write_to_index(records) Implementers may add metrics, retries, or commit hooks, but `run` should stay a thin orchestration layer so callers can reason about ingestion order. """ pass
[docs] def reindex(self, records: list, **kwargs): """Drop and rebuild the index, then re-ingest `records`. Optional hook for subclasses. Default implementation does nothing; a typical override is:: def reindex(self, records, **kwargs): self.create_index(recreate=True) self.write_to_index(records) """ pass