Using CLIP Embeddings#

A trained or pretrained CLIP model produces three outputs: image embeddings, text embeddings, and a logit_scale value (and optionally logit_bias). These outputs serve as building blocks for a wide range of downstream applications.

The examples below use onnxruntime to load a combined CLIP ONNX model and run inference directly. For separate vision and text engines, create two InferenceSession objects, one per encoder, and run them independently.

Zero-Shot Image Classification#

Compare an image embedding against a set of class-name text embeddings to classify the image without task-specific training data.

import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

session = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")

class_names = ["a cat", "a dog", "a bird"]
tokens = tokenizer(
    class_names, padding="max_length", truncation=True,
    max_length=77, return_tensors="np"
)

image = preprocess_image("photo.jpg")  # shape: (1, 3, H, W), float32

outputs = session.run(None, {
    "image": image,
    "input_ids": tokens["input_ids"].astype(np.int64),
    # attention_mask is required by the graph but its values are ignored internally.
    # np.ones_like(tokens["input_ids"]) produces identical results.
    # Note: attention_mask will be removed as a graph input in a future release.
    "attention_mask": tokens["attention_mask"].astype(np.int64),
})
image_emb, text_emb, logit_scale = outputs[0], outputs[1], outputs[2]

# Compute similarity scores
scores = float(logit_scale) * (image_emb @ text_emb.T)  # shape: (1, 3)
predicted_class = class_names[scores[0].argmax()]
print(f"Predicted: {predicted_class}")

Image-Text Retrieval#

Given a text query, rank a gallery of images by relevance.

import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

session = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")

# Embed gallery images in batches
# Requesting a named output returns only that output — no logit_scale or logit_bias.
gallery_embeddings = []
for batch in image_batches:  # each batch: (B, 3, H, W)
    emb = session.run(["image_embedding"], {"image": batch})[0]  # (B, D)
    gallery_embeddings.append(emb)
gallery_embeddings = np.concatenate(gallery_embeddings)  # (N, D)

# Embed the text query
query = "a dog running on the beach"
tokens = tokenizer(
    [query], padding="max_length", truncation=True,
    max_length=77, return_tensors="np"
)
text_emb = session.run(["text_embedding"], {
    "input_ids": tokens["input_ids"].astype(np.int64),
    # attention_mask is required by the graph but its values are ignored internally.
    # Note: attention_mask will be removed as a graph input in a future release.
    "attention_mask": tokens["attention_mask"].astype(np.int64),
})[0]  # (1, D)

# Rank gallery by cosine similarity
scores = (text_emb @ gallery_embeddings.T)[0]  # (N,)
top5 = scores.argsort()[::-1][:5]
print("Top 5 matches:", [image_paths[i] for i in top5])

Content Filtering#

Check whether an image matches any of a set of descriptor texts — for example, to flag images that contain specific content categories.

import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

session = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")

descriptors = [
    "contains a vehicle",
    "outdoor scene",
    "contains text or signage",
]
tokens = tokenizer(
    descriptors, padding="max_length", truncation=True,
    max_length=77, return_tensors="np"
)

image = preprocess_image("image.jpg")  # (1, 3, H, W)

outputs = session.run(None, {
    "image": image,
    "input_ids": tokens["input_ids"].astype(np.int64),
    # attention_mask is required by the graph but its values are ignored internally.
    # Note: attention_mask will be removed as a graph input in a future release.
    "attention_mask": tokens["attention_mask"].astype(np.int64),
})
image_emb, text_emb, logit_scale, logit_bias = (
    outputs[0], outputs[1], outputs[2], outputs[3]
)

scores = float(logit_scale) * (image_emb @ text_emb.T) + float(logit_bias)
matches = scores[0] > 0.0  # sigmoid(score) > 0.5 with SigLIP-style bias
for desc, matched in zip(descriptors, matches):
    print(f"{'[MATCH]' if matched else '[    ]'} {desc}")

Direct ONNX Runtime Inference#

The following snippet shows the minimal input and output structure for a combined CLIP ONNX model. Use this as a reference when integrating the model into a custom inference pipeline.

import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer

session   = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")

tokens = tokenizer(
    ["a photo of a cat"],
    padding="max_length", truncation=True,
    max_length=77, return_tensors="np"
)
image = preprocess_image("photo.jpg")  # (1, 3, H, W), float32

# attention_mask is required by the graph but its values are ignored internally.
# np.ones_like(tokens["input_ids"]) produces identical results.
# Note: attention_mask will be removed as a graph input in a future release.
outputs = session.run(None, {
    "image":          image,
    "input_ids":      tokens["input_ids"].astype(np.int64),
    "attention_mask": tokens["attention_mask"].astype(np.int64),
})
image_emb, text_emb, logit_scale, logit_bias = outputs

Direct TensorRT Engine Inference#

The following snippet shows how to run a combined CLIP TRT engine directly using the TensorRT Python API (TRT 8.5 and later). For TAO Deploy-managed inference, refer to Running Inference with a TensorRT Engine instead.

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
from transformers import AutoTokenizer

TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# Load the serialized TRT engine
with open("clip_model.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

context = engine.create_execution_context()

# Prepare inputs
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")
tokens = tokenizer(
    ["a photo of a cat"],
    padding="max_length", truncation=True,
    max_length=77, return_tensors="np"
)
image          = preprocess_image("photo.jpg")                    # (1, 3, H, W), float32
input_ids      = tokens["input_ids"].astype(np.int64)             # (1, 77)
attention_mask = tokens["attention_mask"].astype(np.int64)        # (1, 77) — deprecated, values ignored

# Set dynamic input shapes for this batch
context.set_input_shape("image",          image.shape)
context.set_input_shape("input_ids",      input_ids.shape)
context.set_input_shape("attention_mask", attention_mask.shape)

# Allocate device buffers for all IO tensors
stream         = cuda.Stream()
device_buffers = {}
host_outputs   = {}

for idx in range(engine.num_io_tensors):
    name  = engine.get_tensor_name(idx)
    shape = tuple(context.get_tensor_shape(name))
    dtype = trt.nptype(engine.get_tensor_dtype(name))
    buf   = cuda.mem_alloc(np.empty(shape, dtype=dtype).nbytes)
    context.set_tensor_address(name, int(buf))
    device_buffers[name] = buf
    if engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
        host_outputs[name] = np.empty(shape, dtype=dtype)

# Copy inputs to device
for name, arr in [
    ("image",          image),
    ("input_ids",      input_ids),
    ("attention_mask", attention_mask),
]:
    cuda.memcpy_htod_async(device_buffers[name], np.ascontiguousarray(arr), stream)

# Run inference and copy outputs to host
context.execute_async_v3(stream.handle)
stream.synchronize()

for name, arr in host_outputs.items():
    cuda.memcpy_dtoh(arr, device_buffers[name])

image_emb   = host_outputs["image_embedding"]   # (1, D)
text_emb    = host_outputs["text_embedding"]    # (1, D)
logit_scale = float(host_outputs["logit_scale"])
logit_bias  = float(host_outputs["logit_bias"])

# Compute match probability (SigLIP style)
import scipy.special
score       = logit_scale * float(image_emb @ text_emb.T) + logit_bias
probability = scipy.special.expit(score)  # sigmoid(score)
print(f"Match probability: {probability:.3f}")

Embedding Index for Similarity Search#

Load embeddings produced by Running Inference and build a lightweight cosine similarity index for fast nearest-neighbor search.

import h5py
import numpy as np

# Load TAO inference output
with h5py.File("image_embeddings.h5", "r") as f:
    embeddings = f["embeddings"][:]   # (N, D) float32, already L2-normalized
    image_paths = [p.decode() for p in f["image_paths"][:]]

def search(query_embedding: np.ndarray, top_k: int = 10):
    """Return the top-k most similar images to the query embedding."""
    scores = embeddings @ query_embedding  # cosine similarity (embeddings are normalized)
    top_indices = scores.argsort()[::-1][:top_k]
    return [(image_paths[i], float(scores[i])) for i in top_indices]

# Example: search with a text query embedding
with h5py.File("text_embeddings.h5", "r") as f:
    query_emb = f["embeddings"][0]  # first text prompt

results = search(query_emb, top_k=5)
for path, score in results:
    print(f"{score:.4f}  {path}")