Using CLIP Embeddings#
A trained or pretrained CLIP model produces three outputs: image embeddings,
text embeddings, and a logit_scale value (and optionally logit_bias).
These outputs serve as building blocks for a wide range of downstream applications.
The examples below use onnxruntime to load a combined CLIP ONNX model
and run inference directly. For separate vision and text engines, create two
InferenceSession objects, one per encoder, and run them independently.
Zero-Shot Image Classification#
Compare an image embedding against a set of class-name text embeddings to classify the image without task-specific training data.
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
session = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")
class_names = ["a cat", "a dog", "a bird"]
tokens = tokenizer(
class_names, padding="max_length", truncation=True,
max_length=77, return_tensors="np"
)
image = preprocess_image("photo.jpg") # shape: (1, 3, H, W), float32
outputs = session.run(None, {
"image": image,
"input_ids": tokens["input_ids"].astype(np.int64),
# attention_mask is required by the graph but its values are ignored internally.
# np.ones_like(tokens["input_ids"]) produces identical results.
# Note: attention_mask will be removed as a graph input in a future release.
"attention_mask": tokens["attention_mask"].astype(np.int64),
})
image_emb, text_emb, logit_scale = outputs[0], outputs[1], outputs[2]
# Compute similarity scores
scores = float(logit_scale) * (image_emb @ text_emb.T) # shape: (1, 3)
predicted_class = class_names[scores[0].argmax()]
print(f"Predicted: {predicted_class}")
Image-Text Retrieval#
Given a text query, rank a gallery of images by relevance.
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
session = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")
# Embed gallery images in batches
# Requesting a named output returns only that output — no logit_scale or logit_bias.
gallery_embeddings = []
for batch in image_batches: # each batch: (B, 3, H, W)
emb = session.run(["image_embedding"], {"image": batch})[0] # (B, D)
gallery_embeddings.append(emb)
gallery_embeddings = np.concatenate(gallery_embeddings) # (N, D)
# Embed the text query
query = "a dog running on the beach"
tokens = tokenizer(
[query], padding="max_length", truncation=True,
max_length=77, return_tensors="np"
)
text_emb = session.run(["text_embedding"], {
"input_ids": tokens["input_ids"].astype(np.int64),
# attention_mask is required by the graph but its values are ignored internally.
# Note: attention_mask will be removed as a graph input in a future release.
"attention_mask": tokens["attention_mask"].astype(np.int64),
})[0] # (1, D)
# Rank gallery by cosine similarity
scores = (text_emb @ gallery_embeddings.T)[0] # (N,)
top5 = scores.argsort()[::-1][:5]
print("Top 5 matches:", [image_paths[i] for i in top5])
Content Filtering#
Check whether an image matches any of a set of descriptor texts — for example, to flag images that contain specific content categories.
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
session = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")
descriptors = [
"contains a vehicle",
"outdoor scene",
"contains text or signage",
]
tokens = tokenizer(
descriptors, padding="max_length", truncation=True,
max_length=77, return_tensors="np"
)
image = preprocess_image("image.jpg") # (1, 3, H, W)
outputs = session.run(None, {
"image": image,
"input_ids": tokens["input_ids"].astype(np.int64),
# attention_mask is required by the graph but its values are ignored internally.
# Note: attention_mask will be removed as a graph input in a future release.
"attention_mask": tokens["attention_mask"].astype(np.int64),
})
image_emb, text_emb, logit_scale, logit_bias = (
outputs[0], outputs[1], outputs[2], outputs[3]
)
scores = float(logit_scale) * (image_emb @ text_emb.T) + float(logit_bias)
matches = scores[0] > 0.0 # sigmoid(score) > 0.5 with SigLIP-style bias
for desc, matched in zip(descriptors, matches):
print(f"{'[MATCH]' if matched else '[ ]'} {desc}")
Direct ONNX Runtime Inference#
The following snippet shows the minimal input and output structure for a combined CLIP ONNX model. Use this as a reference when integrating the model into a custom inference pipeline.
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
session = ort.InferenceSession("clip_model.onnx")
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")
tokens = tokenizer(
["a photo of a cat"],
padding="max_length", truncation=True,
max_length=77, return_tensors="np"
)
image = preprocess_image("photo.jpg") # (1, 3, H, W), float32
# attention_mask is required by the graph but its values are ignored internally.
# np.ones_like(tokens["input_ids"]) produces identical results.
# Note: attention_mask will be removed as a graph input in a future release.
outputs = session.run(None, {
"image": image,
"input_ids": tokens["input_ids"].astype(np.int64),
"attention_mask": tokens["attention_mask"].astype(np.int64),
})
image_emb, text_emb, logit_scale, logit_bias = outputs
Direct TensorRT Engine Inference#
The following snippet shows how to run a combined CLIP TRT engine directly using the TensorRT Python API (TRT 8.5 and later). For TAO Deploy-managed inference, refer to Running Inference with a TensorRT Engine instead.
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
from transformers import AutoTokenizer
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
# Load the serialized TRT engine
with open("clip_model.engine", "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
# Prepare inputs
tokenizer = AutoTokenizer.from_pretrained("clip_tokenizer/")
tokens = tokenizer(
["a photo of a cat"],
padding="max_length", truncation=True,
max_length=77, return_tensors="np"
)
image = preprocess_image("photo.jpg") # (1, 3, H, W), float32
input_ids = tokens["input_ids"].astype(np.int64) # (1, 77)
attention_mask = tokens["attention_mask"].astype(np.int64) # (1, 77) — deprecated, values ignored
# Set dynamic input shapes for this batch
context.set_input_shape("image", image.shape)
context.set_input_shape("input_ids", input_ids.shape)
context.set_input_shape("attention_mask", attention_mask.shape)
# Allocate device buffers for all IO tensors
stream = cuda.Stream()
device_buffers = {}
host_outputs = {}
for idx in range(engine.num_io_tensors):
name = engine.get_tensor_name(idx)
shape = tuple(context.get_tensor_shape(name))
dtype = trt.nptype(engine.get_tensor_dtype(name))
buf = cuda.mem_alloc(np.empty(shape, dtype=dtype).nbytes)
context.set_tensor_address(name, int(buf))
device_buffers[name] = buf
if engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
host_outputs[name] = np.empty(shape, dtype=dtype)
# Copy inputs to device
for name, arr in [
("image", image),
("input_ids", input_ids),
("attention_mask", attention_mask),
]:
cuda.memcpy_htod_async(device_buffers[name], np.ascontiguousarray(arr), stream)
# Run inference and copy outputs to host
context.execute_async_v3(stream.handle)
stream.synchronize()
for name, arr in host_outputs.items():
cuda.memcpy_dtoh(arr, device_buffers[name])
image_emb = host_outputs["image_embedding"] # (1, D)
text_emb = host_outputs["text_embedding"] # (1, D)
logit_scale = float(host_outputs["logit_scale"])
logit_bias = float(host_outputs["logit_bias"])
# Compute match probability (SigLIP style)
import scipy.special
score = logit_scale * float(image_emb @ text_emb.T) + logit_bias
probability = scipy.special.expit(score) # sigmoid(score)
print(f"Match probability: {probability:.3f}")
Embedding Index for Similarity Search#
Load embeddings produced by Running Inference and build a lightweight cosine similarity index for fast nearest-neighbor search.
import h5py
import numpy as np
# Load TAO inference output
with h5py.File("image_embeddings.h5", "r") as f:
embeddings = f["embeddings"][:] # (N, D) float32, already L2-normalized
image_paths = [p.decode() for p in f["image_paths"][:]]
def search(query_embedding: np.ndarray, top_k: int = 10):
"""Return the top-k most similar images to the query embedding."""
scores = embeddings @ query_embedding # cosine similarity (embeddings are normalized)
top_indices = scores.argsort()[::-1][:top_k]
return [(image_paths[i], float(scores[i])) for i in top_indices]
# Example: search with a text query embedding
with h5py.File("text_embeddings.h5", "r") as f:
query_emb = f["embeddings"][0] # first text prompt
results = search(query_emb, top_k=5)
for path, score in results:
print(f"{score:.4f} {path}")