Is this page helpful?

Using the Native Runtime API#

This tutorial demonstrates how to build a TensorRT-RTX engine and run inference programmatically using the C++ or Python API. The example uses a simple convolutional neural network to illustrate the core workflow.

Note

The native API is intended for advanced use cases where you need fine-grained control over network construction. Most users should use the ONNX-based workflow instead.

The workflow has two phases:

Build — Define a network, configure the builder, and serialize the engine to a file
Infer — Deserialize the engine, allocate GPU memory, and execute inference

Build the Engine#

Define a simple convolution network and serialize it to an engine file.

C++

#include "NvInfer.h"
#include <cuda_runtime.h>
#include <fstream>
#include <iostream>
#include <memory>
#include <vector>

using namespace nvinfer1;

// Minimal logger implementation
class Logger : public ILogger {
    void log(Severity severity, const char* msg) noexcept override {
        if (severity <= Severity::kWARNING) std::cout << msg << std::endl;
    }
} gLogger;

void buildEngine(const char* fileName) {
    auto builder = createInferBuilder(gLogger);
    auto network = builder->createNetworkV2(0U);

    // Input tensor of shape {1, 1, 3, 3}
    auto input = network->addInput("input", DataType::kFLOAT, Dims4{1, 1, 3, 3});

    // Convolution weights: 3x3 kernel, 1 input channel, 1 output channel
    std::vector<float> weightValues(9, 1.f);
    std::vector<float> biasValues(1, 0.f);
    Weights W{DataType::kFLOAT, weightValues.data(), 9};
    Weights B{DataType::kFLOAT, biasValues.data(), 1};

    auto conv = network->addConvolution(*input, 1, DimsHW{3, 3}, W, B);
    conv->setStride(DimsHW{1, 1});

    // Mark network output
    auto output = conv->getOutput(0);
    output->setName("output");
    network->markOutput(*output);

    // Build and serialize
    builder->setMaxBatchSize(1);
    auto config = builder->createBuilderConfig();
    config->setMaxWorkspaceSize(1 << 20);
    auto engine = builder->buildSerializedNetwork(*network, *config);

    // Save to file
    std::ofstream out(fileName, std::ios::binary);
    out.write(static_cast<const char*>(engine->data()), engine->size());
    out.close();

    // Clean up
    // Note: While this example uses manual cleanup for clarity,
    // production code should use smart pointers (e.g., std::unique_ptr
    // with custom deleters) to ensure proper resource management.
    delete engine;
    delete network;
    delete config;
    delete builder;
}

Python

import tensorrt_rtx as trt_rtx
import numpy as np

GLOBAL_LOGGER = trt_rtx.Logger(trt_rtx.Logger.WARNING)

def build_engine(file_name):
    builder = trt_rtx.Builder(GLOBAL_LOGGER)
    network = builder.create_network(0)

    # Input tensor of shape (1, 1, 3, 3)
    input_t = network.add_input(
        name="input", dtype=trt_rtx.float32, shape=(1, 1, 3, 3))

    # Convolution: 3x3 kernel, 1 output channel
    w = np.full((1, 1, 3, 3), 1.0, dtype=np.float32)
    b = np.zeros(1, dtype=np.float32)
    conv = network.add_convolution_nd(
        input=input_t, num_output_maps=1,
        kernel_shape=(3, 3), kernel=w, bias=b)
    conv.stride_nd = (1, 1)

    # Mark output
    conv.get_output(0).name = "output"
    network.mark_output(conv.get_output(0))

    # Build and serialize
    builder.max_batch_size = 1
    config = builder.create_builder_config()
    config.max_workspace_size = 1 << 20
    host_mem = builder.build_serialized_network(network, config)

    with open(file_name, "wb") as f:
        f.write(host_mem)

Run Inference#

Load the serialized engine, allocate device memory, run inference, and read back the result.

C++

void performInference(const char* fileName, const std::vector<float>& inputData) {
    // Note: Error handling omitted for brevity. Production code should
    // check return values and handle CUDA errors appropriately.
    auto runtime = createInferRuntime(gLogger);

    // Deserialize engine from file
    std::ifstream is(fileName, std::ios::binary);
    is.seekg(0, is.end);
    size_t nbBytes = is.tellg();
    is.seekg(0, is.beg);
    std::vector<uint8_t> data(nbBytes);
    is.read(reinterpret_cast<char*>(data.data()), nbBytes);
    is.close();

    auto engine = runtime->deserializeCudaEngine(data.data(), nbBytes);
    auto execContext = engine->createExecutionContext();

    // Allocate device memory
    void* bindings[2];
    size_t inputNbBytes = inputData.size() * sizeof(float);
    size_t outputNbBytes = 1 * sizeof(float);
    cudaMalloc(&bindings[0], inputNbBytes);
    cudaMalloc(&bindings[1], outputNbBytes);

    // Copy input to device
    cudaMemcpy(bindings[0], inputData.data(), inputNbBytes, cudaMemcpyHostToDevice);

    // Run inference (synchronous; use enqueueV3() for async)
    execContext->executeV2(bindings);

    // Copy result back to host
    float result;
    cudaMemcpy(&result, bindings[1], outputNbBytes, cudaMemcpyDeviceToHost);
    std::cout << "Result = " << result << std::endl;

    // Clean up
    // Note: Production code should use smart pointers and RAII patterns
    // for automatic resource management. Error handling for CUDA calls
    // should also be implemented.
    cudaFree(bindings[0]);
    cudaFree(bindings[1]);
    delete execContext;
    delete engine;
    delete runtime;
}

Python

import pycuda.driver as cuda
import pycuda.autoprimaryctx  # Use autoprimaryctx, not autoinit

def perform_inference(file_name, input_data):
    runtime = trt_rtx.Runtime(GLOBAL_LOGGER)

    with open(file_name, "rb") as f:
        engine = runtime.deserialize_cuda_engine(f.read())

    exec_context = engine.create_execution_context()

    # Allocate device memory
    d_input = cuda.mem_alloc(input_data.nbytes)
    output_data = np.empty((1,), dtype=np.float32)
    d_output = cuda.mem_alloc(output_data.nbytes)

    # Copy input to device and run inference
    cuda.memcpy_htod(d_input, input_data)
    exec_context.execute_v2([int(d_input), int(d_output)])

    # Copy result back to host
    cuda.memcpy_dtoh(output_data, d_output)

    # Clean up
    cuda.mem_free(d_input)
    cuda.mem_free(d_output)

    return output_data

Next Steps#

C++ API Documentation — Full C++ API reference
Python API Documentation — Full Python API reference
Working with Dynamic Shapes — Handle variable input dimensions
Working with RTX CUDA Graphs — Reduce launch overhead for repeated inference