Using the Native Runtime API#

This tutorial demonstrates how to build a TensorRT-RTX engine and run inference programmatically using the C++ or Python API. The example uses a simple convolutional neural network to illustrate the core workflow.

Note

The native API is intended for advanced use cases where you need fine-grained control over network construction. Most users should use the ONNX-based workflow instead.

The workflow has two phases:

  1. Build — Define a network, configure the builder, and serialize the engine to a file

  2. Infer — Deserialize the engine, allocate GPU memory, and execute inference

Build the Engine#

Define a simple convolution network and serialize it to an engine file.

 1#include "NvInfer.h"
 2#include <cuda_runtime.h>
 3#include <fstream>
 4#include <iostream>
 5#include <memory>
 6#include <vector>
 7
 8using namespace nvinfer1;
 9
10// Minimal logger implementation
11class Logger : public ILogger {
12    void log(Severity severity, const char* msg) noexcept override {
13        if (severity <= Severity::kWARNING) std::cout << msg << std::endl;
14    }
15} gLogger;
16
17void buildEngine(const char* fileName) {
18    auto builder = createInferBuilder(gLogger);
19    auto network = builder->createNetworkV2(0U);
20
21    // Input tensor of shape {1, 1, 3, 3}
22    auto input = network->addInput("input", DataType::kFLOAT, Dims4{1, 1, 3, 3});
23
24    // Convolution weights: 3x3 kernel, 1 input channel, 1 output channel
25    std::vector<float> weightValues(9, 1.f);
26    std::vector<float> biasValues(1, 0.f);
27    Weights W{DataType::kFLOAT, weightValues.data(), 9};
28    Weights B{DataType::kFLOAT, biasValues.data(), 1};
29
30    auto conv = network->addConvolution(*input, 1, DimsHW{3, 3}, W, B);
31    conv->setStride(DimsHW{1, 1});
32
33    // Mark network output
34    auto output = conv->getOutput(0);
35    output->setName("output");
36    network->markOutput(*output);
37
38    // Build and serialize
39    builder->setMaxBatchSize(1);
40    auto config = builder->createBuilderConfig();
41    config->setMaxWorkspaceSize(1 << 20);
42    auto engine = builder->buildSerializedNetwork(*network, *config);
43
44    // Save to file
45    std::ofstream out(fileName, std::ios::binary);
46    out.write(static_cast<const char*>(engine->data()), engine->size());
47    out.close();
48
49    // Clean up
50    // Note: While this example uses manual cleanup for clarity,
51    // production code should use smart pointers (e.g., std::unique_ptr
52    // with custom deleters) to ensure proper resource management.
53    delete engine;
54    delete network;
55    delete config;
56    delete builder;
57}
 1import tensorrt_rtx as trt_rtx
 2import numpy as np
 3
 4GLOBAL_LOGGER = trt_rtx.Logger(trt_rtx.Logger.WARNING)
 5
 6def build_engine(file_name):
 7    builder = trt_rtx.Builder(GLOBAL_LOGGER)
 8    network = builder.create_network(0)
 9
10    # Input tensor of shape (1, 1, 3, 3)
11    input_t = network.add_input(
12        name="input", dtype=trt_rtx.float32, shape=(1, 1, 3, 3))
13
14    # Convolution: 3x3 kernel, 1 output channel
15    w = np.full((1, 1, 3, 3), 1.0, dtype=np.float32)
16    b = np.zeros(1, dtype=np.float32)
17    conv = network.add_convolution_nd(
18        input=input_t, num_output_maps=1,
19        kernel_shape=(3, 3), kernel=w, bias=b)
20    conv.stride_nd = (1, 1)
21
22    # Mark output
23    conv.get_output(0).name = "output"
24    network.mark_output(conv.get_output(0))
25
26    # Build and serialize
27    builder.max_batch_size = 1
28    config = builder.create_builder_config()
29    config.max_workspace_size = 1 << 20
30    host_mem = builder.build_serialized_network(network, config)
31
32    with open(file_name, "wb") as f:
33        f.write(host_mem)

Run Inference#

Load the serialized engine, allocate device memory, run inference, and read back the result.

 1void performInference(const char* fileName, const std::vector<float>& inputData) {
 2    // Note: Error handling omitted for brevity. Production code should
 3    // check return values and handle CUDA errors appropriately.
 4    auto runtime = createInferRuntime(gLogger);
 5
 6    // Deserialize engine from file
 7    std::ifstream is(fileName, std::ios::binary);
 8    is.seekg(0, is.end);
 9    size_t nbBytes = is.tellg();
10    is.seekg(0, is.beg);
11    std::vector<uint8_t> data(nbBytes);
12    is.read(reinterpret_cast<char*>(data.data()), nbBytes);
13    is.close();
14
15    auto engine = runtime->deserializeCudaEngine(data.data(), nbBytes);
16    auto execContext = engine->createExecutionContext();
17
18    // Allocate device memory
19    void* bindings[2];
20    size_t inputNbBytes = inputData.size() * sizeof(float);
21    size_t outputNbBytes = 1 * sizeof(float);
22    cudaMalloc(&bindings[0], inputNbBytes);
23    cudaMalloc(&bindings[1], outputNbBytes);
24
25    // Copy input to device
26    cudaMemcpy(bindings[0], inputData.data(), inputNbBytes, cudaMemcpyHostToDevice);
27
28    // Run inference (synchronous; use enqueueV3() for async)
29    execContext->executeV2(bindings);
30
31    // Copy result back to host
32    float result;
33    cudaMemcpy(&result, bindings[1], outputNbBytes, cudaMemcpyDeviceToHost);
34    std::cout << "Result = " << result << std::endl;
35
36    // Clean up
37    // Note: Production code should use smart pointers and RAII patterns
38    // for automatic resource management. Error handling for CUDA calls
39    // should also be implemented.
40    cudaFree(bindings[0]);
41    cudaFree(bindings[1]);
42    delete execContext;
43    delete engine;
44    delete runtime;
45}
 1import pycuda.driver as cuda
 2import pycuda.autoprimaryctx  # Use autoprimaryctx, not autoinit
 3
 4def perform_inference(file_name, input_data):
 5    runtime = trt_rtx.Runtime(GLOBAL_LOGGER)
 6
 7    with open(file_name, "rb") as f:
 8        engine = runtime.deserialize_cuda_engine(f.read())
 9
10    exec_context = engine.create_execution_context()
11
12    # Allocate device memory
13    d_input = cuda.mem_alloc(input_data.nbytes)
14    output_data = np.empty((1,), dtype=np.float32)
15    d_output = cuda.mem_alloc(output_data.nbytes)
16
17    # Copy input to device and run inference
18    cuda.memcpy_htod(d_input, input_data)
19    exec_context.execute_v2([int(d_input), int(d_output)])
20
21    # Copy result back to host
22    cuda.memcpy_dtoh(output_data, d_output)
23
24    # Clean up
25    cuda.mem_free(d_input)
26    cuda.mem_free(d_output)
27
28    return output_data

Next Steps#