Using the Native Runtime API#
This tutorial demonstrates how to build a TensorRT-RTX engine and run inference programmatically using the C++ or Python API. The example uses a simple convolutional neural network to illustrate the core workflow.
Note
The native API is intended for advanced use cases where you need fine-grained control over network construction. Most users should use the ONNX-based workflow instead.
The workflow has two phases:
Build — Define a network, configure the builder, and serialize the engine to a file
Infer — Deserialize the engine, allocate GPU memory, and execute inference
Build the Engine#
Define a simple convolution network and serialize it to an engine file.
1#include "NvInfer.h"
2#include <cuda_runtime.h>
3#include <fstream>
4#include <iostream>
5#include <memory>
6#include <vector>
7
8using namespace nvinfer1;
9
10// Minimal logger implementation
11class Logger : public ILogger {
12 void log(Severity severity, const char* msg) noexcept override {
13 if (severity <= Severity::kWARNING) std::cout << msg << std::endl;
14 }
15} gLogger;
16
17void buildEngine(const char* fileName) {
18 auto builder = createInferBuilder(gLogger);
19 auto network = builder->createNetworkV2(0U);
20
21 // Input tensor of shape {1, 1, 3, 3}
22 auto input = network->addInput("input", DataType::kFLOAT, Dims4{1, 1, 3, 3});
23
24 // Convolution weights: 3x3 kernel, 1 input channel, 1 output channel
25 std::vector<float> weightValues(9, 1.f);
26 std::vector<float> biasValues(1, 0.f);
27 Weights W{DataType::kFLOAT, weightValues.data(), 9};
28 Weights B{DataType::kFLOAT, biasValues.data(), 1};
29
30 auto conv = network->addConvolution(*input, 1, DimsHW{3, 3}, W, B);
31 conv->setStride(DimsHW{1, 1});
32
33 // Mark network output
34 auto output = conv->getOutput(0);
35 output->setName("output");
36 network->markOutput(*output);
37
38 // Build and serialize
39 builder->setMaxBatchSize(1);
40 auto config = builder->createBuilderConfig();
41 config->setMaxWorkspaceSize(1 << 20);
42 auto engine = builder->buildSerializedNetwork(*network, *config);
43
44 // Save to file
45 std::ofstream out(fileName, std::ios::binary);
46 out.write(static_cast<const char*>(engine->data()), engine->size());
47 out.close();
48
49 // Clean up
50 // Note: While this example uses manual cleanup for clarity,
51 // production code should use smart pointers (e.g., std::unique_ptr
52 // with custom deleters) to ensure proper resource management.
53 delete engine;
54 delete network;
55 delete config;
56 delete builder;
57}
1import tensorrt_rtx as trt_rtx
2import numpy as np
3
4GLOBAL_LOGGER = trt_rtx.Logger(trt_rtx.Logger.WARNING)
5
6def build_engine(file_name):
7 builder = trt_rtx.Builder(GLOBAL_LOGGER)
8 network = builder.create_network(0)
9
10 # Input tensor of shape (1, 1, 3, 3)
11 input_t = network.add_input(
12 name="input", dtype=trt_rtx.float32, shape=(1, 1, 3, 3))
13
14 # Convolution: 3x3 kernel, 1 output channel
15 w = np.full((1, 1, 3, 3), 1.0, dtype=np.float32)
16 b = np.zeros(1, dtype=np.float32)
17 conv = network.add_convolution_nd(
18 input=input_t, num_output_maps=1,
19 kernel_shape=(3, 3), kernel=w, bias=b)
20 conv.stride_nd = (1, 1)
21
22 # Mark output
23 conv.get_output(0).name = "output"
24 network.mark_output(conv.get_output(0))
25
26 # Build and serialize
27 builder.max_batch_size = 1
28 config = builder.create_builder_config()
29 config.max_workspace_size = 1 << 20
30 host_mem = builder.build_serialized_network(network, config)
31
32 with open(file_name, "wb") as f:
33 f.write(host_mem)
Run Inference#
Load the serialized engine, allocate device memory, run inference, and read back the result.
1void performInference(const char* fileName, const std::vector<float>& inputData) {
2 // Note: Error handling omitted for brevity. Production code should
3 // check return values and handle CUDA errors appropriately.
4 auto runtime = createInferRuntime(gLogger);
5
6 // Deserialize engine from file
7 std::ifstream is(fileName, std::ios::binary);
8 is.seekg(0, is.end);
9 size_t nbBytes = is.tellg();
10 is.seekg(0, is.beg);
11 std::vector<uint8_t> data(nbBytes);
12 is.read(reinterpret_cast<char*>(data.data()), nbBytes);
13 is.close();
14
15 auto engine = runtime->deserializeCudaEngine(data.data(), nbBytes);
16 auto execContext = engine->createExecutionContext();
17
18 // Allocate device memory
19 void* bindings[2];
20 size_t inputNbBytes = inputData.size() * sizeof(float);
21 size_t outputNbBytes = 1 * sizeof(float);
22 cudaMalloc(&bindings[0], inputNbBytes);
23 cudaMalloc(&bindings[1], outputNbBytes);
24
25 // Copy input to device
26 cudaMemcpy(bindings[0], inputData.data(), inputNbBytes, cudaMemcpyHostToDevice);
27
28 // Run inference (synchronous; use enqueueV3() for async)
29 execContext->executeV2(bindings);
30
31 // Copy result back to host
32 float result;
33 cudaMemcpy(&result, bindings[1], outputNbBytes, cudaMemcpyDeviceToHost);
34 std::cout << "Result = " << result << std::endl;
35
36 // Clean up
37 // Note: Production code should use smart pointers and RAII patterns
38 // for automatic resource management. Error handling for CUDA calls
39 // should also be implemented.
40 cudaFree(bindings[0]);
41 cudaFree(bindings[1]);
42 delete execContext;
43 delete engine;
44 delete runtime;
45}
1import pycuda.driver as cuda
2import pycuda.autoprimaryctx # Use autoprimaryctx, not autoinit
3
4def perform_inference(file_name, input_data):
5 runtime = trt_rtx.Runtime(GLOBAL_LOGGER)
6
7 with open(file_name, "rb") as f:
8 engine = runtime.deserialize_cuda_engine(f.read())
9
10 exec_context = engine.create_execution_context()
11
12 # Allocate device memory
13 d_input = cuda.mem_alloc(input_data.nbytes)
14 output_data = np.empty((1,), dtype=np.float32)
15 d_output = cuda.mem_alloc(output_data.nbytes)
16
17 # Copy input to device and run inference
18 cuda.memcpy_htod(d_input, input_data)
19 exec_context.execute_v2([int(d_input), int(d_output)])
20
21 # Copy result back to host
22 cuda.memcpy_dtoh(output_data, d_output)
23
24 # Clean up
25 cuda.mem_free(d_input)
26 cuda.mem_free(d_output)
27
28 return output_data
Next Steps#
C++ API Documentation — Full C++ API reference
Python API Documentation — Full Python API reference
Working with Dynamic Shapes — Handle variable input dimensions
Working with RTX CUDA Graphs — Reduce launch overhead for repeated inference