Using the TensorRT-RTX Runtime API#

In this section, we show how to run inference programmatically in TensorRT-RTX, using either the C++ or the Python API. For simplicity, we use the example of a simple convolutional neural network rather than a more modern neural network architecture. We have also omitted error handling, memory management via smart pointers, and so on.

C++

// Define a simple logger class
class Logger : public ILogger {
    void log(Severity severity, const char* msg) noexcept override {
        if (severity <= Severity::kWARNING) std::cout << msg << std::endl;
    }
} gLogger;

// Build a CUDA engine and save it to an file
void buildEngine(const char* fileName){
    // Create builder
    auto builder = createInferBuilder(gLogger);
    auto network = builder->createNetworkV2(0U);
    // Input tensor of shape {1, 1, 3, 3}
    auto input = network->addInput("input", DataType::kFLOAT, Dims4{1,1,3,3});

    // Prepare convolution weights for 3x3 kernel, 1 input, 1 output channel
    std::vector<float> weightValues(9, 1.f);
    std::vector<float> biasValues(1, 0.f);
    Weights W{DataType::kFLOAT, weightValues.data(), 9};
    Weights B{DataType::kFLOAT, biasValues.data(), 1};

    // Add convolution layer
    auto conv = network->addConvolution(*input, 1, DimsHW{3,3}, W, B);
    conv->setStride(DimsHW{1,1});

    // Mark network output
    auto output = conv->getOutput(0);
    output->setName("output");
    network->markOutput(*output);

    // Build the engine and serialize it in one step
    builder->setMaxBatchSize(1);
    auto config = builder->createBuilderConfig();
    config->setMaxWorkspaceSize(1<<20);
    auto engine = builder->buildSerializedNetwork(*network, *config);

    // Save the serialized engine to a file
    std::ofstream out{fileName, std::ios::binary};
    out.write(engine->data(), engine->size());
    out.close();

    // Clean up
    delete engine;
    delete network;
    delete config;
    delete builder;
}

// Load the engine and perform inference
void performInference(char const* fileName, const std::vector<float>& inputData){
    auto runtime = createInferRuntime(gLogger);
    // Deserialize the engine from file
    ICudaEngine* engine{};
    std::ifstream is{fileName, std::ios::binary);
    is.seekg(0, is.end);
    size_t nbBytes = is.tellg();
    is.seekg(0, is.beg);
    std::vector<uint8_t> data(nbBytes);
    is.read(&data[0], nbBytes);
    is.close();
    engine = runtime->deserializeCudaEngine(&data[0], nbBytes);
    // Create an execution context for inference
    auto execContext = engine->createExecutionContext();
    void* bindings[2];
    size_t inputSize = inputData.size();
    size_t outputSize = 1;
    size_t inputNbBytes = inputSize * sizeof(float);
    size_t outputNbBytes = outputSize * sizeof(float);
    cudaMalloc(&bindings[0], inputNbBytes);
    cudaMalloc(&bindings[1], outputNbBytes);
    // Populate input binding on device
    cudaMemcpy(bindings[0], &inputData[0], inputNbBytes, cudaMemcpyHostToDevice);
    // Here we show synchronous inference for simplicity; use enqueueV3()
    // for asynchronous execution
    execContext->executeV2(bindings);
    // Copy result into host memory space
    float result;
    cudaMemcpy(&result, bindings[1], outputNbBytes, cudaMemcpyDeviceToHost);
    std::cout << "Result = " << result << std::endl;
    delete execContext;
    delete engine;
}

Python

import tensort_rtx as trt_rtx
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit

GLOBAL_LOGGER = trt_rtx.Logger(trt_rtx.Logger.WARNING)

def buildEngine(fileName):
    builder = trt_rtx.Builder(GLOBAL_LOGGER)
    network = builder.create_network(0)
    input_t = network.add_input(name="input", dtype=trt_rtx.float32, shape=(1,1,3,3))
    w = np.full((1,1,3,3), 1., dtype=np.float32)
    b = np.zeros(1, dtype=np.float32)
    conv = network.add_convolution_nd(
            input=input_t, num_output_maps=1, kernel_shape=(3,3), kernel=w, bias=b)
    conv.stride_nd = (1,1)
    conv.get_output(0).name = "output"
    network.mark_output(conv.get_output(0))
    builder.max_batch_size = 1
    config = builder.create_builder_config()
    config.max_workspace_size = 1<<20
    hostMem = builder.build_serialized_network(network, config)
    with open(fileName, "wb") as fileOut:
        fileOut.write(hostMem)

def performInference(fileName, inputData):
    runtime = trt.Runtime(GLOBAL_LOGGER)
    with open(fileName, "rb") as f:
        fileBytes = f.read()
    buffer = memoryview(fileBytes)
    engine = runtime.deserialize_cuda_engine(buffer)
    execContext = engine.create_execution_context()
    dInput = cuda.mem_alloc(inputData.nbytes)
    cuda.memcpy_htod(dInput, inputData)
    outputData = np.empty((1,), dtype=np.float32)
    dOutput = cuda.mem_alloc(outputData.nbytes)
    bindings = [int(dInput), int(dOutput)]
    execContext.execute_v2(bindings)
    cuda.memcpy_dtoh(outputData, dOutput)
    cuda.mem_free(dInput)
    cuda.mem_free(dOutput)
    return outputData