Note: SW Release Applicability: This tutorial is applicable to modules in both NVIDIA DriveWorks and NVIDIA DRIVE Software releases.

This code snippet demonstrates the how the DNN module with safe DLA enabled is typically used. Note that error handling is left out for clarity.

Initialize network from file.

In order to be able to use safe DLA, the model must be generated using --useSafeDLA option via TensorRT Optimizer Tool tool. The processor type, while initializing DNN, must be either DW_PROCESSOR_TYPE_DLA_0 or DW_PROCESSOR_TYPE_DLA_1 depending on which DLA engine the inference should take place.

// Load the DNN from a file. Note that the DNN model has to be generated with the tensorRT_optimization tool.
dwDNNHandle_t dnn = nullptr;
dwDNN_initializeTensorRTFromFile(&dnn, "network.dla", nullptr, DW_PROCESSOR_TYPE_DLA_0, contextHandle);

Check that the loaded network has the expected number of inputs and outputs.

// Find out the number of input and output blobs in the netowrk
uint32_t numInputs = 0;
uint32_t numOutputs = 0;
dwDNN_getInputBlobCount(&numInputs, dnn);
dwDNN_getOutputBlobCount(&numOutputs, dnn);
if (numInputs != 1) {
    std::cerr << "Expected a DNN with one input blob." << std::endl;
    return -1;
}
if (numOutputs != 1) {
    std::cerr << "Expected a DNN with one output blobs." << std::endl;
    return -1;
}

Ask the DNN about the order of the input and output blobs. The network is assumed to contain the input blob "data_in" and output blobs "data_out1" and "data_out2".

uint32_t inputIndex = 0;
uint32_t output1Index = 0;
// Find indices of blobs by their name.
dwDNN_getInputIndex(&inputIndex, "data_in", dnn);
dwDNN_getOutputIndex(&output1Index, "data_out1", dnn);

Note that, safe DLA requires RGBA input with interleaved channels, and it provides outputs with a NCHWx format.

NCHWx format's layout is equivalent to a C array with dimensions [N][(C+x-1)/x][H][W][x], with the tensor coordinates (n, c, h, w) mapping to array subscript [n][c/x][h][w][cx] where:

N: Batch size n: Batch index C: Number of planes c: Plane index H: Height h: Vertical index W: Width w: horizontal index x: Number of interleaved elements

DLA dictates that x is equal to 32 / sizeof(DataType); therefore, for a tensor with FP16 precision, x is 16.

Moreover, the input and output to a safe DLA model are expected to be tensors of type NvMedia. In order to simplify the process of inference, dwDataConditioner and dwDNN modules provide the streaming and conversion functionalities.

In dwDNNTensors, the dimensions for NCHWx are stored as:

dims = {numChannels % x, W, H, (numChannels + x - 1U) / x, N};

Therefore, in order to compute the number of channels, which is needed for conversion to NCHW:

numChannels = (dims[2] - 1) * x + dims[0];

Below, we shall make use of these features:

// Get tensor properties and allocate tensors.
dwDNNTensorHandle_t inputTensor;
dwDNNTensorProperties inputProps;
dwDNN_getInputTensorProperties(&inputProps, inputIndex, dnn));
dwDNNTensor_createNew(&inputTensor, &inputProps, contextHandle);
dwDNNTensorHandle_t outputTensor1;
dwDNNTensorProperties outputProps1;
dwDNN_getOutputTensorProperties(&outputProps1, output1Index, dnn));
// Notice that by default, the tensor type is NVMEDIA,precision is FP16, layout is NCHWx.
// Change the properties to the format we need. Note that, in order to keep the inference
// asynchronous, only NvMedia or CUDA can be selected as output.
outputProps1.tensorType = DW_DNN_TENSOR_TYPE_CUDA;
outputProps1.tensorLayout = DW_DNN_TENSOR_LAYOUT_NCHW;
outputProps1.precision = DW_PRECISION_FP32;
outputProps1.numDimensions = 4U; // NCHWx has 5 dimensions. NCHW has 4.
// Finally, estimate the number of channels based on the layout formula mentioned above.
uint32_t numChannels = (outputProps1.dims[2] - 1) * x + outputProps1.dims[0];
outputProps1.dimensionSize[0] = outputProps1.dimensionSize[1];
outputProps1.dimensionSize[1] = outputProps1.dimensionSize[2];
outputProps1.dimensionSize[2] = numChannels;
outputProps1.dimensionSize[3] = outputProps1.dimensionSize[4];
dwDNNTensor_createNew(&outputTensor1, &outputProps1, contextHandle);
// Alternatively, if the dimensions are known in advance, the step above can be skipped and 
// the dimensions can be set manually.
// Create data conditioner to convert an input image to input tensor.
dwDNNMetaData dnnMetaData;
dwDNN_getMetaData(&dnnMetaData, dnn);
dwDataConditionerHandle_t dataConditioner;
dwDataConditioner_initializeFromTensorProperties(&dataConditioner, &inputProps, 1U,
                                                 &metadata.dataConditionerParams, cudaStream, 
                                                 contextHandle);
// Create CPU tensors for outputs.
dwDNNTensorHandle_t outputTensorHost1;
dwDNNTensorProperties outputPropsHost1 = outputProps1;
outputPropsHost1.tensorType = DW_DNN_TENSOR_TYPE_CPU;
dwDNNTensor_createNew(&outputTensorHost1, &outputPropsHost1, contextHandle);
// Create tensor streamers to stream outputs from GPU to CPU if needed
dwDNNTensorStreamerHandle_t streamer1;
dwDNNTensorStreamer_initialize(&streamer1, &outputPropsHost1, outputPropsHost1.tensorType, m_sdk);

Convert DNN input from image to tensor, then perform DNN inference and stream results back. All operations are performed asynchronously with the host code.

// Run data conditioner to get input tensor
dwRect roi{0U, 0U, imageWidth, imageHeight};
dwDataConditioner_prepareData(inputTensor, &inputImage, 1, &roi,
                              cudaAddressModeClamp, dataConditioner);
// Begin DNN inference in the currently selected CUDA stream.
dwConstDNNTensorHandle_t inputs[1U] = {inputTensor};
dwDNNTensorHandle_t outputs[1U] = {outputTensor1};
dwDNN_infer(outputs, inputs, dnn);
// Stream results from GPU to CPU
dwDNNTensorStreamer_producerSend(outputTensor1, streamer1);
dwDNNTensorStreamer_consumerReceive(&outputTensorHost1, streamer1);
// Work on received output tensors.
void* data1;
dwDNNTensor_lock(&data1, outputTensorHost1);
doit(data1);
dwDNNTensor_unlock(outputTensorHost1);
// Return streamed tensors.
dwDNNTensorStreamer_consumerReturn(&outputTensorHost1, streamer1);
dwDNNTensorStreamer_producerReturn(nullptr, 1000, streamer1);

Finally, free previously allocated memory.

dwDNNTensor_destroy(outputTensor1);
dwDNNTensor_destroy(outputTensorHost1);
dwDNNTensorStreamer_release(streamer1);
dwDataConditioner_release(dataconditioner);
dwDNN_release(dnn);