1 # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
3 @page dnn_usecase3 DNN with Safe DLA
5 @note SW Release Applicability: This tutorial is applicable to modules in both **NVIDIA DriveWorks** and **NVIDIA DRIVE Software** releases.
7 This code snippet demonstrates the how the DNN module with safe DLA enabled is typically used. Note that error handling is left out for clarity.
9 Initialize network from file.
11 In order to be able to use safe DLA, the model must be generated using `--useSafeDLA` option via @ref dwx_tensorRT_tool tool.
12 The processor type, while initializing DNN, must be either `::DW_PROCESSOR_TYPE_DLA_0` or `::DW_PROCESSOR_TYPE_DLA_1` depending on which DLA engine the inference should take place.
15 // Load the DNN from a file. Note that the DNN model has to be generated with the tensorRT_optimization tool.
16 dwDNNHandle_t dnn = nullptr;
17 dwDNN_initializeTensorRTFromFile(&dnn, "network.dla", nullptr, DW_PROCESSOR_TYPE_DLA_0, contextHandle);
20 Check that the loaded network has the expected number of inputs and outputs.
23 // Find out the number of input and output blobs in the netowrk
24 uint32_t numInputs = 0;
25 uint32_t numOutputs = 0;
26 dwDNN_getInputBlobCount(&numInputs, dnn);
27 dwDNN_getOutputBlobCount(&numOutputs, dnn);
30 std::cerr << "Expected a DNN with one input blob." << std::endl;
33 if (numOutputs != 1) {
34 std::cerr << "Expected a DNN with one output blobs." << std::endl;
39 Ask the DNN about the order of the input and output blobs. The network is assumed to contain the input blob "data_in" and output blobs "data_out1" and "data_out2".
42 uint32_t inputIndex = 0;
43 uint32_t output1Index = 0;
45 // Find indices of blobs by their name.
46 dwDNN_getInputIndex(&inputIndex, "data_in", dnn);
47 dwDNN_getOutputIndex(&output1Index, "data_out1", dnn);
50 Note that, safe DLA requires `RGBA` input with interleaved channels, and it provides outputs with
53 `NCHWx` format's layout is equivalent to a C array with dimensions
54 `[N][(C+x-1)/x][H][W][x]`, with the tensor coordinates `(n, c, h, w)`
55 mapping to array subscript `[n][c/x][h][w][c%x]` where:
65 `x`: Number of interleaved elements
67 DLA dictates that `x` is equal to `32 / sizeof(DataType)`; therefore, for a tensor with FP16 precision,
70 Moreover, the input and output to a safe DLA model are expected to be tensors of type NvMedia.
71 In order to simplify the process of inference, `dwDataConditioner` and `dwDNN` modules
72 provide the streaming and conversion functionalities.
74 In dwDNNTensors, the dimensions for NCHWx are stored as:
77 dims = {numChannels % x, W, H, (numChannels + x - 1U) / x, N};
80 Therefore, in order to compute the number of channels, which is needed for conversion to `NCHW`:
83 numChannels = (dims[2] - 1) * x + dims[0];
86 Below, we shall make use of these features:
89 // Get tensor properties and allocate tensors.
90 dwDNNTensorHandle_t inputTensor;
91 dwDNNTensorProperties inputProps;
92 dwDNN_getInputTensorProperties(&inputProps, inputIndex, dnn));
93 dwDNNTensor_createNew(&inputTensor, &inputProps, contextHandle);
95 dwDNNTensorHandle_t outputTensor1;
96 dwDNNTensorProperties outputProps1;
97 dwDNN_getOutputTensorProperties(&outputProps1, output1Index, dnn));
99 // Notice that by default, the tensor type is NVMEDIA,precision is FP16, layout is NCHWx.
100 // Change the properties to the format we need. Note that, in order to keep the inference
101 // asynchronous, only NvMedia or CUDA can be selected as output.
102 outputProps1.tensorType = DW_DNN_TENSOR_TYPE_CUDA;
103 outputProps1.tensorLayout = DW_DNN_TENSOR_LAYOUT_NCHW;
104 outputProps1.precision = DW_PRECISION_FP32;
105 outputProps1.numDimensions = 4U; // NCHWx has 5 dimensions. NCHW has 4.
107 // Finally, estimate the number of channels based on the layout formula mentioned above.
108 uint32_t numChannels = (outputProps1.dims[2] - 1) * x + outputProps1.dims[0];
109 outputProps1.dimensionSize[0] = outputProps1.dimensionSize[1];
110 outputProps1.dimensionSize[1] = outputProps1.dimensionSize[2];
111 outputProps1.dimensionSize[2] = numChannels;
112 outputProps1.dimensionSize[3] = outputProps1.dimensionSize[4];
113 dwDNNTensor_createNew(&outputTensor1, &outputProps1, contextHandle);
115 // Alternatively, if the dimensions are known in advance, the step above can be skipped and
116 // the dimensions can be set manually.
118 // Create data conditioner to convert an input image to input tensor.
119 dwDNNMetaData dnnMetaData;
120 dwDNN_getMetaData(&dnnMetaData, dnn);
121 dwDataConditionerHandle_t dataConditioner;
122 dwDataConditioner_initializeFromTensorProperties(&dataConditioner, &inputProps, 1U,
123 &metadata.dataConditionerParams, cudaStream,
126 // Create CPU tensors for outputs.
127 dwDNNTensorHandle_t outputTensorHost1;
128 dwDNNTensorProperties outputPropsHost1 = outputProps1;
129 outputPropsHost1.tensorType = DW_DNN_TENSOR_TYPE_CPU;
131 dwDNNTensor_createNew(&outputTensorHost1, &outputPropsHost1, contextHandle);
133 // Create tensor streamers to stream outputs from GPU to CPU if needed
134 dwDNNTensorStreamerHandle_t streamer1;
135 dwDNNTensorStreamer_initialize(&streamer1, &outputPropsHost1, outputPropsHost1.tensorType, m_sdk);
138 Convert DNN input from image to tensor, then perform DNN inference and stream results back. All operations are performed asynchronously with the host code.
141 // Run data conditioner to get input tensor
142 dwRect roi{0U, 0U, imageWidth, imageHeight};
143 dwDataConditioner_prepareData(inputTensor, &inputImage, 1, &roi,
144 cudaAddressModeClamp, dataConditioner);
146 // Begin DNN inference in the currently selected CUDA stream.
147 dwConstDNNTensorHandle_t inputs[1U] = {inputTensor};
148 dwDNNTensorHandle_t outputs[1U] = {outputTensor1};
149 dwDNN_infer(outputs, inputs, dnn);
151 // Stream results from GPU to CPU
152 dwDNNTensorStreamer_producerSend(outputTensor1, streamer1);
153 dwDNNTensorStreamer_consumerReceive(&outputTensorHost1, streamer1);
155 // Work on received output tensors.
157 dwDNNTensor_lock(&data1, outputTensorHost1);
161 dwDNNTensor_unlock(outputTensorHost1);
163 // Return streamed tensors.
164 dwDNNTensorStreamer_consumerReturn(&outputTensorHost1, streamer1);
165 dwDNNTensorStreamer_producerReturn(nullptr, 1000, streamer1);
168 Finally, free previously allocated memory.
171 dwDNNTensor_destroy(outputTensor1);
172 dwDNNTensor_destroy(outputTensorHost1);
173 dwDNNTensorStreamer_release(streamer1);
174 dwDataConditioner_release(dataconditioner);