DriveWorks SDK Reference
3.5.78 Release
For Test and Development only

dnn/docs/dnn_usecase3.md
Go to the documentation of this file.
1 # Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
2 
3 @page dnn_usecase3 DNN with Safe DLA
4 
5 @note SW Release Applicability: This tutorial is applicable to modules in both **NVIDIA DriveWorks** and **NVIDIA DRIVE Software** releases.
6 
7 This code snippet demonstrates the how the DNN module with safe DLA enabled is typically used. Note that error handling is left out for clarity.
8 
9 Initialize network from file.
10 
11 In order to be able to use safe DLA, the model must be generated using `--useSafeDLA` option via @ref dwx_tensorRT_tool tool.
12 The processor type, while initializing DNN, must be either `::DW_PROCESSOR_TYPE_DLA_0` or `::DW_PROCESSOR_TYPE_DLA_1` depending on which DLA engine the inference should take place.
13 
14 ```{.cpp}
15  // Load the DNN from a file. Note that the DNN model has to be generated with the tensorRT_optimization tool.
16  dwDNNHandle_t dnn = nullptr;
17  dwDNN_initializeTensorRTFromFile(&dnn, "network.dla", nullptr, DW_PROCESSOR_TYPE_DLA_0, contextHandle);
18 ```
19 
20 Check that the loaded network has the expected number of inputs and outputs.
21 
22 ```{.cpp}
23  // Find out the number of input and output blobs in the netowrk
24  uint32_t numInputs = 0;
25  uint32_t numOutputs = 0;
26  dwDNN_getInputBlobCount(&numInputs, dnn);
27  dwDNN_getOutputBlobCount(&numOutputs, dnn);
28 
29  if (numInputs != 1) {
30  std::cerr << "Expected a DNN with one input blob." << std::endl;
31  return -1;
32  }
33  if (numOutputs != 1) {
34  std::cerr << "Expected a DNN with one output blobs." << std::endl;
35  return -1;
36  }
37 ```
38 
39 Ask the DNN about the order of the input and output blobs. The network is assumed to contain the input blob "data_in" and output blobs "data_out1" and "data_out2".
40 
41 ```{.cpp}
42  uint32_t inputIndex = 0;
43  uint32_t output1Index = 0;
44 
45  // Find indices of blobs by their name.
46  dwDNN_getInputIndex(&inputIndex, "data_in", dnn);
47  dwDNN_getOutputIndex(&output1Index, "data_out1", dnn);
48 ```
49 
50 Note that, safe DLA requires `RGBA` input with interleaved channels, and it provides outputs with
51 a `NCHWx` format.
52 
53 `NCHWx` format's layout is equivalent to a C array with dimensions
54 `[N][(C+x-1)/x][H][W][x]`, with the tensor coordinates `(n, c, h, w)`
55 mapping to array subscript `[n][c/x][h][w][c%x]` where:
56 
57 `N`: Batch size
58 `n`: Batch index
59 `C`: Number of planes
60 `c`: Plane index
61 `H`: Height
62 `h`: Vertical index
63 `W`: Width
64 `w`: horizontal index
65 `x`: Number of interleaved elements
66 
67 DLA dictates that `x` is equal to `32 / sizeof(DataType)`; therefore, for a tensor with FP16 precision,
68 `x` is `16`.
69 
70 Moreover, the input and output to a safe DLA model are expected to be tensors of type NvMedia.
71 In order to simplify the process of inference, `dwDataConditioner` and `dwDNN` modules
72 provide the streaming and conversion functionalities.
73 
74 In dwDNNTensors, the dimensions for NCHWx are stored as:
75 
76 ```{.cpp}
77  dims = {numChannels % x, W, H, (numChannels + x - 1U) / x, N};
78 ```
79 
80 Therefore, in order to compute the number of channels, which is needed for conversion to `NCHW`:
81 
82 ```{.cpp}
83  numChannels = (dims[2] - 1) * x + dims[0];
84 ```
85 
86 Below, we shall make use of these features:
87 
88 ```{.cpp}
89  // Get tensor properties and allocate tensors.
90  dwDNNTensorHandle_t inputTensor;
91  dwDNNTensorProperties inputProps;
92  dwDNN_getInputTensorProperties(&inputProps, inputIndex, dnn));
93  dwDNNTensor_createNew(&inputTensor, &inputProps, contextHandle);
94 
95  dwDNNTensorHandle_t outputTensor1;
96  dwDNNTensorProperties outputProps1;
97  dwDNN_getOutputTensorProperties(&outputProps1, output1Index, dnn));
98 
99  // Notice that by default, the tensor type is NVMEDIA,precision is FP16, layout is NCHWx.
100  // Change the properties to the format we need. Note that, in order to keep the inference
101  // asynchronous, only NvMedia or CUDA can be selected as output.
102  outputProps1.tensorType = DW_DNN_TENSOR_TYPE_CUDA;
103  outputProps1.tensorLayout = DW_DNN_TENSOR_LAYOUT_NCHW;
104  outputProps1.precision = DW_PRECISION_FP32;
105  outputProps1.numDimensions = 4U; // NCHWx has 5 dimensions. NCHW has 4.
106 
107  // Finally, estimate the number of channels based on the layout formula mentioned above.
108  uint32_t numChannels = (outputProps1.dims[2] - 1) * x + outputProps1.dims[0];
109  outputProps1.dimensionSize[0] = outputProps1.dimensionSize[1];
110  outputProps1.dimensionSize[1] = outputProps1.dimensionSize[2];
111  outputProps1.dimensionSize[2] = numChannels;
112  outputProps1.dimensionSize[3] = outputProps1.dimensionSize[4];
113  dwDNNTensor_createNew(&outputTensor1, &outputProps1, contextHandle);
114 
115  // Alternatively, if the dimensions are known in advance, the step above can be skipped and
116  // the dimensions can be set manually.
117 
118  // Create data conditioner to convert an input image to input tensor.
119  dwDNNMetaData dnnMetaData;
120  dwDNN_getMetaData(&dnnMetaData, dnn);
121  dwDataConditionerHandle_t dataConditioner;
122  dwDataConditioner_initializeFromTensorProperties(&dataConditioner, &inputProps, 1U,
123  &metadata.dataConditionerParams, cudaStream,
124  contextHandle);
125 
126  // Create CPU tensors for outputs.
127  dwDNNTensorHandle_t outputTensorHost1;
128  dwDNNTensorProperties outputPropsHost1 = outputProps1;
129  outputPropsHost1.tensorType = DW_DNN_TENSOR_TYPE_CPU;
130 
131  dwDNNTensor_createNew(&outputTensorHost1, &outputPropsHost1, contextHandle);
132 
133  // Create tensor streamers to stream outputs from GPU to CPU if needed
134  dwDNNTensorStreamerHandle_t streamer1;
135  dwDNNTensorStreamer_initialize(&streamer1, &outputPropsHost1, outputPropsHost1.tensorType, m_sdk);
136 ```
137 
138 Convert DNN input from image to tensor, then perform DNN inference and stream results back. All operations are performed asynchronously with the host code.
139 
140 ```{.cpp}
141  // Run data conditioner to get input tensor
142  dwRect roi{0U, 0U, imageWidth, imageHeight};
143  dwDataConditioner_prepareData(inputTensor, &inputImage, 1, &roi,
144  cudaAddressModeClamp, dataConditioner);
145 
146  // Begin DNN inference in the currently selected CUDA stream.
147  dwConstDNNTensorHandle_t inputs[1U] = {inputTensor};
148  dwDNNTensorHandle_t outputs[1U] = {outputTensor1};
149  dwDNN_infer(outputs, inputs, dnn);
150 
151  // Stream results from GPU to CPU
152  dwDNNTensorStreamer_producerSend(outputTensor1, streamer1);
153  dwDNNTensorStreamer_consumerReceive(&outputTensorHost1, streamer1);
154 
155  // Work on received output tensors.
156  void* data1;
157  dwDNNTensor_lock(&data1, outputTensorHost1);
158 
159  doit(data1);
160 
161  dwDNNTensor_unlock(outputTensorHost1);
162 
163  // Return streamed tensors.
164  dwDNNTensorStreamer_consumerReturn(&outputTensorHost1, streamer1);
165  dwDNNTensorStreamer_producerReturn(nullptr, 1000, streamer1);
166 ```
167 
168 Finally, free previously allocated memory.
169 
170 ```{.cpp}
171  dwDNNTensor_destroy(outputTensor1);
172  dwDNNTensor_destroy(outputTensorHost1);
173  dwDNNTensorStreamer_release(streamer1);
174  dwDataConditioner_release(dataconditioner);
175  dwDNN_release(dnn);
176 ```