Computing gradients via backward propagation¶
The following code example illustrates how to compute the gradients of a tensor network w.r.t. user-selected input tensors via backward propagation. The full code can be found in the NVIDIA/cuQuantum repository (here).
This example uses the network-centric API introduced in cuTensorNet v2.9.0. Key functions include cutensornetCreateNetwork, cutensornetNetworkAppendTensor, cutensornetNetworkSetOutputTensor, cutensornetNetworkPrepareContraction, cutensornetNetworkContract, cutensornetNetworkSetAdjointTensorMemory, cutensornetNetworkSetGradientTensorMemory, cutensornetNetworkPrepareGradientsBackward, and cutensornetNetworkComputeGradientsBackward.
Headers and data types¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <algorithm>
12#include <unordered_map>
13#include <vector>
14#include <cassert>
15
16#include <cuda_runtime.h>
17#include <cutensornet.h>
18
19#define HANDLE_ERROR(x) \
20 do { \
21 const auto err = x; \
22 if (err != CUTENSORNET_STATUS_SUCCESS) \
23 { \
24 printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
25 fflush(stdout); \
26 exit(EXIT_FAILURE); \
27 } \
28 } while (0)
29
30#define HANDLE_CUDA_ERROR(x) \
31 do { \
32 const auto err = x; \
33 if (err != cudaSuccess) \
34 { \
35 printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
36 fflush(stdout); \
37 exit(EXIT_FAILURE); \
38 } \
39 } while (0)
40
41// Usage: DEV_ATTR(cudaDevAttrClockRate, deviceId)
42#define DEV_ATTR(ENUMCONST, DID) \
43 ({ int v; \
44 HANDLE_CUDA_ERROR(cudaDeviceGetAttribute(&v, ENUMCONST, DID)); \
45 v; })
46
47struct GPUTimer
48{
49 GPUTimer(cudaStream_t stream) : stream_(stream)
50 {
51 HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
52 HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
53 }
54
55 ~GPUTimer()
56 {
57 HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
58 HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
59 }
60
61 void start() { HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_)); }
62
63 float seconds()
64 {
65 HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
66 HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
67 float time;
68 HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
69 return time * 1e-3;
70 }
71
72private:
73 cudaEvent_t start_, stop_;
74 cudaStream_t stream_;
75};
76
77int main()
78{
79 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
80
81 bool verbose = true;
82
83 // Check cuTensorNet version
84 const size_t cuTensornetVersion = cutensornetGetVersion();
85 if (verbose) printf("cuTensorNet version: %ld\n", cuTensornetVersion);
86
87 // Set GPU device
88 int numDevices{0};
89 HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
90 const int deviceId = 0;
91 HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
92 cudaDeviceProp prop;
93 HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
94
95 if (verbose)
96 {
97 printf("===== device info ======\n");
98 printf("GPU-local-id:%d\n", deviceId);
99 printf("GPU-name:%s\n", prop.name);
100 printf("GPU-clock:%d\n", DEV_ATTR(cudaDevAttrClockRate, deviceId));
101 printf("GPU-memoryClock:%d\n", DEV_ATTR(cudaDevAttrMemoryClockRate, deviceId));
102 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
103 printf("GPU-major:%d\n", prop.major);
104 printf("GPU-minor:%d\n", prop.minor);
105 printf("========================\n");
106 }
107
108 typedef float floatType;
109 cudaDataType_t typeData = CUDA_R_32F;
110 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
111
112 if (verbose) printf("Included headers and defined data types\n");
Define tensor network and tensor sizes¶
Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity), and specify the input tensor IDs whose gradients will be computed.
See also the network definition APIs cutensornetCreateNetwork, cutensornetNetworkAppendTensor, and cutensornetNetworkSetOutputTensor.
116 /**********************
117 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
118 * We will execute the contraction and compute the gradients of input tensors A, B, C
119 **********************/
120
121 constexpr int32_t numInputs = 6;
122 std::vector<int32_t> gradInputIDs = {0, 1, 2};
123
124 // Create vectors of tensor modes
125 std::vector<std::vector<int32_t>> tensorModes{
126 {'a', 'b', 'c', 'd'}, // tensor A
127 {'b', 'c', 'd', 'e'}, // tensor B
128 {'e', 'g', 'h'}, // tensor C
129 {'g', 'h', 'i', 'j'}, // tensor D
130 {'i', 'j', 'k', 'l'}, // tensor E
131 {'k', 'l', 'm'}, // tensor F
132 {'a', 'm'} // tensor O
133 };
134
135 // Set mode extents
136 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
137 std::unordered_map<int32_t, int64_t> extent;
138 for (auto& vec : tensorModes)
139 {
140 for (auto& mode : vec)
141 {
142 extent[mode] = sameExtent;
143 }
144 }
145
146 // Create a vector of extents for each tensor
147 std::vector<std::vector<int64_t>> tensorExtents; // for input tensors & output tensor
148 tensorExtents.resize(numInputs + 1); // hold inputs + output tensors
149 for (int32_t t = 0; t < numInputs + 1; ++t)
150 {
151 for (auto& mode : tensorModes[t]) tensorExtents[t].push_back(extent[mode]);
152 }
153
154 if (verbose) printf("Defined tensor network, modes, and extents\n");
Allocate memory, initialize data, initialize cuTensorNet handle¶
Next, we allocate memory for the tensor network operands and initialize them to random values.
We also allocate memory for the gradient tensors corresponding to the selected input tensors for gradient computation,
as well as the activation tensor which we initialize to ones.
Then, we initialize the cuTensorNet library via cutensornetCreate()
.
Note that the created library context will be associated with the currently active GPU.
157 /**********************
158 * Allocating data
159 **********************/
160
161 std::vector<size_t> tensorElements(numInputs + 1); // for input tensors & output tensor
162 std::vector<size_t> tensorSizes(numInputs + 1); // for input tensors & output tensor
163 size_t totalSize = 0;
164 for (int32_t t = 0; t < numInputs + 1; ++t)
165 {
166 size_t numElements = 1;
167 for (auto& mode : tensorModes[t]) numElements *= extent[mode];
168 tensorElements[t] = numElements;
169
170 tensorSizes[t] = sizeof(floatType) * numElements;
171 totalSize += tensorSizes[t];
172 }
173
174 if (verbose) printf("Total GPU memory used for tensor storage: %.2f GiB\n", (totalSize) / 1024. / 1024. / 1024);
175
176 void* tensorData_d[numInputs + 1]; // for input tensors & output tensor
177 for (int32_t t = 0; t < numInputs + 1; ++t)
178 {
179 HANDLE_CUDA_ERROR(cudaMalloc((void**)&tensorData_d[t], tensorSizes[t]));
180 }
181 void* adjoint_d; // hold data of the adjoint/activation tensor
182 HANDLE_CUDA_ERROR(cudaMalloc((void**)&adjoint_d, tensorSizes[numInputs]));
183
184 floatType* tensorData_h[numInputs + 1]; // for input tensors & output tensor
185 for (int32_t t = 0; t < numInputs + 1; ++t)
186 {
187 tensorData_h[t] = (floatType*)malloc(tensorSizes[t]);
188 if (tensorData_h[t] == NULL)
189 {
190 printf("Error: Host memory allocation failed!\n");
191 return -1;
192 }
193 }
194 floatType* adjoint_h = (floatType*)malloc(tensorSizes[numInputs]);
195 if (adjoint_h == NULL)
196 {
197 printf("Error: Host memory allocation failed!\n");
198 return -1;
199 }
200
201 void* gradients_d[numInputs] = {nullptr};
202 for (auto i : gradInputIDs)
203 {
204 HANDLE_CUDA_ERROR(cudaMalloc((void**)&gradients_d[i], tensorSizes[i]));
205 }
206 void* gradients_h[numInputs] = {nullptr};
207 for (auto i : gradInputIDs)
208 {
209 gradients_h[i] = (floatType*)malloc(tensorSizes[i]);
210 if (gradients_h[i] == NULL)
211 {
212 printf("Error: Host memory allocation failed!\n");
213 return -1;
214 }
215 }
216
217 /*******************
218 * Initialize data
219 *******************/
220
221 // set output tensor data to all 0s
222 memset(tensorData_h[numInputs], 0, tensorSizes[numInputs]);
223 // init input tensors data to random values
224 for (int32_t t = 0; t < numInputs; ++t)
225 {
226 for (size_t e = 0; e < tensorElements[t]; ++e) tensorData_h[t][e] = ((floatType)rand()) / RAND_MAX;
227 }
228 // set activation tensor to all 1s
229 for (size_t e = 0; e < tensorElements[numInputs]; ++e) adjoint_h[e] = (floatType)1.0;
230
231 // copy tensors' data to device buffers
232 for (int32_t t = 0; t < numInputs; ++t)
233 {
234 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[t], tensorData_h[t], tensorSizes[t], cudaMemcpyHostToDevice));
235 }
236 HANDLE_CUDA_ERROR(cudaMemcpy(adjoint_d, adjoint_h, tensorSizes[numInputs], cudaMemcpyHostToDevice));
237
238 /*************************
239 * cuTensorNet
240 *************************/
241
242 cudaStream_t stream;
243 HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
244
245 cutensornetHandle_t handle;
246 HANDLE_ERROR(cutensornetCreate(&handle));
247
248 if (verbose) printf("Allocated GPU memory for data, initialized data, and created library handle\n");
Construct the network¶
We create the network descriptor, and append the input tensors with the desired tensor modes and extents, as well as the data type. We can, optional, set the output tensor modes (if skipped, the output modes will be inferred). To compute gradients with respect to specific input tensors, those tensors must be tagged (e.g., specified) using tensor qualifiers.
251 /*******************************
252 * Create Network
253 *******************************/
254
255 // Set up tensor network
256 cutensornetNetworkDescriptor_t networkDesc;
257 HANDLE_ERROR(cutensornetCreateNetwork(handle, &networkDesc));
258
259 int64_t tensorIDs[numInputs]; // for input tensors
260 // attach the input tensors to the network
261 for (int32_t t = 0; t < numInputs; ++t)
262 {
263 cutensornetTensorQualifiers_t qualifiers{0, 0, 0};
264 qualifiers.requiresGradient = gradInputIDs.end() != std::find(gradInputIDs.begin(), gradInputIDs.end(), t);
265 HANDLE_ERROR(cutensornetNetworkAppendTensor(handle,
266 networkDesc,
267 tensorModes[t].size(),
268 tensorExtents[t].data(),
269 tensorModes[t].data(),
270 &qualifiers,
271 typeData,
272 &tensorIDs[t]));
273 }
274
275 // set output tensor of the network
276 HANDLE_ERROR(cutensornetNetworkSetOutputTensor(handle,
277 networkDesc,
278 tensorModes[numInputs].size(),
279 tensorModes[numInputs].data(),
280 typeData));
281
282 // set the network compute type
283 HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
284 networkDesc,
285 CUTENSORNET_NETWORK_COMPUTE_TYPE,
286 &typeCompute,
287 sizeof(typeCompute)));
288
289 if (verbose) printf("Initialized the cuTensorNet library and created a tensor network\n");
Contraction order¶
In this example, we illustrate using a predetermined contraction path and
setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute()
.
We also attach the constructed optimizer info object to the network via cutensornetNetworkSetOptimizerInfo()
292 /*******************************
293 * Choose workspace limit based on available resources.
294 *******************************/
295
296 size_t freeMem, totalMem;
297 HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
298 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
299 if (verbose) printf("Workspace limit = %lu\n", workspaceLimit);
300
301 /*******************************
302 * Set contraction order
303 *******************************/
304
305 // Create contraction optimizer info
306 cutensornetContractionOptimizerInfo_t optimizerInfo;
307 HANDLE_ERROR(cutensornetCreateContractionOptimizerInfo(handle, networkDesc, &optimizerInfo));
308
309 // set a predetermined contraction path
310 std::vector<int32_t> path{0, 1, 0, 4, 0, 3, 0, 2, 0, 1};
311 const auto numContractions = numInputs - 1;
312 cutensornetContractionPath_t contPath;
313 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
314 contPath.numContractions = numContractions;
315
316 // provide user-specified contPath
317 HANDLE_ERROR(cutensornetContractionOptimizerInfoSetAttribute(handle,
318 optimizerInfo,
319 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
320 &contPath,
321 sizeof(contPath)));
322
323 // Attach the optimizer info to the network
324 HANDLE_ERROR(cutensornetNetworkSetOptimizerInfo(handle,
325 networkDesc,
326 optimizerInfo));
327 int64_t numSlices = 1;
328
329 if (verbose) printf("Set predetermined contraction path into cuTensorNet optimizer\n");
Create workspace descriptor and allocate workspace memory¶
Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed
to contract the network.
To enable gradient computation, we need to provide CACHE workspace that will be used to store intermediate tensors’ data
necessary for the backward propagation call to consume.
Thus, we query sizes and allocate device memory for both kinds of workspaces
(CUTENSORNET_WORKSPACE_SCRATCH
, and CUTENSORNET_WORKSPACE_CACHE
) and set these in the workspace descriptor.
The workspace descriptor will be provided to the contraction preparation, contraction computation, and gradient computation APIs.
See also cutensornetWorkspaceSetMemory, cutensornetWorkspaceGetMemorySize, and cutensornetWorkspacePurgeCache.
332 /*******************************
333 * Create workspace descriptor, allocate workspace, and set it.
334 *******************************/
335
336 cutensornetWorkspaceDescriptor_t workDesc;
337 HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
338
339 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
340 int64_t requiredWorkspaceSizeScratch = 0;
341 HANDLE_ERROR(cutensornetWorkspaceComputeContractionSizes(handle, networkDesc, optimizerInfo, workDesc));
342
343 HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
344 workDesc,
345 CUTENSORNET_WORKSIZE_PREF_MIN,
346 CUTENSORNET_MEMSPACE_DEVICE,
347 CUTENSORNET_WORKSPACE_SCRATCH,
348 &requiredWorkspaceSizeScratch));
349
350 void* workScratch = nullptr;
351 HANDLE_CUDA_ERROR(cudaMalloc(&workScratch, requiredWorkspaceSizeScratch));
352
353 HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
354 workDesc,
355 CUTENSORNET_MEMSPACE_DEVICE,
356 CUTENSORNET_WORKSPACE_SCRATCH,
357 workScratch,
358 requiredWorkspaceSizeScratch));
359
360 // set CACHE workspace, which will be used across network contraction operations
361 int64_t requiredWorkspaceSizeCache = 0;
362 HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
363 workDesc,
364 CUTENSORNET_WORKSIZE_PREF_MIN,
365 CUTENSORNET_MEMSPACE_DEVICE,
366 CUTENSORNET_WORKSPACE_CACHE,
367 &requiredWorkspaceSizeCache));
368
369 void* workCache = nullptr;
370 HANDLE_CUDA_ERROR(cudaMalloc(&workCache, requiredWorkspaceSizeCache));
371
372 HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
373 workDesc,
374 CUTENSORNET_MEMSPACE_DEVICE,
375 CUTENSORNET_WORKSPACE_CACHE,
376 workCache,
377 requiredWorkspaceSizeCache));
378
379 if (verbose) printf("Allocated and set up the GPU workspace\n");
Contraction preparation and auto-tuning¶
We prepare the tensor network contraction, via cutensornetNetworkPrepareContraction()
.
Optionally, we can auto-tune the contraction, via cutensornetNetworkAutotuneContraction()
,
such that cuTENSOR selects the best kernel for each pairwise contraction.
This prepared network contraction can be reused for many (possibly different) data inputs, avoiding
the cost of re-initializing it redundantly.
In the network-centric flow, see also cutensornetNetworkAutotuneContraction.
Input and output buffers are attached using cutensornetNetworkSetInputTensorMemory, and cutensornetNetworkSetOutputTensorMemory.
382 /**********************************
383 * Prepare the network contraction.
384 **********************************/
385
386 // set tensor's data buffers and strides
387 for (int32_t t = 0; t < numInputs; ++t)
388 {
389 HANDLE_ERROR(cutensornetNetworkSetInputTensorMemory(handle,
390 networkDesc,
391 tensorIDs[t],
392 tensorData_d[t],
393 NULL));
394 }
395 HANDLE_ERROR(cutensornetNetworkSetOutputTensorMemory(handle,
396 networkDesc,
397 tensorData_d[numInputs],
398 NULL));
399
400 HANDLE_ERROR(cutensornetNetworkPrepareContraction(handle,
401 networkDesc,
402 workDesc));
403
404 /*******************************
405 * Optional: Auto-tune the network's contraction to pick the fastest kernel
406 * for each pairwise tensor contraction.
407 *******************************/
408 cutensornetNetworkAutotunePreference_t autotunePref;
409 HANDLE_ERROR(cutensornetCreateNetworkAutotunePreference(handle, &autotunePref));
410
411 const int numAutotuningIterations = 5; // may be 0
412 HANDLE_ERROR(cutensornetNetworkAutotunePreferenceSetAttribute(handle,
413 autotunePref,
414 CUTENSORNET_NETWORK_AUTOTUNE_MAX_ITERATIONS,
415 &numAutotuningIterations,
416 sizeof(numAutotuningIterations)));
417
418 // Autotune the network to find the best pair-wise contractions
419 HANDLE_ERROR(cutensornetNetworkAutotuneContraction(handle,
420 networkDesc,
421 workDesc,
422 autotunePref,
423 stream));
424
425 HANDLE_ERROR(cutensornetDestroyNetworkAutotunePreference(autotunePref));
426
427 if (verbose) printf("Prepared the network contraction for cuTensorNet and optionally auto-tuned it\n");
Tensor network contraction execution and gradient computation¶
Finally, we contract the tensor network via cutensornetNetworkContract()
.
After contracting the network (which will store intermediate tensors’ data in the CACHE memory),
we prepare the gradient computation via cutensornetNetworkPrepareGradientsBackward()
and compute the required gradients through backward propagation via cutensornetNetworkComputeGradientsBackward()
.
We must purge the CACHE memory for each data set to allow the network contraction call to store the correct intermediate data in the CACHE memory.
Gradient and adjoint/activation buffers are attached using cutensornetNetworkSetGradientTensorMemory, and cutensornetNetworkSetAdjointTensorMemory.
See also cutensornetNetworkPrepareGradientsBackward, cutensornetNetworkComputeGradientsBackward, and cutensornetWorkspacePurgeCache.
430 /**********************
431 * Execute the tensor network contraction
432 **********************/
433
434 // Create a cutensornetSliceGroup_t object from a range of slice IDs
435 cutensornetSliceGroup_t sliceGroup{};
436 HANDLE_ERROR(cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup));
437
438 GPUTimer timer{stream};
439 // restore the output tensor on GPU
440 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[numInputs], tensorData_h[numInputs], tensorSizes[numInputs], cudaMemcpyHostToDevice));
441 HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
442
443 /*
444 * Contract all slices of the tensor network
445 */
446 timer.start();
447
448 int32_t accumulateOutput = 0; // output tensor data will be overwritten
449 HANDLE_ERROR(cutensornetNetworkContract(handle,
450 networkDesc,
451 accumulateOutput,
452 workDesc,
453 sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
454 stream));
455
456 // Synchronize and measure timing
457 auto time = timer.seconds();
458
459 /**********************
460 * Prepare the tensor network gradient computation
461 **********************/
462
463 HANDLE_ERROR(cutensornetNetworkSetAdjointTensorMemory(handle, networkDesc, adjoint_d, NULL));
464
465 for (auto gid : gradInputIDs) // for only those tensors that require the gradient
466 {
467 HANDLE_ERROR(cutensornetNetworkSetGradientTensorMemory(handle,
468 networkDesc,
469 gid,
470 gradients_d[gid], NULL));
471 }
472
473 HANDLE_ERROR(cutensornetNetworkPrepareGradientsBackward(handle, networkDesc, workDesc));
474
475 /**********************
476 * Execute the tensor network gradient computation
477 **********************/
478 timer.start();
479
480 HANDLE_ERROR(cutensornetNetworkComputeGradientsBackward(handle,
481 networkDesc,
482 accumulateOutput,
483 workDesc,
484 sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
485 stream));
486 // Synchronize and measure timing
487 time += timer.seconds();
488
489 if (verbose) printf("Contracted the tensor network and computed gradients\n");
490
491 // restore the output tensor on Host
492 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_h[numInputs], tensorData_d[numInputs], tensorSizes[numInputs], cudaMemcpyDeviceToHost));
493
494 for (auto i : gradInputIDs)
495 {
496 HANDLE_CUDA_ERROR(cudaMemcpy(gradients_h[i], gradients_d[i], tensorSizes[i], cudaMemcpyDeviceToHost));
497 }
498
499 /*************************/
500
501 if (verbose)
502 {
503 printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", time * 1000.f);
504 }
Free resources¶
After the computation, we need to free up all resources.
507 /***************
508 * Free resources
509 ****************/
510
511 // Free cuTensorNet resources
512 HANDLE_ERROR(cutensornetDestroySliceGroup(sliceGroup));
513 HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
514 HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
515 HANDLE_ERROR(cutensornetDestroyNetwork(networkDesc));
516 HANDLE_ERROR(cutensornetDestroy(handle));
517
518 HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
519
520 // Free Host memory resources
521 for (int i = 0; i < numInputs; ++i)
522 {
523 if (tensorData_h[i]) free(tensorData_h[i]);
524 if (gradients_h[i]) free(gradients_h[i]);
525 }
526 if (tensorData_h[numInputs]) free(tensorData_h[numInputs]);
527 if (adjoint_h) free(adjoint_h);
528
529 // Free GPU memory resources
530 if (workScratch) cudaFree(workScratch);
531 if (workCache) cudaFree(workCache);
532 if (adjoint_d) cudaFree(adjoint_d);
533 for (int i = 0; i < numInputs; ++i)
534 {
535 if (tensorData_d[i]) cudaFree(tensorData_d[i]);
536 if (gradients_d[i]) cudaFree(gradients_d[i]);
537 }
538 if (tensorData_d[numInputs]) cudaFree(tensorData_d[numInputs]);
539 if (verbose) printf("Freed resources and exited\n");
540
541 return 0;
542}