Computing gradients via backward propagation

The following code example illustrates how to compute the gradients of a tensor network w.r.t. user-selected input tensors via backward propagation. The full code can be found in the NVIDIA/cuQuantum repository (here).

This example uses the network-centric API introduced in cuTensorNet v2.9.0. Key functions include cutensornetCreateNetwork, cutensornetNetworkAppendTensor, cutensornetNetworkSetOutputTensor, cutensornetNetworkPrepareContraction, cutensornetNetworkContract, cutensornetNetworkSetAdjointTensorMemory, cutensornetNetworkSetGradientTensorMemory, cutensornetNetworkPrepareGradientsBackward, and cutensornetNetworkComputeGradientsBackward.

Headers and data types

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <algorithm>
 12#include <unordered_map>
 13#include <vector>
 14#include <cassert>
 15
 16#include <cuda_runtime.h>
 17#include <cutensornet.h>
 18
 19#define HANDLE_ERROR(x)                                                                 \
 20    do {                                                                                \
 21        const auto err = x;                                                             \
 22        if (err != CUTENSORNET_STATUS_SUCCESS)                                          \
 23        {                                                                               \
 24            printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 25            fflush(stdout);                                                             \
 26            exit(EXIT_FAILURE);                                                         \
 27        }                                                                               \
 28    } while (0)
 29
 30#define HANDLE_CUDA_ERROR(x)                                                          \
 31    do {                                                                              \
 32        const auto err = x;                                                           \
 33        if (err != cudaSuccess)                                                       \
 34        {                                                                             \
 35            printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 36            fflush(stdout);                                                           \
 37            exit(EXIT_FAILURE);                                                       \
 38        }                                                                             \
 39    } while (0)
 40
 41// Usage: DEV_ATTR(cudaDevAttrClockRate, deviceId)
 42#define DEV_ATTR(ENUMCONST, DID)                                                   \
 43    ({ int v;                                                                       \
 44       HANDLE_CUDA_ERROR(cudaDeviceGetAttribute(&v, ENUMCONST, DID));               \
 45       v; })
 46
 47struct GPUTimer
 48{
 49    GPUTimer(cudaStream_t stream) : stream_(stream)
 50    {
 51        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 52        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 53    }
 54
 55    ~GPUTimer()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 58        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 59    }
 60
 61    void start() { HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_)); }
 62
 63    float seconds()
 64    {
 65        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 66        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 67        float time;
 68        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 69        return time * 1e-3;
 70    }
 71
 72private:
 73    cudaEvent_t start_, stop_;
 74    cudaStream_t stream_;
 75};
 76
 77int main()
 78{
 79    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 80
 81    bool verbose = true;
 82
 83    // Check cuTensorNet version
 84    const size_t cuTensornetVersion = cutensornetGetVersion();
 85    if (verbose) printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 86
 87    // Set GPU device
 88    int numDevices{0};
 89    HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
 90    const int deviceId = 0;
 91    HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
 92    cudaDeviceProp prop;
 93    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 94
 95    if (verbose)
 96    {
 97        printf("===== device info ======\n");
 98        printf("GPU-local-id:%d\n", deviceId);
 99        printf("GPU-name:%s\n", prop.name);
100        printf("GPU-clock:%d\n", DEV_ATTR(cudaDevAttrClockRate, deviceId));
101        printf("GPU-memoryClock:%d\n", DEV_ATTR(cudaDevAttrMemoryClockRate, deviceId));
102        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
103        printf("GPU-major:%d\n", prop.major);
104        printf("GPU-minor:%d\n", prop.minor);
105        printf("========================\n");
106    }
107
108    typedef float floatType;
109    cudaDataType_t typeData              = CUDA_R_32F;
110    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
111
112    if (verbose) printf("Included headers and defined data types\n");

Define tensor network and tensor sizes

Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity), and specify the input tensor IDs whose gradients will be computed.

See also the network definition APIs cutensornetCreateNetwork, cutensornetNetworkAppendTensor, and cutensornetNetworkSetOutputTensor.

116    /**********************
117     * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
118     * We will execute the contraction and compute the gradients of input tensors A, B, C
119     **********************/
120
121    constexpr int32_t numInputs       = 6;
122    std::vector<int32_t> gradInputIDs = {0, 1, 2};
123
124    // Create vectors of tensor modes
125    std::vector<std::vector<int32_t>> tensorModes{
126        {'a', 'b', 'c', 'd'}, // tensor A
127        {'b', 'c', 'd', 'e'}, // tensor B
128        {'e', 'g', 'h'},      // tensor C
129        {'g', 'h', 'i', 'j'}, // tensor D
130        {'i', 'j', 'k', 'l'}, // tensor E
131        {'k', 'l', 'm'},      // tensor F
132        {'a', 'm'}            // tensor O
133    };
134
135    // Set mode extents
136    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
137    std::unordered_map<int32_t, int64_t> extent;
138    for (auto& vec : tensorModes)
139    {
140        for (auto& mode : vec)
141        {
142            extent[mode] = sameExtent;
143        }
144    }
145
146    // Create a vector of extents for each tensor
147    std::vector<std::vector<int64_t>> tensorExtents; // for input tensors & output tensor
148    tensorExtents.resize(numInputs + 1);             // hold inputs + output tensors
149    for (int32_t t = 0; t < numInputs + 1; ++t)
150    {
151        for (auto& mode : tensorModes[t]) tensorExtents[t].push_back(extent[mode]);
152    }
153
154    if (verbose) printf("Defined tensor network, modes, and extents\n");

Allocate memory, initialize data, initialize cuTensorNet handle

Next, we allocate memory for the tensor network operands and initialize them to random values. We also allocate memory for the gradient tensors corresponding to the selected input tensors for gradient computation, as well as the activation tensor which we initialize to ones. Then, we initialize the cuTensorNet library via cutensornetCreate(). Note that the created library context will be associated with the currently active GPU.

157    /**********************
158     * Allocating data
159     **********************/
160
161    std::vector<size_t> tensorElements(numInputs + 1); // for input tensors & output tensor
162    std::vector<size_t> tensorSizes(numInputs + 1);    // for input tensors & output tensor
163    size_t totalSize = 0;
164    for (int32_t t = 0; t < numInputs + 1; ++t)
165    {
166        size_t numElements = 1;
167        for (auto& mode : tensorModes[t]) numElements *= extent[mode];
168        tensorElements[t] = numElements;
169
170        tensorSizes[t] = sizeof(floatType) * numElements;
171        totalSize += tensorSizes[t];
172    }
173
174    if (verbose) printf("Total GPU memory used for tensor storage: %.2f GiB\n", (totalSize) / 1024. / 1024. / 1024);
175
176    void* tensorData_d[numInputs + 1]; // for input tensors & output tensor
177    for (int32_t t = 0; t < numInputs + 1; ++t)
178    {
179        HANDLE_CUDA_ERROR(cudaMalloc((void**)&tensorData_d[t], tensorSizes[t]));
180    }
181    void* adjoint_d; // hold data of the adjoint/activation tensor
182    HANDLE_CUDA_ERROR(cudaMalloc((void**)&adjoint_d, tensorSizes[numInputs]));
183
184    floatType* tensorData_h[numInputs + 1]; // for input tensors & output tensor
185    for (int32_t t = 0; t < numInputs + 1; ++t)
186    {
187        tensorData_h[t] = (floatType*)malloc(tensorSizes[t]);
188        if (tensorData_h[t] == NULL)
189        {
190            printf("Error: Host memory allocation failed!\n");
191            return -1;
192        }
193    }
194    floatType* adjoint_h = (floatType*)malloc(tensorSizes[numInputs]);
195    if (adjoint_h == NULL)
196    {
197        printf("Error: Host memory allocation failed!\n");
198        return -1;
199    }
200
201    void* gradients_d[numInputs] = {nullptr};
202    for (auto i : gradInputIDs)
203    {
204        HANDLE_CUDA_ERROR(cudaMalloc((void**)&gradients_d[i], tensorSizes[i]));
205    }
206    void* gradients_h[numInputs] = {nullptr};
207    for (auto i : gradInputIDs)
208    {
209        gradients_h[i] = (floatType*)malloc(tensorSizes[i]);
210        if (gradients_h[i] == NULL)
211        {
212            printf("Error: Host memory allocation failed!\n");
213            return -1;
214        }
215    }
216
217    /*******************
218     * Initialize data
219     *******************/
220
221    // set output tensor data to all 0s
222    memset(tensorData_h[numInputs], 0, tensorSizes[numInputs]);
223    // init input tensors data to random values
224    for (int32_t t = 0; t < numInputs; ++t)
225    {
226        for (size_t e = 0; e < tensorElements[t]; ++e) tensorData_h[t][e] = ((floatType)rand()) / RAND_MAX;
227    }
228    // set activation tensor to all 1s
229    for (size_t e = 0; e < tensorElements[numInputs]; ++e) adjoint_h[e] = (floatType)1.0;
230
231    // copy tensors' data to device buffers
232    for (int32_t t = 0; t < numInputs; ++t)
233    {
234        HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[t], tensorData_h[t], tensorSizes[t], cudaMemcpyHostToDevice));
235    }
236    HANDLE_CUDA_ERROR(cudaMemcpy(adjoint_d, adjoint_h, tensorSizes[numInputs], cudaMemcpyHostToDevice));
237
238    /*************************
239     * cuTensorNet
240     *************************/
241
242    cudaStream_t stream;
243    HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
244
245    cutensornetHandle_t handle;
246    HANDLE_ERROR(cutensornetCreate(&handle));
247
248    if (verbose) printf("Allocated GPU memory for data, initialized data, and created library handle\n");

Construct the network

We create the network descriptor, and append the input tensors with the desired tensor modes and extents, as well as the data type. We can, optional, set the output tensor modes (if skipped, the output modes will be inferred). To compute gradients with respect to specific input tensors, those tensors must be tagged (e.g., specified) using tensor qualifiers.

251    /*******************************
252     * Create Network
253     *******************************/
254
255    // Set up tensor network
256    cutensornetNetworkDescriptor_t networkDesc;
257    HANDLE_ERROR(cutensornetCreateNetwork(handle, &networkDesc));
258
259    int64_t tensorIDs[numInputs]; // for input tensors
260    // attach the input tensors to the network
261    for (int32_t t = 0; t < numInputs; ++t)
262    {
263        cutensornetTensorQualifiers_t qualifiers{0, 0, 0};
264        qualifiers.requiresGradient = gradInputIDs.end() != std::find(gradInputIDs.begin(), gradInputIDs.end(), t);
265        HANDLE_ERROR(cutensornetNetworkAppendTensor(handle,
266                                                    networkDesc,
267                                                    tensorModes[t].size(),
268                                                    tensorExtents[t].data(),
269                                                    tensorModes[t].data(),
270                                                    &qualifiers,
271                                                    typeData,
272                                                    &tensorIDs[t]));
273    }
274
275    // set output tensor of the network
276    HANDLE_ERROR(cutensornetNetworkSetOutputTensor(handle,
277                                                   networkDesc,
278                                                   tensorModes[numInputs].size(),
279                                                   tensorModes[numInputs].data(),
280                                                   typeData));
281
282    // set the network compute type
283    HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
284                                                networkDesc,
285                                                CUTENSORNET_NETWORK_COMPUTE_TYPE,
286                                                &typeCompute,
287                                                sizeof(typeCompute)));
288
289    if (verbose) printf("Initialized the cuTensorNet library and created a tensor network\n");

Contraction order

In this example, we illustrate using a predetermined contraction path and setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute(). We also attach the constructed optimizer info object to the network via cutensornetNetworkSetOptimizerInfo()

292    /*******************************
293     * Choose workspace limit based on available resources.
294     *******************************/
295
296    size_t freeMem, totalMem;
297    HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
298    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
299    if (verbose) printf("Workspace limit = %lu\n", workspaceLimit);
300
301    /*******************************
302     * Set contraction order
303     *******************************/
304
305    // Create contraction optimizer info
306    cutensornetContractionOptimizerInfo_t optimizerInfo;
307    HANDLE_ERROR(cutensornetCreateContractionOptimizerInfo(handle, networkDesc, &optimizerInfo));
308
309    // set a predetermined contraction path
310    std::vector<int32_t> path{0, 1, 0, 4, 0, 3, 0, 2, 0, 1};
311    const auto numContractions = numInputs - 1;
312    cutensornetContractionPath_t contPath;
313    contPath.data            = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
314    contPath.numContractions = numContractions;
315
316    // provide user-specified contPath
317    HANDLE_ERROR(cutensornetContractionOptimizerInfoSetAttribute(handle,
318                                                                 optimizerInfo,
319                                                                 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
320                                                                 &contPath,
321                                                                 sizeof(contPath)));
322
323    // Attach the optimizer info to the network 
324    HANDLE_ERROR(cutensornetNetworkSetOptimizerInfo(handle,
325                                                    networkDesc,
326                                                    optimizerInfo));
327    int64_t numSlices = 1;
328
329    if (verbose) printf("Set predetermined contraction path into cuTensorNet optimizer\n");

Create workspace descriptor and allocate workspace memory

Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. To enable gradient computation, we need to provide CACHE workspace that will be used to store intermediate tensors’ data necessary for the backward propagation call to consume. Thus, we query sizes and allocate device memory for both kinds of workspaces (CUTENSORNET_WORKSPACE_SCRATCH, and CUTENSORNET_WORKSPACE_CACHE) and set these in the workspace descriptor. The workspace descriptor will be provided to the contraction preparation, contraction computation, and gradient computation APIs.

See also cutensornetWorkspaceSetMemory, cutensornetWorkspaceGetMemorySize, and cutensornetWorkspacePurgeCache.

332    /*******************************
333     * Create workspace descriptor, allocate workspace, and set it.
334     *******************************/
335
336    cutensornetWorkspaceDescriptor_t workDesc;
337    HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
338
339    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
340    int64_t requiredWorkspaceSizeScratch = 0;
341    HANDLE_ERROR(cutensornetWorkspaceComputeContractionSizes(handle, networkDesc, optimizerInfo, workDesc));
342
343    HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
344                                                   workDesc,
345                                                   CUTENSORNET_WORKSIZE_PREF_MIN,
346                                                   CUTENSORNET_MEMSPACE_DEVICE,
347                                                   CUTENSORNET_WORKSPACE_SCRATCH,
348                                                   &requiredWorkspaceSizeScratch));
349
350    void* workScratch = nullptr;
351    HANDLE_CUDA_ERROR(cudaMalloc(&workScratch, requiredWorkspaceSizeScratch));
352
353    HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
354                                               workDesc,
355                                               CUTENSORNET_MEMSPACE_DEVICE,
356                                               CUTENSORNET_WORKSPACE_SCRATCH,
357                                               workScratch,
358                                               requiredWorkspaceSizeScratch));
359
360    // set CACHE workspace, which will be used across network contraction operations
361    int64_t requiredWorkspaceSizeCache = 0;
362    HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
363                                                   workDesc,
364                                                   CUTENSORNET_WORKSIZE_PREF_MIN,
365                                                   CUTENSORNET_MEMSPACE_DEVICE,
366                                                   CUTENSORNET_WORKSPACE_CACHE,
367                                                   &requiredWorkspaceSizeCache));
368
369    void* workCache = nullptr;
370    HANDLE_CUDA_ERROR(cudaMalloc(&workCache, requiredWorkspaceSizeCache));
371
372    HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
373                                               workDesc,
374                                               CUTENSORNET_MEMSPACE_DEVICE,
375                                               CUTENSORNET_WORKSPACE_CACHE,
376                                               workCache,
377                                               requiredWorkspaceSizeCache));
378
379    if (verbose) printf("Allocated and set up the GPU workspace\n");

Contraction preparation and auto-tuning

We prepare the tensor network contraction, via cutensornetNetworkPrepareContraction(). Optionally, we can auto-tune the contraction, via cutensornetNetworkAutotuneContraction(), such that cuTENSOR selects the best kernel for each pairwise contraction. This prepared network contraction can be reused for many (possibly different) data inputs, avoiding the cost of re-initializing it redundantly.

In the network-centric flow, see also cutensornetNetworkAutotuneContraction.

Input and output buffers are attached using cutensornetNetworkSetInputTensorMemory, and cutensornetNetworkSetOutputTensorMemory.

382    /**********************************
383     * Prepare the network contraction.
384     **********************************/
385
386    // set tensor's data buffers and strides
387    for (int32_t t = 0; t < numInputs; ++t)
388    {
389        HANDLE_ERROR(cutensornetNetworkSetInputTensorMemory(handle,
390                                                            networkDesc,
391                                                            tensorIDs[t],
392                                                            tensorData_d[t],
393                                                            NULL));
394    }
395    HANDLE_ERROR(cutensornetNetworkSetOutputTensorMemory(handle,
396                                                         networkDesc,
397                                                         tensorData_d[numInputs],
398                                                         NULL));
399
400    HANDLE_ERROR(cutensornetNetworkPrepareContraction(handle,
401                                                      networkDesc,
402                                                      workDesc));
403
404    /*******************************
405     * Optional: Auto-tune the network's contraction to pick the fastest kernel
406     *           for each pairwise tensor contraction.
407     *******************************/
408    cutensornetNetworkAutotunePreference_t autotunePref;
409    HANDLE_ERROR(cutensornetCreateNetworkAutotunePreference(handle, &autotunePref));
410
411    const int numAutotuningIterations = 5; // may be 0
412    HANDLE_ERROR(cutensornetNetworkAutotunePreferenceSetAttribute(handle,
413                                                                  autotunePref,
414                                                                  CUTENSORNET_NETWORK_AUTOTUNE_MAX_ITERATIONS,
415                                                                  &numAutotuningIterations,
416                                                                  sizeof(numAutotuningIterations)));
417
418    // Autotune the network to find the best pair-wise contractions
419    HANDLE_ERROR(cutensornetNetworkAutotuneContraction(handle,
420                                                       networkDesc,
421                                                       workDesc,
422                                                       autotunePref,
423                                                       stream));
424
425    HANDLE_ERROR(cutensornetDestroyNetworkAutotunePreference(autotunePref));
426
427    if (verbose) printf("Prepared the network contraction for cuTensorNet and optionally auto-tuned it\n");

Tensor network contraction execution and gradient computation

Finally, we contract the tensor network via cutensornetNetworkContract(). After contracting the network (which will store intermediate tensors’ data in the CACHE memory), we prepare the gradient computation via cutensornetNetworkPrepareGradientsBackward() and compute the required gradients through backward propagation via cutensornetNetworkComputeGradientsBackward(). We must purge the CACHE memory for each data set to allow the network contraction call to store the correct intermediate data in the CACHE memory.

Gradient and adjoint/activation buffers are attached using cutensornetNetworkSetGradientTensorMemory, and cutensornetNetworkSetAdjointTensorMemory.

See also cutensornetNetworkPrepareGradientsBackward, cutensornetNetworkComputeGradientsBackward, and cutensornetWorkspacePurgeCache.

430    /**********************
431     * Execute the tensor network contraction
432     **********************/
433
434    // Create a cutensornetSliceGroup_t object from a range of slice IDs
435    cutensornetSliceGroup_t sliceGroup{};
436    HANDLE_ERROR(cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup));
437
438    GPUTimer timer{stream};
439    // restore the output tensor on GPU
440    HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[numInputs], tensorData_h[numInputs], tensorSizes[numInputs], cudaMemcpyHostToDevice));
441    HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
442
443    /*
444     * Contract all slices of the tensor network
445     */
446    timer.start();
447
448    int32_t accumulateOutput = 0; // output tensor data will be overwritten
449    HANDLE_ERROR(cutensornetNetworkContract(handle,
450                                            networkDesc,
451                                            accumulateOutput,
452                                            workDesc,
453                                            sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
454                                            stream));
455
456    // Synchronize and measure timing
457    auto time = timer.seconds();
458
459    /**********************
460     * Prepare the tensor network gradient computation
461     **********************/
462
463    HANDLE_ERROR(cutensornetNetworkSetAdjointTensorMemory(handle, networkDesc, adjoint_d, NULL));
464
465    for (auto gid : gradInputIDs) // for only those tensors that require the gradient
466    {
467        HANDLE_ERROR(cutensornetNetworkSetGradientTensorMemory(handle,
468                                                               networkDesc,
469                                                               gid,
470                                                               gradients_d[gid], NULL));
471    }
472
473    HANDLE_ERROR(cutensornetNetworkPrepareGradientsBackward(handle, networkDesc, workDesc));
474
475    /**********************
476     * Execute the tensor network gradient computation
477     **********************/
478    timer.start();
479
480    HANDLE_ERROR(cutensornetNetworkComputeGradientsBackward(handle,
481                                                            networkDesc,
482                                                            accumulateOutput,
483                                                            workDesc,
484                                                            sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
485                                                            stream));
486    // Synchronize and measure timing
487    time += timer.seconds();
488
489    if (verbose) printf("Contracted the tensor network and computed gradients\n");
490
491    // restore the output tensor on Host
492    HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_h[numInputs], tensorData_d[numInputs], tensorSizes[numInputs], cudaMemcpyDeviceToHost));
493
494    for (auto i : gradInputIDs)
495    {
496        HANDLE_CUDA_ERROR(cudaMemcpy(gradients_h[i], gradients_d[i], tensorSizes[i], cudaMemcpyDeviceToHost));
497    }
498
499    /*************************/
500
501    if (verbose)
502    {
503        printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", time * 1000.f);
504    }

Free resources

After the computation, we need to free up all resources.

507    /***************
508     * Free resources
509     ****************/
510
511    // Free cuTensorNet resources
512    HANDLE_ERROR(cutensornetDestroySliceGroup(sliceGroup));
513    HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
514    HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
515    HANDLE_ERROR(cutensornetDestroyNetwork(networkDesc));
516    HANDLE_ERROR(cutensornetDestroy(handle));
517
518    HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
519
520    // Free Host memory resources
521    for (int i = 0; i < numInputs; ++i)
522    {
523        if (tensorData_h[i]) free(tensorData_h[i]);
524        if (gradients_h[i]) free(gradients_h[i]);
525    }
526    if (tensorData_h[numInputs]) free(tensorData_h[numInputs]);
527    if (adjoint_h) free(adjoint_h);
528
529    // Free GPU memory resources
530    if (workScratch) cudaFree(workScratch);
531    if (workCache) cudaFree(workCache);
532    if (adjoint_d) cudaFree(adjoint_d);
533    for (int i = 0; i < numInputs; ++i)
534    {
535        if (tensorData_d[i]) cudaFree(tensorData_d[i]);
536        if (gradients_d[i]) cudaFree(gradients_d[i]);
537    }
538    if (tensorData_d[numInputs]) cudaFree(tensorData_d[numInputs]);
539    if (verbose) printf("Freed resources and exited\n");
540
541    return 0;
542}