Computing gradients via backward propagation

The following code example illustrates how to compute the gradients of a tensor network w.r.t. user-selected input tensors via backward propagation. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and data types

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18
 19#define HANDLE_ERROR(x)                                           \
 20{ const auto err = x;                                             \
 21  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
 22  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 23    fflush(stdout);                                               \
 24  }                                                               \
 25};
 26
 27#define HANDLE_CUDA_ERROR(x)                                      \
 28{ const auto err = x;                                             \
 29  if( err != cudaSuccess )                                        \
 30  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 31    fflush(stdout);                                               \
 32  }                                                               \
 33};
 34
 35
 36struct GPUTimer
 37{
 38    GPUTimer(cudaStream_t stream): stream_(stream)
 39    {
 40        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 41        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 42    }
 43
 44    ~GPUTimer()
 45    {
 46        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 47        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 48    }
 49
 50    void start()
 51    {
 52        HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
 53    }
 54
 55    float seconds()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 58        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 59        float time;
 60        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 61        return time * 1e-3;
 62    }
 63
 64    private:
 65    cudaEvent_t start_, stop_;
 66    cudaStream_t stream_;
 67};
 68
 69
 70int main()
 71{
 72    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 73
 74    bool verbose = true;
 75
 76    // Check cuTensorNet version
 77    const size_t cuTensornetVersion = cutensornetGetVersion();
 78    if(verbose)
 79        printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 80
 81    // Set GPU device
 82    int numDevices {0};
 83    HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
 84    const int deviceId = 0;
 85    HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
 86    cudaDeviceProp prop;
 87    HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 88
 89    if(verbose) {
 90        printf("===== device info ======\n");
 91        printf("GPU-name:%s\n", prop.name);
 92        printf("GPU-clock:%d\n", prop.clockRate);
 93        printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
 94        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
 95        printf("GPU-major:%d\n", prop.major);
 96        printf("GPU-minor:%d\n", prop.minor);
 97        printf("========================\n");
 98    }
 99
100    typedef float floatType;
101    cudaDataType_t typeData = CUDA_R_32F;
102    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
103
104    if(verbose)
105        printf("Included headers and defined data types\n");

Define tensor network and tensor sizes

Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity), and specify the input tensor IDs whose gradients will be computed.

109    /**********************
110    * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
111    * We will execute the contraction and compute the gradients of input tensors A, B, C
112    **********************/
113
114    constexpr int32_t numInputs = 6;
115    std::vector<int32_t> gradInputIDs = {0, 1, 2};
116
117    // Create vectors of tensor modes
118    std::vector<std::vector<int32_t>> modesVec {
119        {'a','b','c','d'},
120        {'b','c','d','e'},
121        {'e','g','h'},
122        {'g','h','i','j'},
123        {'i','j','k','l'},
124        {'k','l','m'},
125        {'a','m'}
126    };
127
128    // Set mode extents
129    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
130    std::unordered_map<int32_t, int64_t> extent;
131    for (auto &vec: modesVec)
132    {
133        for (auto &mode: vec)
134        {
135            extent[mode] = sameExtent;
136        }
137    }
138
139    // Create a vector of extents for each tensor
140    std::vector<std::vector<int64_t>> extentVec;
141    extentVec.resize(numInputs+1); // hold inputs + output tensors
142    for (int i = 0; i < numInputs+1; ++i)
143    {
144        for (auto mode : modesVec[i])
145            extentVec[i].push_back(extent[mode]);
146    }
147
148    if(verbose)
149        printf("Defined tensor network, modes, and extents\n");

Allocate memory, initialize data, initialize cuTensorNet handle

Next, we allocate memory for the tensor network operands and initialize them to random values. We also allocate memory for the gradient tensors corresponding to the selected input tensors for gradient computation, as well as the activation tensor which we initialize to ones. Then, we initialize the cuTensorNet library via cutensornetCreate().

152    /**********************
153    * Allocating data
154    **********************/
155
156    std::vector<size_t> elementsVec;
157    elementsVec.resize(numInputs+1); // hold inputs + output tensors
158    for (int i = 0; i < numInputs+1; ++i)
159    {
160        elementsVec[i] = 1;
161        for (auto mode : modesVec[i])
162            elementsVec[i] *= extent[mode];
163    }
164
165    size_t totalSize = 0;
166    std::vector<size_t> sizeVec;
167    sizeVec.resize(numInputs+1); // hold inputs + output tensors
168    for (int i = 0; i < numInputs+1; ++i)
169    {
170        sizeVec[i] = sizeof(floatType) * elementsVec[i];
171        totalSize += sizeVec[i];
172    }
173    if(verbose)
174        printf("Total GPU memory used for tensor storage: %.2f GiB\n",
175                (totalSize) / 1024. /1024. / 1024);
176
177    void* rawDataIn_d[numInputs];
178    void* O_d;
179    void* outputActivation_d;
180    for (int i = 0; i < numInputs; ++i)
181    {
182        HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
183    }
184    HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
185    HANDLE_CUDA_ERROR( cudaMalloc((void**) &outputActivation_d, sizeVec[numInputs]));
186
187    floatType* rawDataIn_h[numInputs];
188    for (int i = 0; i < numInputs; ++i)
189    {
190        rawDataIn_h[i] = (floatType*) malloc(sizeVec[i]);
191        if (rawDataIn_h[i] == NULL)
192        {
193           printf("Error: Host memory allocation failed!\n");
194           return -1;
195        }
196    }
197    floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
198    if (O_h == NULL)
199    {
200        printf("Error: Host memory allocation failed!\n");
201        return -1;
202    }
203    floatType *outputActivation_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
204    if (outputActivation_h == NULL)
205    {
206        printf("Error: Host memory allocation failed!\n");
207        return -1;
208    }
209
210    void* gradientsOut_d[numInputs] = {nullptr};
211    for (auto i : gradInputIDs)
212    {
213        HANDLE_CUDA_ERROR( cudaMalloc((void**) &gradientsOut_d[i], sizeVec[i]) );
214    }
215    void* gradientsOut_h[numInputs] = {nullptr};
216    for (auto i : gradInputIDs)
217    {
218        gradientsOut_h[i] = (floatType*) malloc(sizeVec[i]);
219        if (gradientsOut_h[i] == NULL)
220        {
221           printf("Error: Host memory allocation failed!\n");
222           return -1;
223        }
224    }
225
226    /*******************
227    * Initialize data
228    *******************/
229
230    memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
231    for (int i = 0; i < numInputs; ++i)
232    {
233        for (size_t e = 0; e < elementsVec[i]; ++e)
234            rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
235    }
236    for (size_t e = 0; e < elementsVec[numInputs]; ++e)
237        outputActivation_h[e] = (floatType) 1.0;
238
239    for (int i = 0; i < numInputs; ++i)
240    {
241        HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
242    }
243    HANDLE_CUDA_ERROR( cudaMemcpy(outputActivation_d, outputActivation_h, sizeVec[numInputs], cudaMemcpyHostToDevice) );
244
245    if(verbose)
246        printf("Allocated GPU memory for data, initialize data, and create library handle\n");
247
248    /*************************
249    * cuTensorNet
250    *************************/
251
252    cudaStream_t stream;
253    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
254
255    cutensornetHandle_t handle;
256    HANDLE_ERROR( cutensornetCreate(&handle) );

Create the network descriptor and set gradient tensor IDs

Next, we create the network descriptor with the desired tensor modes, extents, and strides, as well as the data and compute types. We also set input tensors IDs whose gradients will be computed, at the network descriptor. Note that the created library context will be associated with the currently active GPU.

259    /*******************************
260    * Create Network Descriptor
261    *******************************/
262
263    int32_t* modesIn[numInputs];
264    int32_t numModesIn[numInputs];
265    int64_t* extentsIn[numInputs];
266    int64_t* stridesIn[numInputs];
267    
268    for (int i = 0; i < numInputs; ++i)
269    {
270        modesIn[i] = modesVec[i].data();
271        numModesIn[i] = modesVec[i].size();
272        extentsIn[i] = extentVec[i].data();
273        stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
274    }
275
276    // Set up tensor network
277    cutensornetNetworkDescriptor_t descNet;
278    HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
279                        numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
280                        modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
281                        typeData, typeCompute,
282                        &descNet) );
283
284    /*******************************
285    * Set input tensor ids that requrie gradient
286    *******************************/
287
288    cutensornetTensorIDList_t tensorIDList {
289        .numTensors = static_cast<int32_t>(gradInputIDs.size()),
290        .data = gradInputIDs.data()
291    };
292
293    HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
294                                                descNet,
295                                                CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD,
296                                                &tensorIDList,
297                                                sizeof(cutensornetTensorIDList_t)));
298
299    if(verbose)
300        printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

Contraction order

In this example, we illustrate using a predetermined contraction path and setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute().

303    /*******************************
304    * Choose workspace limit based on available resources.
305    *******************************/
306
307    size_t freeMem, totalMem;
308    HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
309    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
310    if(verbose)
311        printf("Workspace limit = %lu\n", workspaceLimit);
312
313    /*******************************
314    * Set contraction order
315    *******************************/
316
317    // Create contraction optimizer info
318    cutensornetContractionOptimizerInfo_t optimizerInfo;
319    HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
320
321    // set a predetermined contraction path
322    std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
323    const auto numContractions = numInputs - 1;
324    cutensornetContractionPath_t contPath;
325    contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
326    contPath.numContractions = numContractions;
327
328    // provide user-specified contPath
329    HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
330                    handle,
331                    optimizerInfo,
332                    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
333                    &contPath,
334                    sizeof(contPath)));
335    int64_t numSlices = 1;
336
337    if(verbose)
338        printf("Set predetermined contraction path into cuTensorNet optimizer\n");

Create workspace descriptor and allocate workspace memory

Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. To enable gradient computation, we need to provide CACHE workspace that will be used to store intermediate tensors’ data necessary for the backward propagation call to consume. Thus, we query sizes and allocate device memory for both kinds of workspaces (CUTENSORNET_WORKSPACE_SCRATCH, and CUTENSORNET_WORKSPACE_CACHE) and set these in the workspace descriptor. The workspace descriptor will be provided to the contraction plan creation, plan contraction, and gradient computation APIs.

341    /*******************************
342    * Create workspace descriptor, allocate workspace, and set it.
343    *******************************/
344
345    cutensornetWorkspaceDescriptor_t workDesc;
346    HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
347
348    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
349    int64_t requiredWorkspaceSizeScratch = 0;
350    HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
351                                                            descNet,
352                                                            optimizerInfo,
353                                                            workDesc) );
354
355    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
356                                                    workDesc,
357                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
358                                                    CUTENSORNET_MEMSPACE_DEVICE,
359                                                    CUTENSORNET_WORKSPACE_SCRATCH,
360                                                    &requiredWorkspaceSizeScratch) );
361
362    void* workScratch = nullptr;
363    HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
364
365    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
366                                                workDesc,
367                                                CUTENSORNET_MEMSPACE_DEVICE,
368                                                CUTENSORNET_WORKSPACE_SCRATCH,
369                                                workScratch,
370                                                requiredWorkspaceSizeScratch) );
371
372    // set CACHE workspace, which will be used across network contraction operations
373    int64_t requiredWorkspaceSizeCache = 0;
374    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
375                                                    workDesc,
376                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
377                                                    CUTENSORNET_MEMSPACE_DEVICE,
378                                                    CUTENSORNET_WORKSPACE_CACHE,
379                                                    &requiredWorkspaceSizeCache) );
380
381    void* workCache = nullptr;
382    HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
383
384    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
385                                                workDesc,
386                                                CUTENSORNET_MEMSPACE_DEVICE,
387                                                CUTENSORNET_WORKSPACE_CACHE,
388                                                workCache,
389                                                requiredWorkspaceSizeCache) );
390
391    if(verbose)
392        printf("Allocated and set up the GPU workspace\n");

Contraction plan and auto-tune

We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.

395    /*******************************
396    * Initialize the pairwise contraction plan (for cuTENSOR).
397    *******************************/
398
399    cutensornetContractionPlan_t plan;
400    HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
401                                                    descNet,
402                                                    optimizerInfo,
403                                                    workDesc,
404                                                    &plan) );
405
406    /*******************************
407    * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
408    *           for each pairwise tensor contraction.
409    *******************************/
410    cutensornetContractionAutotunePreference_t autotunePref;
411    HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
412                                                        &autotunePref) );
413
414    const int numAutotuningIterations = 5; // may be 0
415    HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
416                            handle,
417                            autotunePref,
418                            CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
419                            &numAutotuningIterations,
420                            sizeof(numAutotuningIterations)) );
421
422    // Modify the plan again to find the best pair-wise contractions
423    HANDLE_ERROR( cutensornetContractionAutotune(handle,
424                                                    plan,
425                                                    rawDataIn_d,
426                                                    O_d,
427                                                    workDesc,
428                                                    autotunePref,
429                                                    stream) );
430
431    HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
432
433    if(verbose)
434        printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");

Tensor network contraction execution and gradient computation

Finally, we contract the tensor network as many times as needed, possibly with different input each time. After contracting the network (which will store intermediate tensors’ data in the CACHE memory), we compute the required gradients via backward propagation. We also purge the CACHE memory at each iteration to make room for subsequent calls to utilize available CACHE memory.

437    /**********************
438    * Execute the tensor network contraction
439    **********************/
440
441   // Create a cutensornetSliceGroup_t object from a range of slice IDs
442   cutensornetSliceGroup_t sliceGroup{};
443   HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
444
445    GPUTimer timer {stream};
446    double minTimeCUTENSORNET = 1e100;
447    const int numRuns = 3; // number of repeats to get stable performance results
448    for (int i = 0; i < numRuns; ++i)
449    {
450        HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
451        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
452
453        /*
454        * Contract all slices of the tensor network
455        */
456        timer.start();
457
458        int32_t accumulateOutput = 0; // output tensor data will be overwritten
459        HANDLE_ERROR( cutensornetContractSlices(handle,
460                       plan,
461                       rawDataIn_d,
462                       O_d,
463                       accumulateOutput,
464                       workDesc,
465                       sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
466                       stream) );
467
468        HANDLE_ERROR( cutensornetComputeGradientsBackward(handle,
469                        plan,
470                        rawDataIn_d,
471                        outputActivation_d, 
472                        gradientsOut_d,
473                        accumulateOutput,
474                        workDesc,
475                        stream) );
476
477        // Purge the cache to make room for the next run to use cache memory
478        HANDLE_ERROR( cutensornetWorkspacePurgeCache(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE) );
479
480        // Synchronize and measure best timing
481        auto time = timer.seconds();
482        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
483    }
484
485    if(verbose)
486        printf("Contracted the tensor network and computed gradients\n");
487
488    HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
489    
490    for (auto i : gradInputIDs)
491    {
492        HANDLE_CUDA_ERROR( cudaMemcpy(gradientsOut_h[i], gradientsOut_d[i], sizeVec[i], cudaMemcpyDeviceToHost) );
493    }
494
495    /*************************/
496
497    if(verbose) {
498        printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", minTimeCUTENSORNET * 1000.f);
499    }

Free resources

After the computation, we need to free up all resources.

502    /***************
503    * Free resources
504    ****************/
505
506    // Free cuTensorNet resources
507    HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
508    HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
509    HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
510    HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
511    HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
512    HANDLE_ERROR( cutensornetDestroy(handle) );
513
514    // Free Host memory resources
515    if (O_h) free(O_h);
516    if (outputActivation_h) free(outputActivation_h);
517    for (int i = 0; i < numInputs; ++i)
518    {
519        if (rawDataIn_h[i]) 
520            free(rawDataIn_h[i]);
521        if (gradientsOut_h[i]) 
522            free(gradientsOut_h[i]);
523    }
524    // Free GPU memory resources
525    if (workScratch) cudaFree(workScratch);
526    if (workCache) cudaFree(workCache);
527    if (O_d) cudaFree(O_d);
528    if (outputActivation_d) cudaFree(outputActivation_d);
529    for (int i = 0; i < numInputs; ++i)
530    {
531        if (rawDataIn_d[i]) 
532            cudaFree(rawDataIn_d[i]);
533        if (gradientsOut_d[i]) 
534            cudaFree(gradientsOut_d[i]);
535    }
536    if(verbose)
537        printf("Freed resources and exited\n");
538
539    return 0;
540}