Computing gradients via backward propagation

The following code example illustrates how to compute the gradients of a tensor network w.r.t. user-selected input tensors via backward propagation. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and data types

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18
 19#define HANDLE_ERROR(x)                                           \
 20{ const auto err = x;                                             \
 21  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
 22  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 23    fflush(stdout);                                               \
 24  }                                                               \
 25};
 26
 27#define HANDLE_CUDA_ERROR(x)                                      \
 28{ const auto err = x;                                             \
 29  if( err != cudaSuccess )                                        \
 30  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 31    fflush(stdout);                                               \
 32  }                                                               \
 33};
 34
 35
 36struct GPUTimer
 37{
 38    GPUTimer(cudaStream_t stream): stream_(stream)
 39    {
 40        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 41        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 42    }
 43
 44    ~GPUTimer()
 45    {
 46        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 47        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 48    }
 49
 50    void start()
 51    {
 52        HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
 53    }
 54
 55    float seconds()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 58        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 59        float time;
 60        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 61        return time * 1e-3;
 62    }
 63
 64    private:
 65    cudaEvent_t start_, stop_;
 66    cudaStream_t stream_;
 67};
 68
 69
 70int main()
 71{
 72    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 73
 74    bool verbose = true;
 75
 76    // Check cuTensorNet version
 77    const size_t cuTensornetVersion = cutensornetGetVersion();
 78    if(verbose)
 79        printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 80
 81    // Set GPU device
 82    int numDevices {0};
 83    HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
 84    const int deviceId = 0;
 85    HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
 86    cudaDeviceProp prop;
 87    HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 88
 89    if(verbose) {
 90        printf("===== device info ======\n");
 91        printf("GPU-local-id:%d\n", deviceId);
 92        printf("GPU-name:%s\n", prop.name);
 93        printf("GPU-clock:%d\n", prop.clockRate);
 94        printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
 95        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
 96        printf("GPU-major:%d\n", prop.major);
 97        printf("GPU-minor:%d\n", prop.minor);
 98        printf("========================\n");
 99    }
100
101    typedef float floatType;
102    cudaDataType_t typeData = CUDA_R_32F;
103    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105    if(verbose)
106        printf("Included headers and defined data types\n");

Define tensor network and tensor sizes

Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity), and specify the input tensor IDs whose gradients will be computed.

110    /**********************
111    * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112    * We will execute the contraction and compute the gradients of input tensors A, B, C
113    **********************/
114
115    constexpr int32_t numInputs = 6;
116    std::vector<int32_t> gradInputIDs = {0, 1, 2};
117
118    // Create vectors of tensor modes
119    std::vector<std::vector<int32_t>> modesVec {
120        {'a','b','c','d'},
121        {'b','c','d','e'},
122        {'e','g','h'},
123        {'g','h','i','j'},
124        {'i','j','k','l'},
125        {'k','l','m'},
126        {'a','m'}
127    };
128
129    // Set mode extents
130    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
131    std::unordered_map<int32_t, int64_t> extent;
132    for (auto &vec: modesVec)
133    {
134        for (auto &mode: vec)
135        {
136            extent[mode] = sameExtent;
137        }
138    }
139
140    // Create a vector of extents for each tensor
141    std::vector<std::vector<int64_t>> extentVec;
142    extentVec.resize(numInputs+1); // hold inputs + output tensors
143    for (int i = 0; i < numInputs+1; ++i)
144    {
145        for (auto mode : modesVec[i])
146            extentVec[i].push_back(extent[mode]);
147    }
148
149    if(verbose)
150        printf("Defined tensor network, modes, and extents\n");

Allocate memory, initialize data, initialize cuTensorNet handle

Next, we allocate memory for the tensor network operands and initialize them to random values. We also allocate memory for the gradient tensors corresponding to the selected input tensors for gradient computation, as well as the activation tensor which we initialize to ones. Then, we initialize the cuTensorNet library via cutensornetCreate().

153    /**********************
154    * Allocating data
155    **********************/
156
157    std::vector<size_t> elementsVec;
158    elementsVec.resize(numInputs+1); // hold inputs + output tensors
159    for (int i = 0; i < numInputs+1; ++i)
160    {
161        elementsVec[i] = 1;
162        for (auto mode : modesVec[i])
163            elementsVec[i] *= extent[mode];
164    }
165
166    size_t totalSize = 0;
167    std::vector<size_t> sizeVec;
168    sizeVec.resize(numInputs+1); // hold inputs + output tensors
169    for (int i = 0; i < numInputs+1; ++i)
170    {
171        sizeVec[i] = sizeof(floatType) * elementsVec[i];
172        totalSize += sizeVec[i];
173    }
174    if(verbose)
175        printf("Total GPU memory used for tensor storage: %.2f GiB\n",
176                (totalSize) / 1024. /1024. / 1024);
177
178    void* rawDataIn_d[numInputs];
179    void* O_d;
180    void* outputActivation_d;
181    for (int i = 0; i < numInputs; ++i)
182    {
183        HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
184    }
185    HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
186    HANDLE_CUDA_ERROR( cudaMalloc((void**) &outputActivation_d, sizeVec[numInputs]));
187
188    floatType* rawDataIn_h[numInputs];
189    for (int i = 0; i < numInputs; ++i)
190    {
191        rawDataIn_h[i] = (floatType*) malloc(sizeVec[i]);
192        if (rawDataIn_h[i] == NULL)
193        {
194           printf("Error: Host memory allocation failed!\n");
195           return -1;
196        }
197    }
198    floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
199    if (O_h == NULL)
200    {
201        printf("Error: Host memory allocation failed!\n");
202        return -1;
203    }
204    floatType *outputActivation_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
205    if (outputActivation_h == NULL)
206    {
207        printf("Error: Host memory allocation failed!\n");
208        return -1;
209    }
210
211    void* gradientsOut_d[numInputs] = {nullptr};
212    for (auto i : gradInputIDs)
213    {
214        HANDLE_CUDA_ERROR( cudaMalloc((void**) &gradientsOut_d[i], sizeVec[i]) );
215    }
216    void* gradientsOut_h[numInputs] = {nullptr};
217    for (auto i : gradInputIDs)
218    {
219        gradientsOut_h[i] = (floatType*) malloc(sizeVec[i]);
220        if (gradientsOut_h[i] == NULL)
221        {
222           printf("Error: Host memory allocation failed!\n");
223           return -1;
224        }
225    }
226
227    /*******************
228    * Initialize data
229    *******************/
230
231    memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
232    for (int i = 0; i < numInputs; ++i)
233    {
234        for (size_t e = 0; e < elementsVec[i]; ++e)
235            rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
236    }
237    for (size_t e = 0; e < elementsVec[numInputs]; ++e)
238        outputActivation_h[e] = (floatType) 1.0;
239
240    for (int i = 0; i < numInputs; ++i)
241    {
242        HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
243    }
244    HANDLE_CUDA_ERROR( cudaMemcpy(outputActivation_d, outputActivation_h, sizeVec[numInputs], cudaMemcpyHostToDevice) );
245
246    if(verbose)
247        printf("Allocated GPU memory for data, initialize data, and create library handle\n");
248
249    /*************************
250    * cuTensorNet
251    *************************/
252
253    cudaStream_t stream;
254    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
255
256    cutensornetHandle_t handle;
257    HANDLE_ERROR( cutensornetCreate(&handle) );

Create the network descriptor and set gradient tensor IDs

Next, we create the network descriptor with the desired tensor modes, extents, and strides, as well as the data and compute types. We also set input tensors IDs whose gradients will be computed, at the network descriptor. Note that the created library context will be associated with the currently active GPU.

260    /*******************************
261    * Create Network Descriptor
262    *******************************/
263
264    int32_t* modesIn[numInputs];
265    int32_t numModesIn[numInputs];
266    int64_t* extentsIn[numInputs];
267    int64_t* stridesIn[numInputs];
268    
269    for (int i = 0; i < numInputs; ++i)
270    {
271        modesIn[i] = modesVec[i].data();
272        numModesIn[i] = modesVec[i].size();
273        extentsIn[i] = extentVec[i].data();
274        stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
275    }
276
277    // Set up tensor network
278    cutensornetNetworkDescriptor_t descNet;
279    HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
280                        numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
281                        modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
282                        typeData, typeCompute,
283                        &descNet) );
284
285    /*******************************
286    * Set input tensor ids that requrie gradient
287    *******************************/
288
289    cutensornetTensorIDList_t tensorIDList {
290        .numTensors = static_cast<int32_t>(gradInputIDs.size()),
291        .data = gradInputIDs.data()
292    };
293
294    HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
295                                                descNet,
296                                                CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD,
297                                                &tensorIDList,
298                                                sizeof(cutensornetTensorIDList_t)));
299
300    if(verbose)
301        printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

Contraction order

In this example, we illustrate using a predetermined contraction path and setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute().

304    /*******************************
305    * Choose workspace limit based on available resources.
306    *******************************/
307
308    size_t freeMem, totalMem;
309    HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
310    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
311    if(verbose)
312        printf("Workspace limit = %lu\n", workspaceLimit);
313
314    /*******************************
315    * Set contraction order
316    *******************************/
317
318    // Create contraction optimizer info
319    cutensornetContractionOptimizerInfo_t optimizerInfo;
320    HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
321
322    // set a predetermined contraction path
323    std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
324    const auto numContractions = numInputs - 1;
325    cutensornetContractionPath_t contPath;
326    contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
327    contPath.numContractions = numContractions;
328
329    // provide user-specified contPath
330    HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
331                    handle,
332                    optimizerInfo,
333                    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
334                    &contPath,
335                    sizeof(contPath)));
336    int64_t numSlices = 1;
337
338    if(verbose)
339        printf("Set predetermined contraction path into cuTensorNet optimizer\n");

Create workspace descriptor and allocate workspace memory

Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. To enable gradient computation, we need to provide CACHE workspace that will be used to store intermediate tensors’ data necessary for the backward propagation call to consume. Thus, we query sizes and allocate device memory for both kinds of workspaces (CUTENSORNET_WORKSPACE_SCRATCH, and CUTENSORNET_WORKSPACE_CACHE) and set these in the workspace descriptor. The workspace descriptor will be provided to the contraction plan creation, plan contraction, and gradient computation APIs.

342    /*******************************
343    * Create workspace descriptor, allocate workspace, and set it.
344    *******************************/
345
346    cutensornetWorkspaceDescriptor_t workDesc;
347    HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
348
349    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
350    int64_t requiredWorkspaceSizeScratch = 0;
351    HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
352                                                            descNet,
353                                                            optimizerInfo,
354                                                            workDesc) );
355
356    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
357                                                    workDesc,
358                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
359                                                    CUTENSORNET_MEMSPACE_DEVICE,
360                                                    CUTENSORNET_WORKSPACE_SCRATCH,
361                                                    &requiredWorkspaceSizeScratch) );
362
363    void* workScratch = nullptr;
364    HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
365
366    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
367                                                workDesc,
368                                                CUTENSORNET_MEMSPACE_DEVICE,
369                                                CUTENSORNET_WORKSPACE_SCRATCH,
370                                                workScratch,
371                                                requiredWorkspaceSizeScratch) );
372
373    // set CACHE workspace, which will be used across network contraction operations
374    int64_t requiredWorkspaceSizeCache = 0;
375    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
376                                                    workDesc,
377                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
378                                                    CUTENSORNET_MEMSPACE_DEVICE,
379                                                    CUTENSORNET_WORKSPACE_CACHE,
380                                                    &requiredWorkspaceSizeCache) );
381
382    void* workCache = nullptr;
383    HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
384
385    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
386                                                workDesc,
387                                                CUTENSORNET_MEMSPACE_DEVICE,
388                                                CUTENSORNET_WORKSPACE_CACHE,
389                                                workCache,
390                                                requiredWorkspaceSizeCache) );
391
392    if(verbose)
393        printf("Allocated and set up the GPU workspace\n");

Contraction plan and auto-tune

We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.

396    /*******************************
397    * Initialize the pairwise contraction plan (for cuTENSOR).
398    *******************************/
399
400    cutensornetContractionPlan_t plan;
401    HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
402                                                    descNet,
403                                                    optimizerInfo,
404                                                    workDesc,
405                                                    &plan) );
406
407    /*******************************
408    * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
409    *           for each pairwise tensor contraction.
410    *******************************/
411    cutensornetContractionAutotunePreference_t autotunePref;
412    HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
413                                                        &autotunePref) );
414
415    const int numAutotuningIterations = 5; // may be 0
416    HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
417                            handle,
418                            autotunePref,
419                            CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
420                            &numAutotuningIterations,
421                            sizeof(numAutotuningIterations)) );
422
423    // Modify the plan again to find the best pair-wise contractions
424    HANDLE_ERROR( cutensornetContractionAutotune(handle,
425                                                    plan,
426                                                    rawDataIn_d,
427                                                    O_d,
428                                                    workDesc,
429                                                    autotunePref,
430                                                    stream) );
431
432    HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
433
434    if(verbose)
435        printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");

Tensor network contraction execution and gradient computation

Finally, we contract the tensor network as many times as needed, possibly with different input each time. After contracting the network (which will store intermediate tensors’ data in the CACHE memory), we compute the required gradients via backward propagation. We also purge the CACHE memory at each iteration to make room for subsequent calls to utilize available CACHE memory.

438    /**********************
439    * Execute the tensor network contraction
440    **********************/
441
442   // Create a cutensornetSliceGroup_t object from a range of slice IDs
443   cutensornetSliceGroup_t sliceGroup{};
444   HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
445
446    GPUTimer timer {stream};
447    double minTimeCUTENSORNET = 1e100;
448    const int numRuns = 3; // number of repeats to get stable performance results
449    for (int i = 0; i < numRuns; ++i)
450    {
451        HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
452        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
453
454        /*
455        * Contract all slices of the tensor network
456        */
457        timer.start();
458
459        int32_t accumulateOutput = 0; // output tensor data will be overwritten
460        HANDLE_ERROR( cutensornetContractSlices(handle,
461                       plan,
462                       rawDataIn_d,
463                       O_d,
464                       accumulateOutput,
465                       workDesc,
466                       sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
467                       stream) );
468
469        HANDLE_ERROR( cutensornetComputeGradientsBackward(handle,
470                        plan,
471                        rawDataIn_d,
472                        outputActivation_d, 
473                        gradientsOut_d,
474                        accumulateOutput,
475                        workDesc,
476                        stream) );
477
478        // Purge the cache to make room for the next run to use cache memory
479        HANDLE_ERROR( cutensornetWorkspacePurgeCache(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE) );
480
481        // Synchronize and measure best timing
482        auto time = timer.seconds();
483        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
484    }
485
486    if(verbose)
487        printf("Contracted the tensor network and computed gradients\n");
488
489    HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
490    
491    for (auto i : gradInputIDs)
492    {
493        HANDLE_CUDA_ERROR( cudaMemcpy(gradientsOut_h[i], gradientsOut_d[i], sizeVec[i], cudaMemcpyDeviceToHost) );
494    }
495
496    /*************************/
497
498    if(verbose) {
499        printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", minTimeCUTENSORNET * 1000.f);
500    }

Free resources

After the computation, we need to free up all resources.

503    /***************
504    * Free resources
505    ****************/
506
507    // Free cuTensorNet resources
508    HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
509    HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
510    HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
511    HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
512    HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
513    HANDLE_ERROR( cutensornetDestroy(handle) );
514
515    // Free Host memory resources
516    if (O_h) free(O_h);
517    if (outputActivation_h) free(outputActivation_h);
518    for (int i = 0; i < numInputs; ++i)
519    {
520        if (rawDataIn_h[i]) 
521            free(rawDataIn_h[i]);
522        if (gradientsOut_h[i]) 
523            free(gradientsOut_h[i]);
524    }
525    // Free GPU memory resources
526    if (workScratch) cudaFree(workScratch);
527    if (workCache) cudaFree(workCache);
528    if (O_d) cudaFree(O_d);
529    if (outputActivation_d) cudaFree(outputActivation_d);
530    for (int i = 0; i < numInputs; ++i)
531    {
532        if (rawDataIn_d[i]) 
533            cudaFree(rawDataIn_d[i]);
534        if (gradientsOut_d[i]) 
535            cudaFree(gradientsOut_d[i]);
536    }
537    if(verbose)
538        printf("Freed resources and exited\n");
539
540    return 0;
541}