Caching/Reusing Constant Intermediate Tensors

The following code example illustrates how to activate caching of constant intermediate tensors in order to greatly accelerate the repeated execution of a tensor network contraction where only some of the input tensors change their values in each iteration. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and data types

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18
 19#define HANDLE_ERROR(x)                                           \
 20{ const auto err = x;                                             \
 21  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
 22  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 23    fflush(stdout);                                               \
 24  }                                                               \
 25};
 26
 27#define HANDLE_CUDA_ERROR(x)                                      \
 28{ const auto err = x;                                             \
 29  if( err != cudaSuccess )                                        \
 30  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 31    fflush(stdout);                                               \
 32  }                                                               \
 33};
 34
 35
 36struct GPUTimer
 37{
 38    GPUTimer(cudaStream_t stream): stream_(stream)
 39    {
 40        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 41        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 42    }
 43
 44    ~GPUTimer()
 45    {
 46        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 47        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 48    }
 49
 50    void start()
 51    {
 52        HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
 53    }
 54
 55    float seconds()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 58        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 59        float time;
 60        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 61        return time * 1e-3;
 62    }
 63
 64    private:
 65    cudaEvent_t start_, stop_;
 66    cudaStream_t stream_;
 67};
 68
 69
 70int main()
 71{
 72    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 73
 74    bool verbose = true;
 75
 76    // Check cuTensorNet version
 77    const size_t cuTensornetVersion = cutensornetGetVersion();
 78    if(verbose)
 79        printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 80
 81    // Set GPU device
 82    int numDevices {0};
 83    HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
 84    const int deviceId = 0;
 85    HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
 86    cudaDeviceProp prop;
 87    HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 88
 89    if(verbose) {
 90        printf("===== device info ======\n");
 91        printf("GPU-local-id:%d\n", deviceId);
 92        printf("GPU-name:%s\n", prop.name);
 93        printf("GPU-clock:%d\n", prop.clockRate);
 94        printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
 95        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
 96        printf("GPU-major:%d\n", prop.major);
 97        printf("GPU-minor:%d\n", prop.minor);
 98        printf("========================\n");
 99    }
100
101    typedef float floatType;
102    cudaDataType_t typeData = CUDA_R_32F;
103    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105    if(verbose)
106        printf("Included headers and defined data types\n");

Define tensor network and tensor sizes

Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).

110    /**********************
111    * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112    * We will execute the contraction a few times assuming all input tensors being constant except F.
113    **********************/
114
115    constexpr int32_t numInputs = 6;
116
117    // Create vectors of tensor modes
118    std::vector<std::vector<int32_t>> modesVec {
119        {'a','b','c','d'},
120        {'b','c','d','e'},
121        {'e','f','g','h'},
122        {'g','h','i','j'},
123        {'i','j','k','l'},
124        {'k','l','m'},
125        {'a','m'}
126    };
127
128    // Set mode extents
129    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
130    std::unordered_map<int32_t, int64_t> extent;
131    for (auto &vec: modesVec)
132    {
133        for (auto &mode: vec)
134        {
135            extent[mode] = sameExtent;
136        }
137    }
138
139    // Create a vector of extents for each tensor
140    std::vector<std::vector<int64_t>> extentVec;
141    extentVec.resize(numInputs+1); // hold inputs + output tensors
142    for (int i = 0; i < numInputs+1; ++i)
143    {
144        for (auto mode : modesVec[i])
145            extentVec[i].push_back(extent[mode]);
146    }
147
148    if(verbose)
149        printf("Defined tensor network, modes, and extents\n");

Allocate memory, initialize data, initialize cuTensorNet handle

Next, we allocate memory for the tensor network operands and initialize them to random values. Then, we initialize the cuTensorNet library via cutensornetCreate().

152    /**********************
153    * Allocating data
154    **********************/
155
156    std::vector<size_t> elementsVec;
157    elementsVec.resize(numInputs+1); // hold inputs + output tensors
158    for (int i = 0; i < numInputs+1; ++i)
159    {
160        elementsVec[i] = 1;
161        for (auto mode : modesVec[i])
162            elementsVec[i] *= extent[mode];
163    }
164
165    size_t totalSize = 0;
166    std::vector<size_t> sizeVec;
167    sizeVec.resize(numInputs+1); // hold inputs + output tensors
168    for (int i = 0; i < numInputs+1; ++i)
169    {
170        sizeVec[i] = sizeof(floatType) * elementsVec[i];
171        totalSize += sizeVec[i];
172    }
173    if(verbose)
174        printf("Total GPU memory used for tensor storage: %.2f GiB\n",
175                (totalSize) / 1024. /1024. / 1024);
176
177    void* rawDataIn_d[numInputs];
178    void* O_d;
179    for (int i = 0; i < numInputs; ++i)
180    {
181        HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
182    }
183    HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
184
185    floatType* rawDataIn_h[numInputs];
186    for (int i = 0; i < numInputs; ++i)
187    {
188        rawDataIn_h[i] = (floatType*) malloc(sizeof(floatType) * elementsVec[i]);
189        if (rawDataIn_h[i] == NULL)
190        {
191           printf("Error: Host memory allocation failed!\n");
192           return -1;
193        }
194    }
195    floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
196    if (O_h == NULL)
197    {
198        printf("Error: Host memory allocation failed!\n");
199        return -1;
200    }
201
202    /*******************
203    * Initialize data
204    *******************/
205
206    memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
207    for (int i = 0; i < numInputs; ++i)
208    {
209        for (size_t e = 0; e < elementsVec[i]; ++e)
210            rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
211    }
212
213    for (int i = 0; i < numInputs; ++i)
214    {
215        HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
216    }
217
218    if(verbose)
219        printf("Allocated GPU memory for data, initialize data, and create library handle\n");
220
221    /*************************
222    * cuTensorNet
223    *************************/
224
225    cudaStream_t stream;
226    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
227
228    cutensornetHandle_t handle;
229    HANDLE_ERROR( cutensornetCreate(&handle) );

Mark constant tensors and create the network descriptor

Next, we specify which input tensors are constant, and create the network descriptor with the desired tensor modes, extents, strides, and qualifiers (e.g., constant) as well as the data and compute types. Note that the created library context will be associated with the currently active GPU.

232    /*******************************
233    * Set constant input tensors
234    *******************************/
235
236    // specify which input tensors are constant
237    std::vector<cutensornetTensorQualifiers_t> qualifiersIn;
238    qualifiersIn.resize(numInputs);
239    for (int i = 0; i < numInputs; ++i)
240    {
241        if (i < 5)
242            qualifiersIn[i].isConstant = 1;
243        else
244            qualifiersIn[i].isConstant = 0;
245    }
246
247    /*******************************
248    * Create Network Descriptor
249    *******************************/
250
251    int32_t* modesIn[numInputs];
252    int32_t numModesIn[numInputs];
253    int64_t* extentsIn[numInputs];
254    int64_t* stridesIn[numInputs];
255    
256    for (int i = 0; i < numInputs; ++i)
257    {
258        modesIn[i] = modesVec[i].data();
259        numModesIn[i] = modesVec[i].size();
260        extentsIn[i] = extentVec[i].data();
261        stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
262    }
263
264    // Set up tensor network
265    cutensornetNetworkDescriptor_t descNet;
266    HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
267                        numInputs, numModesIn, extentsIn, stridesIn, modesIn, qualifiersIn.data(),
268                        modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
269                        typeData, typeCompute,
270                        &descNet) );
271
272    if(verbose)
273        printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

Contraction order and slicing

In this example, we illustrate using a predetermined contraction path and setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute().

276    /*******************************
277    * Choose workspace limit based on available resources.
278    *******************************/
279
280    size_t freeMem, totalMem;
281    HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
282    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
283    if(verbose)
284        printf("Workspace limit = %lu\n", workspaceLimit);
285
286    /*******************************
287    * Set contraction order
288    *******************************/
289
290    // Create contraction optimizer info
291    cutensornetContractionOptimizerInfo_t optimizerInfo;
292    HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
293
294    // set a predetermined contraction path
295    std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
296    const auto numContractions = numInputs - 1;
297    cutensornetContractionPath_t contPath;
298    contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
299    contPath.numContractions = numContractions;
300
301    // provide user-specified contPath
302    HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
303                    handle,
304                    optimizerInfo,
305                    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
306                    &contPath,
307                    sizeof(contPath)));
308    int64_t numSlices = 1;
309
310    if(verbose)
311        printf("Set predetermined contraction path into cuTensorNet optimizer\n");

Create workspace descriptor and allocate workspace memory

Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. To activate intermediate tensor reuse, we need to provide CACHE workspace that will be used across multiple network contractions. Thus, we query sizes and allocate device memory for both kinds of workspaces (CUTENSORNET_WORKSPACE_SCRATCH, and CUTENSORNET_WORKSPACE_CACHE) and set these in the workspace descriptor. The workspace descriptor will be provided to the contraction plan creation and contraction APIs.

314    /*******************************
315    * Create workspace descriptor, allocate workspace, and set it.
316    *******************************/
317
318    cutensornetWorkspaceDescriptor_t workDesc;
319    HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
320
321    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
322    int64_t requiredWorkspaceSizeScratch = 0;
323    HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
324                                                            descNet,
325                                                            optimizerInfo,
326                                                            workDesc) );
327
328    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
329                                                    workDesc,
330                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
331                                                    CUTENSORNET_MEMSPACE_DEVICE,
332                                                    CUTENSORNET_WORKSPACE_SCRATCH,
333                                                    &requiredWorkspaceSizeScratch) );
334
335    void* workScratch = nullptr;
336    HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
337
338    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
339                                                workDesc,
340                                                CUTENSORNET_MEMSPACE_DEVICE,
341                                                CUTENSORNET_WORKSPACE_SCRATCH,
342                                                workScratch,
343                                                requiredWorkspaceSizeScratch) );
344
345    // set CACHE workspace, which will be used across network contraction operations
346    int64_t requiredWorkspaceSizeCache = 0;
347    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
348                                                    workDesc,
349                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
350                                                    CUTENSORNET_MEMSPACE_DEVICE,
351                                                    CUTENSORNET_WORKSPACE_CACHE,
352                                                    &requiredWorkspaceSizeCache) );
353
354    void* workCache = nullptr;
355    HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
356
357    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
358                                                workDesc,
359                                                CUTENSORNET_MEMSPACE_DEVICE,
360                                                CUTENSORNET_WORKSPACE_CACHE,
361                                                workCache,
362                                                requiredWorkspaceSizeCache) );
363
364    if(verbose)
365        printf("Allocated and set up the GPU workspace\n");

Note that, it is possible to skip the steps of creating a workspace descriptor and explicitly handling workspace memory by setting a device memory handler, in which case cuTensorNet will implicitly handle workspace memory by allocating/deallocating memory from the provided memory pool. See Memory Management API for details.

Contraction plan and auto-tune

We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.

368    /*******************************
369    * Initialize the pairwise contraction plan (for cuTENSOR).
370    *******************************/
371
372    cutensornetContractionPlan_t plan;
373    HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
374                                                    descNet,
375                                                    optimizerInfo,
376                                                    workDesc,
377                                                    &plan) );
378
379    /*******************************
380    * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
381    *           for each pairwise tensor contraction.
382    *******************************/
383    cutensornetContractionAutotunePreference_t autotunePref;
384    HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
385                                                        &autotunePref) );
386
387    const int numAutotuningIterations = 5; // may be 0
388    HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
389                            handle,
390                            autotunePref,
391                            CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
392                            &numAutotuningIterations,
393                            sizeof(numAutotuningIterations)) );
394
395    // Modify the plan again to find the best pair-wise contractions
396    HANDLE_ERROR( cutensornetContractionAutotune(handle,
397                                                    plan,
398                                                    rawDataIn_d,
399                                                    O_d,
400                                                    workDesc,
401                                                    autotunePref,
402                                                    stream) );
403
404    HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
405
406    if(verbose)
407        printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");

Tensor network contraction execution

Finally, we contract the tensor network as many times as needed, possibly with different input each time. Note that the first network contraction call will utilize the provided CACHE workspace to store the constant intermediate tensors. Subsequent network contractions will use the cached data to greatly reduce the computation times.

410    /**********************
411    * Execute the tensor network contraction
412    **********************/
413
414    // Create a cutensornetSliceGroup_t object from a range of slice IDs
415    cutensornetSliceGroup_t sliceGroup{};
416    HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
417
418    GPUTimer timer {stream};
419    double minTimeCUTENSORNET = 1e100;
420    double firstTimeCUTENSORNET = 1e100;
421    const int numRuns = 3; // number of repeats to get stable performance results
422    for (int i = 0; i < numRuns; ++i)
423    {
424        HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
425        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
426
427        /*
428        * Contract all slices of the tensor network
429        */
430        timer.start();
431
432        int32_t accumulateOutput = 0; // output tensor data will be overwritten
433        HANDLE_ERROR( cutensornetContractSlices(handle,
434                        plan,
435                        rawDataIn_d,
436                        O_d,
437                        accumulateOutput,
438                        workDesc,
439                        sliceGroup, // slternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
440                        stream) );
441
442        // Synchronize and measure best timing
443        auto time = timer.seconds();
444        if (i == 0) 
445            firstTimeCUTENSORNET = time;
446        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
447    }
448
449    if(verbose)
450        printf("Contracted the tensor network, each slice used the same contraction plan\n");
451
452    // Print the 1-norm of the output tensor (verification)
453    HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
454    HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
455    double norm1 = 0.0;
456    for (int64_t i = 0; i < elementsVec[numInputs]; ++i) {
457        norm1 += std::abs(O_h[i]);
458    }
459    if(verbose)
460        printf("Computed the 1-norm of the output tensor: %e\n", norm1);
461
462    /*************************/
463
464    // Query the total Flop count for the tensor network contraction
465    double flops {0.0};
466    HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
467                        handle,
468                        optimizerInfo,
469                        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
470                        &flops,
471                        sizeof(flops)) );
472
473    if(verbose) {
474        printf("Number of tensor network slices = %ld\n", numSlices);
475        printf("Network contraction flop cost = %e\n", flops);
476        printf("Tensor network contraction time (ms):\n");
477        printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
478        printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
479    }

Free resources

After the computation, we need to free up all resources.

482    /***************
483    * Free resources
484    ****************/
485
486    // Free cuTensorNet resources
487    HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
488    HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
489    HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
490    HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
491    HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
492    HANDLE_ERROR( cutensornetDestroy(handle) );
493
494    // Free Host memory resources
495    if (O_h) free(O_h);
496    for (int i = 0; i < numInputs; ++i)
497    {
498        if (rawDataIn_h[i]) 
499            free(rawDataIn_h[i]);
500    }
501    // Free GPU memory resources
502    if (workScratch) cudaFree(workScratch);
503    if (workCache) cudaFree(workCache);
504    if (O_d) cudaFree(O_d);
505    for (int i = 0; i < numInputs; ++i)
506    {
507        if (rawDataIn_d[i]) 
508            cudaFree(rawDataIn_d[i]);
509    }
510    if(verbose)
511        printf("Freed resources and exited\n");
512
513    return 0;
514}