Caching/Reusing Constant Intermediate Tensors

The following code example illustrates how to activate caching of constant intermediate tensors in order to greatly accelerate the repeated execution of a tensor network contraction where only some of the input tensors change their values in each iteration. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and data types

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18
 19#define HANDLE_ERROR(x)                                           \
 20{ const auto err = x;                                             \
 21  if( err != CUTENSORNET_STATUS_SUCCESS )                         \
 22  { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 23    fflush(stdout);                                               \
 24  }                                                               \
 25};
 26
 27#define HANDLE_CUDA_ERROR(x)                                      \
 28{ const auto err = x;                                             \
 29  if( err != cudaSuccess )                                        \
 30  { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 31    fflush(stdout);                                               \
 32  }                                                               \
 33};
 34
 35
 36struct GPUTimer
 37{
 38    GPUTimer(cudaStream_t stream): stream_(stream)
 39    {
 40        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 41        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 42    }
 43
 44    ~GPUTimer()
 45    {
 46        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 47        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 48    }
 49
 50    void start()
 51    {
 52        HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
 53    }
 54
 55    float seconds()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 58        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 59        float time;
 60        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 61        return time * 1e-3;
 62    }
 63
 64    private:
 65    cudaEvent_t start_, stop_;
 66    cudaStream_t stream_;
 67};
 68
 69
 70int main()
 71{
 72    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 73
 74    bool verbose = true;
 75
 76    // Check cuTensorNet version
 77    const size_t cuTensornetVersion = cutensornetGetVersion();
 78    if(verbose)
 79        printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 80
 81    // Set GPU device
 82    int numDevices {0};
 83    HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
 84    const int deviceId = 0;
 85    HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
 86    cudaDeviceProp prop;
 87    HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 88
 89    if(verbose) {
 90        printf("===== device info ======\n");
 91        printf("GPU-name:%s\n", prop.name);
 92        printf("GPU-clock:%d\n", prop.clockRate);
 93        printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
 94        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
 95        printf("GPU-major:%d\n", prop.major);
 96        printf("GPU-minor:%d\n", prop.minor);
 97        printf("========================\n");
 98    }
 99
100    typedef float floatType;
101    cudaDataType_t typeData = CUDA_R_32F;
102    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
103
104    if(verbose)
105        printf("Included headers and defined data types\n");

Define tensor network and tensor sizes

Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).

109    /**********************
110    * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
111    * We will execute the contraction a few times assuming all input tensors being constant except F.
112    **********************/
113
114    constexpr int32_t numInputs = 6;
115
116    // Create vectors of tensor modes
117    std::vector<std::vector<int32_t>> modesVec {
118        {'a','b','c','d'},
119        {'b','c','d','e'},
120        {'e','f','g','h'},
121        {'g','h','i','j'},
122        {'i','j','k','l'},
123        {'k','l','m'},
124        {'a','m'}
125    };
126
127    // Set mode extents
128    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
129    std::unordered_map<int32_t, int64_t> extent;
130    for (auto &vec: modesVec)
131    {
132        for (auto &mode: vec)
133        {
134            extent[mode] = sameExtent;
135        }
136    }
137
138    // Create a vector of extents for each tensor
139    std::vector<std::vector<int64_t>> extentVec;
140    extentVec.resize(numInputs+1); // hold inputs + output tensors
141    for (int i = 0; i < numInputs+1; ++i)
142    {
143        for (auto mode : modesVec[i])
144            extentVec[i].push_back(extent[mode]);
145    }
146
147    if(verbose)
148        printf("Defined tensor network, modes, and extents\n");

Allocate memory, initialize data, initialize cuTensorNet handle

Next, we allocate memory for the tensor network operands and initialize them to random values. Then, we initialize the cuTensorNet library via cutensornetCreate().

151    /**********************
152    * Allocating data
153    **********************/
154
155    std::vector<size_t> elementsVec;
156    elementsVec.resize(numInputs+1); // hold inputs + output tensors
157    for (int i = 0; i < numInputs+1; ++i)
158    {
159        elementsVec[i] = 1;
160        for (auto mode : modesVec[i])
161            elementsVec[i] *= extent[mode];
162    }
163
164    size_t totalSize = 0;
165    std::vector<size_t> sizeVec;
166    sizeVec.resize(numInputs+1); // hold inputs + output tensors
167    for (int i = 0; i < numInputs+1; ++i)
168    {
169        sizeVec[i] = sizeof(floatType) * elementsVec[i];
170        totalSize += sizeVec[i];
171    }
172    if(verbose)
173        printf("Total GPU memory used for tensor storage: %.2f GiB\n",
174                (totalSize) / 1024. /1024. / 1024);
175
176    void* rawDataIn_d[numInputs];
177    void* O_d;
178    for (int i = 0; i < numInputs; ++i)
179    {
180        HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
181    }
182    HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
183
184    floatType* rawDataIn_h[numInputs];
185    for (int i = 0; i < numInputs; ++i)
186    {
187        rawDataIn_h[i] = (floatType*) malloc(sizeof(floatType) * elementsVec[i]);
188        if (rawDataIn_h[i] == NULL)
189        {
190           printf("Error: Host memory allocation failed!\n");
191           return -1;
192        }
193    }
194    floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
195    if (O_h == NULL)
196    {
197        printf("Error: Host memory allocation failed!\n");
198        return -1;
199    }
200
201    /*******************
202    * Initialize data
203    *******************/
204
205    memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
206    for (int i = 0; i < numInputs; ++i)
207    {
208        for (size_t e = 0; e < elementsVec[i]; ++e)
209            rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
210    }
211
212    for (int i = 0; i < numInputs; ++i)
213    {
214        HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
215    }
216
217    if(verbose)
218        printf("Allocated GPU memory for data, initialize data, and create library handle\n");
219
220    /*************************
221    * cuTensorNet
222    *************************/
223
224    cudaStream_t stream;
225    HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
226
227    cutensornetHandle_t handle;
228    HANDLE_ERROR( cutensornetCreate(&handle) );

Mark constant tensors and create the network descriptor

Next, we specify which input tensors are constant, and create the network descriptor with the desired tensor modes, extents, strides, and qualifiers (e.g., constant) as well as the data and compute types. Note that the created library context will be associated with the currently active GPU.

231    /*******************************
232    * Set constant input tensors
233    *******************************/
234
235    // specify which input tensors are constant
236    std::vector<cutensornetTensorQualifiers_t> qualifiersIn;
237    qualifiersIn.resize(numInputs);
238    for (int i = 0; i < numInputs; ++i)
239    {
240        if (i < 5)
241            qualifiersIn[i].isConstant = 1;
242        else
243            qualifiersIn[i].isConstant = 0;
244    }
245
246    /*******************************
247    * Create Network Descriptor
248    *******************************/
249
250    int32_t* modesIn[numInputs];
251    int32_t numModesIn[numInputs];
252    int64_t* extentsIn[numInputs];
253    int64_t* stridesIn[numInputs];
254    
255    for (int i = 0; i < numInputs; ++i)
256    {
257        modesIn[i] = modesVec[i].data();
258        numModesIn[i] = modesVec[i].size();
259        extentsIn[i] = extentVec[i].data();
260        stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
261    }
262
263    // Set up tensor network
264    cutensornetNetworkDescriptor_t descNet;
265    HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
266                        numInputs, numModesIn, extentsIn, stridesIn, modesIn, qualifiersIn.data(),
267                        modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
268                        typeData, typeCompute,
269                        &descNet) );
270
271    if(verbose)
272        printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

Contraction order and slicing

In this example, we illustrate using a predetermined contraction path and setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute().

275    /*******************************
276    * Choose workspace limit based on available resources.
277    *******************************/
278
279    size_t freeMem, totalMem;
280    HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
281    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
282    if(verbose)
283        printf("Workspace limit = %lu\n", workspaceLimit);
284
285    /*******************************
286    * Set contraction order
287    *******************************/
288
289    // Create contraction optimizer info
290    cutensornetContractionOptimizerInfo_t optimizerInfo;
291    HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
292
293    // set a predetermined contraction path
294    std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
295    const auto numContractions = numInputs - 1;
296    cutensornetContractionPath_t contPath;
297    contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
298    contPath.numContractions = numContractions;
299
300    // provide user-specified contPath
301    HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
302                    handle,
303                    optimizerInfo,
304                    CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
305                    &contPath,
306                    sizeof(contPath)));
307    int64_t numSlices = 1;
308
309    if(verbose)
310        printf("Set predetermined contraction path into cuTensorNet optimizer\n");

Create workspace descriptor and allocate workspace memory

Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. To activate intermediate tensor reuse, we need to provide CACHE workspace that will be used across multiple network contractions. Thus, we query sizes and allocate device memory for both kinds of workspaces (CUTENSORNET_WORKSPACE_SCRATCH, and CUTENSORNET_WORKSPACE_CACHE) and set these in the workspace descriptor. The workspace descriptor will be provided to the contraction plan creation and contraction APIs.

313    /*******************************
314    * Create workspace descriptor, allocate workspace, and set it.
315    *******************************/
316
317    cutensornetWorkspaceDescriptor_t workDesc;
318    HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
319
320    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
321    int64_t requiredWorkspaceSizeScratch = 0;
322    HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
323                                                            descNet,
324                                                            optimizerInfo,
325                                                            workDesc) );
326
327    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
328                                                    workDesc,
329                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
330                                                    CUTENSORNET_MEMSPACE_DEVICE,
331                                                    CUTENSORNET_WORKSPACE_SCRATCH,
332                                                    &requiredWorkspaceSizeScratch) );
333
334    void* workScratch = nullptr;
335    HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
336
337    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
338                                                workDesc,
339                                                CUTENSORNET_MEMSPACE_DEVICE,
340                                                CUTENSORNET_WORKSPACE_SCRATCH,
341                                                workScratch,
342                                                requiredWorkspaceSizeScratch) );
343
344    // set CACHE workspace, which will be used across network contraction operations
345    int64_t requiredWorkspaceSizeCache = 0;
346    HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
347                                                    workDesc,
348                                                    CUTENSORNET_WORKSIZE_PREF_MIN,
349                                                    CUTENSORNET_MEMSPACE_DEVICE,
350                                                    CUTENSORNET_WORKSPACE_CACHE,
351                                                    &requiredWorkspaceSizeCache) );
352
353    void* workCache = nullptr;
354    HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
355
356    HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
357                                                workDesc,
358                                                CUTENSORNET_MEMSPACE_DEVICE,
359                                                CUTENSORNET_WORKSPACE_CACHE,
360                                                workCache,
361                                                requiredWorkspaceSizeCache) );
362
363    if(verbose)
364        printf("Allocated and set up the GPU workspace\n");

Note that, it is possible to skip the steps of creating a workspace descriptor and explicitly handling workspace memory by setting a device memory handler, in which case cuTensorNet will implicitly handle workspace memory by allocating/deallocating memory from the provided memory pool. See Memory Management API for details.

Contraction plan and auto-tune

We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.

367    /*******************************
368    * Initialize the pairwise contraction plan (for cuTENSOR).
369    *******************************/
370
371    cutensornetContractionPlan_t plan;
372    HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
373                                                    descNet,
374                                                    optimizerInfo,
375                                                    workDesc,
376                                                    &plan) );
377
378    /*******************************
379    * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
380    *           for each pairwise tensor contraction.
381    *******************************/
382    cutensornetContractionAutotunePreference_t autotunePref;
383    HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
384                                                        &autotunePref) );
385
386    const int numAutotuningIterations = 5; // may be 0
387    HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
388                            handle,
389                            autotunePref,
390                            CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
391                            &numAutotuningIterations,
392                            sizeof(numAutotuningIterations)) );
393
394    // Modify the plan again to find the best pair-wise contractions
395    HANDLE_ERROR( cutensornetContractionAutotune(handle,
396                                                    plan,
397                                                    rawDataIn_d,
398                                                    O_d,
399                                                    workDesc,
400                                                    autotunePref,
401                                                    stream) );
402
403    HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
404
405    if(verbose)
406        printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");

Tensor network contraction execution

Finally, we contract the tensor network as many times as needed, possibly with different input each time. Note that the first network contraction call will utilize the provided CACHE workspace to store the constant intermediate tensors. Subsequent network contractions will use the cached data to greatly reduce the computation times.

409    /**********************
410    * Execute the tensor network contraction
411    **********************/
412
413    // Create a cutensornetSliceGroup_t object from a range of slice IDs
414    cutensornetSliceGroup_t sliceGroup{};
415    HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
416
417    GPUTimer timer {stream};
418    double minTimeCUTENSORNET = 1e100;
419    double firstTimeCUTENSORNET = 1e100;
420    const int numRuns = 3; // number of repeats to get stable performance results
421    for (int i = 0; i < numRuns; ++i)
422    {
423        HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
424        HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
425
426        /*
427        * Contract all slices of the tensor network
428        */
429        timer.start();
430
431        int32_t accumulateOutput = 0; // output tensor data will be overwritten
432        HANDLE_ERROR( cutensornetContractSlices(handle,
433                        plan,
434                        rawDataIn_d,
435                        O_d,
436                        accumulateOutput,
437                        workDesc,
438                        sliceGroup, // slternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
439                        stream) );
440
441        // Synchronize and measure best timing
442        auto time = timer.seconds();
443        if (i == 0) 
444            firstTimeCUTENSORNET = time;
445        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
446    }
447
448    if(verbose)
449        printf("Contracted the tensor network, each slice used the same contraction plan\n");
450
451    // Print the 1-norm of the output tensor (verification)
452    HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
453    HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
454    double norm1 = 0.0;
455    for (int64_t i = 0; i < elementsVec[numInputs]; ++i) {
456        norm1 += std::abs(O_h[i]);
457    }
458    if(verbose)
459        printf("Computed the 1-norm of the output tensor: %e\n", norm1);
460
461    /*************************/
462
463    // Query the total Flop count for the tensor network contraction
464    double flops {0.0};
465    HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
466                        handle,
467                        optimizerInfo,
468                        CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
469                        &flops,
470                        sizeof(flops)) );
471
472    if(verbose) {
473        printf("Number of tensor network slices = %ld\n", numSlices);
474        printf("Network contraction flop cost = %e\n", flops);
475        printf("Tensor network contraction time (ms):\n");
476        printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
477        printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
478    }

Free resources

After the computation, we need to free up all resources.

481    /***************
482    * Free resources
483    ****************/
484
485    // Free cuTensorNet resources
486    HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
487    HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
488    HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
489    HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
490    HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
491    HANDLE_ERROR( cutensornetDestroy(handle) );
492
493    // Free Host memory resources
494    if (O_h) free(O_h);
495    for (int i = 0; i < numInputs; ++i)
496    {
497        if (rawDataIn_h[i]) 
498            free(rawDataIn_h[i]);
499    }
500    // Free GPU memory resources
501    if (workScratch) cudaFree(workScratch);
502    if (workCache) cudaFree(workCache);
503    if (O_d) cudaFree(O_d);
504    for (int i = 0; i < numInputs; ++i)
505    {
506        if (rawDataIn_d[i]) 
507            cudaFree(rawDataIn_d[i]);
508    }
509    if(verbose)
510        printf("Freed resources and exited\n");
511
512    return 0;
513}