Caching/Reusing constant intermediate tensors

The following code example illustrates how to activate caching of constant intermediate tensors in order to greatly accelerate the repeated execution of a tensor network contraction where only some of the input tensors change their values in each iteration. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and data types

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18#define HANDLE_ERROR(x)                                                                 \
 19    do {                                                                                \
 20        const auto err = x;                                                             \
 21        if (err != CUTENSORNET_STATUS_SUCCESS)                                          \
 22        {                                                                               \
 23            printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 24            fflush(stdout);                                                             \
 25            exit(EXIT_FAILURE);                                                         \
 26        }                                                                               \
 27    } while (0)
 28
 29#define HANDLE_CUDA_ERROR(x)                                                          \
 30    do {                                                                              \
 31        const auto err = x;                                                           \
 32        if (err != cudaSuccess)                                                       \
 33        {                                                                             \
 34            printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 35            fflush(stdout);                                                           \
 36            exit(EXIT_FAILURE);                                                       \
 37        }                                                                             \
 38    } while (0)
 39
 40// Usage: DEV_ATTR(cudaDevAttrClockRate, deviceId)
 41#define DEV_ATTR(ENUMCONST, DID)                                                   \
 42    ({ int v;                                                                       \
 43       HANDLE_CUDA_ERROR(cudaDeviceGetAttribute(&v, ENUMCONST, DID));               \
 44       v; })
 45
 46struct GPUTimer
 47{
 48    GPUTimer(cudaStream_t stream) : stream_(stream)
 49    {
 50        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 51        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 52    }
 53
 54    ~GPUTimer()
 55    {
 56        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 57        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 58    }
 59
 60    void start() { HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_)); }
 61
 62    float seconds()
 63    {
 64        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 65        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 66        float time;
 67        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 68        return time * 1e-3;
 69    }
 70
 71private:
 72    cudaEvent_t start_, stop_;
 73    cudaStream_t stream_;
 74};
 75
 76int main()
 77{
 78    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 79
 80    bool verbose = true;
 81
 82    // Check cuTensorNet version
 83    const size_t cuTensornetVersion = cutensornetGetVersion();
 84    if (verbose) printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 85
 86    // Set GPU device
 87    int numDevices{0};
 88    HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
 89    const int deviceId = 0;
 90    HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
 91    cudaDeviceProp prop;
 92    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 93
 94    if (verbose)
 95    {
 96        printf("===== device info ======\n");
 97        printf("GPU-local-id:%d\n", deviceId);
 98        printf("GPU-name:%s\n", prop.name);
 99        printf("GPU-clock:%d\n", DEV_ATTR(cudaDevAttrClockRate, deviceId));
100        printf("GPU-memoryClock:%d\n", DEV_ATTR(cudaDevAttrMemoryClockRate, deviceId));
101        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
102        printf("GPU-major:%d\n", prop.major);
103        printf("GPU-minor:%d\n", prop.minor);
104        printf("========================\n");
105    }
106
107    typedef float floatType;
108    cudaDataType_t typeData              = CUDA_R_32F;
109    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
110
111    if (verbose) printf("Included headers and defined data types\n");

Define tensor network and tensor sizes

Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).

115    /**********************
116     * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
117     * We will execute the contraction a few times assuming all input tensors being constant except F.
118     **********************/
119
120    constexpr int32_t numInputs = 6;
121
122    // Create vectors of tensor modes
123    std::vector<std::vector<int32_t>> tensorModes{ // for input tensors & output tensor
124        // input tensors
125        {'a', 'b', 'c', 'd'}, // tensor A
126        {'b', 'c', 'd', 'e'}, // tensor B
127        {'e', 'f', 'g', 'h'}, // tensor C
128        {'g', 'h', 'i', 'j'}, // tensor D
129        {'i', 'j', 'k', 'l'}, // tensor E
130        {'k', 'l', 'm'},      // tensor F
131        // output tensor
132        {'a', 'm'}}; // tensor O
133
134    // Set mode extents
135    int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
136    std::unordered_map<int32_t, int64_t> extent;
137    for (auto& vec : tensorModes)
138    {
139        for (auto& mode : vec)
140        {
141            extent[mode] = sameExtent;
142        }
143    }
144
145    // Create a vector of extents for each tensor
146    std::vector<std::vector<int64_t>> tensorExtents;
147    tensorExtents.resize(numInputs + 1); // hold inputs + output tensors
148    for (int32_t t = 0; t < numInputs + 1; ++t)
149    {
150        for (auto& mode : tensorModes[t]) tensorExtents[t].push_back(extent[mode]);
151    }
152
153    if (verbose) printf("Defined tensor network, modes, and extents\n");

Allocate memory, initialize data, initialize cuTensorNet handle

Next, we allocate memory for the tensor network operands and initialize them to random values. Then, we initialize the cuTensorNet library via cutensornetCreate().

156    /**********************
157     * Allocating data
158     **********************/
159
160    std::vector<size_t> tensorElements(numInputs + 1);              // hold inputs + output tensors
161    std::vector<size_t> tensorSizes(numInputs + 1);                 // hold inputs + output tensors
162    std::vector<std::vector<int64_t>> tensorStrides(numInputs + 1); // hold inputs + output tensors
163    size_t totalSize = 0;
164    for (int32_t t = 0; t < numInputs + 1; ++t)
165    {
166        size_t numElements = 1;
167        for (auto& mode : tensorModes[t])
168        {
169            tensorStrides[t].push_back(numElements);
170            numElements *= extent[mode];
171        }
172        tensorElements[t] = numElements;
173
174        tensorSizes[t] = sizeof(floatType) * numElements;
175        totalSize += tensorSizes[t];
176    }
177    if (verbose) printf("Total GPU memory used for tensor storage: %.2f GiB\n", (totalSize) / 1024. / 1024. / 1024);
178
179    void* tensorData_d[numInputs + 1]; // for input tensors & output tensor
180    for (int32_t t = 0; t < numInputs + 1; ++t)
181    {
182        HANDLE_CUDA_ERROR(cudaMalloc((void**)&tensorData_d[t], tensorSizes[t]));
183    }
184
185    floatType* tensorData_h[numInputs + 1]; // for input tensors & output tensor
186    for (int32_t t = 0; t < numInputs + 1; ++t)
187    {
188        tensorData_h[t] = (floatType*)malloc(tensorSizes[t]);
189        if (tensorData_h[t] == NULL)
190        {
191            printf("Error: Host memory allocation failed!\n");
192            return -1;
193        }
194    }
195
196    /*******************
197     * Initialize data
198     *******************/
199
200    // init output tensor to all 0s
201    memset(tensorData_h[numInputs], 0, tensorSizes[numInputs]);
202    // init input tensors to random values
203    for (int32_t t = 0; t < numInputs; ++t)
204    {
205        for (size_t e = 0; e < tensorElements[t]; ++e) tensorData_h[t][e] = ((floatType)rand()) / RAND_MAX;
206    }
207    // copy input data to device buffers
208    for (int32_t t = 0; t < numInputs; ++t)
209    {
210        HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[t], tensorData_h[t], tensorSizes[t], cudaMemcpyHostToDevice));
211    }
212
213    /*************************
214     * cuTensorNet
215     *************************/
216
217    cudaStream_t stream;
218    HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
219
220    cutensornetHandle_t handle;
221    HANDLE_ERROR(cutensornetCreate(&handle));
222
223    if (verbose) printf("Allocated GPU memory for data, initialized data, and created library handle\n");

Mark constant tensors and create the network descriptor

Next, we specify which input tensors are constant, and create the network descriptor with the desired tensor modes, extents, strides, and qualifiers (e.g., constant) as well as the data and compute types. Note that the created library context will be associated with the currently active GPU.

226    /*******************************
227     * Set constant input tensors
228     *******************************/
229
230    // specify which input tensors are constant
231    std::vector<cutensornetTensorQualifiers_t> qualifiersIn(numInputs, cutensornetTensorQualifiers_t{0, 0, 0});
232    for (int i = 0; i < numInputs; ++i)
233    {
234        qualifiersIn[i].isConstant = i < (numInputs - 1) ? 1 : 0;
235    }
236
237    /*******************************
238     * Create Network
239     *******************************/
240
241    // Set up tensor network
242    cutensornetNetworkDescriptor_t networkDesc;
243    HANDLE_ERROR(cutensornetCreateNetwork(handle, &networkDesc));
244
245    int64_t tensorIDs[numInputs]; // for input tensors
246    // attach the input tensors to the network
247    for (int32_t t = 0; t < numInputs; ++t)
248    {
249        HANDLE_ERROR(cutensornetNetworkAppendTensor(handle,
250                                                    networkDesc,
251                                                    tensorModes[t].size(),
252                                                    tensorExtents[t].data(),
253                                                    tensorModes[t].data(),
254                                                    &qualifiersIn[t],
255                                                    typeData,
256                                                    &tensorIDs[t]));
257    }
258
259    // set the output tensor
260    HANDLE_ERROR(cutensornetNetworkSetOutputTensor(handle,
261                                                   networkDesc,
262                                                   tensorModes[numInputs].size(),
263                                                   tensorModes[numInputs].data(),
264                                                   typeData));
265
266    // set the network compute type
267    HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
268                                                networkDesc,
269                                                CUTENSORNET_NETWORK_COMPUTE_TYPE,
270                                                &typeCompute,
271                                                sizeof(typeCompute)));
272    if (verbose) printf("Initialized the cuTensorNet library and created a tensor network\n");

Contraction order and slicing

In this example, we illustrate using a predetermined contraction path and setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute(). We also attach the constructed optimizer info object to the network via cutensornetNetworkSetOptimizerInfo()

275    /*******************************
276     * Choose workspace limit based on available resources.
277     *******************************/
278
279    size_t freeMem, totalMem;
280    HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
281    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
282    if (verbose) printf("Workspace limit = %lu\n", workspaceLimit);
283
284    /*******************************
285     * Set contraction order
286     *******************************/
287
288    // Create contraction optimizer info
289    cutensornetContractionOptimizerInfo_t optimizerInfo;
290    HANDLE_ERROR(cutensornetCreateContractionOptimizerInfo(handle, networkDesc, &optimizerInfo));
291
292    // set a predetermined contraction path
293    std::vector<int32_t> path{0, 1, 0, 4, 0, 3, 0, 2, 0, 1};
294    const auto numContractions = numInputs - 1;
295    cutensornetContractionPath_t contPath;
296    contPath.data            = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
297    contPath.numContractions = numContractions;
298
299    // provide user-specified contPath
300    HANDLE_ERROR(cutensornetContractionOptimizerInfoSetAttribute(handle,
301                                                                 optimizerInfo,
302                                                                 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
303                                                                 &contPath,
304                                                                 sizeof(contPath)));
305
306    // Attach the optimizer info to the network 
307    HANDLE_ERROR(cutensornetNetworkSetOptimizerInfo(handle,
308                                                    networkDesc,
309                                                    optimizerInfo));
310    int64_t numSlices = 1;
311
312    if (verbose) printf("Set predetermined contraction path into cuTensorNet optimizer\n");

Create workspace descriptor and allocate workspace memory

Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. To activate intermediate tensor reuse, we need to provide CACHE workspace that will be used across multiple network contractions. Thus, we query sizes and allocate device memory for both kinds of workspaces (CUTENSORNET_WORKSPACE_SCRATCH, and CUTENSORNET_WORKSPACE_CACHE) and set these in the workspace descriptor. The workspace descriptor will be provided to the contraction preparing and computing APIs.

315    /*******************************
316     * Create workspace descriptor, allocate workspace, and set it.
317     *******************************/
318
319    cutensornetWorkspaceDescriptor_t workDesc;
320    HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
321
322    // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
323    int64_t requiredWorkspaceSizeScratch = 0;
324    HANDLE_ERROR(cutensornetWorkspaceComputeContractionSizes(handle, networkDesc, optimizerInfo, workDesc));
325
326    HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
327                                                   workDesc,
328                                                   CUTENSORNET_WORKSIZE_PREF_MIN,
329                                                   CUTENSORNET_MEMSPACE_DEVICE,
330                                                   CUTENSORNET_WORKSPACE_SCRATCH,
331                                                   &requiredWorkspaceSizeScratch));
332
333    void* workScratch = nullptr;
334    HANDLE_CUDA_ERROR(cudaMalloc(&workScratch, requiredWorkspaceSizeScratch));
335
336    HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
337                                               workDesc,
338                                               CUTENSORNET_MEMSPACE_DEVICE,
339                                               CUTENSORNET_WORKSPACE_SCRATCH,
340                                               workScratch,
341                                               requiredWorkspaceSizeScratch));
342
343    // set CACHE workspace, which will be used across network contraction operations
344    int64_t requiredWorkspaceSizeCache = 0;
345    HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
346                                                   workDesc,
347                                                   CUTENSORNET_WORKSIZE_PREF_MIN,
348                                                   CUTENSORNET_MEMSPACE_DEVICE,
349                                                   CUTENSORNET_WORKSPACE_CACHE,
350                                                   &requiredWorkspaceSizeCache));
351
352    void* workCache = nullptr;
353    HANDLE_CUDA_ERROR(cudaMalloc(&workCache, requiredWorkspaceSizeCache));
354
355    HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
356                                               workDesc,
357                                               CUTENSORNET_MEMSPACE_DEVICE,
358                                               CUTENSORNET_WORKSPACE_CACHE,
359                                               workCache,
360                                               requiredWorkspaceSizeCache));
361
362    if (verbose) printf("Allocated and set up the GPU workspace\n");

Note that, it is possible to skip the steps of creating a workspace descriptor and explicitly handling workspace memory by setting a device memory handler, in which case cuTensorNet will implicitly handle workspace memory by allocating/deallocating memory from the provided memory pool. See Memory management API for details.

Contraction preparation and auto-tuning

We prepare the tensor network contraction, via cutensornetNetworkPrepareContraction(), and set tensor’s data buffers and strides via cutensornetNetworkSetInputTensorMemory() and cutensornetNetworkSetOutputTensorMemory(). Optionally, we can auto-tune the contraction, via cutensornetNetworkAutotuneContraction(), such that cuTENSOR selects the best kernel for each pairwise contraction. This prepared network contraction can be reused for many (possibly different) data inputs, avoiding the cost of initializing it redundantly.

365    /*******************************
366     * Prepare the contraction.
367     *******************************/
368
369    HANDLE_ERROR(cutensornetNetworkPrepareContraction(handle,
370                                                      networkDesc,
371                                                      workDesc));
372
373    // set tensor's data buffers and strides
374    for (int32_t t = 0; t < numInputs; ++t)
375    {
376        HANDLE_ERROR(cutensornetNetworkSetInputTensorMemory(handle,
377                                                            networkDesc,
378                                                            tensorIDs[t],
379                                                            tensorData_d[t],
380                                                            NULL));
381    }
382    HANDLE_ERROR(cutensornetNetworkSetOutputTensorMemory(handle,
383                                                         networkDesc,
384                                                         tensorData_d[numInputs],
385                                                         NULL));
386    /*******************************
387     * Optional: Auto-tune the contraction to pick the fastest kernel
388     *           for each pairwise tensor contraction.
389     *******************************/
390    cutensornetNetworkAutotunePreference_t autotunePref;
391    HANDLE_ERROR(cutensornetCreateNetworkAutotunePreference(handle,
392                                                            &autotunePref));
393
394    const int numAutotuningIterations = 5; // may be 0
395    HANDLE_ERROR(cutensornetNetworkAutotunePreferenceSetAttribute(handle,
396                                                                  autotunePref,
397                                                                  CUTENSORNET_NETWORK_AUTOTUNE_MAX_ITERATIONS,
398                                                                  &numAutotuningIterations,
399                                                                  sizeof(numAutotuningIterations)));
400
401    // Modify the network again to find the best pair-wise contractions
402    HANDLE_ERROR(cutensornetNetworkAutotuneContraction(handle,
403                                                       networkDesc,
404                                                       workDesc,
405                                                       autotunePref,
406                                                       stream));
407
408    HANDLE_ERROR(cutensornetDestroyNetworkAutotunePreference(autotunePref));
409
410    if (verbose) printf("Prepared the network contraction for cuTensorNet and optionally auto-tuned it\n");

Tensor network contraction execution

Finally, we contract the tensor network as many times as needed, possibly with different input each time. Note that the first network contraction call will utilize the provided CACHE workspace to store the constant intermediate tensors. Subsequent network contractions will use the cached data to greatly reduce the computation times.

413    /**********************
414     * Execute the tensor network contraction
415     **********************/
416
417    // Create a cutensornetSliceGroup_t object from a range of slice IDs
418    cutensornetSliceGroup_t sliceGroup{};
419    HANDLE_ERROR(cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup));
420
421    GPUTimer timer{stream};
422    double minTimeCUTENSORNET   = 1e100;
423    double firstTimeCUTENSORNET = 1e100;
424    const int numRuns           = 3; // number of repeats to get stable performance results
425    for (int i = 0; i < numRuns; ++i)
426    {
427        // restore the output tensor on GPU
428        HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[numInputs], tensorData_h[numInputs], tensorSizes[numInputs], cudaMemcpyHostToDevice));
429        HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
430
431        /*
432         * Contract all slices of the tensor network
433         */
434        timer.start();
435
436        int32_t accumulateOutput = 0; // output tensor data will be overwritten
437        HANDLE_ERROR(cutensornetNetworkContract(handle,
438                                                networkDesc,
439                                                accumulateOutput,
440                                                workDesc,
441                                                sliceGroup, // alternatively, NULL can also be used to contract over
442                                                            // all slices instead of specifying a sliceGroup object
443                                                stream));
444
445        // Synchronize and measure best timing
446        auto time = timer.seconds();
447        if (i == 0) firstTimeCUTENSORNET = time;
448        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
449    }
450
451    if (verbose) printf("Contracted the tensor network, each slice used the same prepared contraction\n");
452
453    // Print the 1-norm of the output tensor (verification)
454    HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream));
455    // restore the output tensor on Host
456    HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_h[numInputs], tensorData_d[numInputs], tensorSizes[numInputs], cudaMemcpyDeviceToHost));
457    double norm1 = 0.0;
458    for (int64_t i = 0; i < tensorElements[numInputs]; ++i)
459    {
460        norm1 += std::abs(tensorData_h[numInputs][i]);
461    }
462    if (verbose) printf("Computed the 1-norm of the output tensor: %e\n", norm1);
463
464    /*************************/
465
466    // Query the total Flop count for the tensor network contraction
467    double flops{0.0};
468    HANDLE_ERROR(cutensornetContractionOptimizerInfoGetAttribute(handle,
469                                                                 optimizerInfo,
470                                                                 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
471                                                                 &flops,
472                                                                 sizeof(flops)));
473
474    if (verbose)
475    {
476        printf("Number of tensor network slices = %ld\n", numSlices);
477        printf("Network contraction flop cost = %e\n", flops);
478        printf("Tensor network contraction time (ms):\n");
479        printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
480        printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
481    }

Free resources

After the computation, we need to free up all resources.

484    /***************
485     * Free resources
486     ****************/
487
488    // Free cuTensorNet resources
489    HANDLE_ERROR(cutensornetDestroySliceGroup(sliceGroup));
490    HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
491    HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
492    HANDLE_ERROR(cutensornetDestroyNetwork(networkDesc));
493    HANDLE_ERROR(cutensornetDestroy(handle));
494
495    HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
496
497    // Free Host and GPU memory resources
498    for (int32_t t = 0; t < numInputs + 1; ++t)
499    {
500        if (tensorData_h[t]) free(tensorData_h[t]);
501        if (tensorData_d[t]) cudaFree(tensorData_d[t]);
502    }
503    // Free GPU memory resources
504    if (workScratch) cudaFree(workScratch);
505    if (workCache) cudaFree(workCache);
506    if (verbose) printf("Freed resources and exited\n");
507
508    return 0;
509}