Code example (serial contraction)#

The following code example illustrates the common steps necessary to use cuTensorNet and also introduces typical tensor network operations. Specifically, the example performs the following tensor contraction:

\[R_{k,l} = A_{a,b,c,d,e,f} B_{b,g,h,e,i,j} C_{m,a,g,f,i,k} D_{l,c,h,d,j,m}\]

The full sample code can be found in the NVIDIA/cuQuantum repository (here).

Headers and data types#

  8#include <stdlib.h>
  9#include <stdio.h>
 10
 11#include <unordered_map>
 12#include <vector>
 13#include <cassert>
 14
 15#include <cuda_runtime.h>
 16#include <cutensornet.h>
 17
 18#define HANDLE_ERROR(x)                                                                 \
 19    do {                                                                                \
 20        const auto err = x;                                                             \
 21        if (err != CUTENSORNET_STATUS_SUCCESS)                                          \
 22        {                                                                               \
 23            printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
 24            fflush(stdout);                                                             \
 25            exit(EXIT_FAILURE);                                                         \
 26        }                                                                               \
 27    } while (0)
 28
 29#define HANDLE_CUDA_ERROR(x)                                                          \
 30    do {                                                                              \
 31        const auto err = x;                                                           \
 32        if (err != cudaSuccess)                                                       \
 33        {                                                                             \
 34            printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
 35            fflush(stdout);                                                           \
 36            exit(EXIT_FAILURE);                                                       \
 37        }                                                                             \
 38    } while (0)
 39
 40// Usage: DEV_ATTR(cudaDevAttrClockRate, deviceId)
 41#define DEV_ATTR(ENUMCONST, DID)                                                   \
 42    ({ int v;                                                                       \
 43       HANDLE_CUDA_ERROR(cudaDeviceGetAttribute(&v, ENUMCONST, DID));               \
 44       v; })
 45
 46
 47struct GPUTimer
 48{
 49    GPUTimer(cudaStream_t stream) : stream_(stream)
 50    {
 51        HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
 52        HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
 53    }
 54
 55    ~GPUTimer()
 56    {
 57        HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
 58        HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
 59    }
 60
 61    void start() { HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_)); }
 62
 63    float seconds()
 64    {
 65        HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
 66        HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
 67        float time;
 68        HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
 69        return time * 1e-3;
 70    }
 71
 72private:
 73    cudaEvent_t start_, stop_;
 74    cudaStream_t stream_;
 75};
 76
 77int main()
 78{
 79    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
 80
 81    bool verbose = true;
 82
 83    // Check cuTensorNet version
 84    const size_t cuTensornetVersion = cutensornetGetVersion();
 85    if (verbose) printf("cuTensorNet version: %ld\n", cuTensornetVersion);
 86
 87    // Set GPU device
 88    int numDevices{0};
 89    HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
 90    const int deviceId = 0;
 91    HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
 92    cudaDeviceProp prop;
 93    HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
 94
 95    if (verbose)
 96    {
 97        printf("===== device info ======\n");
 98        printf("GPU-local-id:%d\n", deviceId);
 99        printf("GPU-name:%s\n", prop.name);
100        printf("GPU-clock:%d\n", DEV_ATTR(cudaDevAttrClockRate, deviceId));
101        printf("GPU-memoryClock:%d\n", DEV_ATTR(cudaDevAttrMemoryClockRate, deviceId));
102        printf("GPU-nSM:%d\n", prop.multiProcessorCount);
103        printf("GPU-major:%d\n", prop.major);
104        printf("GPU-minor:%d\n", prop.minor);
105        printf("========================\n");
106    }
107
108    typedef float floatType;
109    cudaDataType_t typeData              = CUDA_R_32F;
110    cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
111
112    if (verbose) printf("Included headers and defined data types\n");

Define tensor network and tensor sizes#

Next, we define the topology of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).

116    /**************************************************************************************
117     * Computing: R_{k,l} = A_{a,b,c,d,e,f} B_{b,g,h,e,i,j} C_{m,a,g,f,i,k} D_{l,c,h,d,j,m}
118     **************************************************************************************/
119
120    constexpr int32_t numInputs = 4;
121
122    // Create vectors of tensor modes
123    std::vector<std::vector<int32_t>> tensorModes{ // for input tensors & output tensor
124        // input tensors
125        {'a', 'b', 'c', 'd', 'e', 'f'}, // tensor A
126        {'b', 'g', 'h', 'e', 'i', 'j'}, // tensor B
127        {'m', 'a', 'g', 'f', 'i', 'k'}, // tensor C
128        {'l', 'c', 'h', 'd', 'j', 'm'}, // tensor D
129        // output tensor
130        {'k', 'l'}, // tensor R
131    };
132
133    // Set mode extents
134    int64_t sameExtent = 16; // setting same extent for simplicity. In principle extents can differ.
135    std::unordered_map<int32_t, int64_t> extent;
136    for (auto& vec : tensorModes)
137    {
138        for (auto& mode : vec)
139        {
140            extent[mode] = sameExtent;
141        }
142    }
143
144    // Create a vector of extents for each tensor
145    std::vector<std::vector<int64_t>> tensorExtents; // for input tensors & output tensor
146    tensorExtents.resize(numInputs + 1);             // hold inputs + output tensors
147    for (int32_t t = 0; t < numInputs + 1; ++t)
148    {
149        for (auto& mode : tensorModes[t]) tensorExtents[t].push_back(extent[mode]);
150    }
151
152    if (verbose) printf("Defined tensor network, modes, and extents\n");

Allocate memory and initialize data#

Next, we allocate memory for the tensor network operands and initialize them to random values.

155    /*****************
156     * Allocating data
157     *****************/
158
159    std::vector<size_t> tensorElements(numInputs + 1); // for input tensors & output tensor
160    std::vector<size_t> tensorSizes(numInputs + 1);    // for input tensors & output tensor
161    size_t totalSize = 0;
162    for (int32_t t = 0; t < numInputs + 1; ++t)
163    {
164        size_t numElements = 1;
165        for (auto& mode : tensorModes[t]) numElements *= extent[mode];
166        tensorElements[t] = numElements;
167
168        tensorSizes[t] = sizeof(floatType) * numElements;
169        totalSize += tensorSizes[t];
170    }
171
172    if (verbose) printf("Total GPU memory used for tensor storage: %.2f GiB\n", (totalSize) / 1024. / 1024. / 1024);
173
174    void* tensorData_d[numInputs + 1]; // for input tensors & output tensor
175    for (int32_t t = 0; t < numInputs + 1; ++t)
176    {
177        HANDLE_CUDA_ERROR(cudaMalloc((void**)&tensorData_d[t], tensorSizes[t]));
178    }
179
180    floatType* tensorData_h[numInputs + 1]; // for input tensors & output tensor
181    for (int32_t t = 0; t < numInputs + 1; ++t)
182    {
183        tensorData_h[t] = (floatType*)malloc(tensorSizes[t]);
184        if (tensorData_h[t] == NULL)
185        {
186            printf("Error: Host memory allocation failed!\n");
187            return -1;
188        }
189    }
190
191    /*****************
192     * Initialize data
193     *****************/
194
195    // init output tensor to all 0s
196    memset(tensorData_h[numInputs], 0, tensorSizes[numInputs]);
197    // init input tensors to random values
198    for (int32_t t = 0; t < numInputs; ++t)
199    {
200        for (size_t e = 0; e < tensorElements[t]; ++e) tensorData_h[t][e] = ((floatType)rand()) / RAND_MAX;
201    }
202    // copy input data to device buffers
203    for (int32_t t = 0; t < numInputs; ++t)
204    {
205        HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[t], tensorData_h[t], tensorSizes[t], cudaMemcpyHostToDevice));
206    }

cuTensorNet handle and network descriptor#

Next, we initialize the cuTensorNet library via cutensornetCreate(). Note that the created library context will be associated with the currently active GPU. We create the network descriptor, and append the input tensors with the desired tensor modes and extents, as well as the data type. We can, optionally, set the output tensor modes (if skipped, the output modes will be inferred). Also, we can, optionally, set the compute mode on the network (if skipped, a default compute type, corresponding to the data type, will be used. Refer to cutensornetCreateNetwork()).

209    /*************
210     * cuTensorNet
211     *************/
212
213    cudaStream_t stream;
214    HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
215
216    cutensornetHandle_t handle;
217    HANDLE_ERROR(cutensornetCreate(&handle));
218
219    if (verbose) printf("Allocated GPU memory for data, initialized data, and created library handle\n");
220
221    /****************
222     * Create Network
223     ****************/
224
225    // Set up tensor network
226    cutensornetNetworkDescriptor_t networkDesc;
227    HANDLE_ERROR(cutensornetCreateNetwork(handle, &networkDesc));
228
229    int64_t tensorIDs[numInputs]; // for input tensors
230    // attach the input tensors to the network
231    for (int32_t t = 0; t < numInputs; ++t)
232    {
233        HANDLE_ERROR(cutensornetNetworkAppendTensor(handle,
234                                                    networkDesc,
235                                                    tensorModes[t].size(),
236                                                    tensorExtents[t].data(),
237                                                    tensorModes[t].data(),
238                                                    NULL,
239                                                    typeData,
240                                                    &tensorIDs[t]));
241    }
242
243    // set the output tensor
244    HANDLE_ERROR(cutensornetNetworkSetOutputTensor(handle,
245                                                   networkDesc,
246                                                   tensorModes[numInputs].size(),
247                                                   tensorModes[numInputs].data(),
248                                                   typeData));
249
250    // set the network compute type
251    HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
252                                                networkDesc,
253                                                CUTENSORNET_NETWORK_COMPUTE_TYPE,
254                                                &typeCompute,
255                                                sizeof(typeCompute)));
256    if (verbose) printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");

Optimal contraction order and slicing#

At this stage, we can deploy the cuTensorNet optimizer to find an optimized contraction path and slicing combination. We choose a limit for the workspace needed to perform the contraction based on the available memory resources, and provide it to the optimizer as a constraint. We then create an optimizer configuration object of type cutensornetContractionOptimizerConfig_t to specify various optimizer options and provide it to the optimizer, which is invoked via cutensornetContractionOptimize(). The results from the optimizer will be returned in an optimizer info object of type cutensornetContractionOptimizerInfo_t.

259    /******************************************************
260     * Choose workspace limit based on available resources.
261     ******************************************************/
262
263    size_t freeMem, totalMem;
264    HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
265    uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
266    if (verbose) printf("Workspace limit = %lu\n", workspaceLimit);
267
268    /*******************************
269     * Find "optimal" contraction order and slicing
270     *******************************/
271
272    cutensornetContractionOptimizerConfig_t optimizerConfig;
273    HANDLE_ERROR(cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
274
275    // Set the desired number of hyper-samples (defaults to 0)
276    int32_t num_hypersamples = 8;
277    HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(handle,
278                                                                   optimizerConfig,
279                                                                   CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES,
280                                                                   &num_hypersamples,
281                                                                   sizeof(num_hypersamples)));
282
283    // Create contraction optimizer info and find an optimized contraction path
284    cutensornetContractionOptimizerInfo_t optimizerInfo;
285    HANDLE_ERROR(cutensornetCreateContractionOptimizerInfo(handle, networkDesc, &optimizerInfo));
286
287    HANDLE_ERROR(cutensornetContractionOptimize(handle,
288                                                networkDesc,
289                                                optimizerConfig,
290                                                workspaceLimit,
291                                                optimizerInfo));
292
293    // Query the number of slices the tensor network execution will be split into
294    int64_t numSlices = 0;
295    HANDLE_ERROR(cutensornetContractionOptimizerInfoGetAttribute(handle,
296                                                                 optimizerInfo,
297                                                                 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
298                                                                 &numSlices,
299                                                                 sizeof(numSlices)));
300    assert(numSlices > 0);
301
302    if (verbose) printf("Found an optimized contraction path using cuTensorNet optimizer\n");

It is also possible to bypass the cuTensorNet optimizer and import a pre-determined contraction path, as well as slicing information, directly to the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute(), then attach it to the network via cutensornetNetworkSetOptimizerInfo().

Create workspace descriptor and allocate workspace memory#

Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. We then allocate device memory for the workspace and set this in the workspace descriptor. The workspace descriptor will be provided to the contraction preparation and computation calls.

305    /*******************************
306     * Create workspace descriptor, allocate workspace, and set it.
307     *******************************/
308
309    cutensornetWorkspaceDescriptor_t workDesc;
310    HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
311
312    int64_t requiredWorkspaceSize = 0;
313    HANDLE_ERROR(cutensornetWorkspaceComputeContractionSizes(handle,
314                                                             networkDesc,
315                                                             optimizerInfo,
316                                                             workDesc));
317
318    HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
319                                                   workDesc,
320                                                   CUTENSORNET_WORKSIZE_PREF_MIN,
321                                                   CUTENSORNET_MEMSPACE_DEVICE,
322                                                   CUTENSORNET_WORKSPACE_SCRATCH,
323                                                   &requiredWorkspaceSize));
324
325    void* work = nullptr;
326    HANDLE_CUDA_ERROR(cudaMalloc(&work, requiredWorkspaceSize));
327
328    HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
329                                               workDesc,
330                                               CUTENSORNET_MEMSPACE_DEVICE,
331                                               CUTENSORNET_WORKSPACE_SCRATCH,
332                                               work,
333                                               requiredWorkspaceSize));
334
335    if (verbose) printf("Allocated and set up the GPU workspace\n");

Contraction preparation and auto-tuning#

We prepare the tensor network contraction, via cutensornetNetworkPrepareContraction(), and set tensor’s data buffers and strides via cutensornetNetworkSetInputTensorMemory() and cutensornetNetworkSetOutputTensorMemory(). Optionally, we can auto-tune the contraction, via cutensornetNetworkAutotuneContraction(), such that cuTENSOR selects the best kernel for each pairwise contraction. This prepared network contraction can be reused for many (possibly different) data inputs, avoiding the cost of initializing it redundantly.

338    /**************************
339     * Prepare the contraction.
340     **************************/
341
342    HANDLE_ERROR(cutensornetNetworkPrepareContraction(handle,
343                                                      networkDesc,
344                                                      workDesc));
345
346    // set tensor's data buffers and strides
347    for (int32_t t = 0; t < numInputs; ++t)
348    {
349        HANDLE_ERROR(cutensornetNetworkSetInputTensorMemory(handle,
350                                                            networkDesc,
351                                                            tensorIDs[t],
352                                                            tensorData_d[t],
353                                                            NULL));
354    }
355    HANDLE_ERROR(cutensornetNetworkSetOutputTensorMemory(handle,
356                                                         networkDesc,
357                                                         tensorData_d[numInputs],
358                                                         NULL));
359    /****************************************************************
360     * Optional: Auto-tune the contraction to pick the fastest kernel
361     *           for each pairwise tensor contraction.
362     ****************************************************************/
363    cutensornetNetworkAutotunePreference_t autotunePref;
364    HANDLE_ERROR(cutensornetCreateNetworkAutotunePreference(handle,
365                                                            &autotunePref));
366
367    const int numAutotuningIterations = 5; // may be 0
368    HANDLE_ERROR(cutensornetNetworkAutotunePreferenceSetAttribute(handle,
369                                                                  autotunePref,
370                                                                  CUTENSORNET_NETWORK_AUTOTUNE_MAX_ITERATIONS,
371                                                                  &numAutotuningIterations,
372                                                                  sizeof(numAutotuningIterations)));
373
374    // Modify the network again to find the best pair-wise contractions
375    HANDLE_ERROR(cutensornetNetworkAutotuneContraction(handle,
376                                                       networkDesc,
377                                                       workDesc,
378                                                       autotunePref,
379                                                       stream));
380
381    HANDLE_ERROR(cutensornetDestroyNetworkAutotunePreference(autotunePref));
382
383    if (verbose) printf("Prepared the network contraction for cuTensorNet and optionally auto-tuned it\n");

Tensor network contraction execution#

Finally, we contract the tensor network as many times as needed, via cutensornetNetworkContract(), possibly with different input each time (reset the data buffers and strides via cutensornetNetworkSetInputTensorMemory()). Tensor network slices, captured as a cutensornetSliceGroup_t object, are computed using the same prepared network contraction. For convenience, NULL can be provided to the cutensornetNetworkContract() function instead of a slice group when the goal is to contract all the slices in the network. We also clean up and free allocated resources.

386    /****************************************
387     * Execute the tensor network contraction
388     ****************************************/
389
390    // Create a cutensornetSliceGroup_t object from a range of slice IDs
391    cutensornetSliceGroup_t sliceGroup{};
392    HANDLE_ERROR(cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup));
393
394    GPUTimer timer{stream};
395    double minTimeCUTENSORNET = 1e100;
396    const int numRuns         = 3; // number of repeats to get stable performance results
397    for (int i = 0; i < numRuns; ++i)
398    {
399        // reset the output tensor data
400        HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[numInputs], tensorData_h[numInputs], tensorSizes[numInputs], cudaMemcpyHostToDevice));
401        HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
402
403        /*
404         * Contract all slices of the tensor network
405         */
406        timer.start();
407
408        int32_t accumulateOutput = 0; // output tensor data will be overwritten
409        HANDLE_ERROR(cutensornetNetworkContract(handle,
410                                                networkDesc,
411                                                accumulateOutput,
412                                                workDesc,
413                                                sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
414                                                stream));
415
416        // Synchronize and measure best timing
417        auto time          = timer.seconds();
418        minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
419    }
420
421    if (verbose) printf("Contracted the tensor network, each slice used the same prepared contraction\n");
422
423    // Print the 1-norm of the output tensor (verification)
424    HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream));
425    // restore the output tensor on Host
426    HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_h[numInputs], tensorData_d[numInputs], tensorSizes[numInputs], cudaMemcpyDeviceToHost));
427    double norm1 = 0.0;
428    for (int64_t i = 0; i < tensorElements[numInputs]; ++i)
429    {
430        norm1 += std::abs(tensorData_h[numInputs][i]);
431    }
432    if (verbose) printf("Computed the 1-norm of the output tensor: %e\n", norm1);
433
434    /*************************/
435
436    // Query the total Flop count for the tensor network contraction
437    double flops{0.0};
438    HANDLE_ERROR(cutensornetContractionOptimizerInfoGetAttribute(handle,
439                                                                 optimizerInfo,
440                                                                 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
441                                                                 &flops,
442                                                                 sizeof(flops)));
443
444    if (verbose)
445    {
446        printf("Number of tensor network slices = %ld\n", numSlices);
447        printf("Tensor network contraction time (ms) = %.3f\n", minTimeCUTENSORNET * 1000.f);
448    }
449
450    // Sphinx: #9
451    /****************
452     * Free resources
453     ****************/
454
455    // Free cuTensorNet resources
456    HANDLE_ERROR(cutensornetDestroySliceGroup(sliceGroup));
457    HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
458    HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
459    HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
460    HANDLE_ERROR(cutensornetDestroyNetwork(networkDesc));
461    HANDLE_ERROR(cutensornetDestroy(handle));
462
463    HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
464
465    // Free Host and GPU memory resources
466    for (int32_t t = 0; t < numInputs + 1; ++t)
467    {
468        if (tensorData_h[t]) free(tensorData_h[t]);
469        if (tensorData_d[t]) cudaFree(tensorData_d[t]);
470    }
471    if (work) cudaFree(work);
472
473    if (verbose) printf("Freed resources and exited\n");
474
475    return 0;
476}

Recall that the full sample code can be found in the NVIDIA/cuQuantum repository (here).