Getting Started

In this section, we show how to contract a tensor network using cuTensorNet. First, we describe how to install the library and how to compile a sample code. Then, we present the example code used to perform common steps in cuTensorNet. In this example, we perform the following tensor contraction:

\[D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}\]

We build the code up step by step, each step adding code at the end. The steps are separated by succinct multi-line comment blocks.

It is recommended that the reader refers to Overview and cuTENSOR documentation for familiarity with the nomenclature and cuTENSOR operations.

Installation and Compilation

Download the cuQuantum package (which cuTensorNet is part of) from https://developer.nvidia.com/cuQuantum-downloads, and the cuTENSOR package from https://developer.nvidia.com/cutensor.

Linux

Assuming cuQuantum has been extracted in CUQUANTUM_ROOT and cuTENSOR in CUTENSOR_ROOT we update the library path as follows:

export LD_LIBRARY_PATH=${CUQUANTUM_ROOT}/lib:${CUTENSOR_ROOT}/lib/11:${LD_LIBRARY_PATH}

Depending on your CUDA Toolkit, you might have to choose a different library version (e.g., ${CUTENSOR_ROOT}/lib/11.0).

The sample code discussed below (tensornet_example.cu) can be compiled via the following command:

nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUTENSOR_ROOT}/lib/11 -lcutensornet -lcutensor -o tensornet_example

For statically linking against the cuTensorNet library, use the following command (note that libmetis_static.a needs to be explicitly linked against, assuming it is installed through the NVIDIA CUDA Toolkit and accessible through $LIBRARY_PATH):

nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include ${CUQUANTUM_ROOT}/lib/libcutensornet_static.a -L${CUTENSOR_DIR}/lib/11 -lcutensor libmetis_static.a -o tensornet_example

Note

Depending on the source of the cuQuantum package, you may need to replace lib above by lib64.

Code Example

The following code example shows the common steps necessary to use cuTensorNet and performs some typical operations in tensor network contraction. The full sample code can be found in the NVIDIA/cuQuantum repository (here).

Headers and Data Types

 8#include <stdlib.h>
 9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17#include <cutensor.h>
18
19#define HANDLE_ERROR(x)                                               \
20{ const auto err = x;                                                 \
21if( err != CUTENSORNET_STATUS_SUCCESS )                                \
22{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \
23};
24
25#define HANDLE_CUDA_ERROR(x)                                      \
26{  const auto err = x;                                             \
27   if( err != cudaSuccess )                                        \
28   { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \
29};
30
31struct GPUTimer
32{
33   GPUTimer(cudaStream_t stream): stream_(stream)
34   {
35      cudaEventCreate(&start_);
36      cudaEventCreate(&stop_);
37   }
38
39   ~GPUTimer()
40   {
41      cudaEventDestroy(start_);
42      cudaEventDestroy(stop_);
43   }
44
45   void start()
46   {
47      cudaEventRecord(start_, stream_);
48   }
49
50   float seconds()
51   {
52      cudaEventRecord(stop_, stream_);
53      cudaEventSynchronize(stop_);
54      float time;
55      cudaEventElapsedTime(&time, start_, stop_);
56      return time * 1e-3;
57   }
58
59   private:
60   cudaEvent_t start_, stop_;
61   cudaStream_t stream_;
62};
63
64
65int main()
66{
67   const size_t cuTensornetVersion = cutensornetGetVersion();
68   printf("cuTensorNet-vers:%ld\n",cuTensornetVersion);
69
70   cudaDeviceProp prop;
71   int32_t deviceId = -1;
72   HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
73   HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
74
75   printf("===== device info ======\n");
76   printf("GPU-name:%s\n", prop.name);
77   printf("GPU-clock:%d\n", prop.clockRate);
78   printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
79   printf("GPU-nSM:%d\n", prop.multiProcessorCount);
80   printf("GPU-major:%d\n", prop.major);
81   printf("GPU-minor:%d\n", prop.minor);
82   printf("========================\n");
83
84   typedef float floatType;
85   cudaDataType_t typeData = CUDA_R_32F;
86   cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
87
88   printf("Include headers and define data types\n");

Define Tensor Network and Tensor Sizes

Next, we define the topology of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).

 91   /**********************
 92   * Computing: D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}
 93   **********************/
 94
 95   constexpr int32_t numInputs = 3;
 96
 97   // Create vector of modes
 98   std::vector<int32_t> modesA{'m','h','k','n'};
 99   std::vector<int32_t> modesB{'u','k','h'};
100   std::vector<int32_t> modesC{'x','u','y'};
101   std::vector<int32_t> modesD{'m','x','n','y'};
102
103   // Extents
104   std::unordered_map<int32_t, int64_t> extent;
105   extent['m'] = 96;
106   extent['n'] = 96;
107   extent['u'] = 96;
108   extent['h'] = 64;
109   extent['k'] = 64;
110   extent['x'] = 64;
111   extent['y'] = 64;
112
113   // Create a vector of extents for each tensor
114   std::vector<int64_t> extentA;
115   for (auto mode : modesA)
116      extentA.push_back(extent[mode]);
117   std::vector<int64_t> extentB;
118   for (auto mode : modesB)
119      extentB.push_back(extent[mode]);
120   std::vector<int64_t> extentC;
121   for (auto mode : modesC)
122      extentC.push_back(extent[mode]);
123   std::vector<int64_t> extentD;
124   for (auto mode : modesD)
125      extentD.push_back(extent[mode]);
126
127   printf("Define network, modes, and extents\n");

Allocate memory and initialize data

Next, we allocate memory for data and workspace, and randomly initialize the data.

130   /**********************
131   * Allocating data
132   **********************/
133
134   size_t elementsA = 1;
135   for (auto mode : modesA)
136      elementsA *= extent[mode];
137   size_t elementsB = 1;
138   for (auto mode : modesB)
139      elementsB *= extent[mode];
140   size_t elementsC = 1;
141   for (auto mode : modesC)
142      elementsC *= extent[mode];
143   size_t elementsD = 1;
144   for (auto mode : modesD)
145      elementsD *= extent[mode];
146
147   size_t sizeA = sizeof(floatType) * elementsA;
148   size_t sizeB = sizeof(floatType) * elementsB;
149   size_t sizeC = sizeof(floatType) * elementsC;
150   size_t sizeD = sizeof(floatType) * elementsD;
151   printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC + sizeD)/1024./1024./1024);
152
153   void* rawDataIn_d[numInputs];
154   void* D_d;
155   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[0], sizeA));
156   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[1], sizeB));
157   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[2], sizeC));
158   HANDLE_CUDA_ERROR(cudaMalloc((void**) &D_d, sizeD));
159
160   floatType *A = (floatType*) malloc(sizeof(floatType) * elementsA);
161   floatType *B = (floatType*) malloc(sizeof(floatType) * elementsB);
162   floatType *C = (floatType*) malloc(sizeof(floatType) * elementsC);
163   floatType *D = (floatType*) malloc(sizeof(floatType) * elementsD);
164
165   if (A == NULL || B == NULL || C == NULL || D == NULL)
166   {
167      printf("Error: Host allocation of A or C.\n");
168      return -1;
169
170   }
171   /**********************
172   * Allocate workspace
173   **********************/
174
175   size_t freeMem, totalMem;
176   HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem ));
177
178   uint64_t worksize = freeMem * 0.9;
179
180   void *work = nullptr;
181   HANDLE_CUDA_ERROR( cudaMalloc(&work, worksize) );
182
183   /*******************
184   * Initialize data
185   *******************/
186
187   for (uint64_t i = 0; i < elementsA; i++)
188      A[i] = ((float) rand())/RAND_MAX;
189   for (uint64_t i = 0; i < elementsB; i++)
190      B[i] = ((float) rand())/RAND_MAX;
191   for (uint64_t i = 0; i < elementsC; i++)
192      C[i] = ((float) rand())/RAND_MAX;
193   memset(D, 0, sizeof(floatType) * elementsD);
194
195   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[0], A, sizeA, cudaMemcpyHostToDevice));
196   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[1], B, sizeB, cudaMemcpyHostToDevice));
197   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[2], C, sizeC, cudaMemcpyHostToDevice));
198
199   printf("Allocate memory for data and workspace, and initialize data.\n");

cuTensorNet handle and Network Descriptor

Next, we initialize the cuTensorNet library via cutensornetCreate() and create the network descriptor with the desired tensor modes, extents, and strides, as well as the data and compute types.

202   /*************************
203   * cuTensorNet
204   *************************/
205
206   cudaStream_t stream;
207   cudaStreamCreate(&stream);
208
209   cutensornetHandle_t handle;
210   HANDLE_ERROR(cutensornetCreate(&handle));
211
212   const int32_t nmodeA = modesA.size();
213   const int32_t nmodeB = modesB.size();
214   const int32_t nmodeC = modesC.size();
215   const int32_t nmodeD = modesD.size();
216
217   /*******************************
218   * Create Network Descriptor
219   *******************************/
220
221   const int32_t* modesIn[] = {modesA.data(), modesB.data(), modesC.data()};
222   int32_t const numModesIn[] = {nmodeA, nmodeB, nmodeC};
223   const int64_t* extentsIn[] = {extentA.data(), extentB.data(), extentC.data()};
224   const int64_t* stridesIn[] = {NULL, NULL, NULL}; // strides are optional; if no stride is provided, then cuTensorNet assumes a generalized column-major data layout
225
226   // Notice that pointers are allocated via cudaMalloc are aligned to 256 byte
227   // boundaries by default; however here we're checking the pointer alignment explicitly
228   // to demonstrate how one would check the alginment for arbitrary pointers.
229
230   auto getMaximalPointerAlignment = [](const void* ptr) {
231      const uint64_t ptrAddr  = reinterpret_cast<uint64_t>(ptr);
232      uint32_t alignment = 1;
233      while(ptrAddr % alignment == 0 &&
234            alignment < 256) // at the latest we terminate once the alignment reached 256 bytes (we could be going, but any alignment larger or equal to 256 is equally fine)
235      {
236         alignment *= 2;
237      }
238      return alignment;
239   };
240   const uint32_t alignmentsIn[] = {getMaximalPointerAlignment(rawDataIn_d[0]),
241                                    getMaximalPointerAlignment(rawDataIn_d[1]),
242                                    getMaximalPointerAlignment(rawDataIn_d[2])};
243   const uint32_t alignmentOut = getMaximalPointerAlignment(D_d);
244
245   // setup tensor network
246   cutensornetNetworkDescriptor_t descNet;
247   HANDLE_ERROR (cutensornetCreateNetworkDescriptor(handle,
248                                                numInputs, numModesIn, extentsIn, stridesIn, modesIn, alignmentsIn,
249                                                nmodeD, extentD.data(), /*stridesOut = */NULL, modesD.data(), alignmentOut,
250                                                typeData, typeCompute,
251                                                &descNet));
252
253   printf("Initialize the cuTensorNet library and create a network descriptor.\n");

Optimal contraction order and slicing

At this stage, we can deploy the cuTensorNet optimizer to find an optimized contraction path and slicing combination. It is also possible to feed a pre-determined path into the configuration. We use the cutensornetContractionOptimize() function after creating a config and info structures, which allow us to specify various options to the optimizer.

256   /*******************************
257   * Find "optimal" contraction order and slicing
258   *******************************/
259
260   cutensornetContractionOptimizerConfig_t optimizerConfig;
261   HANDLE_ERROR (cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
262
263   // Set the value of the partitioner imbalance factor, if desired
264   int imbalance_factor = 30;
265   HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(
266                                                               handle,
267                                                               optimizerConfig,
268                                                               CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR,
269                                                               &imbalance_factor,
270                                                               sizeof(imbalance_factor)));
271
272   cutensornetContractionOptimizerInfo_t optimizerInfo;
273   HANDLE_ERROR (cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo));
274
275   HANDLE_ERROR (cutensornetContractionOptimize(handle,
276                                             descNet,
277                                             optimizerConfig,
278                                             worksize,
279                                             optimizerInfo));
280
281   int64_t numSlices = 0;
282   HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
283               handle,
284               optimizerInfo,
285               CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
286               &numSlices,
287               sizeof(numSlices)));
288
289   assert(numSlices > 0);
290
291   printf("Find an optimized contraction path with cuTensorNet optimizer.\n");

Contraction plan and auto-tune

We create a contraction plan holding pair-wise contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly. We also create a workspace descriptor, compute and query the minimum workspace size needed to contract the network, and provide this to the contraction plan.

294   /*******************************
295   * Initialize all pair-wise contraction plans (for cuTENSOR)
296   *******************************/
297   cutensornetContractionPlan_t plan;
298
299   cutensornetWorkspaceDescriptor_t workDesc;
300   HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
301
302   uint64_t requiredWorkspaceSize = 0;
303   HANDLE_ERROR(cutensornetWorkspaceComputeSizes(handle,
304                                          descNet,
305                                          optimizerInfo,
306                                          workDesc));
307
308   HANDLE_ERROR(cutensornetWorkspaceGetSize(handle,
309                                         workDesc,
310                                         CUTENSORNET_WORKSIZE_PREF_MIN,
311                                         CUTENSORNET_MEMSPACE_DEVICE,
312                                         &requiredWorkspaceSize));
313   if (worksize < requiredWorkspaceSize)
314   {
315      printf("Not enough workspace memory is available.");
316   }
317   else
318   {
319      HANDLE_ERROR (cutensornetWorkspaceSet(handle,
320                                            workDesc,
321                                            CUTENSORNET_MEMSPACE_DEVICE,
322                                            work,
323                                            worksize));
324                                          
325      HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
326                                                     descNet,
327                                                     optimizerInfo,
328                                                     workDesc,
329                                                     &plan) );
330
331      /*******************************
332      * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
333      *******************************/
334      cutensornetContractionAutotunePreference_t autotunePref;
335      HANDLE_ERROR(cutensornetCreateContractionAutotunePreference(handle,
336                              &autotunePref));
337
338      const int numAutotuningIterations = 5; // may be 0
339      HANDLE_ERROR(cutensornetContractionAutotunePreferenceSetAttribute(
340                              handle,
341                              autotunePref,
342                              CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
343                              &numAutotuningIterations,
344                              sizeof(numAutotuningIterations)));
345
346      // modify the plan again to find the best pair-wise contractions
347      HANDLE_ERROR(cutensornetContractionAutotune(handle,
348                              plan,
349                              rawDataIn_d,
350                              D_d,
351                              workDesc,
352                              autotunePref,
353                              stream));
354
355      HANDLE_ERROR(cutensornetDestroyContractionAutotunePreference(autotunePref));
356
357      printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");

Network contraction execution

Finally, we contract the network as many times as needed, possibly with different data. Network slices can be executed using the same contraction plan, potentially on different devices. We also clean up and free allocated resources.

360      /**********************
361      * Run
362      **********************/
363      GPUTimer timer{stream};
364      double minTimeCUTENSOR = 1e100;
365      const int numRuns = 3; // to get stable perf results
366      for (int i=0; i < numRuns; ++i)
367      {
368         cudaMemcpy(D_d, D, sizeD, cudaMemcpyHostToDevice); // restore output
369         cudaDeviceSynchronize();
370
371         /*
372         * Contract over all slices.
373         *
374         * A user may choose to parallelize this loop across multiple devices.
375         * (Note, however, that as of cuTensorNet v1.0.0 the contraction must
376         * start from slice 0, see the cutensornetContraction documentation at
377         * https://docs.nvidia.com/cuda/cuquantum/cutensornet/api/functions.html#cutensornetcontraction )
378         */
379         for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
380         {
381            timer.start();
382
383            HANDLE_ERROR(cutensornetContraction(handle,
384                                    plan,
385                                    rawDataIn_d,
386                                    D_d,
387                                    workDesc, sliceId, stream));
388
389            // Synchronize and measure timing
390            auto time = timer.seconds();
391            minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
392         }
393      }
394
395      printf("Contract the network, each slice uses the same contraction plan.\n");
396
397      /*************************/
398
399      double flops = -1;
400
401      HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
402                  handle,
403                  optimizerInfo,
404                  CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
405                  &flops,
406                  sizeof(flops)));
407
408      printf("numSlices: %ld\n", numSlices);
409      printf("%.2f ms / slice\n", minTimeCUTENSOR * 1000.f);
410      printf("%.2f GFLOPS/s\n", flops/1e9/minTimeCUTENSOR );
411   }
412
413   HANDLE_ERROR(cutensornetDestroy(handle));
414   HANDLE_ERROR(cutensornetDestroyNetworkDescriptor(descNet));
415   HANDLE_ERROR(cutensornetDestroyContractionPlan(plan));
416   HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
417   HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
418   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
419
420   if (A) free(A);
421   if (B) free(B);
422   if (C) free(C);
423   if (D) free(D);
424   if (rawDataIn_d[0]) cudaFree(rawDataIn_d[0]);
425   if (rawDataIn_d[1]) cudaFree(rawDataIn_d[1]);
426   if (rawDataIn_d[2]) cudaFree(rawDataIn_d[2]);
427   if (D_d) cudaFree(D_d);
428   if (work) cudaFree(work);
429
430   printf("Free resource and exit.\n");
431
432   return 0;
433}

Recall that the full sample code can be found in the NVIDIA/cuQuantum repository (here).

Useful tips

  • For debugging, the environment variable CUTENSORNET_LOG_LEVEL=n can be set. The level n = 0, 1, …, 5 corresponds to the logger level as described and used in cutensornetLoggerSetLevel(). The environment variable CUTENSORNET_LOG_FILE=<filepath> can be used to direct the log output to a custom file at <filepath> instead of stdout.