Getting Started

In this section, we show how to contract a tensor network using cuTensorNet. Firstly, We describe how to install the library and how to compile a sample code. Then, we present the example code to perform common steps in cuTensorNet. In this example, we perform the following tensor contraction:

\[D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}\]

We build the code up step by step, each step adding code at the end. The steps are separated by comments consisting of multiple stars.

It is recommended to refer to Overview and cuTENSOR documentation for familiarity with the nomenclature and cuTENSOR operations.

Installation and Compilation

Download the cuQuantum package (which cuTensorNet is part of) from https://developer.nvidia.com/cuQuantum-downloads, and the cuTENSOR package from https://developer.nvidia.com/cutensor.

Linux

Assuming cuQuantum has been extracted in CUQUANTUM_ROOT and cuTENSOR in CUTENSOR_ROOT we update the library path as follows:

export LD_LIBRARY_PATH=${CUQUANTUM_ROOT}/lib:${CUTENSOR_ROOT}/lib/11:${LD_LIBRARY_PATH}

Depending on your CUDA Toolkit, you might have to choose a different library version (e.g., ${CUTENSOR_ROOT}/lib/11.0).

The sample code discussed below (tensornet_example.cu) can be compiled via the following command:

nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUTENSOR_ROOT}/lib/11 -lcutensornet -lcutensor -o tensornet_example

For statically linking against the cuTensorNet library, use the following command (libmetis_static.a needs to be explicitly linked against, assuming it is installed through the NVIDIA CUDA Toolkit and accessible through $LIBRARY_PATH):

nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include ${CUQUANTUM_ROOT}/lib/libcutensornet_static.a -L${CUTENSOR_DIR}/lib/11 -lcutensor libmetis_static.a -o tensornet_example

Note

Depending on the source of the cuQuantum package, you may need to replace lib above by lib64.

Code Example

The following code example shows the common steps to use cuTensorNet and performs some typical operations in tensor network contraction. The full sample code can be found in the NVIDIA/cuQuantum repository (here).

Headers and Data Types

 31#include <stdlib.h>
 32#include <stdio.h>
 33
 34#include <unordered_map>
 35#include <vector>
 36#include <cassert>
 37
 38#include <cuda_runtime.h>
 39#include <cutensornet.h>
 40#include <cutensor.h>
 41
 42#define HANDLE_ERROR(x)                                               \
 43{ const auto err = x;                                                 \
 44if( err != CUTENSORNET_STATUS_SUCCESS )                                \
 45{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \
 46};
 47
 48#define HANDLE_CUDA_ERROR(x)                                      \
 49{  const auto err = x;                                             \
 50   if( err != cudaSuccess )                                        \
 51   { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \
 52};
 53
 54struct GPUTimer
 55{
 56   GPUTimer(cudaStream_t stream): stream_(stream)
 57   {
 58      cudaEventCreate(&start_);
 59      cudaEventCreate(&stop_);
 60   }
 61
 62   ~GPUTimer()
 63   {
 64      cudaEventDestroy(start_);
 65      cudaEventDestroy(stop_);
 66   }
 67
 68   void start()
 69   {
 70      cudaEventRecord(start_, stream_);
 71   }
 72
 73   float seconds()
 74   {
 75      cudaEventRecord(stop_, stream_);
 76      cudaEventSynchronize(stop_);
 77      float time;
 78      cudaEventElapsedTime(&time, start_, stop_);
 79      return time * 1e-3;
 80   }
 81
 82   private:
 83   cudaEvent_t start_, stop_;
 84   cudaStream_t stream_;
 85};
 86
 87
 88int main()
 89{
 90   const size_t cuTensornetVersion = cutensornetGetVersion();
 91   printf("cuTensorNet-vers:%ld\n",cuTensornetVersion);
 92
 93   cudaDeviceProp prop;
 94   int32_t deviceId = -1;
 95   HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
 96   HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
 97
 98   printf("===== device info ======\n");
 99   printf("GPU-name:%s\n", prop.name);
100   printf("GPU-clock:%d\n", prop.clockRate);
101   printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
102   printf("GPU-nSM:%d\n", prop.multiProcessorCount);
103   printf("GPU-major:%d\n", prop.major);
104   printf("GPU-minor:%d\n", prop.minor);
105   printf("========================\n");
106
107   typedef float floatType;
108   cudaDataType_t typeData = CUDA_R_32F;
109   cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
110
111   printf("Include headers and define data types\n");

Define Tensor Network and Tensor Sizes

Next, we define the topology of the tensor network (i.e., the modes of the tensors, their extents and connectivity).

114   /**********************
115   * Computing: D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}
116   **********************/
117
118   constexpr int32_t numInputs = 3;
119
120   // Create vector of modes
121   std::vector<int32_t> modesA{'m','h','k','n'};
122   std::vector<int32_t> modesB{'u','k','h'};
123   std::vector<int32_t> modesC{'x','u','y'};
124   std::vector<int32_t> modesD{'m','x','n','y'};
125
126   // Extents
127   std::unordered_map<int32_t, int64_t> extent;
128   extent['m'] = 96;
129   extent['n'] = 96;
130   extent['u'] = 96;
131   extent['h'] = 64;
132   extent['k'] = 64;
133   extent['x'] = 64;
134   extent['y'] = 64;
135
136   // Create a vector of extents for each tensor
137   std::vector<int64_t> extentA;
138   for (auto mode : modesA)
139      extentA.push_back(extent[mode]);
140   std::vector<int64_t> extentB;
141   for (auto mode : modesB)
142      extentB.push_back(extent[mode]);
143   std::vector<int64_t> extentC;
144   for (auto mode : modesC)
145      extentC.push_back(extent[mode]);
146   std::vector<int64_t> extentD;
147   for (auto mode : modesD)
148      extentD.push_back(extent[mode]);
149
150   printf("Define network, modes, and extents\n");

Allocate memory and initialize data

Next, we allocate memory for data and workspace, and randomly initialize data.

153   /**********************
154   * Allocating data
155   **********************/
156
157   size_t elementsA = 1;
158   for (auto mode : modesA)
159      elementsA *= extent[mode];
160   size_t elementsB = 1;
161   for (auto mode : modesB)
162      elementsB *= extent[mode];
163   size_t elementsC = 1;
164   for (auto mode : modesC)
165      elementsC *= extent[mode];
166   size_t elementsD = 1;
167   for (auto mode : modesD)
168      elementsD *= extent[mode];
169
170   size_t sizeA = sizeof(floatType) * elementsA;
171   size_t sizeB = sizeof(floatType) * elementsB;
172   size_t sizeC = sizeof(floatType) * elementsC;
173   size_t sizeD = sizeof(floatType) * elementsD;
174   printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC + sizeD)/1024./1024./1024);
175
176   void* rawDataIn_d[numInputs];
177   void* D_d;
178   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[0], sizeA));
179   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[1], sizeB));
180   HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[2], sizeC));
181   HANDLE_CUDA_ERROR(cudaMalloc((void**) &D_d, sizeD));
182
183   floatType *A = (floatType*) malloc(sizeof(floatType) * elementsA);
184   floatType *B = (floatType*) malloc(sizeof(floatType) * elementsB);
185   floatType *C = (floatType*) malloc(sizeof(floatType) * elementsC);
186   floatType *D = (floatType*) malloc(sizeof(floatType) * elementsD);
187
188   if (A == NULL || B == NULL || C == NULL || D == NULL)
189   {
190      printf("Error: Host allocation of A or C.\n");
191      return -1;
192
193   }
194   /**********************
195   * Allocate workspace
196   **********************/
197
198   size_t freeMem, totalMem;
199   HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem ));
200
201   uint64_t worksize = freeMem * 0.9;
202
203   void *work = nullptr;
204   HANDLE_CUDA_ERROR( cudaMalloc(&work, worksize) );
205
206   /*******************
207   * Initialize data
208   *******************/
209
210   for (uint64_t i = 0; i < elementsA; i++)
211      A[i] = ((float) rand())/RAND_MAX;
212   for (uint64_t i = 0; i < elementsB; i++)
213      B[i] = ((float) rand())/RAND_MAX;
214   for (uint64_t i = 0; i < elementsC; i++)
215      C[i] = ((float) rand())/RAND_MAX;
216   memset(D, 0, sizeof(floatType) * elementsD);
217
218   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[0], A, sizeA, cudaMemcpyHostToDevice));
219   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[1], B, sizeB, cudaMemcpyHostToDevice));
220   HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[2], C, sizeC, cudaMemcpyHostToDevice));
221
222   printf("Allocate memory for data and workspace, and initialize data.\n");

cuTensorNet handle and Network Descriptor

Next, we initialize the cuTensorNet library via cutensornetCreate() and create the network descriptor with the desired tensors modes, extents, strides as well as the data and compute types.

225   /*************************
226   * cuTensorNet
227   *************************/
228
229   cudaStream_t stream;
230   cudaStreamCreate(&stream);
231
232   cutensornetHandle_t handle;
233   HANDLE_ERROR(cutensornetCreate(&handle));
234
235   const int32_t nmodeA = modesA.size();
236   const int32_t nmodeB = modesB.size();
237   const int32_t nmodeC = modesC.size();
238   const int32_t nmodeD = modesD.size();
239
240   /*******************************
241   * Create Network Descriptor
242   *******************************/
243
244   const int32_t* modesIn[] = {modesA.data(), modesB.data(), modesC.data()};
245   int32_t const numModesIn[] = {nmodeA, nmodeB, nmodeC};
246   const int64_t* extentsIn[] = {extentA.data(), extentB.data(), extentC.data()};
247   const int64_t* stridesIn[] = {NULL, NULL, NULL}; // strides are optional; if no stride is provided, then cuTensorNet assumes a generalized column-major data layout
248
249   // Notice that pointers are allocated via cudaMalloc are aligned to 256 byte
250   // boundaries by default; however here we're checking the pointer alignment explicitly
251   // to demonstrate how one would check the alginment for arbitrary pointers.
252
253   auto getMaximalPointerAlignment = [](const void* ptr) {
254      const uint64_t ptrAddr  = reinterpret_cast<uint64_t>(ptr);
255      uint32_t alignment = 1;
256      while(ptrAddr % alignment == 0 &&
257            alignment < 256) // at the latest we terminate once the alignment reached 256 bytes (we could be going, but any alignment larger or equal to 256 is equally fine)
258      {
259         alignment *= 2;
260      }
261      return alignment;
262   };
263   const uint32_t alignmentsIn[] = {getMaximalPointerAlignment(rawDataIn_d[0]),
264                                    getMaximalPointerAlignment(rawDataIn_d[1]),
265                                    getMaximalPointerAlignment(rawDataIn_d[2])};
266   const uint32_t alignmentOut = getMaximalPointerAlignment(D_d);
267
268   // setup tensor network
269   cutensornetNetworkDescriptor_t descNet;
270   HANDLE_ERROR (cutensornetCreateNetworkDescriptor(handle,
271                                                numInputs, numModesIn, extentsIn, stridesIn, modesIn, alignmentsIn,
272                                                nmodeD, extentD.data(), /*stridesOut = */NULL, modesD.data(), alignmentOut,
273                                                typeData, typeCompute,
274                                                &descNet));
275
276   printf("Initialize the cuTensorNet library and create a network descriptor.\n");

Optimal contraction order and slicing

At this stage, we can deploy the cuTensorNet optimizer to find an optimized contraction path and slicing combination. It is possible to feed a pre-determined path into the config also. We use the cutensornetContractionOptimize() function after creating a config and info structures, which allow us to specify various options to the optimizer.

279   /*******************************
280   * Find "optimal" contraction order and slicing
281   *******************************/
282
283   cutensornetContractionOptimizerConfig_t optimizerConfig;
284   HANDLE_ERROR (cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
285
286   // Set the value of the partitioner imbalance factor, if desired
287   int imbalance_factor = 30;
288   HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(
289                                                               handle,
290                                                               optimizerConfig,
291                                                               CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR,
292                                                               &imbalance_factor,
293                                                               sizeof(imbalance_factor)));
294
295   cutensornetContractionOptimizerInfo_t optimizerInfo;
296   HANDLE_ERROR (cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo));
297
298   HANDLE_ERROR (cutensornetContractionOptimize(handle,
299                                             descNet,
300                                             optimizerConfig,
301                                             worksize,
302                                             optimizerInfo));
303
304   int64_t numSlices = 0;
305   HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
306               handle,
307               optimizerInfo,
308               CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
309               &numSlices,
310               sizeof(numSlices)));
311
312   assert(numSlices > 0);
313
314   printf("Find an optimized contraction path with cuTensorNet optimizer.\n");

Contraction plan and auto-tune

We create a contraction plan holding pair-wise contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost to initialize this plan redundantly. We also create a workspace descriptor, compute and query the minimum workspace size needed to contract the network, and provide that to the contraction plan.

317   /*******************************
318   * Initialize all pair-wise contraction plans (for cuTENSOR)
319   *******************************/
320   cutensornetContractionPlan_t plan;
321
322   cutensornetWorkspaceDescriptor_t workDesc;
323   HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
324
325   uint64_t requiredWorkspaceSize = 0;
326   HANDLE_ERROR(cutensornetWorkspaceComputeSizes(handle,
327                                          descNet,
328                                          optimizerInfo,
329                                          workDesc));
330
331   HANDLE_ERROR(cutensornetWorkspaceGetSize(handle,
332                                         workDesc,
333                                         CUTENSORNET_WORKSIZE_PREF_MIN,
334                                         CUTENSORNET_MEMSPACE_DEVICE,
335                                         &requiredWorkspaceSize));
336   if (worksize < requiredWorkspaceSize)
337   {
338      printf("Not enough workspace memory is available.");
339   }
340   else
341   {
342      HANDLE_ERROR (cutensornetWorkspaceSet(handle,
343                                            workDesc,
344                                            CUTENSORNET_MEMSPACE_DEVICE,
345                                            work,
346                                            worksize));
347                                          
348      HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
349                                                     descNet,
350                                                     optimizerInfo,
351                                                     workDesc,
352                                                     &plan) );
353
354      /*******************************
355      * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
356      *******************************/
357      cutensornetContractionAutotunePreference_t autotunePref;
358      HANDLE_ERROR(cutensornetCreateContractionAutotunePreference(handle,
359                              &autotunePref));
360
361      const int numAutotuningIterations = 5; // may be 0
362      HANDLE_ERROR(cutensornetContractionAutotunePreferenceSetAttribute(
363                              handle,
364                              autotunePref,
365                              CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
366                              &numAutotuningIterations,
367                              sizeof(numAutotuningIterations)));
368
369      // modify the plan again to find the best pair-wise contractions
370      HANDLE_ERROR(cutensornetContractionAutotune(handle,
371                              plan,
372                              rawDataIn_d,
373                              D_d,
374                              workDesc,
375                              autotunePref,
376                              stream));
377
378      HANDLE_ERROR(cutensornetDestroyContractionAutotunePreference(autotunePref));
379
380      printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");

Network contraction execution

Finally, we contract the network as many times as needed, possibly with different data. Network slices can be executed using the same contraction plan, possibly on different devices. We also clean up and free allocated resources.

383      /**********************
384      * Run
385      **********************/
386      GPUTimer timer{stream};
387      double minTimeCUTENSOR = 1e100;
388      const int numRuns = 3; // to get stable perf results
389      for (int i=0; i < numRuns; ++i)
390      {
391         cudaMemcpy(D_d, D, sizeD, cudaMemcpyHostToDevice); // restore output
392         cudaDeviceSynchronize();
393
394         /*
395         * Contract over all slices.
396         *
397         * A user may choose to parallelize this loop across multiple devices.
398         * (Note, however, that as of cuTensorNet v1.0.0 the contraction must
399         * start from slice 0, see the cutensornetContraction documentation at
400         * https://docs.nvidia.com/cuda/cuquantum/cutensornet/api/functions.html#cutensornetcontraction )
401         */
402         for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
403         {
404            timer.start();
405
406            HANDLE_ERROR(cutensornetContraction(handle,
407                                    plan,
408                                    rawDataIn_d,
409                                    D_d,
410                                    workDesc, sliceId, stream));
411
412            // Synchronize and measure timing
413            auto time = timer.seconds();
414            minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
415         }
416      }
417
418      printf("Contract the network, each slice uses the same contraction plan.\n");
419
420      /*************************/
421
422      double flops = -1;
423
424      HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
425                  handle,
426                  optimizerInfo,
427                  CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
428                  &flops,
429                  sizeof(flops)));
430
431      printf("numSlices: %ld\n", numSlices);
432      printf("%.2f ms / slice\n", minTimeCUTENSOR * 1000.f);
433      printf("%.2f GFLOPS/s\n", flops/1e9/minTimeCUTENSOR );
434   }
435
436   HANDLE_ERROR(cutensornetDestroy(handle));
437   HANDLE_ERROR(cutensornetDestroyNetworkDescriptor(descNet));
438   HANDLE_ERROR(cutensornetDestroyContractionPlan(plan));
439   HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
440   HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
441   HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
442
443   if (A) free(A);
444   if (B) free(B);
445   if (C) free(C);
446   if (D) free(D);
447   if (rawDataIn_d[0]) cudaFree(rawDataIn_d[0]);
448   if (rawDataIn_d[1]) cudaFree(rawDataIn_d[1]);
449   if (rawDataIn_d[2]) cudaFree(rawDataIn_d[2]);
450   if (D_d) cudaFree(D_d);
451   if (work) cudaFree(work);
452
453   printf("Free resource and exit.\n");
454
455   return 0;
456}

Recall that the full sample code can be found in the NVIDIA/cuQuantum repository (here).

Useful tips

  • For debugging, the environment variable CUTENSORNET_LOG_LEVEL=n can be set. The level n = 0, 1, …, 5 corresponds to the logger level as described and used in cutensornetLoggerSetLevel(). The environment variable CUTENSORNET_LOG_FILE=<filepath> can be used to direct the log output to a custom file at <filepath> instead of stdout.