Getting Started¶
In this section, we show how to contract a tensor network using cuTensorNet. Firstly, We describe how to install the library and how to compile a sample code. Then, we present the example code to perform common steps in cuTensorNet. In this example, we perform the following tensor contraction:
We build the code up step by step, each step adding code at the end. The steps are separated by comments consisting of multiple stars.
It is recommended to refer to Overview and cuTENSOR documentation for familiarity with the nomenclature and cuTENSOR operations.
Installation and Compilation¶
Download the cuQuantum package (which cuTensorNet is part of) from https://developer.nvidia.com/cuQuantum-downloads, and the cuTENSOR package from https://developer.nvidia.com/cutensor.
Linux¶
Assuming cuQuantum has been extracted in CUQUANTUM_ROOT
and cuTENSOR in CUTENSOR_ROOT
we update the library path as follows:
export LD_LIBRARY_PATH=${CUQUANTUM_ROOT}/lib:${CUTENSOR_ROOT}/lib/11:${LD_LIBRARY_PATH}
Depending on your CUDA Toolkit, you might have to choose a different library version (e.g., ${CUTENSOR_ROOT}/lib/11.0
).
The sample code discussed below (tensornet_example.cu
) can be compiled via the following command:
nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUTENSOR_ROOT}/lib/11 -lcutensornet -lcutensor -o tensornet_example
For statically linking against the cuTensorNet library, use the following command (libmetis_static.a
needs to be explicitly linked against, assuming it is installed through the NVIDIA CUDA Toolkit and accessible through $LIBRARY_PATH
):
nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include ${CUQUANTUM_ROOT}/lib/libcutensornet_static.a -L${CUTENSOR_DIR}/lib/11 -lcutensor libmetis_static.a -o tensornet_example
Note
Depending on the source of the cuQuantum package, you may need to replace lib
above by lib64
.
Code Example¶
The following code example shows the common steps to use cuTensorNet and performs some typical operations in tensor network contraction. The full sample code can be found in the NVIDIA/cuQuantum repository (here).
Headers and Data Types¶
31#include <stdlib.h>
32#include <stdio.h>
33
34#include <unordered_map>
35#include <vector>
36#include <cassert>
37
38#include <cuda_runtime.h>
39#include <cutensornet.h>
40#include <cutensor.h>
41
42#define HANDLE_ERROR(x) \
43{ const auto err = x; \
44if( err != CUTENSORNET_STATUS_SUCCESS ) \
45{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \
46};
47
48#define HANDLE_CUDA_ERROR(x) \
49{ const auto err = x; \
50 if( err != cudaSuccess ) \
51 { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \
52};
53
54struct GPUTimer
55{
56 GPUTimer(cudaStream_t stream): stream_(stream)
57 {
58 cudaEventCreate(&start_);
59 cudaEventCreate(&stop_);
60 }
61
62 ~GPUTimer()
63 {
64 cudaEventDestroy(start_);
65 cudaEventDestroy(stop_);
66 }
67
68 void start()
69 {
70 cudaEventRecord(start_, stream_);
71 }
72
73 float seconds()
74 {
75 cudaEventRecord(stop_, stream_);
76 cudaEventSynchronize(stop_);
77 float time;
78 cudaEventElapsedTime(&time, start_, stop_);
79 return time * 1e-3;
80 }
81
82 private:
83 cudaEvent_t start_, stop_;
84 cudaStream_t stream_;
85};
86
87
88int main()
89{
90 const size_t cuTensornetVersion = cutensornetGetVersion();
91 printf("cuTensorNet-vers:%ld\n",cuTensornetVersion);
92
93 cudaDeviceProp prop;
94 int32_t deviceId = -1;
95 HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
96 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
97
98 printf("===== device info ======\n");
99 printf("GPU-name:%s\n", prop.name);
100 printf("GPU-clock:%d\n", prop.clockRate);
101 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
102 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
103 printf("GPU-major:%d\n", prop.major);
104 printf("GPU-minor:%d\n", prop.minor);
105 printf("========================\n");
106
107 typedef float floatType;
108 cudaDataType_t typeData = CUDA_R_32F;
109 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
110
111 printf("Include headers and define data types\n");
Define Tensor Network and Tensor Sizes¶
Next, we define the topology of the tensor network (i.e., the modes of the tensors, their extents and connectivity).
114 /**********************
115 * Computing: D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}
116 **********************/
117
118 constexpr int32_t numInputs = 3;
119
120 // Create vector of modes
121 std::vector<int32_t> modesA{'m','h','k','n'};
122 std::vector<int32_t> modesB{'u','k','h'};
123 std::vector<int32_t> modesC{'x','u','y'};
124 std::vector<int32_t> modesD{'m','x','n','y'};
125
126 // Extents
127 std::unordered_map<int32_t, int64_t> extent;
128 extent['m'] = 96;
129 extent['n'] = 96;
130 extent['u'] = 96;
131 extent['h'] = 64;
132 extent['k'] = 64;
133 extent['x'] = 64;
134 extent['y'] = 64;
135
136 // Create a vector of extents for each tensor
137 std::vector<int64_t> extentA;
138 for (auto mode : modesA)
139 extentA.push_back(extent[mode]);
140 std::vector<int64_t> extentB;
141 for (auto mode : modesB)
142 extentB.push_back(extent[mode]);
143 std::vector<int64_t> extentC;
144 for (auto mode : modesC)
145 extentC.push_back(extent[mode]);
146 std::vector<int64_t> extentD;
147 for (auto mode : modesD)
148 extentD.push_back(extent[mode]);
149
150 printf("Define network, modes, and extents\n");
Allocate memory and initialize data¶
Next, we allocate memory for data and workspace, and randomly initialize data.
153 /**********************
154 * Allocating data
155 **********************/
156
157 size_t elementsA = 1;
158 for (auto mode : modesA)
159 elementsA *= extent[mode];
160 size_t elementsB = 1;
161 for (auto mode : modesB)
162 elementsB *= extent[mode];
163 size_t elementsC = 1;
164 for (auto mode : modesC)
165 elementsC *= extent[mode];
166 size_t elementsD = 1;
167 for (auto mode : modesD)
168 elementsD *= extent[mode];
169
170 size_t sizeA = sizeof(floatType) * elementsA;
171 size_t sizeB = sizeof(floatType) * elementsB;
172 size_t sizeC = sizeof(floatType) * elementsC;
173 size_t sizeD = sizeof(floatType) * elementsD;
174 printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC + sizeD)/1024./1024./1024);
175
176 void* rawDataIn_d[numInputs];
177 void* D_d;
178 HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[0], sizeA));
179 HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[1], sizeB));
180 HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[2], sizeC));
181 HANDLE_CUDA_ERROR(cudaMalloc((void**) &D_d, sizeD));
182
183 floatType *A = (floatType*) malloc(sizeof(floatType) * elementsA);
184 floatType *B = (floatType*) malloc(sizeof(floatType) * elementsB);
185 floatType *C = (floatType*) malloc(sizeof(floatType) * elementsC);
186 floatType *D = (floatType*) malloc(sizeof(floatType) * elementsD);
187
188 if (A == NULL || B == NULL || C == NULL || D == NULL)
189 {
190 printf("Error: Host allocation of A or C.\n");
191 return -1;
192
193 }
194 /**********************
195 * Allocate workspace
196 **********************/
197
198 size_t freeMem, totalMem;
199 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem ));
200
201 uint64_t worksize = freeMem * 0.9;
202
203 void *work = nullptr;
204 HANDLE_CUDA_ERROR( cudaMalloc(&work, worksize) );
205
206 /*******************
207 * Initialize data
208 *******************/
209
210 for (uint64_t i = 0; i < elementsA; i++)
211 A[i] = ((float) rand())/RAND_MAX;
212 for (uint64_t i = 0; i < elementsB; i++)
213 B[i] = ((float) rand())/RAND_MAX;
214 for (uint64_t i = 0; i < elementsC; i++)
215 C[i] = ((float) rand())/RAND_MAX;
216 memset(D, 0, sizeof(floatType) * elementsD);
217
218 HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[0], A, sizeA, cudaMemcpyHostToDevice));
219 HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[1], B, sizeB, cudaMemcpyHostToDevice));
220 HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[2], C, sizeC, cudaMemcpyHostToDevice));
221
222 printf("Allocate memory for data and workspace, and initialize data.\n");
cuTensorNet handle and Network Descriptor¶
Next, we initialize the cuTensorNet library via cutensornetCreate()
and
create the network descriptor with the desired tensors modes, extents,
strides as well as the data and compute types.
225 /*************************
226 * cuTensorNet
227 *************************/
228
229 cudaStream_t stream;
230 cudaStreamCreate(&stream);
231
232 cutensornetHandle_t handle;
233 HANDLE_ERROR(cutensornetCreate(&handle));
234
235 const int32_t nmodeA = modesA.size();
236 const int32_t nmodeB = modesB.size();
237 const int32_t nmodeC = modesC.size();
238 const int32_t nmodeD = modesD.size();
239
240 /*******************************
241 * Create Network Descriptor
242 *******************************/
243
244 const int32_t* modesIn[] = {modesA.data(), modesB.data(), modesC.data()};
245 int32_t const numModesIn[] = {nmodeA, nmodeB, nmodeC};
246 const int64_t* extentsIn[] = {extentA.data(), extentB.data(), extentC.data()};
247 const int64_t* stridesIn[] = {NULL, NULL, NULL}; // strides are optional; if no stride is provided, then cuTensorNet assumes a generalized column-major data layout
248
249 // Notice that pointers are allocated via cudaMalloc are aligned to 256 byte
250 // boundaries by default; however here we're checking the pointer alignment explicitly
251 // to demonstrate how one would check the alginment for arbitrary pointers.
252
253 auto getMaximalPointerAlignment = [](const void* ptr) {
254 const uint64_t ptrAddr = reinterpret_cast<uint64_t>(ptr);
255 uint32_t alignment = 1;
256 while(ptrAddr % alignment == 0 &&
257 alignment < 256) // at the latest we terminate once the alignment reached 256 bytes (we could be going, but any alignment larger or equal to 256 is equally fine)
258 {
259 alignment *= 2;
260 }
261 return alignment;
262 };
263 const uint32_t alignmentsIn[] = {getMaximalPointerAlignment(rawDataIn_d[0]),
264 getMaximalPointerAlignment(rawDataIn_d[1]),
265 getMaximalPointerAlignment(rawDataIn_d[2])};
266 const uint32_t alignmentOut = getMaximalPointerAlignment(D_d);
267
268 // setup tensor network
269 cutensornetNetworkDescriptor_t descNet;
270 HANDLE_ERROR (cutensornetCreateNetworkDescriptor(handle,
271 numInputs, numModesIn, extentsIn, stridesIn, modesIn, alignmentsIn,
272 nmodeD, extentD.data(), /*stridesOut = */NULL, modesD.data(), alignmentOut,
273 typeData, typeCompute,
274 &descNet));
275
276 printf("Initialize the cuTensorNet library and create a network descriptor.\n");
Optimal contraction order and slicing¶
At this stage, we can deploy the cuTensorNet optimizer to find an optimized contraction path and slicing combination.
It is possible to feed a pre-determined path into the config also.
We use the cutensornetContractionOptimize()
function after creating a config and info structures,
which allow us to specify various options to the optimizer.
279 /*******************************
280 * Find "optimal" contraction order and slicing
281 *******************************/
282
283 cutensornetContractionOptimizerConfig_t optimizerConfig;
284 HANDLE_ERROR (cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
285
286 // Set the value of the partitioner imbalance factor, if desired
287 int imbalance_factor = 30;
288 HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(
289 handle,
290 optimizerConfig,
291 CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR,
292 &imbalance_factor,
293 sizeof(imbalance_factor)));
294
295 cutensornetContractionOptimizerInfo_t optimizerInfo;
296 HANDLE_ERROR (cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo));
297
298 HANDLE_ERROR (cutensornetContractionOptimize(handle,
299 descNet,
300 optimizerConfig,
301 worksize,
302 optimizerInfo));
303
304 int64_t numSlices = 0;
305 HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
306 handle,
307 optimizerInfo,
308 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
309 &numSlices,
310 sizeof(numSlices)));
311
312 assert(numSlices > 0);
313
314 printf("Find an optimized contraction path with cuTensorNet optimizer.\n");
Contraction plan and auto-tune¶
We create a contraction plan holding pair-wise contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost to initialize this plan redundantly. We also create a workspace descriptor, compute and query the minimum workspace size needed to contract the network, and provide that to the contraction plan.
317 /*******************************
318 * Initialize all pair-wise contraction plans (for cuTENSOR)
319 *******************************/
320 cutensornetContractionPlan_t plan;
321
322 cutensornetWorkspaceDescriptor_t workDesc;
323 HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
324
325 uint64_t requiredWorkspaceSize = 0;
326 HANDLE_ERROR(cutensornetWorkspaceComputeSizes(handle,
327 descNet,
328 optimizerInfo,
329 workDesc));
330
331 HANDLE_ERROR(cutensornetWorkspaceGetSize(handle,
332 workDesc,
333 CUTENSORNET_WORKSIZE_PREF_MIN,
334 CUTENSORNET_MEMSPACE_DEVICE,
335 &requiredWorkspaceSize));
336 if (worksize < requiredWorkspaceSize)
337 {
338 printf("Not enough workspace memory is available.");
339 }
340 else
341 {
342 HANDLE_ERROR (cutensornetWorkspaceSet(handle,
343 workDesc,
344 CUTENSORNET_MEMSPACE_DEVICE,
345 work,
346 worksize));
347
348 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
349 descNet,
350 optimizerInfo,
351 workDesc,
352 &plan) );
353
354 /*******************************
355 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
356 *******************************/
357 cutensornetContractionAutotunePreference_t autotunePref;
358 HANDLE_ERROR(cutensornetCreateContractionAutotunePreference(handle,
359 &autotunePref));
360
361 const int numAutotuningIterations = 5; // may be 0
362 HANDLE_ERROR(cutensornetContractionAutotunePreferenceSetAttribute(
363 handle,
364 autotunePref,
365 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
366 &numAutotuningIterations,
367 sizeof(numAutotuningIterations)));
368
369 // modify the plan again to find the best pair-wise contractions
370 HANDLE_ERROR(cutensornetContractionAutotune(handle,
371 plan,
372 rawDataIn_d,
373 D_d,
374 workDesc,
375 autotunePref,
376 stream));
377
378 HANDLE_ERROR(cutensornetDestroyContractionAutotunePreference(autotunePref));
379
380 printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");
Network contraction execution¶
Finally, we contract the network as many times as needed, possibly with different data. Network slices can be executed using the same contraction plan, possibly on different devices. We also clean up and free allocated resources.
383 /**********************
384 * Run
385 **********************/
386 GPUTimer timer{stream};
387 double minTimeCUTENSOR = 1e100;
388 const int numRuns = 3; // to get stable perf results
389 for (int i=0; i < numRuns; ++i)
390 {
391 cudaMemcpy(D_d, D, sizeD, cudaMemcpyHostToDevice); // restore output
392 cudaDeviceSynchronize();
393
394 /*
395 * Contract over all slices.
396 *
397 * A user may choose to parallelize this loop across multiple devices.
398 * (Note, however, that as of cuTensorNet v1.0.0 the contraction must
399 * start from slice 0, see the cutensornetContraction documentation at
400 * https://docs.nvidia.com/cuda/cuquantum/cutensornet/api/functions.html#cutensornetcontraction )
401 */
402 for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
403 {
404 timer.start();
405
406 HANDLE_ERROR(cutensornetContraction(handle,
407 plan,
408 rawDataIn_d,
409 D_d,
410 workDesc, sliceId, stream));
411
412 // Synchronize and measure timing
413 auto time = timer.seconds();
414 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
415 }
416 }
417
418 printf("Contract the network, each slice uses the same contraction plan.\n");
419
420 /*************************/
421
422 double flops = -1;
423
424 HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
425 handle,
426 optimizerInfo,
427 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
428 &flops,
429 sizeof(flops)));
430
431 printf("numSlices: %ld\n", numSlices);
432 printf("%.2f ms / slice\n", minTimeCUTENSOR * 1000.f);
433 printf("%.2f GFLOPS/s\n", flops/1e9/minTimeCUTENSOR );
434 }
435
436 HANDLE_ERROR(cutensornetDestroy(handle));
437 HANDLE_ERROR(cutensornetDestroyNetworkDescriptor(descNet));
438 HANDLE_ERROR(cutensornetDestroyContractionPlan(plan));
439 HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
440 HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
441 HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
442
443 if (A) free(A);
444 if (B) free(B);
445 if (C) free(C);
446 if (D) free(D);
447 if (rawDataIn_d[0]) cudaFree(rawDataIn_d[0]);
448 if (rawDataIn_d[1]) cudaFree(rawDataIn_d[1]);
449 if (rawDataIn_d[2]) cudaFree(rawDataIn_d[2]);
450 if (D_d) cudaFree(D_d);
451 if (work) cudaFree(work);
452
453 printf("Free resource and exit.\n");
454
455 return 0;
456}
Recall that the full sample code can be found in the NVIDIA/cuQuantum repository (here).
Useful tips¶
For debugging, the environment variable
CUTENSORNET_LOG_LEVEL=n
can be set. The leveln
= 0, 1, …, 5 corresponds to the logger level as described and used incutensornetLoggerSetLevel()
. The environment variableCUTENSORNET_LOG_FILE=<filepath>
can be used to direct the log output to a custom file at<filepath>
instead ofstdout
.