Code example (serial contraction)#
The following code example illustrates the common steps necessary to use cuTensorNet and also introduces typical tensor network operations. Specifically, the example performs the following tensor contraction:
The full sample code can be found in the NVIDIA/cuQuantum repository (here).
Headers and data types#
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18#define HANDLE_ERROR(x) \
19 do { \
20 const auto err = x; \
21 if (err != CUTENSORNET_STATUS_SUCCESS) \
22 { \
23 printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
24 fflush(stdout); \
25 exit(EXIT_FAILURE); \
26 } \
27 } while (0)
28
29#define HANDLE_CUDA_ERROR(x) \
30 do { \
31 const auto err = x; \
32 if (err != cudaSuccess) \
33 { \
34 printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
35 fflush(stdout); \
36 exit(EXIT_FAILURE); \
37 } \
38 } while (0)
39
40// Usage: DEV_ATTR(cudaDevAttrClockRate, deviceId)
41#define DEV_ATTR(ENUMCONST, DID) \
42 ({ int v; \
43 HANDLE_CUDA_ERROR(cudaDeviceGetAttribute(&v, ENUMCONST, DID)); \
44 v; })
45
46
47struct GPUTimer
48{
49 GPUTimer(cudaStream_t stream) : stream_(stream)
50 {
51 HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
52 HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
53 }
54
55 ~GPUTimer()
56 {
57 HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
58 HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
59 }
60
61 void start() { HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_)); }
62
63 float seconds()
64 {
65 HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
66 HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
67 float time;
68 HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
69 return time * 1e-3;
70 }
71
72private:
73 cudaEvent_t start_, stop_;
74 cudaStream_t stream_;
75};
76
77int main()
78{
79 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
80
81 bool verbose = true;
82
83 // Check cuTensorNet version
84 const size_t cuTensornetVersion = cutensornetGetVersion();
85 if (verbose) printf("cuTensorNet version: %ld\n", cuTensornetVersion);
86
87 // Set GPU device
88 int numDevices{0};
89 HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
90 const int deviceId = 0;
91 HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
92 cudaDeviceProp prop;
93 HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
94
95 if (verbose)
96 {
97 printf("===== device info ======\n");
98 printf("GPU-local-id:%d\n", deviceId);
99 printf("GPU-name:%s\n", prop.name);
100 printf("GPU-clock:%d\n", DEV_ATTR(cudaDevAttrClockRate, deviceId));
101 printf("GPU-memoryClock:%d\n", DEV_ATTR(cudaDevAttrMemoryClockRate, deviceId));
102 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
103 printf("GPU-major:%d\n", prop.major);
104 printf("GPU-minor:%d\n", prop.minor);
105 printf("========================\n");
106 }
107
108 typedef float floatType;
109 cudaDataType_t typeData = CUDA_R_32F;
110 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
111
112 if (verbose) printf("Included headers and defined data types\n");
Define tensor network and tensor sizes#
Next, we define the topology of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).
116 /**************************************************************************************
117 * Computing: R_{k,l} = A_{a,b,c,d,e,f} B_{b,g,h,e,i,j} C_{m,a,g,f,i,k} D_{l,c,h,d,j,m}
118 **************************************************************************************/
119
120 constexpr int32_t numInputs = 4;
121
122 // Create vectors of tensor modes
123 std::vector<std::vector<int32_t>> tensorModes{ // for input tensors & output tensor
124 // input tensors
125 {'a', 'b', 'c', 'd', 'e', 'f'}, // tensor A
126 {'b', 'g', 'h', 'e', 'i', 'j'}, // tensor B
127 {'m', 'a', 'g', 'f', 'i', 'k'}, // tensor C
128 {'l', 'c', 'h', 'd', 'j', 'm'}, // tensor D
129 // output tensor
130 {'k', 'l'}, // tensor R
131 };
132
133 // Set mode extents
134 int64_t sameExtent = 16; // setting same extent for simplicity. In principle extents can differ.
135 std::unordered_map<int32_t, int64_t> extent;
136 for (auto& vec : tensorModes)
137 {
138 for (auto& mode : vec)
139 {
140 extent[mode] = sameExtent;
141 }
142 }
143
144 // Create a vector of extents for each tensor
145 std::vector<std::vector<int64_t>> tensorExtents; // for input tensors & output tensor
146 tensorExtents.resize(numInputs + 1); // hold inputs + output tensors
147 for (int32_t t = 0; t < numInputs + 1; ++t)
148 {
149 for (auto& mode : tensorModes[t]) tensorExtents[t].push_back(extent[mode]);
150 }
151
152 if (verbose) printf("Defined tensor network, modes, and extents\n");
Allocate memory and initialize data#
Next, we allocate memory for the tensor network operands and initialize them to random values.
155 /*****************
156 * Allocating data
157 *****************/
158
159 std::vector<size_t> tensorElements(numInputs + 1); // for input tensors & output tensor
160 std::vector<size_t> tensorSizes(numInputs + 1); // for input tensors & output tensor
161 size_t totalSize = 0;
162 for (int32_t t = 0; t < numInputs + 1; ++t)
163 {
164 size_t numElements = 1;
165 for (auto& mode : tensorModes[t]) numElements *= extent[mode];
166 tensorElements[t] = numElements;
167
168 tensorSizes[t] = sizeof(floatType) * numElements;
169 totalSize += tensorSizes[t];
170 }
171
172 if (verbose) printf("Total GPU memory used for tensor storage: %.2f GiB\n", (totalSize) / 1024. / 1024. / 1024);
173
174 void* tensorData_d[numInputs + 1]; // for input tensors & output tensor
175 for (int32_t t = 0; t < numInputs + 1; ++t)
176 {
177 HANDLE_CUDA_ERROR(cudaMalloc((void**)&tensorData_d[t], tensorSizes[t]));
178 }
179
180 floatType* tensorData_h[numInputs + 1]; // for input tensors & output tensor
181 for (int32_t t = 0; t < numInputs + 1; ++t)
182 {
183 tensorData_h[t] = (floatType*)malloc(tensorSizes[t]);
184 if (tensorData_h[t] == NULL)
185 {
186 printf("Error: Host memory allocation failed!\n");
187 return -1;
188 }
189 }
190
191 /*****************
192 * Initialize data
193 *****************/
194
195 // init output tensor to all 0s
196 memset(tensorData_h[numInputs], 0, tensorSizes[numInputs]);
197 // init input tensors to random values
198 for (int32_t t = 0; t < numInputs; ++t)
199 {
200 for (size_t e = 0; e < tensorElements[t]; ++e) tensorData_h[t][e] = ((floatType)rand()) / RAND_MAX;
201 }
202 // copy input data to device buffers
203 for (int32_t t = 0; t < numInputs; ++t)
204 {
205 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[t], tensorData_h[t], tensorSizes[t], cudaMemcpyHostToDevice));
206 }
cuTensorNet handle and network descriptor#
Next, we initialize the cuTensorNet library via cutensornetCreate().
Note that the created library context will be associated with the currently active GPU.
We create the network descriptor, and append the input tensors with the desired tensor modes and extents, as well as the data type.
We can, optionally, set the output tensor modes (if skipped, the output modes will be inferred).
Also, we can, optionally, set the compute mode on the network (if skipped, a default compute type, corresponding to the data type, will be used.
Refer to cutensornetCreateNetwork()).
209 /*************
210 * cuTensorNet
211 *************/
212
213 cudaStream_t stream;
214 HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
215
216 cutensornetHandle_t handle;
217 HANDLE_ERROR(cutensornetCreate(&handle));
218
219 if (verbose) printf("Allocated GPU memory for data, initialized data, and created library handle\n");
220
221 /****************
222 * Create Network
223 ****************/
224
225 // Set up tensor network
226 cutensornetNetworkDescriptor_t networkDesc;
227 HANDLE_ERROR(cutensornetCreateNetwork(handle, &networkDesc));
228
229 int64_t tensorIDs[numInputs]; // for input tensors
230 // attach the input tensors to the network
231 for (int32_t t = 0; t < numInputs; ++t)
232 {
233 HANDLE_ERROR(cutensornetNetworkAppendTensor(handle,
234 networkDesc,
235 tensorModes[t].size(),
236 tensorExtents[t].data(),
237 tensorModes[t].data(),
238 NULL,
239 typeData,
240 &tensorIDs[t]));
241 }
242
243 // set the output tensor
244 HANDLE_ERROR(cutensornetNetworkSetOutputTensor(handle,
245 networkDesc,
246 tensorModes[numInputs].size(),
247 tensorModes[numInputs].data(),
248 typeData));
249
250 // set the network compute type
251 HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
252 networkDesc,
253 CUTENSORNET_NETWORK_COMPUTE_TYPE,
254 &typeCompute,
255 sizeof(typeCompute)));
256 if (verbose) printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");
Optimal contraction order and slicing#
At this stage, we can deploy the cuTensorNet optimizer to find an optimized contraction path and slicing combination.
We choose a limit for the workspace needed to perform the contraction based on the available memory resources, and provide it
to the optimizer as a constraint. We then create an optimizer configuration object of type cutensornetContractionOptimizerConfig_t
to specify various optimizer options and provide it to the optimizer, which is invoked via cutensornetContractionOptimize().
The results from the optimizer will be returned in an optimizer info object of type cutensornetContractionOptimizerInfo_t.
259 /******************************************************
260 * Choose workspace limit based on available resources.
261 ******************************************************/
262
263 size_t freeMem, totalMem;
264 HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
265 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
266 if (verbose) printf("Workspace limit = %lu\n", workspaceLimit);
267
268 /*******************************
269 * Find "optimal" contraction order and slicing
270 *******************************/
271
272 cutensornetContractionOptimizerConfig_t optimizerConfig;
273 HANDLE_ERROR(cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
274
275 // Set the desired number of hyper-samples (defaults to 0)
276 int32_t num_hypersamples = 8;
277 HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(handle,
278 optimizerConfig,
279 CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_HYPER_NUM_SAMPLES,
280 &num_hypersamples,
281 sizeof(num_hypersamples)));
282
283 // Create contraction optimizer info and find an optimized contraction path
284 cutensornetContractionOptimizerInfo_t optimizerInfo;
285 HANDLE_ERROR(cutensornetCreateContractionOptimizerInfo(handle, networkDesc, &optimizerInfo));
286
287 HANDLE_ERROR(cutensornetContractionOptimize(handle,
288 networkDesc,
289 optimizerConfig,
290 workspaceLimit,
291 optimizerInfo));
292
293 // Query the number of slices the tensor network execution will be split into
294 int64_t numSlices = 0;
295 HANDLE_ERROR(cutensornetContractionOptimizerInfoGetAttribute(handle,
296 optimizerInfo,
297 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
298 &numSlices,
299 sizeof(numSlices)));
300 assert(numSlices > 0);
301
302 if (verbose) printf("Found an optimized contraction path using cuTensorNet optimizer\n");
It is also possible to bypass the cuTensorNet optimizer and import a pre-determined contraction path, as well as slicing information,
directly to the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute(),
then attach it to the network via cutensornetNetworkSetOptimizerInfo().
Create workspace descriptor and allocate workspace memory#
Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed to contract the network. We then allocate device memory for the workspace and set this in the workspace descriptor. The workspace descriptor will be provided to the contraction preparation and computation calls.
305 /*******************************
306 * Create workspace descriptor, allocate workspace, and set it.
307 *******************************/
308
309 cutensornetWorkspaceDescriptor_t workDesc;
310 HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
311
312 int64_t requiredWorkspaceSize = 0;
313 HANDLE_ERROR(cutensornetWorkspaceComputeContractionSizes(handle,
314 networkDesc,
315 optimizerInfo,
316 workDesc));
317
318 HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
319 workDesc,
320 CUTENSORNET_WORKSIZE_PREF_MIN,
321 CUTENSORNET_MEMSPACE_DEVICE,
322 CUTENSORNET_WORKSPACE_SCRATCH,
323 &requiredWorkspaceSize));
324
325 void* work = nullptr;
326 HANDLE_CUDA_ERROR(cudaMalloc(&work, requiredWorkspaceSize));
327
328 HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
329 workDesc,
330 CUTENSORNET_MEMSPACE_DEVICE,
331 CUTENSORNET_WORKSPACE_SCRATCH,
332 work,
333 requiredWorkspaceSize));
334
335 if (verbose) printf("Allocated and set up the GPU workspace\n");
Contraction preparation and auto-tuning#
We prepare the tensor network contraction, via cutensornetNetworkPrepareContraction(),
and set tensor’s data buffers and strides via cutensornetNetworkSetInputTensorMemory() and cutensornetNetworkSetOutputTensorMemory().
Optionally, we can auto-tune the contraction, via cutensornetNetworkAutotuneContraction(),
such that cuTENSOR selects the best kernel for each pairwise contraction.
This prepared network contraction can be reused for many (possibly different) data inputs, avoiding
the cost of initializing it redundantly.
338 /**************************
339 * Prepare the contraction.
340 **************************/
341
342 HANDLE_ERROR(cutensornetNetworkPrepareContraction(handle,
343 networkDesc,
344 workDesc));
345
346 // set tensor's data buffers and strides
347 for (int32_t t = 0; t < numInputs; ++t)
348 {
349 HANDLE_ERROR(cutensornetNetworkSetInputTensorMemory(handle,
350 networkDesc,
351 tensorIDs[t],
352 tensorData_d[t],
353 NULL));
354 }
355 HANDLE_ERROR(cutensornetNetworkSetOutputTensorMemory(handle,
356 networkDesc,
357 tensorData_d[numInputs],
358 NULL));
359 /****************************************************************
360 * Optional: Auto-tune the contraction to pick the fastest kernel
361 * for each pairwise tensor contraction.
362 ****************************************************************/
363 cutensornetNetworkAutotunePreference_t autotunePref;
364 HANDLE_ERROR(cutensornetCreateNetworkAutotunePreference(handle,
365 &autotunePref));
366
367 const int numAutotuningIterations = 5; // may be 0
368 HANDLE_ERROR(cutensornetNetworkAutotunePreferenceSetAttribute(handle,
369 autotunePref,
370 CUTENSORNET_NETWORK_AUTOTUNE_MAX_ITERATIONS,
371 &numAutotuningIterations,
372 sizeof(numAutotuningIterations)));
373
374 // Modify the network again to find the best pair-wise contractions
375 HANDLE_ERROR(cutensornetNetworkAutotuneContraction(handle,
376 networkDesc,
377 workDesc,
378 autotunePref,
379 stream));
380
381 HANDLE_ERROR(cutensornetDestroyNetworkAutotunePreference(autotunePref));
382
383 if (verbose) printf("Prepared the network contraction for cuTensorNet and optionally auto-tuned it\n");
Tensor network contraction execution#
Finally, we contract the tensor network as many times as needed, via cutensornetNetworkContract(),
possibly with different input each time (reset the data buffers and strides via cutensornetNetworkSetInputTensorMemory()).
Tensor network slices, captured as a cutensornetSliceGroup_t object, are computed using the same prepared network contraction.
For convenience, NULL can be provided to the cutensornetNetworkContract() function instead of a slice group when
the goal is to contract all the slices in the network. We also clean up and free allocated resources.
386 /****************************************
387 * Execute the tensor network contraction
388 ****************************************/
389
390 // Create a cutensornetSliceGroup_t object from a range of slice IDs
391 cutensornetSliceGroup_t sliceGroup{};
392 HANDLE_ERROR(cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup));
393
394 GPUTimer timer{stream};
395 double minTimeCUTENSORNET = 1e100;
396 const int numRuns = 3; // number of repeats to get stable performance results
397 for (int i = 0; i < numRuns; ++i)
398 {
399 // reset the output tensor data
400 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[numInputs], tensorData_h[numInputs], tensorSizes[numInputs], cudaMemcpyHostToDevice));
401 HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
402
403 /*
404 * Contract all slices of the tensor network
405 */
406 timer.start();
407
408 int32_t accumulateOutput = 0; // output tensor data will be overwritten
409 HANDLE_ERROR(cutensornetNetworkContract(handle,
410 networkDesc,
411 accumulateOutput,
412 workDesc,
413 sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
414 stream));
415
416 // Synchronize and measure best timing
417 auto time = timer.seconds();
418 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
419 }
420
421 if (verbose) printf("Contracted the tensor network, each slice used the same prepared contraction\n");
422
423 // Print the 1-norm of the output tensor (verification)
424 HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream));
425 // restore the output tensor on Host
426 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_h[numInputs], tensorData_d[numInputs], tensorSizes[numInputs], cudaMemcpyDeviceToHost));
427 double norm1 = 0.0;
428 for (int64_t i = 0; i < tensorElements[numInputs]; ++i)
429 {
430 norm1 += std::abs(tensorData_h[numInputs][i]);
431 }
432 if (verbose) printf("Computed the 1-norm of the output tensor: %e\n", norm1);
433
434 /*************************/
435
436 // Query the total Flop count for the tensor network contraction
437 double flops{0.0};
438 HANDLE_ERROR(cutensornetContractionOptimizerInfoGetAttribute(handle,
439 optimizerInfo,
440 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
441 &flops,
442 sizeof(flops)));
443
444 if (verbose)
445 {
446 printf("Number of tensor network slices = %ld\n", numSlices);
447 printf("Tensor network contraction time (ms) = %.3f\n", minTimeCUTENSORNET * 1000.f);
448 }
449
450 // Sphinx: #9
451 /****************
452 * Free resources
453 ****************/
454
455 // Free cuTensorNet resources
456 HANDLE_ERROR(cutensornetDestroySliceGroup(sliceGroup));
457 HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
458 HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
459 HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
460 HANDLE_ERROR(cutensornetDestroyNetwork(networkDesc));
461 HANDLE_ERROR(cutensornetDestroy(handle));
462
463 HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
464
465 // Free Host and GPU memory resources
466 for (int32_t t = 0; t < numInputs + 1; ++t)
467 {
468 if (tensorData_h[t]) free(tensorData_h[t]);
469 if (tensorData_d[t]) cudaFree(tensorData_d[t]);
470 }
471 if (work) cudaFree(work);
472
473 if (verbose) printf("Freed resources and exited\n");
474
475 return 0;
476}
Recall that the full sample code can be found in the NVIDIA/cuQuantum repository (here).