Caching/Reusing constant intermediate tensors¶
The following code example illustrates how to activate caching of constant intermediate tensors in order to greatly accelerate the repeated execution of a tensor network contraction where only some of the input tensors change their values in each iteration. The full code can be found in the NVIDIA/cuQuantum repository (here).
Headers and data types¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18#define HANDLE_ERROR(x) \
19 do { \
20 const auto err = x; \
21 if (err != CUTENSORNET_STATUS_SUCCESS) \
22 { \
23 printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
24 fflush(stdout); \
25 exit(EXIT_FAILURE); \
26 } \
27 } while (0)
28
29#define HANDLE_CUDA_ERROR(x) \
30 do { \
31 const auto err = x; \
32 if (err != cudaSuccess) \
33 { \
34 printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
35 fflush(stdout); \
36 exit(EXIT_FAILURE); \
37 } \
38 } while (0)
39
40// Usage: DEV_ATTR(cudaDevAttrClockRate, deviceId)
41#define DEV_ATTR(ENUMCONST, DID) \
42 ({ int v; \
43 HANDLE_CUDA_ERROR(cudaDeviceGetAttribute(&v, ENUMCONST, DID)); \
44 v; })
45
46struct GPUTimer
47{
48 GPUTimer(cudaStream_t stream) : stream_(stream)
49 {
50 HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
51 HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
52 }
53
54 ~GPUTimer()
55 {
56 HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
57 HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
58 }
59
60 void start() { HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_)); }
61
62 float seconds()
63 {
64 HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
65 HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
66 float time;
67 HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
68 return time * 1e-3;
69 }
70
71private:
72 cudaEvent_t start_, stop_;
73 cudaStream_t stream_;
74};
75
76int main()
77{
78 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
79
80 bool verbose = true;
81
82 // Check cuTensorNet version
83 const size_t cuTensornetVersion = cutensornetGetVersion();
84 if (verbose) printf("cuTensorNet version: %ld\n", cuTensornetVersion);
85
86 // Set GPU device
87 int numDevices{0};
88 HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
89 const int deviceId = 0;
90 HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
91 cudaDeviceProp prop;
92 HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));
93
94 if (verbose)
95 {
96 printf("===== device info ======\n");
97 printf("GPU-local-id:%d\n", deviceId);
98 printf("GPU-name:%s\n", prop.name);
99 printf("GPU-clock:%d\n", DEV_ATTR(cudaDevAttrClockRate, deviceId));
100 printf("GPU-memoryClock:%d\n", DEV_ATTR(cudaDevAttrMemoryClockRate, deviceId));
101 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
102 printf("GPU-major:%d\n", prop.major);
103 printf("GPU-minor:%d\n", prop.minor);
104 printf("========================\n");
105 }
106
107 typedef float floatType;
108 cudaDataType_t typeData = CUDA_R_32F;
109 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
110
111 if (verbose) printf("Included headers and defined data types\n");
Define tensor network and tensor sizes¶
Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).
115 /**********************
116 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
117 * We will execute the contraction a few times assuming all input tensors being constant except F.
118 **********************/
119
120 constexpr int32_t numInputs = 6;
121
122 // Create vectors of tensor modes
123 std::vector<std::vector<int32_t>> tensorModes{ // for input tensors & output tensor
124 // input tensors
125 {'a', 'b', 'c', 'd'}, // tensor A
126 {'b', 'c', 'd', 'e'}, // tensor B
127 {'e', 'f', 'g', 'h'}, // tensor C
128 {'g', 'h', 'i', 'j'}, // tensor D
129 {'i', 'j', 'k', 'l'}, // tensor E
130 {'k', 'l', 'm'}, // tensor F
131 // output tensor
132 {'a', 'm'}}; // tensor O
133
134 // Set mode extents
135 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
136 std::unordered_map<int32_t, int64_t> extent;
137 for (auto& vec : tensorModes)
138 {
139 for (auto& mode : vec)
140 {
141 extent[mode] = sameExtent;
142 }
143 }
144
145 // Create a vector of extents for each tensor
146 std::vector<std::vector<int64_t>> tensorExtents;
147 tensorExtents.resize(numInputs + 1); // hold inputs + output tensors
148 for (int32_t t = 0; t < numInputs + 1; ++t)
149 {
150 for (auto& mode : tensorModes[t]) tensorExtents[t].push_back(extent[mode]);
151 }
152
153 if (verbose) printf("Defined tensor network, modes, and extents\n");
Allocate memory, initialize data, initialize cuTensorNet handle¶
Next, we allocate memory for the tensor network operands and initialize them to random values.
Then, we initialize the cuTensorNet library via cutensornetCreate()
.
156 /**********************
157 * Allocating data
158 **********************/
159
160 std::vector<size_t> tensorElements(numInputs + 1); // hold inputs + output tensors
161 std::vector<size_t> tensorSizes(numInputs + 1); // hold inputs + output tensors
162 std::vector<std::vector<int64_t>> tensorStrides(numInputs + 1); // hold inputs + output tensors
163 size_t totalSize = 0;
164 for (int32_t t = 0; t < numInputs + 1; ++t)
165 {
166 size_t numElements = 1;
167 for (auto& mode : tensorModes[t])
168 {
169 tensorStrides[t].push_back(numElements);
170 numElements *= extent[mode];
171 }
172 tensorElements[t] = numElements;
173
174 tensorSizes[t] = sizeof(floatType) * numElements;
175 totalSize += tensorSizes[t];
176 }
177 if (verbose) printf("Total GPU memory used for tensor storage: %.2f GiB\n", (totalSize) / 1024. / 1024. / 1024);
178
179 void* tensorData_d[numInputs + 1]; // for input tensors & output tensor
180 for (int32_t t = 0; t < numInputs + 1; ++t)
181 {
182 HANDLE_CUDA_ERROR(cudaMalloc((void**)&tensorData_d[t], tensorSizes[t]));
183 }
184
185 floatType* tensorData_h[numInputs + 1]; // for input tensors & output tensor
186 for (int32_t t = 0; t < numInputs + 1; ++t)
187 {
188 tensorData_h[t] = (floatType*)malloc(tensorSizes[t]);
189 if (tensorData_h[t] == NULL)
190 {
191 printf("Error: Host memory allocation failed!\n");
192 return -1;
193 }
194 }
195
196 /*******************
197 * Initialize data
198 *******************/
199
200 // init output tensor to all 0s
201 memset(tensorData_h[numInputs], 0, tensorSizes[numInputs]);
202 // init input tensors to random values
203 for (int32_t t = 0; t < numInputs; ++t)
204 {
205 for (size_t e = 0; e < tensorElements[t]; ++e) tensorData_h[t][e] = ((floatType)rand()) / RAND_MAX;
206 }
207 // copy input data to device buffers
208 for (int32_t t = 0; t < numInputs; ++t)
209 {
210 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[t], tensorData_h[t], tensorSizes[t], cudaMemcpyHostToDevice));
211 }
212
213 /*************************
214 * cuTensorNet
215 *************************/
216
217 cudaStream_t stream;
218 HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));
219
220 cutensornetHandle_t handle;
221 HANDLE_ERROR(cutensornetCreate(&handle));
222
223 if (verbose) printf("Allocated GPU memory for data, initialized data, and created library handle\n");
Mark constant tensors and create the network descriptor¶
Next, we specify which input tensors are constant, and create the network descriptor with the desired tensor modes, extents, strides, and qualifiers (e.g., constant) as well as the data and compute types. Note that the created library context will be associated with the currently active GPU.
226 /*******************************
227 * Set constant input tensors
228 *******************************/
229
230 // specify which input tensors are constant
231 std::vector<cutensornetTensorQualifiers_t> qualifiersIn(numInputs, cutensornetTensorQualifiers_t{0, 0, 0});
232 for (int i = 0; i < numInputs; ++i)
233 {
234 qualifiersIn[i].isConstant = i < (numInputs - 1) ? 1 : 0;
235 }
236
237 /*******************************
238 * Create Network
239 *******************************/
240
241 // Set up tensor network
242 cutensornetNetworkDescriptor_t networkDesc;
243 HANDLE_ERROR(cutensornetCreateNetwork(handle, &networkDesc));
244
245 int64_t tensorIDs[numInputs]; // for input tensors
246 // attach the input tensors to the network
247 for (int32_t t = 0; t < numInputs; ++t)
248 {
249 HANDLE_ERROR(cutensornetNetworkAppendTensor(handle,
250 networkDesc,
251 tensorModes[t].size(),
252 tensorExtents[t].data(),
253 tensorModes[t].data(),
254 &qualifiersIn[t],
255 typeData,
256 &tensorIDs[t]));
257 }
258
259 // set the output tensor
260 HANDLE_ERROR(cutensornetNetworkSetOutputTensor(handle,
261 networkDesc,
262 tensorModes[numInputs].size(),
263 tensorModes[numInputs].data(),
264 typeData));
265
266 // set the network compute type
267 HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
268 networkDesc,
269 CUTENSORNET_NETWORK_COMPUTE_TYPE,
270 &typeCompute,
271 sizeof(typeCompute)));
272 if (verbose) printf("Initialized the cuTensorNet library and created a tensor network\n");
Contraction order and slicing¶
In this example, we illustrate using a predetermined contraction path and
setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute()
.
We also attach the constructed optimizer info object to the network via cutensornetNetworkSetOptimizerInfo()
275 /*******************************
276 * Choose workspace limit based on available resources.
277 *******************************/
278
279 size_t freeMem, totalMem;
280 HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
281 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
282 if (verbose) printf("Workspace limit = %lu\n", workspaceLimit);
283
284 /*******************************
285 * Set contraction order
286 *******************************/
287
288 // Create contraction optimizer info
289 cutensornetContractionOptimizerInfo_t optimizerInfo;
290 HANDLE_ERROR(cutensornetCreateContractionOptimizerInfo(handle, networkDesc, &optimizerInfo));
291
292 // set a predetermined contraction path
293 std::vector<int32_t> path{0, 1, 0, 4, 0, 3, 0, 2, 0, 1};
294 const auto numContractions = numInputs - 1;
295 cutensornetContractionPath_t contPath;
296 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
297 contPath.numContractions = numContractions;
298
299 // provide user-specified contPath
300 HANDLE_ERROR(cutensornetContractionOptimizerInfoSetAttribute(handle,
301 optimizerInfo,
302 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
303 &contPath,
304 sizeof(contPath)));
305
306 // Attach the optimizer info to the network
307 HANDLE_ERROR(cutensornetNetworkSetOptimizerInfo(handle,
308 networkDesc,
309 optimizerInfo));
310 int64_t numSlices = 1;
311
312 if (verbose) printf("Set predetermined contraction path into cuTensorNet optimizer\n");
Create workspace descriptor and allocate workspace memory¶
Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed
to contract the network.
To activate intermediate tensor reuse, we need to provide CACHE workspace that will be used across multiple network contractions.
Thus, we query sizes and allocate device memory for both kinds of workspaces
(CUTENSORNET_WORKSPACE_SCRATCH
, and CUTENSORNET_WORKSPACE_CACHE
) and set these in the workspace descriptor.
The workspace descriptor will be provided to the contraction preparing and computing APIs.
315 /*******************************
316 * Create workspace descriptor, allocate workspace, and set it.
317 *******************************/
318
319 cutensornetWorkspaceDescriptor_t workDesc;
320 HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
321
322 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
323 int64_t requiredWorkspaceSizeScratch = 0;
324 HANDLE_ERROR(cutensornetWorkspaceComputeContractionSizes(handle, networkDesc, optimizerInfo, workDesc));
325
326 HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
327 workDesc,
328 CUTENSORNET_WORKSIZE_PREF_MIN,
329 CUTENSORNET_MEMSPACE_DEVICE,
330 CUTENSORNET_WORKSPACE_SCRATCH,
331 &requiredWorkspaceSizeScratch));
332
333 void* workScratch = nullptr;
334 HANDLE_CUDA_ERROR(cudaMalloc(&workScratch, requiredWorkspaceSizeScratch));
335
336 HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
337 workDesc,
338 CUTENSORNET_MEMSPACE_DEVICE,
339 CUTENSORNET_WORKSPACE_SCRATCH,
340 workScratch,
341 requiredWorkspaceSizeScratch));
342
343 // set CACHE workspace, which will be used across network contraction operations
344 int64_t requiredWorkspaceSizeCache = 0;
345 HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(handle,
346 workDesc,
347 CUTENSORNET_WORKSIZE_PREF_MIN,
348 CUTENSORNET_MEMSPACE_DEVICE,
349 CUTENSORNET_WORKSPACE_CACHE,
350 &requiredWorkspaceSizeCache));
351
352 void* workCache = nullptr;
353 HANDLE_CUDA_ERROR(cudaMalloc(&workCache, requiredWorkspaceSizeCache));
354
355 HANDLE_ERROR(cutensornetWorkspaceSetMemory(handle,
356 workDesc,
357 CUTENSORNET_MEMSPACE_DEVICE,
358 CUTENSORNET_WORKSPACE_CACHE,
359 workCache,
360 requiredWorkspaceSizeCache));
361
362 if (verbose) printf("Allocated and set up the GPU workspace\n");
Note that, it is possible to skip the steps of creating a workspace descriptor and explicitly handling workspace memory by setting a device memory handler, in which case cuTensorNet will implicitly handle workspace memory by allocating/deallocating memory from the provided memory pool. See Memory management API for details.
Contraction preparation and auto-tuning¶
We prepare the tensor network contraction, via cutensornetNetworkPrepareContraction()
,
and set tensor’s data buffers and strides via cutensornetNetworkSetInputTensorMemory()
and cutensornetNetworkSetOutputTensorMemory()
.
Optionally, we can auto-tune the contraction, via cutensornetNetworkAutotuneContraction()
,
such that cuTENSOR selects the best kernel for each pairwise contraction.
This prepared network contraction can be reused for many (possibly different) data inputs, avoiding
the cost of initializing it redundantly.
365 /*******************************
366 * Prepare the contraction.
367 *******************************/
368
369 HANDLE_ERROR(cutensornetNetworkPrepareContraction(handle,
370 networkDesc,
371 workDesc));
372
373 // set tensor's data buffers and strides
374 for (int32_t t = 0; t < numInputs; ++t)
375 {
376 HANDLE_ERROR(cutensornetNetworkSetInputTensorMemory(handle,
377 networkDesc,
378 tensorIDs[t],
379 tensorData_d[t],
380 NULL));
381 }
382 HANDLE_ERROR(cutensornetNetworkSetOutputTensorMemory(handle,
383 networkDesc,
384 tensorData_d[numInputs],
385 NULL));
386 /*******************************
387 * Optional: Auto-tune the contraction to pick the fastest kernel
388 * for each pairwise tensor contraction.
389 *******************************/
390 cutensornetNetworkAutotunePreference_t autotunePref;
391 HANDLE_ERROR(cutensornetCreateNetworkAutotunePreference(handle,
392 &autotunePref));
393
394 const int numAutotuningIterations = 5; // may be 0
395 HANDLE_ERROR(cutensornetNetworkAutotunePreferenceSetAttribute(handle,
396 autotunePref,
397 CUTENSORNET_NETWORK_AUTOTUNE_MAX_ITERATIONS,
398 &numAutotuningIterations,
399 sizeof(numAutotuningIterations)));
400
401 // Modify the network again to find the best pair-wise contractions
402 HANDLE_ERROR(cutensornetNetworkAutotuneContraction(handle,
403 networkDesc,
404 workDesc,
405 autotunePref,
406 stream));
407
408 HANDLE_ERROR(cutensornetDestroyNetworkAutotunePreference(autotunePref));
409
410 if (verbose) printf("Prepared the network contraction for cuTensorNet and optionally auto-tuned it\n");
Tensor network contraction execution¶
Finally, we contract the tensor network as many times as needed, possibly with different input each time. Note that the first network contraction call will utilize the provided CACHE workspace to store the constant intermediate tensors. Subsequent network contractions will use the cached data to greatly reduce the computation times.
413 /**********************
414 * Execute the tensor network contraction
415 **********************/
416
417 // Create a cutensornetSliceGroup_t object from a range of slice IDs
418 cutensornetSliceGroup_t sliceGroup{};
419 HANDLE_ERROR(cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup));
420
421 GPUTimer timer{stream};
422 double minTimeCUTENSORNET = 1e100;
423 double firstTimeCUTENSORNET = 1e100;
424 const int numRuns = 3; // number of repeats to get stable performance results
425 for (int i = 0; i < numRuns; ++i)
426 {
427 // restore the output tensor on GPU
428 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_d[numInputs], tensorData_h[numInputs], tensorSizes[numInputs], cudaMemcpyHostToDevice));
429 HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
430
431 /*
432 * Contract all slices of the tensor network
433 */
434 timer.start();
435
436 int32_t accumulateOutput = 0; // output tensor data will be overwritten
437 HANDLE_ERROR(cutensornetNetworkContract(handle,
438 networkDesc,
439 accumulateOutput,
440 workDesc,
441 sliceGroup, // alternatively, NULL can also be used to contract over
442 // all slices instead of specifying a sliceGroup object
443 stream));
444
445 // Synchronize and measure best timing
446 auto time = timer.seconds();
447 if (i == 0) firstTimeCUTENSORNET = time;
448 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
449 }
450
451 if (verbose) printf("Contracted the tensor network, each slice used the same prepared contraction\n");
452
453 // Print the 1-norm of the output tensor (verification)
454 HANDLE_CUDA_ERROR(cudaStreamSynchronize(stream));
455 // restore the output tensor on Host
456 HANDLE_CUDA_ERROR(cudaMemcpy(tensorData_h[numInputs], tensorData_d[numInputs], tensorSizes[numInputs], cudaMemcpyDeviceToHost));
457 double norm1 = 0.0;
458 for (int64_t i = 0; i < tensorElements[numInputs]; ++i)
459 {
460 norm1 += std::abs(tensorData_h[numInputs][i]);
461 }
462 if (verbose) printf("Computed the 1-norm of the output tensor: %e\n", norm1);
463
464 /*************************/
465
466 // Query the total Flop count for the tensor network contraction
467 double flops{0.0};
468 HANDLE_ERROR(cutensornetContractionOptimizerInfoGetAttribute(handle,
469 optimizerInfo,
470 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
471 &flops,
472 sizeof(flops)));
473
474 if (verbose)
475 {
476 printf("Number of tensor network slices = %ld\n", numSlices);
477 printf("Network contraction flop cost = %e\n", flops);
478 printf("Tensor network contraction time (ms):\n");
479 printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
480 printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
481 }
Free resources¶
After the computation, we need to free up all resources.
484 /***************
485 * Free resources
486 ****************/
487
488 // Free cuTensorNet resources
489 HANDLE_ERROR(cutensornetDestroySliceGroup(sliceGroup));
490 HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
491 HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
492 HANDLE_ERROR(cutensornetDestroyNetwork(networkDesc));
493 HANDLE_ERROR(cutensornetDestroy(handle));
494
495 HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));
496
497 // Free Host and GPU memory resources
498 for (int32_t t = 0; t < numInputs + 1; ++t)
499 {
500 if (tensorData_h[t]) free(tensorData_h[t]);
501 if (tensorData_d[t]) cudaFree(tensorData_d[t]);
502 }
503 // Free GPU memory resources
504 if (workScratch) cudaFree(workScratch);
505 if (workCache) cudaFree(workCache);
506 if (verbose) printf("Freed resources and exited\n");
507
508 return 0;
509}