Caching/Reusing Constant Intermediate Tensors¶
The following code example illustrates how to activate caching of constant intermediate tensors in order to greatly accelerate the repeated execution of a tensor network contraction where only some of the input tensors change their values in each iteration. The full code can be found in the NVIDIA/cuQuantum repository (here).
Headers and data types¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_ERROR(x) \
20{ const auto err = x; \
21 if( err != CUTENSORNET_STATUS_SUCCESS ) \
22 { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
23 fflush(stdout); \
24 } \
25};
26
27#define HANDLE_CUDA_ERROR(x) \
28{ const auto err = x; \
29 if( err != cudaSuccess ) \
30 { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
31 fflush(stdout); \
32 } \
33};
34
35
36struct GPUTimer
37{
38 GPUTimer(cudaStream_t stream): stream_(stream)
39 {
40 cudaEventCreate(&start_);
41 cudaEventCreate(&stop_);
42 }
43
44 ~GPUTimer()
45 {
46 cudaEventDestroy(start_);
47 cudaEventDestroy(stop_);
48 }
49
50 void start()
51 {
52 cudaEventRecord(start_, stream_);
53 }
54
55 float seconds()
56 {
57 cudaEventRecord(stop_, stream_);
58 cudaEventSynchronize(stop_);
59 float time;
60 cudaEventElapsedTime(&time, start_, stop_);
61 return time * 1e-3;
62 }
63
64 private:
65 cudaEvent_t start_, stop_;
66 cudaStream_t stream_;
67};
68
69
70int main()
71{
72 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
73
74 bool verbose = true;
75
76 // Check cuTensorNet version
77 const size_t cuTensornetVersion = cutensornetGetVersion();
78 if(verbose)
79 printf("cuTensorNet version: %ld\n", cuTensornetVersion);
80
81 // Set GPU device
82 int numDevices {0};
83 HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
84 const int deviceId = 0;
85 HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
86 cudaDeviceProp prop;
87 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
88
89 if(verbose) {
90 printf("===== device info ======\n");
91 printf("GPU-name:%s\n", prop.name);
92 printf("GPU-clock:%d\n", prop.clockRate);
93 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
94 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
95 printf("GPU-major:%d\n", prop.major);
96 printf("GPU-minor:%d\n", prop.minor);
97 printf("========================\n");
98 }
99
100 typedef float floatType;
101 cudaDataType_t typeData = CUDA_R_32F;
102 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
103
104 if(verbose)
105 printf("Included headers and defined data types\n");
Define tensor network and tensor sizes¶
Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).
109 /**********************
110 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
111 * We will execute the contraction a few times assuming all input tensors being constant except F.
112 **********************/
113
114 constexpr int32_t numInputs = 6;
115
116 // Create vectors of tensor modes
117 std::vector<std::vector<int32_t>> modesVec {
118 {'a','b','c','d'},
119 {'b','c','d','e'},
120 {'e','f','g','h'},
121 {'g','h','i','j'},
122 {'i','j','k','l'},
123 {'k','l','m'},
124 {'a','m'}
125 };
126
127 // Set mode extents
128 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
129 std::unordered_map<int32_t, int64_t> extent;
130 for (auto &vec: modesVec)
131 {
132 for (auto &mode: vec)
133 {
134 extent[mode] = sameExtent;
135 }
136 }
137
138 // Create a vector of extents for each tensor
139 std::vector<std::vector<int64_t>> extentVec;
140 extentVec.resize(numInputs+1); // hold inputs + output tensors
141 for (int i = 0; i < numInputs+1; ++i)
142 {
143 for (auto mode : modesVec[i])
144 extentVec[i].push_back(extent[mode]);
145 }
146
147 if(verbose)
148 printf("Defined tensor network, modes, and extents\n");
Allocate memory, initialize data, initialize cuTensorNet handle¶
Next, we allocate memory for the tensor network operands and initialize them to random values.
Then, we initialize the cuTensorNet library via cutensornetCreate()
.
151 /**********************
152 * Allocating data
153 **********************/
154
155 std::vector<size_t> elementsVec;
156 elementsVec.resize(numInputs+1); // hold inputs + output tensors
157 for (int i = 0; i < numInputs+1; ++i)
158 {
159 elementsVec[i] = 1;
160 for (auto mode : modesVec[i])
161 elementsVec[i] *= extent[mode];
162 }
163
164 size_t totalSize = 0;
165 std::vector<size_t> sizeVec;
166 sizeVec.resize(numInputs+1); // hold inputs + output tensors
167 for (int i = 0; i < numInputs+1; ++i)
168 {
169 sizeVec[i] = sizeof(floatType) * elementsVec[i];
170 totalSize += sizeVec[i];
171 }
172 if(verbose)
173 printf("Total GPU memory used for tensor storage: %.2f GiB\n",
174 (totalSize) / 1024. /1024. / 1024);
175
176 void* rawDataIn_d[numInputs];
177 void* O_d;
178 for (int i = 0; i < numInputs; ++i)
179 {
180 HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
181 }
182 HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
183
184 floatType* rawDataIn_h[numInputs];
185 for (int i = 0; i < numInputs; ++i)
186 {
187 rawDataIn_h[i] = (floatType*) malloc(sizeof(floatType) * elementsVec[i]);
188 if (rawDataIn_h[i] == NULL)
189 {
190 printf("Error: Host memory allocation failed!\n");
191 return -1;
192 }
193 }
194 floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
195 if (O_h == NULL)
196 {
197 printf("Error: Host memory allocation failed!\n");
198 return -1;
199 }
200
201 /*******************
202 * Initialize data
203 *******************/
204
205 memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
206 for (int i = 0; i < numInputs; ++i)
207 {
208 for (size_t e = 0; e < elementsVec[i]; ++e)
209 rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
210 }
211
212 for (int i = 0; i < numInputs; ++i)
213 {
214 HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
215 }
216
217 if(verbose)
218 printf("Allocated GPU memory for data, initialize data, and create library handle\n");
219
220 /*************************
221 * cuTensorNet
222 *************************/
223
224 cudaStream_t stream;
225 cudaStreamCreate(&stream);
226
227 cutensornetHandle_t handle;
228 HANDLE_ERROR( cutensornetCreate(&handle) );
Mark constant tensors and create the network descriptor¶
Next, we specify which input tensors are constant, and create the network descriptor with the desired tensor modes, extents, strides, and qualifiers (e.g., constant) as well as the data and compute types. Note that the created library context will be associated with the currently active GPU.
231 /*******************************
232 * Set constant input tensors
233 *******************************/
234
235 // specify which input tensors are constant
236 std::vector<cutensornetTensorQualifiers_t> qualifiersIn;
237 qualifiersIn.resize(numInputs);
238 for (int i = 0; i < numInputs; ++i)
239 {
240 if (i < 5)
241 qualifiersIn[i].isConstant = 1;
242 else
243 qualifiersIn[i].isConstant = 0;
244 }
245
246 /*******************************
247 * Create Network Descriptor
248 *******************************/
249
250 int32_t* modesIn[numInputs];
251 int32_t numModesIn[numInputs];
252 int64_t* extentsIn[numInputs];
253 int64_t* stridesIn[numInputs];
254
255 for (int i = 0; i < numInputs; ++i)
256 {
257 modesIn[i] = modesVec[i].data();
258 numModesIn[i] = modesVec[i].size();
259 extentsIn[i] = extentVec[i].data();
260 stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
261 }
262
263 // Set up tensor network
264 cutensornetNetworkDescriptor_t descNet;
265 HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
266 numInputs, numModesIn, extentsIn, stridesIn, modesIn, qualifiersIn.data(),
267 modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
268 typeData, typeCompute,
269 &descNet) );
270
271 if(verbose)
272 printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");
Contraction order and slicing¶
In this example, we illustrate using a predetermined contraction path and
setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute()
.
275 /*******************************
276 * Choose workspace limit based on available resources.
277 *******************************/
278
279 size_t freeMem, totalMem;
280 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
281 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
282 if(verbose)
283 printf("Workspace limit = %lu\n", workspaceLimit);
284
285 /*******************************
286 * Set contraction order
287 *******************************/
288
289 // Create contraction optimizer info
290 cutensornetContractionOptimizerInfo_t optimizerInfo;
291 HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
292
293 // set a predetermined contraction path
294 std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
295 const auto numContractions = numInputs - 1;
296 cutensornetContractionPath_t contPath;
297 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
298 contPath.numContractions = numContractions;
299
300 // provide user-specified contPath
301 HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
302 handle,
303 optimizerInfo,
304 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
305 &contPath,
306 sizeof(contPath)));
307 int64_t numSlices = 1;
308
309 if(verbose)
310 printf("Set predetermined contraction path into cuTensorNet optimizer\n");
Create workspace descriptor and allocate workspace memory¶
Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed
to contract the network.
To activate intermediate tensor reuse, we need to provide CACHE workspace that will be used across multiple network contractions.
Thus, we query sizes and allocate device memory for both kinds of workspaces
(CUTENSORNET_WORKSPACE_SCRATCH
, and CUTENSORNET_WORKSPACE_CACHE
) and set these in the workspace descriptor.
The workspace descriptor will be provided to the contraction plan creation and contraction APIs.
313 /*******************************
314 * Create workspace descriptor, allocate workspace, and set it.
315 *******************************/
316
317 cutensornetWorkspaceDescriptor_t workDesc;
318 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
319
320 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
321 int64_t requiredWorkspaceSizeScratch = 0;
322 HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
323 descNet,
324 optimizerInfo,
325 workDesc) );
326
327 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
328 workDesc,
329 CUTENSORNET_WORKSIZE_PREF_MIN,
330 CUTENSORNET_MEMSPACE_DEVICE,
331 CUTENSORNET_WORKSPACE_SCRATCH,
332 &requiredWorkspaceSizeScratch) );
333
334 void* workScratch = nullptr;
335 HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
336
337 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
338 workDesc,
339 CUTENSORNET_MEMSPACE_DEVICE,
340 CUTENSORNET_WORKSPACE_SCRATCH,
341 workScratch,
342 requiredWorkspaceSizeScratch) );
343
344 // set CACHE workspace, which will be used across network contraction operations
345 int64_t requiredWorkspaceSizeCache = 0;
346 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
347 workDesc,
348 CUTENSORNET_WORKSIZE_PREF_MIN,
349 CUTENSORNET_MEMSPACE_DEVICE,
350 CUTENSORNET_WORKSPACE_CACHE,
351 &requiredWorkspaceSizeCache) );
352
353 void* workCache = nullptr;
354 HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
355
356 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
357 workDesc,
358 CUTENSORNET_MEMSPACE_DEVICE,
359 CUTENSORNET_WORKSPACE_CACHE,
360 workCache,
361 requiredWorkspaceSizeCache) );
362
363 if(verbose)
364 printf("Allocated and set up the GPU workspace\n");
Note that, it is possible to skip the steps of creating a workspace descriptor and explicitly handling workspace memory by setting a device memory handler, in which case cuTensorNet will implicitly handle workspace memory by allocating/deallocating memory from the provided memory pool. See Memory Management API for details.
Contraction plan and auto-tune¶
We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.
367 /*******************************
368 * Initialize the pairwise contraction plan (for cuTENSOR).
369 *******************************/
370
371 cutensornetContractionPlan_t plan;
372 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
373 descNet,
374 optimizerInfo,
375 workDesc,
376 &plan) );
377
378 /*******************************
379 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
380 * for each pairwise tensor contraction.
381 *******************************/
382 cutensornetContractionAutotunePreference_t autotunePref;
383 HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
384 &autotunePref) );
385
386 const int numAutotuningIterations = 5; // may be 0
387 HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
388 handle,
389 autotunePref,
390 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
391 &numAutotuningIterations,
392 sizeof(numAutotuningIterations)) );
393
394 // Modify the plan again to find the best pair-wise contractions
395 HANDLE_ERROR( cutensornetContractionAutotune(handle,
396 plan,
397 rawDataIn_d,
398 O_d,
399 workDesc,
400 autotunePref,
401 stream) );
402
403 HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
404
405 if(verbose)
406 printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");
Tensor network contraction execution¶
Finally, we contract the tensor network as many times as needed, possibly with different input each time. Note that the first network contraction call will utilize the provided CACHE workspace to store the constant intermediate tensors. Subsequent network contractions will use the cached data to greatly reduce the computation times.
409 /**********************
410 * Execute the tensor network contraction
411 **********************/
412
413 // Create a cutensornetSliceGroup_t object from a range of slice IDs
414 cutensornetSliceGroup_t sliceGroup{};
415 HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
416
417 GPUTimer timer {stream};
418 double minTimeCUTENSORNET = 1e100;
419 double firstTimeCUTENSORNET = 1e100;
420 const int numRuns = 3; // number of repeats to get stable performance results
421 for (int i = 0; i < numRuns; ++i)
422 {
423 HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
424 HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
425
426 /*
427 * Contract all slices of the tensor network
428 */
429 timer.start();
430
431 int32_t accumulateOutput = 0; // output tensor data will be overwritten
432 HANDLE_ERROR( cutensornetContractSlices(handle,
433 plan,
434 rawDataIn_d,
435 O_d,
436 accumulateOutput,
437 workDesc,
438 sliceGroup, // slternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
439 stream) );
440
441 // Synchronize and measure best timing
442 auto time = timer.seconds();
443 if (i == 0)
444 firstTimeCUTENSORNET = time;
445 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
446 }
447
448 if(verbose)
449 printf("Contracted the tensor network, each slice used the same contraction plan\n");
450
451 // Print the 1-norm of the output tensor (verification)
452 HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
453 HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
454 double norm1 = 0.0;
455 for (int64_t i = 0; i < elementsVec[numInputs]; ++i) {
456 norm1 += std::abs(O_h[i]);
457 }
458 if(verbose)
459 printf("Computed the 1-norm of the output tensor: %e\n", norm1);
460
461 /*************************/
462
463 // Query the total Flop count for the tensor network contraction
464 double flops {0.0};
465 HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
466 handle,
467 optimizerInfo,
468 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
469 &flops,
470 sizeof(flops)) );
471
472 if(verbose) {
473 printf("Number of tensor network slices = %ld\n", numSlices);
474 printf("Network contraction flop cost = %e\n", flops);
475 printf("Tensor network contraction time (ms):\n");
476 printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
477 printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
478 }
Free resources¶
After the computation, we need to free up all resources.
481 /***************
482 * Free resources
483 ****************/
484
485 // Free cuTensorNet resources
486 HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
487 HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
488 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
489 HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
490 HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
491 HANDLE_ERROR( cutensornetDestroy(handle) );
492
493 // Free Host memory resources
494 if (O_h) free(O_h);
495 for (int i = 0; i < numInputs; ++i)
496 {
497 if (rawDataIn_h[i])
498 free(rawDataIn_h[i]);
499 }
500 // Free GPU memory resources
501 if (workScratch) cudaFree(workScratch);
502 if (workCache) cudaFree(workCache);
503 if (O_d) cudaFree(O_d);
504 for (int i = 0; i < numInputs; ++i)
505 {
506 if (rawDataIn_d[i])
507 cudaFree(rawDataIn_d[i]);
508 }
509 if(verbose)
510 printf("Freed resources and exited\n");
511
512 return 0;
513}