Caching/Reusing Constant Intermediate Tensors¶
The following code example illustrates how to activate caching of constant intermediate tensors in order to greatly accelerate the repeated execution of a tensor network contraction where only some of the input tensors change their values in each iteration. The full code can be found in the NVIDIA/cuQuantum repository (here).
Headers and data types¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_ERROR(x) \
20{ const auto err = x; \
21 if( err != CUTENSORNET_STATUS_SUCCESS ) \
22 { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
23 fflush(stdout); \
24 } \
25};
26
27#define HANDLE_CUDA_ERROR(x) \
28{ const auto err = x; \
29 if( err != cudaSuccess ) \
30 { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
31 fflush(stdout); \
32 } \
33};
34
35
36struct GPUTimer
37{
38 GPUTimer(cudaStream_t stream): stream_(stream)
39 {
40 HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
41 HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
42 }
43
44 ~GPUTimer()
45 {
46 HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
47 HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
48 }
49
50 void start()
51 {
52 HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
53 }
54
55 float seconds()
56 {
57 HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
58 HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
59 float time;
60 HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
61 return time * 1e-3;
62 }
63
64 private:
65 cudaEvent_t start_, stop_;
66 cudaStream_t stream_;
67};
68
69
70int main()
71{
72 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
73
74 bool verbose = true;
75
76 // Check cuTensorNet version
77 const size_t cuTensornetVersion = cutensornetGetVersion();
78 if(verbose)
79 printf("cuTensorNet version: %ld\n", cuTensornetVersion);
80
81 // Set GPU device
82 int numDevices {0};
83 HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
84 const int deviceId = 0;
85 HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
86 cudaDeviceProp prop;
87 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
88
89 if(verbose) {
90 printf("===== device info ======\n");
91 printf("GPU-local-id:%d\n", deviceId);
92 printf("GPU-name:%s\n", prop.name);
93 printf("GPU-clock:%d\n", prop.clockRate);
94 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
95 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
96 printf("GPU-major:%d\n", prop.major);
97 printf("GPU-minor:%d\n", prop.minor);
98 printf("========================\n");
99 }
100
101 typedef float floatType;
102 cudaDataType_t typeData = CUDA_R_32F;
103 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105 if(verbose)
106 printf("Included headers and defined data types\n");
Define tensor network and tensor sizes¶
Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).
110 /**********************
111 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,f,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112 * We will execute the contraction a few times assuming all input tensors being constant except F.
113 **********************/
114
115 constexpr int32_t numInputs = 6;
116
117 // Create vectors of tensor modes
118 std::vector<std::vector<int32_t>> modesVec {
119 {'a','b','c','d'},
120 {'b','c','d','e'},
121 {'e','f','g','h'},
122 {'g','h','i','j'},
123 {'i','j','k','l'},
124 {'k','l','m'},
125 {'a','m'}
126 };
127
128 // Set mode extents
129 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
130 std::unordered_map<int32_t, int64_t> extent;
131 for (auto &vec: modesVec)
132 {
133 for (auto &mode: vec)
134 {
135 extent[mode] = sameExtent;
136 }
137 }
138
139 // Create a vector of extents for each tensor
140 std::vector<std::vector<int64_t>> extentVec;
141 extentVec.resize(numInputs+1); // hold inputs + output tensors
142 for (int i = 0; i < numInputs+1; ++i)
143 {
144 for (auto mode : modesVec[i])
145 extentVec[i].push_back(extent[mode]);
146 }
147
148 if(verbose)
149 printf("Defined tensor network, modes, and extents\n");
Allocate memory, initialize data, initialize cuTensorNet handle¶
Next, we allocate memory for the tensor network operands and initialize them to random values.
Then, we initialize the cuTensorNet library via cutensornetCreate()
.
152 /**********************
153 * Allocating data
154 **********************/
155
156 std::vector<size_t> elementsVec;
157 elementsVec.resize(numInputs+1); // hold inputs + output tensors
158 for (int i = 0; i < numInputs+1; ++i)
159 {
160 elementsVec[i] = 1;
161 for (auto mode : modesVec[i])
162 elementsVec[i] *= extent[mode];
163 }
164
165 size_t totalSize = 0;
166 std::vector<size_t> sizeVec;
167 sizeVec.resize(numInputs+1); // hold inputs + output tensors
168 for (int i = 0; i < numInputs+1; ++i)
169 {
170 sizeVec[i] = sizeof(floatType) * elementsVec[i];
171 totalSize += sizeVec[i];
172 }
173 if(verbose)
174 printf("Total GPU memory used for tensor storage: %.2f GiB\n",
175 (totalSize) / 1024. /1024. / 1024);
176
177 void* rawDataIn_d[numInputs];
178 void* O_d;
179 for (int i = 0; i < numInputs; ++i)
180 {
181 HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
182 }
183 HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
184
185 floatType* rawDataIn_h[numInputs];
186 for (int i = 0; i < numInputs; ++i)
187 {
188 rawDataIn_h[i] = (floatType*) malloc(sizeof(floatType) * elementsVec[i]);
189 if (rawDataIn_h[i] == NULL)
190 {
191 printf("Error: Host memory allocation failed!\n");
192 return -1;
193 }
194 }
195 floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
196 if (O_h == NULL)
197 {
198 printf("Error: Host memory allocation failed!\n");
199 return -1;
200 }
201
202 /*******************
203 * Initialize data
204 *******************/
205
206 memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
207 for (int i = 0; i < numInputs; ++i)
208 {
209 for (size_t e = 0; e < elementsVec[i]; ++e)
210 rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
211 }
212
213 for (int i = 0; i < numInputs; ++i)
214 {
215 HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
216 }
217
218 if(verbose)
219 printf("Allocated GPU memory for data, initialize data, and create library handle\n");
220
221 /*************************
222 * cuTensorNet
223 *************************/
224
225 cudaStream_t stream;
226 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
227
228 cutensornetHandle_t handle;
229 HANDLE_ERROR( cutensornetCreate(&handle) );
Mark constant tensors and create the network descriptor¶
Next, we specify which input tensors are constant, and create the network descriptor with the desired tensor modes, extents, strides, and qualifiers (e.g., constant) as well as the data and compute types. Note that the created library context will be associated with the currently active GPU.
232 /*******************************
233 * Set constant input tensors
234 *******************************/
235
236 // specify which input tensors are constant
237 std::vector<cutensornetTensorQualifiers_t> qualifiersIn;
238 qualifiersIn.resize(numInputs);
239 for (int i = 0; i < numInputs; ++i)
240 {
241 if (i < 5)
242 qualifiersIn[i].isConstant = 1;
243 else
244 qualifiersIn[i].isConstant = 0;
245 }
246
247 /*******************************
248 * Create Network Descriptor
249 *******************************/
250
251 int32_t* modesIn[numInputs];
252 int32_t numModesIn[numInputs];
253 int64_t* extentsIn[numInputs];
254 int64_t* stridesIn[numInputs];
255
256 for (int i = 0; i < numInputs; ++i)
257 {
258 modesIn[i] = modesVec[i].data();
259 numModesIn[i] = modesVec[i].size();
260 extentsIn[i] = extentVec[i].data();
261 stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
262 }
263
264 // Set up tensor network
265 cutensornetNetworkDescriptor_t descNet;
266 HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
267 numInputs, numModesIn, extentsIn, stridesIn, modesIn, qualifiersIn.data(),
268 modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
269 typeData, typeCompute,
270 &descNet) );
271
272 if(verbose)
273 printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");
Contraction order and slicing¶
In this example, we illustrate using a predetermined contraction path and
setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute()
.
276 /*******************************
277 * Choose workspace limit based on available resources.
278 *******************************/
279
280 size_t freeMem, totalMem;
281 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
282 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
283 if(verbose)
284 printf("Workspace limit = %lu\n", workspaceLimit);
285
286 /*******************************
287 * Set contraction order
288 *******************************/
289
290 // Create contraction optimizer info
291 cutensornetContractionOptimizerInfo_t optimizerInfo;
292 HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
293
294 // set a predetermined contraction path
295 std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
296 const auto numContractions = numInputs - 1;
297 cutensornetContractionPath_t contPath;
298 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
299 contPath.numContractions = numContractions;
300
301 // provide user-specified contPath
302 HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
303 handle,
304 optimizerInfo,
305 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
306 &contPath,
307 sizeof(contPath)));
308 int64_t numSlices = 1;
309
310 if(verbose)
311 printf("Set predetermined contraction path into cuTensorNet optimizer\n");
Create workspace descriptor and allocate workspace memory¶
Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed
to contract the network.
To activate intermediate tensor reuse, we need to provide CACHE workspace that will be used across multiple network contractions.
Thus, we query sizes and allocate device memory for both kinds of workspaces
(CUTENSORNET_WORKSPACE_SCRATCH
, and CUTENSORNET_WORKSPACE_CACHE
) and set these in the workspace descriptor.
The workspace descriptor will be provided to the contraction plan creation and contraction APIs.
314 /*******************************
315 * Create workspace descriptor, allocate workspace, and set it.
316 *******************************/
317
318 cutensornetWorkspaceDescriptor_t workDesc;
319 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
320
321 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
322 int64_t requiredWorkspaceSizeScratch = 0;
323 HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
324 descNet,
325 optimizerInfo,
326 workDesc) );
327
328 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
329 workDesc,
330 CUTENSORNET_WORKSIZE_PREF_MIN,
331 CUTENSORNET_MEMSPACE_DEVICE,
332 CUTENSORNET_WORKSPACE_SCRATCH,
333 &requiredWorkspaceSizeScratch) );
334
335 void* workScratch = nullptr;
336 HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
337
338 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
339 workDesc,
340 CUTENSORNET_MEMSPACE_DEVICE,
341 CUTENSORNET_WORKSPACE_SCRATCH,
342 workScratch,
343 requiredWorkspaceSizeScratch) );
344
345 // set CACHE workspace, which will be used across network contraction operations
346 int64_t requiredWorkspaceSizeCache = 0;
347 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
348 workDesc,
349 CUTENSORNET_WORKSIZE_PREF_MIN,
350 CUTENSORNET_MEMSPACE_DEVICE,
351 CUTENSORNET_WORKSPACE_CACHE,
352 &requiredWorkspaceSizeCache) );
353
354 void* workCache = nullptr;
355 HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
356
357 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
358 workDesc,
359 CUTENSORNET_MEMSPACE_DEVICE,
360 CUTENSORNET_WORKSPACE_CACHE,
361 workCache,
362 requiredWorkspaceSizeCache) );
363
364 if(verbose)
365 printf("Allocated and set up the GPU workspace\n");
Note that, it is possible to skip the steps of creating a workspace descriptor and explicitly handling workspace memory by setting a device memory handler, in which case cuTensorNet will implicitly handle workspace memory by allocating/deallocating memory from the provided memory pool. See Memory Management API for details.
Contraction plan and auto-tune¶
We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.
368 /*******************************
369 * Initialize the pairwise contraction plan (for cuTENSOR).
370 *******************************/
371
372 cutensornetContractionPlan_t plan;
373 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
374 descNet,
375 optimizerInfo,
376 workDesc,
377 &plan) );
378
379 /*******************************
380 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
381 * for each pairwise tensor contraction.
382 *******************************/
383 cutensornetContractionAutotunePreference_t autotunePref;
384 HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
385 &autotunePref) );
386
387 const int numAutotuningIterations = 5; // may be 0
388 HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
389 handle,
390 autotunePref,
391 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
392 &numAutotuningIterations,
393 sizeof(numAutotuningIterations)) );
394
395 // Modify the plan again to find the best pair-wise contractions
396 HANDLE_ERROR( cutensornetContractionAutotune(handle,
397 plan,
398 rawDataIn_d,
399 O_d,
400 workDesc,
401 autotunePref,
402 stream) );
403
404 HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
405
406 if(verbose)
407 printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");
Tensor network contraction execution¶
Finally, we contract the tensor network as many times as needed, possibly with different input each time. Note that the first network contraction call will utilize the provided CACHE workspace to store the constant intermediate tensors. Subsequent network contractions will use the cached data to greatly reduce the computation times.
410 /**********************
411 * Execute the tensor network contraction
412 **********************/
413
414 // Create a cutensornetSliceGroup_t object from a range of slice IDs
415 cutensornetSliceGroup_t sliceGroup{};
416 HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
417
418 GPUTimer timer {stream};
419 double minTimeCUTENSORNET = 1e100;
420 double firstTimeCUTENSORNET = 1e100;
421 const int numRuns = 3; // number of repeats to get stable performance results
422 for (int i = 0; i < numRuns; ++i)
423 {
424 HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
425 HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
426
427 /*
428 * Contract all slices of the tensor network
429 */
430 timer.start();
431
432 int32_t accumulateOutput = 0; // output tensor data will be overwritten
433 HANDLE_ERROR( cutensornetContractSlices(handle,
434 plan,
435 rawDataIn_d,
436 O_d,
437 accumulateOutput,
438 workDesc,
439 sliceGroup, // slternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
440 stream) );
441
442 // Synchronize and measure best timing
443 auto time = timer.seconds();
444 if (i == 0)
445 firstTimeCUTENSORNET = time;
446 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
447 }
448
449 if(verbose)
450 printf("Contracted the tensor network, each slice used the same contraction plan\n");
451
452 // Print the 1-norm of the output tensor (verification)
453 HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );
454 HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
455 double norm1 = 0.0;
456 for (int64_t i = 0; i < elementsVec[numInputs]; ++i) {
457 norm1 += std::abs(O_h[i]);
458 }
459 if(verbose)
460 printf("Computed the 1-norm of the output tensor: %e\n", norm1);
461
462 /*************************/
463
464 // Query the total Flop count for the tensor network contraction
465 double flops {0.0};
466 HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
467 handle,
468 optimizerInfo,
469 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
470 &flops,
471 sizeof(flops)) );
472
473 if(verbose) {
474 printf("Number of tensor network slices = %ld\n", numSlices);
475 printf("Network contraction flop cost = %e\n", flops);
476 printf("Tensor network contraction time (ms):\n");
477 printf("\tfirst run (intermediate tensors get cached) = %.3f\n", firstTimeCUTENSORNET * 1000.f);
478 printf("\tsubsequent run (cache reused) = %.3f\n", minTimeCUTENSORNET * 1000.f);
479 }
Free resources¶
After the computation, we need to free up all resources.
482 /***************
483 * Free resources
484 ****************/
485
486 // Free cuTensorNet resources
487 HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
488 HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
489 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
490 HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
491 HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
492 HANDLE_ERROR( cutensornetDestroy(handle) );
493
494 // Free Host memory resources
495 if (O_h) free(O_h);
496 for (int i = 0; i < numInputs; ++i)
497 {
498 if (rawDataIn_h[i])
499 free(rawDataIn_h[i]);
500 }
501 // Free GPU memory resources
502 if (workScratch) cudaFree(workScratch);
503 if (workCache) cudaFree(workCache);
504 if (O_d) cudaFree(O_d);
505 for (int i = 0; i < numInputs; ++i)
506 {
507 if (rawDataIn_d[i])
508 cudaFree(rawDataIn_d[i]);
509 }
510 if(verbose)
511 printf("Freed resources and exited\n");
512
513 return 0;
514}