Computing gradients via backward propagation¶
The following code example illustrates how to compute the gradients of a tensor network w.r.t. user-selected input tensors via backward propagation. The full code can be found in the NVIDIA/cuQuantum repository (here).
Headers and data types¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_ERROR(x) \
20{ const auto err = x; \
21 if( err != CUTENSORNET_STATUS_SUCCESS ) \
22 { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
23 fflush(stdout); \
24 } \
25};
26
27#define HANDLE_CUDA_ERROR(x) \
28{ const auto err = x; \
29 if( err != cudaSuccess ) \
30 { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
31 fflush(stdout); \
32 } \
33};
34
35
36struct GPUTimer
37{
38 GPUTimer(cudaStream_t stream): stream_(stream)
39 {
40 HANDLE_CUDA_ERROR(cudaEventCreate(&start_));
41 HANDLE_CUDA_ERROR(cudaEventCreate(&stop_));
42 }
43
44 ~GPUTimer()
45 {
46 HANDLE_CUDA_ERROR(cudaEventDestroy(start_));
47 HANDLE_CUDA_ERROR(cudaEventDestroy(stop_));
48 }
49
50 void start()
51 {
52 HANDLE_CUDA_ERROR(cudaEventRecord(start_, stream_));
53 }
54
55 float seconds()
56 {
57 HANDLE_CUDA_ERROR(cudaEventRecord(stop_, stream_));
58 HANDLE_CUDA_ERROR(cudaEventSynchronize(stop_));
59 float time;
60 HANDLE_CUDA_ERROR(cudaEventElapsedTime(&time, start_, stop_));
61 return time * 1e-3;
62 }
63
64 private:
65 cudaEvent_t start_, stop_;
66 cudaStream_t stream_;
67};
68
69
70int main()
71{
72 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
73
74 bool verbose = true;
75
76 // Check cuTensorNet version
77 const size_t cuTensornetVersion = cutensornetGetVersion();
78 if(verbose)
79 printf("cuTensorNet version: %ld\n", cuTensornetVersion);
80
81 // Set GPU device
82 int numDevices {0};
83 HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
84 const int deviceId = 0;
85 HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
86 cudaDeviceProp prop;
87 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
88
89 if(verbose) {
90 printf("===== device info ======\n");
91 printf("GPU-local-id:%d\n", deviceId);
92 printf("GPU-name:%s\n", prop.name);
93 printf("GPU-clock:%d\n", prop.clockRate);
94 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
95 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
96 printf("GPU-major:%d\n", prop.major);
97 printf("GPU-minor:%d\n", prop.minor);
98 printf("========================\n");
99 }
100
101 typedef float floatType;
102 cudaDataType_t typeData = CUDA_R_32F;
103 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
104
105 if(verbose)
106 printf("Included headers and defined data types\n");
Define tensor network and tensor sizes¶
Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity), and specify the input tensor IDs whose gradients will be computed.
110 /**********************
111 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
112 * We will execute the contraction and compute the gradients of input tensors A, B, C
113 **********************/
114
115 constexpr int32_t numInputs = 6;
116 std::vector<int32_t> gradInputIDs = {0, 1, 2};
117
118 // Create vectors of tensor modes
119 std::vector<std::vector<int32_t>> modesVec {
120 {'a','b','c','d'},
121 {'b','c','d','e'},
122 {'e','g','h'},
123 {'g','h','i','j'},
124 {'i','j','k','l'},
125 {'k','l','m'},
126 {'a','m'}
127 };
128
129 // Set mode extents
130 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
131 std::unordered_map<int32_t, int64_t> extent;
132 for (auto &vec: modesVec)
133 {
134 for (auto &mode: vec)
135 {
136 extent[mode] = sameExtent;
137 }
138 }
139
140 // Create a vector of extents for each tensor
141 std::vector<std::vector<int64_t>> extentVec;
142 extentVec.resize(numInputs+1); // hold inputs + output tensors
143 for (int i = 0; i < numInputs+1; ++i)
144 {
145 for (auto mode : modesVec[i])
146 extentVec[i].push_back(extent[mode]);
147 }
148
149 if(verbose)
150 printf("Defined tensor network, modes, and extents\n");
Allocate memory, initialize data, initialize cuTensorNet handle¶
Next, we allocate memory for the tensor network operands and initialize them to random values.
We also allocate memory for the gradient tensors corresponding to the selected input tensors for gradient computation,
as well as the activation tensor which we initialize to ones.
Then, we initialize the cuTensorNet library via cutensornetCreate()
.
153 /**********************
154 * Allocating data
155 **********************/
156
157 std::vector<size_t> elementsVec;
158 elementsVec.resize(numInputs+1); // hold inputs + output tensors
159 for (int i = 0; i < numInputs+1; ++i)
160 {
161 elementsVec[i] = 1;
162 for (auto mode : modesVec[i])
163 elementsVec[i] *= extent[mode];
164 }
165
166 size_t totalSize = 0;
167 std::vector<size_t> sizeVec;
168 sizeVec.resize(numInputs+1); // hold inputs + output tensors
169 for (int i = 0; i < numInputs+1; ++i)
170 {
171 sizeVec[i] = sizeof(floatType) * elementsVec[i];
172 totalSize += sizeVec[i];
173 }
174 if(verbose)
175 printf("Total GPU memory used for tensor storage: %.2f GiB\n",
176 (totalSize) / 1024. /1024. / 1024);
177
178 void* rawDataIn_d[numInputs];
179 void* O_d;
180 void* outputActivation_d;
181 for (int i = 0; i < numInputs; ++i)
182 {
183 HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
184 }
185 HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
186 HANDLE_CUDA_ERROR( cudaMalloc((void**) &outputActivation_d, sizeVec[numInputs]));
187
188 floatType* rawDataIn_h[numInputs];
189 for (int i = 0; i < numInputs; ++i)
190 {
191 rawDataIn_h[i] = (floatType*) malloc(sizeVec[i]);
192 if (rawDataIn_h[i] == NULL)
193 {
194 printf("Error: Host memory allocation failed!\n");
195 return -1;
196 }
197 }
198 floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
199 if (O_h == NULL)
200 {
201 printf("Error: Host memory allocation failed!\n");
202 return -1;
203 }
204 floatType *outputActivation_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
205 if (outputActivation_h == NULL)
206 {
207 printf("Error: Host memory allocation failed!\n");
208 return -1;
209 }
210
211 void* gradientsOut_d[numInputs] = {nullptr};
212 for (auto i : gradInputIDs)
213 {
214 HANDLE_CUDA_ERROR( cudaMalloc((void**) &gradientsOut_d[i], sizeVec[i]) );
215 }
216 void* gradientsOut_h[numInputs] = {nullptr};
217 for (auto i : gradInputIDs)
218 {
219 gradientsOut_h[i] = (floatType*) malloc(sizeVec[i]);
220 if (gradientsOut_h[i] == NULL)
221 {
222 printf("Error: Host memory allocation failed!\n");
223 return -1;
224 }
225 }
226
227 /*******************
228 * Initialize data
229 *******************/
230
231 memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
232 for (int i = 0; i < numInputs; ++i)
233 {
234 for (size_t e = 0; e < elementsVec[i]; ++e)
235 rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
236 }
237 for (size_t e = 0; e < elementsVec[numInputs]; ++e)
238 outputActivation_h[e] = (floatType) 1.0;
239
240 for (int i = 0; i < numInputs; ++i)
241 {
242 HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
243 }
244 HANDLE_CUDA_ERROR( cudaMemcpy(outputActivation_d, outputActivation_h, sizeVec[numInputs], cudaMemcpyHostToDevice) );
245
246 if(verbose)
247 printf("Allocated GPU memory for data, initialize data, and create library handle\n");
248
249 /*************************
250 * cuTensorNet
251 *************************/
252
253 cudaStream_t stream;
254 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
255
256 cutensornetHandle_t handle;
257 HANDLE_ERROR( cutensornetCreate(&handle) );
Create the network descriptor and set gradient tensor IDs¶
Next, we create the network descriptor with the desired tensor modes, extents, and strides, as well as the data and compute types. We also set input tensors IDs whose gradients will be computed, at the network descriptor. Note that the created library context will be associated with the currently active GPU.
260 /*******************************
261 * Create Network Descriptor
262 *******************************/
263
264 int32_t* modesIn[numInputs];
265 int32_t numModesIn[numInputs];
266 int64_t* extentsIn[numInputs];
267 int64_t* stridesIn[numInputs];
268
269 for (int i = 0; i < numInputs; ++i)
270 {
271 modesIn[i] = modesVec[i].data();
272 numModesIn[i] = modesVec[i].size();
273 extentsIn[i] = extentVec[i].data();
274 stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
275 }
276
277 // Set up tensor network
278 cutensornetNetworkDescriptor_t descNet;
279 HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
280 numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
281 modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
282 typeData, typeCompute,
283 &descNet) );
284
285 /*******************************
286 * Set input tensor ids that requrie gradient
287 *******************************/
288
289 cutensornetTensorIDList_t tensorIDList {
290 .numTensors = static_cast<int32_t>(gradInputIDs.size()),
291 .data = gradInputIDs.data()
292 };
293
294 HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
295 descNet,
296 CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD,
297 &tensorIDList,
298 sizeof(cutensornetTensorIDList_t)));
299
300 if(verbose)
301 printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");
Contraction order¶
In this example, we illustrate using a predetermined contraction path and
setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute()
.
304 /*******************************
305 * Choose workspace limit based on available resources.
306 *******************************/
307
308 size_t freeMem, totalMem;
309 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
310 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
311 if(verbose)
312 printf("Workspace limit = %lu\n", workspaceLimit);
313
314 /*******************************
315 * Set contraction order
316 *******************************/
317
318 // Create contraction optimizer info
319 cutensornetContractionOptimizerInfo_t optimizerInfo;
320 HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
321
322 // set a predetermined contraction path
323 std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
324 const auto numContractions = numInputs - 1;
325 cutensornetContractionPath_t contPath;
326 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
327 contPath.numContractions = numContractions;
328
329 // provide user-specified contPath
330 HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
331 handle,
332 optimizerInfo,
333 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
334 &contPath,
335 sizeof(contPath)));
336 int64_t numSlices = 1;
337
338 if(verbose)
339 printf("Set predetermined contraction path into cuTensorNet optimizer\n");
Create workspace descriptor and allocate workspace memory¶
Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed
to contract the network.
To enable gradient computation, we need to provide CACHE workspace that will be used to store intermediate tensors’ data
necessary for the backward propagation call to consume.
Thus, we query sizes and allocate device memory for both kinds of workspaces
(CUTENSORNET_WORKSPACE_SCRATCH
, and CUTENSORNET_WORKSPACE_CACHE
) and set these in the workspace descriptor.
The workspace descriptor will be provided to the contraction plan creation, plan contraction, and gradient computation APIs.
342 /*******************************
343 * Create workspace descriptor, allocate workspace, and set it.
344 *******************************/
345
346 cutensornetWorkspaceDescriptor_t workDesc;
347 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
348
349 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
350 int64_t requiredWorkspaceSizeScratch = 0;
351 HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
352 descNet,
353 optimizerInfo,
354 workDesc) );
355
356 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
357 workDesc,
358 CUTENSORNET_WORKSIZE_PREF_MIN,
359 CUTENSORNET_MEMSPACE_DEVICE,
360 CUTENSORNET_WORKSPACE_SCRATCH,
361 &requiredWorkspaceSizeScratch) );
362
363 void* workScratch = nullptr;
364 HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
365
366 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
367 workDesc,
368 CUTENSORNET_MEMSPACE_DEVICE,
369 CUTENSORNET_WORKSPACE_SCRATCH,
370 workScratch,
371 requiredWorkspaceSizeScratch) );
372
373 // set CACHE workspace, which will be used across network contraction operations
374 int64_t requiredWorkspaceSizeCache = 0;
375 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
376 workDesc,
377 CUTENSORNET_WORKSIZE_PREF_MIN,
378 CUTENSORNET_MEMSPACE_DEVICE,
379 CUTENSORNET_WORKSPACE_CACHE,
380 &requiredWorkspaceSizeCache) );
381
382 void* workCache = nullptr;
383 HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
384
385 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
386 workDesc,
387 CUTENSORNET_MEMSPACE_DEVICE,
388 CUTENSORNET_WORKSPACE_CACHE,
389 workCache,
390 requiredWorkspaceSizeCache) );
391
392 if(verbose)
393 printf("Allocated and set up the GPU workspace\n");
Contraction plan and auto-tune¶
We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.
396 /*******************************
397 * Initialize the pairwise contraction plan (for cuTENSOR).
398 *******************************/
399
400 cutensornetContractionPlan_t plan;
401 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
402 descNet,
403 optimizerInfo,
404 workDesc,
405 &plan) );
406
407 /*******************************
408 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
409 * for each pairwise tensor contraction.
410 *******************************/
411 cutensornetContractionAutotunePreference_t autotunePref;
412 HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
413 &autotunePref) );
414
415 const int numAutotuningIterations = 5; // may be 0
416 HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
417 handle,
418 autotunePref,
419 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
420 &numAutotuningIterations,
421 sizeof(numAutotuningIterations)) );
422
423 // Modify the plan again to find the best pair-wise contractions
424 HANDLE_ERROR( cutensornetContractionAutotune(handle,
425 plan,
426 rawDataIn_d,
427 O_d,
428 workDesc,
429 autotunePref,
430 stream) );
431
432 HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
433
434 if(verbose)
435 printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");
Tensor network contraction execution and gradient computation¶
Finally, we contract the tensor network as many times as needed, possibly with different input each time. After contracting the network (which will store intermediate tensors’ data in the CACHE memory), we compute the required gradients via backward propagation. We also purge the CACHE memory at each iteration to make room for subsequent calls to utilize available CACHE memory.
438 /**********************
439 * Execute the tensor network contraction
440 **********************/
441
442 // Create a cutensornetSliceGroup_t object from a range of slice IDs
443 cutensornetSliceGroup_t sliceGroup{};
444 HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
445
446 GPUTimer timer {stream};
447 double minTimeCUTENSORNET = 1e100;
448 const int numRuns = 3; // number of repeats to get stable performance results
449 for (int i = 0; i < numRuns; ++i)
450 {
451 HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
452 HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
453
454 /*
455 * Contract all slices of the tensor network
456 */
457 timer.start();
458
459 int32_t accumulateOutput = 0; // output tensor data will be overwritten
460 HANDLE_ERROR( cutensornetContractSlices(handle,
461 plan,
462 rawDataIn_d,
463 O_d,
464 accumulateOutput,
465 workDesc,
466 sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
467 stream) );
468
469 HANDLE_ERROR( cutensornetComputeGradientsBackward(handle,
470 plan,
471 rawDataIn_d,
472 outputActivation_d,
473 gradientsOut_d,
474 accumulateOutput,
475 workDesc,
476 stream) );
477
478 // Purge the cache to make room for the next run to use cache memory
479 HANDLE_ERROR( cutensornetWorkspacePurgeCache(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE) );
480
481 // Synchronize and measure best timing
482 auto time = timer.seconds();
483 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
484 }
485
486 if(verbose)
487 printf("Contracted the tensor network and computed gradients\n");
488
489 HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
490
491 for (auto i : gradInputIDs)
492 {
493 HANDLE_CUDA_ERROR( cudaMemcpy(gradientsOut_h[i], gradientsOut_d[i], sizeVec[i], cudaMemcpyDeviceToHost) );
494 }
495
496 /*************************/
497
498 if(verbose) {
499 printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", minTimeCUTENSORNET * 1000.f);
500 }
Free resources¶
After the computation, we need to free up all resources.
503 /***************
504 * Free resources
505 ****************/
506
507 // Free cuTensorNet resources
508 HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
509 HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
510 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
511 HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
512 HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
513 HANDLE_ERROR( cutensornetDestroy(handle) );
514
515 // Free Host memory resources
516 if (O_h) free(O_h);
517 if (outputActivation_h) free(outputActivation_h);
518 for (int i = 0; i < numInputs; ++i)
519 {
520 if (rawDataIn_h[i])
521 free(rawDataIn_h[i]);
522 if (gradientsOut_h[i])
523 free(gradientsOut_h[i]);
524 }
525 // Free GPU memory resources
526 if (workScratch) cudaFree(workScratch);
527 if (workCache) cudaFree(workCache);
528 if (O_d) cudaFree(O_d);
529 if (outputActivation_d) cudaFree(outputActivation_d);
530 for (int i = 0; i < numInputs; ++i)
531 {
532 if (rawDataIn_d[i])
533 cudaFree(rawDataIn_d[i]);
534 if (gradientsOut_d[i])
535 cudaFree(gradientsOut_d[i]);
536 }
537 if(verbose)
538 printf("Freed resources and exited\n");
539
540 return 0;
541}