Computing gradients via backward propagation¶
The following code example illustrates how to compute the gradients of a tensor network w.r.t. user-selected input tensors via backward propagation. The full code can be found in the NVIDIA/cuQuantum repository (here).
Headers and data types¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_ERROR(x) \
20{ const auto err = x; \
21 if( err != CUTENSORNET_STATUS_SUCCESS ) \
22 { printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
23 fflush(stdout); \
24 } \
25};
26
27#define HANDLE_CUDA_ERROR(x) \
28{ const auto err = x; \
29 if( err != cudaSuccess ) \
30 { printf("CUDA Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); \
31 fflush(stdout); \
32 } \
33};
34
35
36struct GPUTimer
37{
38 GPUTimer(cudaStream_t stream): stream_(stream)
39 {
40 cudaEventCreate(&start_);
41 cudaEventCreate(&stop_);
42 }
43
44 ~GPUTimer()
45 {
46 cudaEventDestroy(start_);
47 cudaEventDestroy(stop_);
48 }
49
50 void start()
51 {
52 cudaEventRecord(start_, stream_);
53 }
54
55 float seconds()
56 {
57 cudaEventRecord(stop_, stream_);
58 cudaEventSynchronize(stop_);
59 float time;
60 cudaEventElapsedTime(&time, start_, stop_);
61 return time * 1e-3;
62 }
63
64 private:
65 cudaEvent_t start_, stop_;
66 cudaStream_t stream_;
67};
68
69
70int main()
71{
72 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
73
74 bool verbose = true;
75
76 // Check cuTensorNet version
77 const size_t cuTensornetVersion = cutensornetGetVersion();
78 if(verbose)
79 printf("cuTensorNet version: %ld\n", cuTensornetVersion);
80
81 // Set GPU device
82 int numDevices {0};
83 HANDLE_CUDA_ERROR( cudaGetDeviceCount(&numDevices) );
84 const int deviceId = 0;
85 HANDLE_CUDA_ERROR( cudaSetDevice(deviceId) );
86 cudaDeviceProp prop;
87 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
88
89 if(verbose) {
90 printf("===== device info ======\n");
91 printf("GPU-name:%s\n", prop.name);
92 printf("GPU-clock:%d\n", prop.clockRate);
93 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
94 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
95 printf("GPU-major:%d\n", prop.major);
96 printf("GPU-minor:%d\n", prop.minor);
97 printf("========================\n");
98 }
99
100 typedef float floatType;
101 cudaDataType_t typeData = CUDA_R_32F;
102 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
103
104 if(verbose)
105 printf("Included headers and defined data types\n");
Define tensor network and tensor sizes¶
Next, we define the structure of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity), and specify the input tensor IDs whose gradients will be computed.
109 /**********************
110 * Computing: O_{a,m} = A_{a,b,c,d} B_{b,c,d,e} C_{e,g,h} D_{g,h,i,j} E_{i,j,k,l} F_{k,l,m}
111 * We will execute the contraction and compute the gradients of input tensors A, B, C
112 **********************/
113
114 constexpr int32_t numInputs = 6;
115 std::vector<int32_t> gradInputIDs = {0, 1, 2};
116
117 // Create vectors of tensor modes
118 std::vector<std::vector<int32_t>> modesVec {
119 {'a','b','c','d'},
120 {'b','c','d','e'},
121 {'e','g','h'},
122 {'g','h','i','j'},
123 {'i','j','k','l'},
124 {'k','l','m'},
125 {'a','m'}
126 };
127
128 // Set mode extents
129 int64_t sameExtent = 36; // setting same extent for simplicity. In principle extents can differ.
130 std::unordered_map<int32_t, int64_t> extent;
131 for (auto &vec: modesVec)
132 {
133 for (auto &mode: vec)
134 {
135 extent[mode] = sameExtent;
136 }
137 }
138
139 // Create a vector of extents for each tensor
140 std::vector<std::vector<int64_t>> extentVec;
141 extentVec.resize(numInputs+1); // hold inputs + output tensors
142 for (int i = 0; i < numInputs+1; ++i)
143 {
144 for (auto mode : modesVec[i])
145 extentVec[i].push_back(extent[mode]);
146 }
147
148 if(verbose)
149 printf("Defined tensor network, modes, and extents\n");
Allocate memory, initialize data, initialize cuTensorNet handle¶
Next, we allocate memory for the tensor network operands and initialize them to random values.
We also allocate memory for the gradient tensors corresponding to the selected input tensors for gradient computation,
as well as the activation tensor which we initialize to ones.
Then, we initialize the cuTensorNet library via cutensornetCreate()
.
152 /**********************
153 * Allocating data
154 **********************/
155
156 std::vector<size_t> elementsVec;
157 elementsVec.resize(numInputs+1); // hold inputs + output tensors
158 for (int i = 0; i < numInputs+1; ++i)
159 {
160 elementsVec[i] = 1;
161 for (auto mode : modesVec[i])
162 elementsVec[i] *= extent[mode];
163 }
164
165 size_t totalSize = 0;
166 std::vector<size_t> sizeVec;
167 sizeVec.resize(numInputs+1); // hold inputs + output tensors
168 for (int i = 0; i < numInputs+1; ++i)
169 {
170 sizeVec[i] = sizeof(floatType) * elementsVec[i];
171 totalSize += sizeVec[i];
172 }
173 if(verbose)
174 printf("Total GPU memory used for tensor storage: %.2f GiB\n",
175 (totalSize) / 1024. /1024. / 1024);
176
177 void* rawDataIn_d[numInputs];
178 void* O_d;
179 void* outputActivation_d;
180 for (int i = 0; i < numInputs; ++i)
181 {
182 HANDLE_CUDA_ERROR( cudaMalloc((void**) &rawDataIn_d[i], sizeVec[i]) );
183 }
184 HANDLE_CUDA_ERROR( cudaMalloc((void**) &O_d, sizeVec[numInputs]));
185 HANDLE_CUDA_ERROR( cudaMalloc((void**) &outputActivation_d, sizeVec[numInputs]));
186
187 floatType* rawDataIn_h[numInputs];
188 for (int i = 0; i < numInputs; ++i)
189 {
190 rawDataIn_h[i] = (floatType*) malloc(sizeVec[i]);
191 if (rawDataIn_h[i] == NULL)
192 {
193 printf("Error: Host memory allocation failed!\n");
194 return -1;
195 }
196 }
197 floatType *O_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
198 if (O_h == NULL)
199 {
200 printf("Error: Host memory allocation failed!\n");
201 return -1;
202 }
203 floatType *outputActivation_h = (floatType*) malloc(sizeof(floatType) * elementsVec[numInputs]);
204 if (outputActivation_h == NULL)
205 {
206 printf("Error: Host memory allocation failed!\n");
207 return -1;
208 }
209
210 void* gradientsOut_d[numInputs] = {nullptr};
211 for (auto i : gradInputIDs)
212 {
213 HANDLE_CUDA_ERROR( cudaMalloc((void**) &gradientsOut_d[i], sizeVec[i]) );
214 }
215 void* gradientsOut_h[numInputs] = {nullptr};
216 for (auto i : gradInputIDs)
217 {
218 gradientsOut_h[i] = (floatType*) malloc(sizeVec[i]);
219 if (gradientsOut_h[i] == NULL)
220 {
221 printf("Error: Host memory allocation failed!\n");
222 return -1;
223 }
224 }
225
226 /*******************
227 * Initialize data
228 *******************/
229
230 memset(O_h, 0, sizeof(floatType) * elementsVec[numInputs]);
231 for (int i = 0; i < numInputs; ++i)
232 {
233 for (size_t e = 0; e < elementsVec[i]; ++e)
234 rawDataIn_h[i][e] = ((floatType) rand()) / RAND_MAX;
235 }
236 for (size_t e = 0; e < elementsVec[numInputs]; ++e)
237 outputActivation_h[e] = (floatType) 1.0;
238
239 for (int i = 0; i < numInputs; ++i)
240 {
241 HANDLE_CUDA_ERROR( cudaMemcpy(rawDataIn_d[i], rawDataIn_h[i], sizeVec[i], cudaMemcpyHostToDevice) );
242 }
243 HANDLE_CUDA_ERROR( cudaMemcpy(outputActivation_d, outputActivation_h, sizeVec[numInputs], cudaMemcpyHostToDevice) );
244
245 if(verbose)
246 printf("Allocated GPU memory for data, initialize data, and create library handle\n");
247
248 /*************************
249 * cuTensorNet
250 *************************/
251
252 cudaStream_t stream;
253 cudaStreamCreate(&stream);
254
255 cutensornetHandle_t handle;
256 HANDLE_ERROR( cutensornetCreate(&handle) );
Create the network descriptor and set gradient tensor IDs¶
Next, we create the network descriptor with the desired tensor modes, extents, and strides, as well as the data and compute types. We also set input tensors IDs whose gradients will be computed, at the network descriptor. Note that the created library context will be associated with the currently active GPU.
259 /*******************************
260 * Create Network Descriptor
261 *******************************/
262
263 int32_t* modesIn[numInputs];
264 int32_t numModesIn[numInputs];
265 int64_t* extentsIn[numInputs];
266 int64_t* stridesIn[numInputs];
267
268 for (int i = 0; i < numInputs; ++i)
269 {
270 modesIn[i] = modesVec[i].data();
271 numModesIn[i] = modesVec[i].size();
272 extentsIn[i] = extentVec[i].data();
273 stridesIn[i] = NULL; // strides are optional; if no stride is provided, cuTensorNet assumes a generalized column-major data layout
274 }
275
276 // Set up tensor network
277 cutensornetNetworkDescriptor_t descNet;
278 HANDLE_ERROR( cutensornetCreateNetworkDescriptor(handle,
279 numInputs, numModesIn, extentsIn, stridesIn, modesIn, NULL,
280 modesVec[numInputs].size(), extentVec[numInputs].data(), /*stridesOut = */NULL, modesVec[numInputs].data(),
281 typeData, typeCompute,
282 &descNet) );
283
284 /*******************************
285 * Set input tensor ids that requrie gradient
286 *******************************/
287
288 cutensornetTensorIDList_t tensorIDList {
289 .numTensors = static_cast<int32_t>(gradInputIDs.size()),
290 .data = gradInputIDs.data()
291 };
292
293 HANDLE_ERROR(cutensornetNetworkSetAttribute(handle,
294 descNet,
295 CUTENSORNET_NETWORK_INPUT_TENSORS_REQUIRE_GRAD,
296 &tensorIDList,
297 sizeof(cutensornetTensorIDList_t)));
298
299 if(verbose)
300 printf("Initialized the cuTensorNet library and created a tensor network descriptor\n");
Contraction order¶
In this example, we illustrate using a predetermined contraction path and
setting it into the optimizer info object via cutensornetContractionOptimizerInfoSetAttribute()
.
303 /*******************************
304 * Choose workspace limit based on available resources.
305 *******************************/
306
307 size_t freeMem, totalMem;
308 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem) );
309 uint64_t workspaceLimit = (uint64_t)((double)freeMem * 0.9);
310 if(verbose)
311 printf("Workspace limit = %lu\n", workspaceLimit);
312
313 /*******************************
314 * Set contraction order
315 *******************************/
316
317 // Create contraction optimizer info
318 cutensornetContractionOptimizerInfo_t optimizerInfo;
319 HANDLE_ERROR( cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo) );
320
321 // set a predetermined contraction path
322 std::vector<int32_t> path{0,1,0,4,0,3,0,2,0,1};
323 const auto numContractions = numInputs - 1;
324 cutensornetContractionPath_t contPath;
325 contPath.data = reinterpret_cast<cutensornetNodePair_t*>(const_cast<int32_t*>(path.data()));
326 contPath.numContractions = numContractions;
327
328 // provide user-specified contPath
329 HANDLE_ERROR( cutensornetContractionOptimizerInfoSetAttribute(
330 handle,
331 optimizerInfo,
332 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_PATH,
333 &contPath,
334 sizeof(contPath)));
335 int64_t numSlices = 1;
336
337 if(verbose)
338 printf("Set predetermined contraction path into cuTensorNet optimizer\n");
Create workspace descriptor and allocate workspace memory¶
Next, we create a workspace descriptor, compute the workspace sizes, and query the minimum workspace size needed
to contract the network.
To enable gradient computation, we need to provide CACHE workspace that will be used to store intermediate tensors’ data
necessary for the backward propagation call to consume.
Thus, we query sizes and allocate device memory for both kinds of workspaces
(CUTENSORNET_WORKSPACE_SCRATCH
, and CUTENSORNET_WORKSPACE_CACHE
) and set these in the workspace descriptor.
The workspace descriptor will be provided to the contraction plan creation, plan contraction, and gradient computation APIs.
341 /*******************************
342 * Create workspace descriptor, allocate workspace, and set it.
343 *******************************/
344
345 cutensornetWorkspaceDescriptor_t workDesc;
346 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
347
348 // set SCRATCH workspace, which will be used during each network contraction operation, not needed afterwords
349 int64_t requiredWorkspaceSizeScratch = 0;
350 HANDLE_ERROR( cutensornetWorkspaceComputeContractionSizes(handle,
351 descNet,
352 optimizerInfo,
353 workDesc) );
354
355 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
356 workDesc,
357 CUTENSORNET_WORKSIZE_PREF_MIN,
358 CUTENSORNET_MEMSPACE_DEVICE,
359 CUTENSORNET_WORKSPACE_SCRATCH,
360 &requiredWorkspaceSizeScratch) );
361
362 void* workScratch = nullptr;
363 HANDLE_CUDA_ERROR( cudaMalloc(&workScratch, requiredWorkspaceSizeScratch) );
364
365 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
366 workDesc,
367 CUTENSORNET_MEMSPACE_DEVICE,
368 CUTENSORNET_WORKSPACE_SCRATCH,
369 workScratch,
370 requiredWorkspaceSizeScratch) );
371
372 // set CACHE workspace, which will be used across network contraction operations
373 int64_t requiredWorkspaceSizeCache = 0;
374 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
375 workDesc,
376 CUTENSORNET_WORKSIZE_PREF_MIN,
377 CUTENSORNET_MEMSPACE_DEVICE,
378 CUTENSORNET_WORKSPACE_CACHE,
379 &requiredWorkspaceSizeCache) );
380
381 void* workCache = nullptr;
382 HANDLE_CUDA_ERROR( cudaMalloc(&workCache, requiredWorkspaceSizeCache) );
383
384 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
385 workDesc,
386 CUTENSORNET_MEMSPACE_DEVICE,
387 CUTENSORNET_WORKSPACE_CACHE,
388 workCache,
389 requiredWorkspaceSizeCache) );
390
391 if(verbose)
392 printf("Allocated and set up the GPU workspace\n");
Contraction plan and auto-tune¶
We create a tensor network contraction plan holding all pairwise tensor contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each pairwise contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly.
395 /*******************************
396 * Initialize the pairwise contraction plan (for cuTENSOR).
397 *******************************/
398
399 cutensornetContractionPlan_t plan;
400 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
401 descNet,
402 optimizerInfo,
403 workDesc,
404 &plan) );
405
406 /*******************************
407 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
408 * for each pairwise tensor contraction.
409 *******************************/
410 cutensornetContractionAutotunePreference_t autotunePref;
411 HANDLE_ERROR( cutensornetCreateContractionAutotunePreference(handle,
412 &autotunePref) );
413
414 const int numAutotuningIterations = 5; // may be 0
415 HANDLE_ERROR( cutensornetContractionAutotunePreferenceSetAttribute(
416 handle,
417 autotunePref,
418 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
419 &numAutotuningIterations,
420 sizeof(numAutotuningIterations)) );
421
422 // Modify the plan again to find the best pair-wise contractions
423 HANDLE_ERROR( cutensornetContractionAutotune(handle,
424 plan,
425 rawDataIn_d,
426 O_d,
427 workDesc,
428 autotunePref,
429 stream) );
430
431 HANDLE_ERROR( cutensornetDestroyContractionAutotunePreference(autotunePref) );
432
433 if(verbose)
434 printf("Created a contraction plan for cuTensorNet and optionally auto-tuned it\n");
Tensor network contraction execution and gradient computation¶
Finally, we contract the tensor network as many times as needed, possibly with different input each time. After contracting the network (which will store intermediate tensors’ data in the CACHE memory), we compute the required gradients via backward propagation. We also purge the CACHE memory at each iteration to make room for subsequent calls to utilize available CACHE memory.
437 /**********************
438 * Execute the tensor network contraction
439 **********************/
440
441 // Create a cutensornetSliceGroup_t object from a range of slice IDs
442 cutensornetSliceGroup_t sliceGroup{};
443 HANDLE_ERROR( cutensornetCreateSliceGroupFromIDRange(handle, 0, numSlices, 1, &sliceGroup) );
444
445 GPUTimer timer {stream};
446 double minTimeCUTENSORNET = 1e100;
447 const int numRuns = 3; // number of repeats to get stable performance results
448 for (int i = 0; i < numRuns; ++i)
449 {
450 HANDLE_CUDA_ERROR( cudaMemcpy(O_d, O_h, sizeVec[numInputs], cudaMemcpyHostToDevice) ); // restore the output tensor on GPU
451 HANDLE_CUDA_ERROR( cudaDeviceSynchronize() );
452
453 /*
454 * Contract all slices of the tensor network
455 */
456 timer.start();
457
458 int32_t accumulateOutput = 0; // output tensor data will be overwritten
459 HANDLE_ERROR( cutensornetContractSlices(handle,
460 plan,
461 rawDataIn_d,
462 O_d,
463 accumulateOutput,
464 workDesc,
465 sliceGroup, // alternatively, NULL can also be used to contract over all slices instead of specifying a sliceGroup object
466 stream) );
467
468 HANDLE_ERROR( cutensornetComputeGradientsBackward(handle,
469 plan,
470 rawDataIn_d,
471 outputActivation_d,
472 gradientsOut_d,
473 accumulateOutput,
474 workDesc,
475 stream) );
476
477 // Purge the cache to make room for the next run to use cache memory
478 HANDLE_ERROR( cutensornetWorkspacePurgeCache(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE) );
479
480 // Synchronize and measure best timing
481 auto time = timer.seconds();
482 minTimeCUTENSORNET = (time > minTimeCUTENSORNET) ? minTimeCUTENSORNET : time;
483 }
484
485 if(verbose)
486 printf("Contracted the tensor network and computed gradients\n");
487
488 HANDLE_CUDA_ERROR( cudaMemcpy(O_h, O_d, sizeVec[numInputs], cudaMemcpyDeviceToHost) ); // restore the output tensor on Host
489
490 for (auto i : gradInputIDs)
491 {
492 HANDLE_CUDA_ERROR( cudaMemcpy(gradientsOut_h[i], gradientsOut_d[i], sizeVec[i], cudaMemcpyDeviceToHost) );
493 }
494
495 /*************************/
496
497 if(verbose) {
498 printf("Tensor network contraction and back-propagation time (ms): = %.3f\n", minTimeCUTENSORNET * 1000.f);
499 }
Free resources¶
After the computation, we need to free up all resources.
502 /***************
503 * Free resources
504 ****************/
505
506 // Free cuTensorNet resources
507 HANDLE_ERROR( cutensornetDestroySliceGroup(sliceGroup) );
508 HANDLE_ERROR( cutensornetDestroyContractionPlan(plan) );
509 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
510 HANDLE_ERROR( cutensornetDestroyContractionOptimizerInfo(optimizerInfo) );
511 HANDLE_ERROR( cutensornetDestroyNetworkDescriptor(descNet) );
512 HANDLE_ERROR( cutensornetDestroy(handle) );
513
514 // Free Host memory resources
515 if (O_h) free(O_h);
516 if (outputActivation_h) free(outputActivation_h);
517 for (int i = 0; i < numInputs; ++i)
518 {
519 if (rawDataIn_h[i])
520 free(rawDataIn_h[i]);
521 if (gradientsOut_h[i])
522 free(gradientsOut_h[i]);
523 }
524 // Free GPU memory resources
525 if (workScratch) cudaFree(workScratch);
526 if (workCache) cudaFree(workCache);
527 if (O_d) cudaFree(O_d);
528 if (outputActivation_d) cudaFree(outputActivation_d);
529 for (int i = 0; i < numInputs; ++i)
530 {
531 if (rawDataIn_d[i])
532 cudaFree(rawDataIn_d[i]);
533 if (gradientsOut_d[i])
534 cudaFree(gradientsOut_d[i]);
535 }
536 if(verbose)
537 printf("Freed resources and exited\n");
538
539 return 0;
540}