Getting Started¶
In this section, we show how to contract a tensor network using cuTensorNet. First, we describe how to install the library and how to compile a sample code. Then, we present the example code used to perform common steps in cuTensorNet. In this example, we perform the following tensor contraction:
We build the code up step by step, each step adding code at the end. The steps are separated by succinct multi-line comment blocks.
It is recommended that the reader refers to Overview and cuTENSOR documentation for familiarity with the nomenclature and cuTENSOR operations.
Installation and Compilation¶
Download the cuQuantum package (which cuTensorNet is part of) from https://developer.nvidia.com/cuQuantum-downloads, and the cuTENSOR package from https://developer.nvidia.com/cutensor.
Linux¶
Assuming cuQuantum has been extracted in CUQUANTUM_ROOT
and cuTENSOR in CUTENSOR_ROOT
we update the library path as follows:
export LD_LIBRARY_PATH=${CUQUANTUM_ROOT}/lib:${CUTENSOR_ROOT}/lib/11:${LD_LIBRARY_PATH}
Depending on your CUDA Toolkit, you might have to choose a different library version (e.g., ${CUTENSOR_ROOT}/lib/11.0
).
The sample code discussed below (tensornet_example.cu
) can be compiled via the following command:
nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include -L${CUQUANTUM_ROOT}/lib -L${CUTENSOR_ROOT}/lib/11 -lcutensornet -lcutensor -o tensornet_example
For statically linking against the cuTensorNet library, use the following command (note that libmetis_static.a
needs to be explicitly linked against, assuming it is installed through the NVIDIA CUDA Toolkit and accessible through $LIBRARY_PATH
):
nvcc tensornet_example.cu -I${CUQUANTUM_ROOT}/include -I${CUTENSOR_ROOT}/include ${CUQUANTUM_ROOT}/lib/libcutensornet_static.a -L${CUTENSOR_DIR}/lib/11 -lcutensor libmetis_static.a -o tensornet_example
Note
Depending on the source of the cuQuantum package, you may need to replace lib
above by lib64
.
Code Example¶
The following code example shows the common steps necessary to use cuTensorNet and performs some typical operations in tensor network contraction. The full sample code can be found in the NVIDIA/cuQuantum repository (here).
Headers and Data Types¶
8#include <stdlib.h>
9#include <stdio.h>
10
11#include <unordered_map>
12#include <vector>
13#include <cassert>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17#include <cutensor.h>
18
19#define HANDLE_ERROR(x) \
20{ const auto err = x; \
21if( err != CUTENSORNET_STATUS_SUCCESS ) \
22{ printf("Error: %s in line %d\n", cutensornetGetErrorString(err), __LINE__); return err; } \
23};
24
25#define HANDLE_CUDA_ERROR(x) \
26{ const auto err = x; \
27 if( err != cudaSuccess ) \
28 { printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__); return err; } \
29};
30
31struct GPUTimer
32{
33 GPUTimer(cudaStream_t stream): stream_(stream)
34 {
35 cudaEventCreate(&start_);
36 cudaEventCreate(&stop_);
37 }
38
39 ~GPUTimer()
40 {
41 cudaEventDestroy(start_);
42 cudaEventDestroy(stop_);
43 }
44
45 void start()
46 {
47 cudaEventRecord(start_, stream_);
48 }
49
50 float seconds()
51 {
52 cudaEventRecord(stop_, stream_);
53 cudaEventSynchronize(stop_);
54 float time;
55 cudaEventElapsedTime(&time, start_, stop_);
56 return time * 1e-3;
57 }
58
59 private:
60 cudaEvent_t start_, stop_;
61 cudaStream_t stream_;
62};
63
64
65int main()
66{
67 const size_t cuTensornetVersion = cutensornetGetVersion();
68 printf("cuTensorNet-vers:%ld\n",cuTensornetVersion);
69
70 cudaDeviceProp prop;
71 int32_t deviceId = -1;
72 HANDLE_CUDA_ERROR( cudaGetDevice(&deviceId) );
73 HANDLE_CUDA_ERROR( cudaGetDeviceProperties(&prop, deviceId) );
74
75 printf("===== device info ======\n");
76 printf("GPU-name:%s\n", prop.name);
77 printf("GPU-clock:%d\n", prop.clockRate);
78 printf("GPU-memoryClock:%d\n", prop.memoryClockRate);
79 printf("GPU-nSM:%d\n", prop.multiProcessorCount);
80 printf("GPU-major:%d\n", prop.major);
81 printf("GPU-minor:%d\n", prop.minor);
82 printf("========================\n");
83
84 typedef float floatType;
85 cudaDataType_t typeData = CUDA_R_32F;
86 cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
87
88 printf("Include headers and define data types\n");
Define Tensor Network and Tensor Sizes¶
Next, we define the topology of the tensor network (i.e., the modes of the tensors, their extents, and their connectivity).
91 /**********************
92 * Computing: D_{m,x,n,y} = A_{m,h,k,n} B_{u,k,h} C_{x,u,y}
93 **********************/
94
95 constexpr int32_t numInputs = 3;
96
97 // Create vector of modes
98 std::vector<int32_t> modesA{'m','h','k','n'};
99 std::vector<int32_t> modesB{'u','k','h'};
100 std::vector<int32_t> modesC{'x','u','y'};
101 std::vector<int32_t> modesD{'m','x','n','y'};
102
103 // Extents
104 std::unordered_map<int32_t, int64_t> extent;
105 extent['m'] = 96;
106 extent['n'] = 96;
107 extent['u'] = 96;
108 extent['h'] = 64;
109 extent['k'] = 64;
110 extent['x'] = 64;
111 extent['y'] = 64;
112
113 // Create a vector of extents for each tensor
114 std::vector<int64_t> extentA;
115 for (auto mode : modesA)
116 extentA.push_back(extent[mode]);
117 std::vector<int64_t> extentB;
118 for (auto mode : modesB)
119 extentB.push_back(extent[mode]);
120 std::vector<int64_t> extentC;
121 for (auto mode : modesC)
122 extentC.push_back(extent[mode]);
123 std::vector<int64_t> extentD;
124 for (auto mode : modesD)
125 extentD.push_back(extent[mode]);
126
127 printf("Define network, modes, and extents\n");
Allocate memory and initialize data¶
Next, we allocate memory for data and workspace, and randomly initialize the data.
130 /**********************
131 * Allocating data
132 **********************/
133
134 size_t elementsA = 1;
135 for (auto mode : modesA)
136 elementsA *= extent[mode];
137 size_t elementsB = 1;
138 for (auto mode : modesB)
139 elementsB *= extent[mode];
140 size_t elementsC = 1;
141 for (auto mode : modesC)
142 elementsC *= extent[mode];
143 size_t elementsD = 1;
144 for (auto mode : modesD)
145 elementsD *= extent[mode];
146
147 size_t sizeA = sizeof(floatType) * elementsA;
148 size_t sizeB = sizeof(floatType) * elementsB;
149 size_t sizeC = sizeof(floatType) * elementsC;
150 size_t sizeD = sizeof(floatType) * elementsD;
151 printf("Total memory: %.2f GiB\n", (sizeA + sizeB + sizeC + sizeD)/1024./1024./1024);
152
153 void* rawDataIn_d[numInputs];
154 void* D_d;
155 HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[0], sizeA));
156 HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[1], sizeB));
157 HANDLE_CUDA_ERROR(cudaMalloc((void**) &rawDataIn_d[2], sizeC));
158 HANDLE_CUDA_ERROR(cudaMalloc((void**) &D_d, sizeD));
159
160 floatType *A = (floatType*) malloc(sizeof(floatType) * elementsA);
161 floatType *B = (floatType*) malloc(sizeof(floatType) * elementsB);
162 floatType *C = (floatType*) malloc(sizeof(floatType) * elementsC);
163 floatType *D = (floatType*) malloc(sizeof(floatType) * elementsD);
164
165 if (A == NULL || B == NULL || C == NULL || D == NULL)
166 {
167 printf("Error: Host allocation of A or C.\n");
168 return -1;
169
170 }
171 /**********************
172 * Allocate workspace
173 **********************/
174
175 size_t freeMem, totalMem;
176 HANDLE_CUDA_ERROR( cudaMemGetInfo(&freeMem, &totalMem ));
177
178 uint64_t worksize = freeMem * 0.9;
179
180 void *work = nullptr;
181 HANDLE_CUDA_ERROR( cudaMalloc(&work, worksize) );
182
183 /*******************
184 * Initialize data
185 *******************/
186
187 for (uint64_t i = 0; i < elementsA; i++)
188 A[i] = ((float) rand())/RAND_MAX;
189 for (uint64_t i = 0; i < elementsB; i++)
190 B[i] = ((float) rand())/RAND_MAX;
191 for (uint64_t i = 0; i < elementsC; i++)
192 C[i] = ((float) rand())/RAND_MAX;
193 memset(D, 0, sizeof(floatType) * elementsD);
194
195 HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[0], A, sizeA, cudaMemcpyHostToDevice));
196 HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[1], B, sizeB, cudaMemcpyHostToDevice));
197 HANDLE_CUDA_ERROR(cudaMemcpy(rawDataIn_d[2], C, sizeC, cudaMemcpyHostToDevice));
198
199 printf("Allocate memory for data and workspace, and initialize data.\n");
cuTensorNet handle and Network Descriptor¶
Next, we initialize the cuTensorNet library via cutensornetCreate()
and
create the network descriptor with the desired tensor modes, extents, and
strides, as well as the data and compute types.
202 /*************************
203 * cuTensorNet
204 *************************/
205
206 cudaStream_t stream;
207 cudaStreamCreate(&stream);
208
209 cutensornetHandle_t handle;
210 HANDLE_ERROR(cutensornetCreate(&handle));
211
212 const int32_t nmodeA = modesA.size();
213 const int32_t nmodeB = modesB.size();
214 const int32_t nmodeC = modesC.size();
215 const int32_t nmodeD = modesD.size();
216
217 /*******************************
218 * Create Network Descriptor
219 *******************************/
220
221 const int32_t* modesIn[] = {modesA.data(), modesB.data(), modesC.data()};
222 int32_t const numModesIn[] = {nmodeA, nmodeB, nmodeC};
223 const int64_t* extentsIn[] = {extentA.data(), extentB.data(), extentC.data()};
224 const int64_t* stridesIn[] = {NULL, NULL, NULL}; // strides are optional; if no stride is provided, then cuTensorNet assumes a generalized column-major data layout
225
226 // Notice that pointers are allocated via cudaMalloc are aligned to 256 byte
227 // boundaries by default; however here we're checking the pointer alignment explicitly
228 // to demonstrate how one would check the alginment for arbitrary pointers.
229
230 auto getMaximalPointerAlignment = [](const void* ptr) {
231 const uint64_t ptrAddr = reinterpret_cast<uint64_t>(ptr);
232 uint32_t alignment = 1;
233 while(ptrAddr % alignment == 0 &&
234 alignment < 256) // at the latest we terminate once the alignment reached 256 bytes (we could be going, but any alignment larger or equal to 256 is equally fine)
235 {
236 alignment *= 2;
237 }
238 return alignment;
239 };
240 const uint32_t alignmentsIn[] = {getMaximalPointerAlignment(rawDataIn_d[0]),
241 getMaximalPointerAlignment(rawDataIn_d[1]),
242 getMaximalPointerAlignment(rawDataIn_d[2])};
243 const uint32_t alignmentOut = getMaximalPointerAlignment(D_d);
244
245 // setup tensor network
246 cutensornetNetworkDescriptor_t descNet;
247 HANDLE_ERROR (cutensornetCreateNetworkDescriptor(handle,
248 numInputs, numModesIn, extentsIn, stridesIn, modesIn, alignmentsIn,
249 nmodeD, extentD.data(), /*stridesOut = */NULL, modesD.data(), alignmentOut,
250 typeData, typeCompute,
251 &descNet));
252
253 printf("Initialize the cuTensorNet library and create a network descriptor.\n");
Optimal contraction order and slicing¶
At this stage, we can deploy the cuTensorNet optimizer to find an optimized contraction path and slicing combination.
It is also possible to feed a pre-determined path into the configuration.
We use the cutensornetContractionOptimize()
function after creating a config and info structures,
which allow us to specify various options to the optimizer.
256 /*******************************
257 * Find "optimal" contraction order and slicing
258 *******************************/
259
260 cutensornetContractionOptimizerConfig_t optimizerConfig;
261 HANDLE_ERROR (cutensornetCreateContractionOptimizerConfig(handle, &optimizerConfig));
262
263 // Set the value of the partitioner imbalance factor, if desired
264 int imbalance_factor = 30;
265 HANDLE_ERROR(cutensornetContractionOptimizerConfigSetAttribute(
266 handle,
267 optimizerConfig,
268 CUTENSORNET_CONTRACTION_OPTIMIZER_CONFIG_GRAPH_IMBALANCE_FACTOR,
269 &imbalance_factor,
270 sizeof(imbalance_factor)));
271
272 cutensornetContractionOptimizerInfo_t optimizerInfo;
273 HANDLE_ERROR (cutensornetCreateContractionOptimizerInfo(handle, descNet, &optimizerInfo));
274
275 HANDLE_ERROR (cutensornetContractionOptimize(handle,
276 descNet,
277 optimizerConfig,
278 worksize,
279 optimizerInfo));
280
281 int64_t numSlices = 0;
282 HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
283 handle,
284 optimizerInfo,
285 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_NUM_SLICES,
286 &numSlices,
287 sizeof(numSlices)));
288
289 assert(numSlices > 0);
290
291 printf("Find an optimized contraction path with cuTensorNet optimizer.\n");
Contraction plan and auto-tune¶
We create a contraction plan holding pair-wise contraction plans for cuTENSOR. Optionally, we can auto-tune the plan such that cuTENSOR selects the best kernel for each contraction. This contraction plan can be reused for many (possibly different) data inputs, avoiding the cost of initializing this plan redundantly. We also create a workspace descriptor, compute and query the minimum workspace size needed to contract the network, and provide this to the contraction plan.
294 /*******************************
295 * Initialize all pair-wise contraction plans (for cuTENSOR)
296 *******************************/
297 cutensornetContractionPlan_t plan;
298
299 cutensornetWorkspaceDescriptor_t workDesc;
300 HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
301
302 uint64_t requiredWorkspaceSize = 0;
303 HANDLE_ERROR(cutensornetWorkspaceComputeSizes(handle,
304 descNet,
305 optimizerInfo,
306 workDesc));
307
308 HANDLE_ERROR(cutensornetWorkspaceGetSize(handle,
309 workDesc,
310 CUTENSORNET_WORKSIZE_PREF_MIN,
311 CUTENSORNET_MEMSPACE_DEVICE,
312 &requiredWorkspaceSize));
313 if (worksize < requiredWorkspaceSize)
314 {
315 printf("Not enough workspace memory is available.");
316 }
317 else
318 {
319 HANDLE_ERROR (cutensornetWorkspaceSet(handle,
320 workDesc,
321 CUTENSORNET_MEMSPACE_DEVICE,
322 work,
323 worksize));
324
325 HANDLE_ERROR( cutensornetCreateContractionPlan(handle,
326 descNet,
327 optimizerInfo,
328 workDesc,
329 &plan) );
330
331 /*******************************
332 * Optional: Auto-tune cuTENSOR's cutensorContractionPlan to pick the fastest kernel
333 *******************************/
334 cutensornetContractionAutotunePreference_t autotunePref;
335 HANDLE_ERROR(cutensornetCreateContractionAutotunePreference(handle,
336 &autotunePref));
337
338 const int numAutotuningIterations = 5; // may be 0
339 HANDLE_ERROR(cutensornetContractionAutotunePreferenceSetAttribute(
340 handle,
341 autotunePref,
342 CUTENSORNET_CONTRACTION_AUTOTUNE_MAX_ITERATIONS,
343 &numAutotuningIterations,
344 sizeof(numAutotuningIterations)));
345
346 // modify the plan again to find the best pair-wise contractions
347 HANDLE_ERROR(cutensornetContractionAutotune(handle,
348 plan,
349 rawDataIn_d,
350 D_d,
351 workDesc,
352 autotunePref,
353 stream));
354
355 HANDLE_ERROR(cutensornetDestroyContractionAutotunePreference(autotunePref));
356
357 printf("Create a contraction plan for cuTENSOR and optionally auto-tune it.\n");
Network contraction execution¶
Finally, we contract the network as many times as needed, possibly with different data. Network slices can be executed using the same contraction plan, potentially on different devices. We also clean up and free allocated resources.
360 /**********************
361 * Run
362 **********************/
363 GPUTimer timer{stream};
364 double minTimeCUTENSOR = 1e100;
365 const int numRuns = 3; // to get stable perf results
366 for (int i=0; i < numRuns; ++i)
367 {
368 cudaMemcpy(D_d, D, sizeD, cudaMemcpyHostToDevice); // restore output
369 cudaDeviceSynchronize();
370
371 /*
372 * Contract over all slices.
373 *
374 * A user may choose to parallelize this loop across multiple devices.
375 * (Note, however, that as of cuTensorNet v1.0.0 the contraction must
376 * start from slice 0, see the cutensornetContraction documentation at
377 * https://docs.nvidia.com/cuda/cuquantum/cutensornet/api/functions.html#cutensornetcontraction )
378 */
379 for(int64_t sliceId=0; sliceId < numSlices; ++sliceId)
380 {
381 timer.start();
382
383 HANDLE_ERROR(cutensornetContraction(handle,
384 plan,
385 rawDataIn_d,
386 D_d,
387 workDesc, sliceId, stream));
388
389 // Synchronize and measure timing
390 auto time = timer.seconds();
391 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
392 }
393 }
394
395 printf("Contract the network, each slice uses the same contraction plan.\n");
396
397 /*************************/
398
399 double flops = -1;
400
401 HANDLE_ERROR( cutensornetContractionOptimizerInfoGetAttribute(
402 handle,
403 optimizerInfo,
404 CUTENSORNET_CONTRACTION_OPTIMIZER_INFO_FLOP_COUNT,
405 &flops,
406 sizeof(flops)));
407
408 printf("numSlices: %ld\n", numSlices);
409 printf("%.2f ms / slice\n", minTimeCUTENSOR * 1000.f);
410 printf("%.2f GFLOPS/s\n", flops/1e9/minTimeCUTENSOR );
411 }
412
413 HANDLE_ERROR(cutensornetDestroy(handle));
414 HANDLE_ERROR(cutensornetDestroyNetworkDescriptor(descNet));
415 HANDLE_ERROR(cutensornetDestroyContractionPlan(plan));
416 HANDLE_ERROR(cutensornetDestroyContractionOptimizerConfig(optimizerConfig));
417 HANDLE_ERROR(cutensornetDestroyContractionOptimizerInfo(optimizerInfo));
418 HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
419
420 if (A) free(A);
421 if (B) free(B);
422 if (C) free(C);
423 if (D) free(D);
424 if (rawDataIn_d[0]) cudaFree(rawDataIn_d[0]);
425 if (rawDataIn_d[1]) cudaFree(rawDataIn_d[1]);
426 if (rawDataIn_d[2]) cudaFree(rawDataIn_d[2]);
427 if (D_d) cudaFree(D_d);
428 if (work) cudaFree(work);
429
430 printf("Free resource and exit.\n");
431
432 return 0;
433}
Recall that the full sample code can be found in the NVIDIA/cuQuantum repository (here).
Useful tips¶
For debugging, the environment variable
CUTENSORNET_LOG_LEVEL=n
can be set. The leveln
= 0, 1, …, 5 corresponds to the logger level as described and used incutensornetLoggerSetLevel()
. The environment variableCUTENSORNET_LOG_FILE=<filepath>
can be used to direct the log output to a custom file at<filepath>
instead ofstdout
.