The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).
Define QR decomposition¶
We first define the QR decomposition to perform with the data type, modes partition, and the extents.
96 /**********************************************
97 * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}
98 ***********************************************/
99
100 typedef float floatType;
101 cudaDataType_t typeData = CUDA_R_32F;
102
103 // Create vector of modes
104 int32_t sharedMode = 'x';
105
106 std::vector<int32_t> modesT{'i','j','m','n'}; // input
107 std::vector<int32_t> modesQ{'i', sharedMode,'m'};
108 std::vector<int32_t> modesR{'n', sharedMode,'j'}; // QR output
109
110 // Extents
111 std::unordered_map<int32_t, int64_t> extentMap;
112 extentMap['i'] = 16;
113 extentMap['j'] = 16;
114 extentMap['m'] = 16;
115 extentMap['n'] = 16;
116
117 int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
118 int64_t colExtent = computeCombinedExtent(extentMap, modesR);
119
120 // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
121 extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
122
123 // Create a vector of extents for each tensor
124 std::vector<int64_t> extentT;
125 for (auto mode : modesT)
126 extentT.push_back(extentMap[mode]);
127 std::vector<int64_t> extentQ;
128 for (auto mode : modesQ)
129 extentQ.push_back(extentMap[mode]);
130 std::vector<int64_t> extentR;
131 for (auto mode : modesR)
132 extentR.push_back(extentMap[mode]);
Note
cutensornetTensorQR()
operates in reduced mode, expecting the shared extent for the output to be the minimum of
the combined row extent and the combined column extent.
Allocate memory and initialize data¶
Next, we allocate memory for the input and output tensor operands. The input operand is initialized to random values.
135 /***********************************
136 * Allocating data on host and device
137 ************************************/
138
139 size_t elementsT = 1;
140 for (auto mode : modesT)
141 elementsT *= extentMap[mode];
142 size_t elementsQ = 1;
143 for (auto mode : modesQ)
144 elementsQ *= extentMap[mode];
145 size_t elementsR = 1;
146 for (auto mode : modesR)
147 elementsR *= extentMap[mode];
148
149 size_t sizeT = sizeof(floatType) * elementsT;
150 size_t sizeQ = sizeof(floatType) * elementsQ;
151 size_t sizeR = sizeof(floatType) * elementsR;
152
153 printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
154
155 floatType *T = (floatType*) malloc(sizeT);
156 floatType *Q = (floatType*) malloc(sizeQ);
157 floatType *R = (floatType*) malloc(sizeR);
158
159 if (T == NULL || Q==NULL || R==NULL )
160 {
161 printf("Error: Host allocation of input T or output Q/R.\n");
162 return -1;
163 }
164
165 void* D_T;
166 void* D_Q;
167 void* D_R;
168
169 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
170 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
171 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
172
173 /****************
174 * Initialize data
175 *****************/
176
177 for (uint64_t i = 0; i < elementsT; i++)
178 T[i] = ((floatType) rand())/RAND_MAX;
179
180 HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
181 printf("Allocate memory for data, and initialize data.\n");
Initialize cuTensorNet and create tensor descriptors¶
Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t
for input and output tensors.
184 /******************
185 * cuTensorNet
186 *******************/
187
188 cudaStream_t stream;
189 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
190
191 cutensornetHandle_t handle;
192 HANDLE_ERROR( cutensornetCreate(&handle) );
193
194 /***************************
195 * Create tensor descriptors
196 ****************************/
197
198 cutensornetTensorDescriptor_t descTensorIn;
199 cutensornetTensorDescriptor_t descTensorQ;
200 cutensornetTensorDescriptor_t descTensorR;
201
202 const int32_t numModesIn = modesT.size();
203 const int32_t numModesQ = modesQ.size();
204 const int32_t numModesR = modesR.size();
205
206 const int64_t* strides = NULL; // assuming fortran layout for all tensors
207
208 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
209 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
210 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
211
212 printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");
Query and allocate required workspace¶
Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes()
.
215 /********************************************
216 * Query and allocate required workspace sizes
217 *********************************************/
218
219 cutensornetWorkspaceDescriptor_t workDesc;
220 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
221 HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
222 int64_t hostWorkspaceSize, deviceWorkspaceSize;
223
224 // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
225 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
226 workDesc,
227 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
228 CUTENSORNET_MEMSPACE_DEVICE,
229 CUTENSORNET_WORKSPACE_SCRATCH,
230 &deviceWorkspaceSize) );
231 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
232 workDesc,
233 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
234 CUTENSORNET_MEMSPACE_HOST,
235 CUTENSORNET_WORKSPACE_SCRATCH,
236 &hostWorkspaceSize) );
237
238 void *devWork = nullptr, *hostWork = nullptr;
239 if (deviceWorkspaceSize > 0) {
240 HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
241 }
242 if (hostWorkspaceSize > 0) {
243 hostWork = malloc(hostWorkspaceSize);
244 }
245 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
246 workDesc,
247 CUTENSORNET_MEMSPACE_DEVICE,
248 CUTENSORNET_WORKSPACE_SCRATCH,
249 devWork,
250 deviceWorkspaceSize) );
251 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
252 workDesc,
253 CUTENSORNET_MEMSPACE_HOST,
254 CUTENSORNET_WORKSPACE_SCRATCH,
255 hostWork,
256 hostWorkspaceSize) );
Execution¶
At this stage, we can perform the QR decomposition by calling cutensornetTensorQR()
.
259 /**********
260 * Execution
261 ***********/
262
263 GPUTimer timer{stream};
264 double minTimeCUTENSOR = 1e100;
265 const int numRuns = 3; // to get stable perf results
266 for (int i=0; i < numRuns; ++i)
267 {
268 // restore output
269 cudaMemsetAsync(D_Q, 0, sizeQ, stream);
270 cudaMemsetAsync(D_R, 0, sizeR, stream);
271 cudaDeviceSynchronize();
272
273 timer.start();
274 HANDLE_ERROR( cutensornetTensorQR(handle,
275 descTensorIn, D_T,
276 descTensorQ, D_Q,
277 descTensorR, D_R,
278 workDesc,
279 stream) );
280 // Synchronize and measure timing
281 auto time = timer.seconds();
282 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
283 }
284
285 printf("Performing QR\n");
286
287 HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
288 HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
289
290 cudaDeviceSynchronize(); // device synchronization.
291 printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);
Free resources¶
After the computation, we need to free up all resources.
294 /***************
295 * Free resources
296 ****************/
297
298 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
299 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
300 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
301 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
302 HANDLE_ERROR( cutensornetDestroy(handle) );
303
304 if (T) free(T);
305 if (Q) free(Q);
306 if (R) free(R);
307 if (D_T) cudaFree(D_T);
308 if (D_Q) cudaFree(D_Q);
309 if (D_R) cudaFree(D_R);
310 if (devWork) cudaFree(devWork);
311 if (hostWork) free(hostWork);
312
313 printf("Free resource and exit.\n");
314
315 return 0;
316}