The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).
Define QR decomposition¶
We first define the QR decomposition to perform with the data type, modes partition, and the extents.
103 /**********************************************
104 * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}
105 ***********************************************/
106
107 typedef float floatType;
108 cudaDataType_t typeData = CUDA_R_32F;
109
110 // Create vector of modes
111 int32_t sharedMode = 'x';
112
113 std::vector<int32_t> modesT{'i','j','m','n'}; // input
114 std::vector<int32_t> modesQ{'i', sharedMode,'m'};
115 std::vector<int32_t> modesR{'n', sharedMode,'j'}; // QR output
116
117 // Extents
118 std::unordered_map<int32_t, int64_t> extentMap;
119 extentMap['i'] = 16;
120 extentMap['j'] = 16;
121 extentMap['m'] = 16;
122 extentMap['n'] = 16;
123
124 int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
125 int64_t colExtent = computeCombinedExtent(extentMap, modesR);
126
127 // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
128 extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
129
130 // Create a vector of extents for each tensor
131 std::vector<int64_t> extentT;
132 for (auto mode : modesT)
133 extentT.push_back(extentMap[mode]);
134 std::vector<int64_t> extentQ;
135 for (auto mode : modesQ)
136 extentQ.push_back(extentMap[mode]);
137 std::vector<int64_t> extentR;
138 for (auto mode : modesR)
139 extentR.push_back(extentMap[mode]);
Note
cutensornetTensorQR()
operates in reduced mode, expecting the shared extent for the output to be the minimum of
the combined row extent and the combined column extent.
Allocate memory and initialize data¶
Next, we allocate memory for the input and output tensor operands. The input operand is initialized to random values.
142 /***********************************
143 * Allocating data on host and device
144 ************************************/
145
146 size_t elementsT = 1;
147 for (auto mode : modesT)
148 elementsT *= extentMap[mode];
149 size_t elementsQ = 1;
150 for (auto mode : modesQ)
151 elementsQ *= extentMap[mode];
152 size_t elementsR = 1;
153 for (auto mode : modesR)
154 elementsR *= extentMap[mode];
155
156 size_t sizeT = sizeof(floatType) * elementsT;
157 size_t sizeQ = sizeof(floatType) * elementsQ;
158 size_t sizeR = sizeof(floatType) * elementsR;
159
160 printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
161
162 floatType *T = (floatType*) malloc(sizeT);
163 floatType *Q = (floatType*) malloc(sizeQ);
164 floatType *R = (floatType*) malloc(sizeR);
165
166 if (T == NULL || Q==NULL || R==NULL )
167 {
168 printf("Error: Host allocation of input T or output Q/R.\n");
169 return -1;
170 }
171
172 void* D_T;
173 void* D_Q;
174 void* D_R;
175
176 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
177 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
178 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
179
180 /****************
181 * Initialize data
182 *****************/
183
184 for (uint64_t i = 0; i < elementsT; i++)
185 T[i] = ((floatType) rand())/RAND_MAX;
186
187 HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
188 printf("Allocate memory for data, and initialize data.\n");
Initialize cuTensorNet and create tensor descriptors¶
Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t
for input and output tensors.
191 /******************
192 * cuTensorNet
193 *******************/
194
195 cudaStream_t stream;
196 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
197
198 cutensornetHandle_t handle;
199 HANDLE_ERROR( cutensornetCreate(&handle) );
200
201 /***************************
202 * Create tensor descriptors
203 ****************************/
204
205 cutensornetTensorDescriptor_t descTensorIn;
206 cutensornetTensorDescriptor_t descTensorQ;
207 cutensornetTensorDescriptor_t descTensorR;
208
209 const int32_t numModesIn = modesT.size();
210 const int32_t numModesQ = modesQ.size();
211 const int32_t numModesR = modesR.size();
212
213 const int64_t* strides = NULL; // assuming fortran layout for all tensors
214
215 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
216 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
217 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
218
219 printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");
Query and allocate required workspace¶
Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes()
.
222 /********************************************
223 * Query and allocate required workspace sizes
224 *********************************************/
225
226 cutensornetWorkspaceDescriptor_t workDesc;
227 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
228 HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
229 int64_t hostWorkspaceSize, deviceWorkspaceSize;
230
231 // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
232 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
233 workDesc,
234 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
235 CUTENSORNET_MEMSPACE_DEVICE,
236 CUTENSORNET_WORKSPACE_SCRATCH,
237 &deviceWorkspaceSize) );
238 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
239 workDesc,
240 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
241 CUTENSORNET_MEMSPACE_HOST,
242 CUTENSORNET_WORKSPACE_SCRATCH,
243 &hostWorkspaceSize) );
244
245 void *devWork = nullptr, *hostWork = nullptr;
246 if (deviceWorkspaceSize > 0) {
247 HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
248 }
249 if (hostWorkspaceSize > 0) {
250 hostWork = malloc(hostWorkspaceSize);
251 }
252 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
253 workDesc,
254 CUTENSORNET_MEMSPACE_DEVICE,
255 CUTENSORNET_WORKSPACE_SCRATCH,
256 devWork,
257 deviceWorkspaceSize) );
258 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
259 workDesc,
260 CUTENSORNET_MEMSPACE_HOST,
261 CUTENSORNET_WORKSPACE_SCRATCH,
262 hostWork,
263 hostWorkspaceSize) );
Execution¶
At this stage, we can perform the QR decomposition by calling cutensornetTensorQR()
.
266 /**********
267 * Execution
268 ***********/
269
270 GPUTimer timer{stream};
271 double minTimeCUTENSOR = 1e100;
272 const int numRuns = 3; // to get stable perf results
273 for (int i=0; i < numRuns; ++i)
274 {
275 // restore output
276 cudaMemsetAsync(D_Q, 0, sizeQ, stream);
277 cudaMemsetAsync(D_R, 0, sizeR, stream);
278 cudaDeviceSynchronize();
279
280 timer.start();
281 HANDLE_ERROR( cutensornetTensorQR(handle,
282 descTensorIn, D_T,
283 descTensorQ, D_Q,
284 descTensorR, D_R,
285 workDesc,
286 stream) );
287 // Synchronize and measure timing
288 auto time = timer.seconds();
289 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
290 }
291
292 printf("Performing QR\n");
293
294 HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
295 HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
296
297 cudaDeviceSynchronize(); // device synchronization.
298 printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);
Free resources¶
After the computation, we need to free up all resources.
301 /***************
302 * Free resources
303 ****************/
304
305 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
306 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
307 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
308 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
309 HANDLE_ERROR( cutensornetDestroy(handle) );
310
311 if (T) free(T);
312 if (Q) free(Q);
313 if (R) free(R);
314 if (D_T) cudaFree(D_T);
315 if (D_Q) cudaFree(D_Q);
316 if (D_R) cudaFree(D_R);
317 if (devWork) cudaFree(devWork);
318 if (hostWork) free(hostWork);
319
320 printf("Free resource and exit.\n");
321
322 return 0;
323}