The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).
Define QR decomposition¶
We first define the QR decomposition to perform with the data type, modes partition, and the extents.
97 /**********************************************
98 * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}
99 ***********************************************/
100
101 typedef float floatType;
102 cudaDataType_t typeData = CUDA_R_32F;
103
104 // Create vector of modes
105 int32_t sharedMode = 'x';
106
107 std::vector<int32_t> modesT{'i','j','m','n'}; // input
108 std::vector<int32_t> modesQ{'i', sharedMode,'m'};
109 std::vector<int32_t> modesR{'n', sharedMode,'j'}; // QR output
110
111 // Extents
112 std::unordered_map<int32_t, int64_t> extentMap;
113 extentMap['i'] = 16;
114 extentMap['j'] = 16;
115 extentMap['m'] = 16;
116 extentMap['n'] = 16;
117
118 int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
119 int64_t colExtent = computeCombinedExtent(extentMap, modesR);
120
121 // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
122 extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
123
124 // Create a vector of extents for each tensor
125 std::vector<int64_t> extentT;
126 for (auto mode : modesT)
127 extentT.push_back(extentMap[mode]);
128 std::vector<int64_t> extentQ;
129 for (auto mode : modesQ)
130 extentQ.push_back(extentMap[mode]);
131 std::vector<int64_t> extentR;
132 for (auto mode : modesR)
133 extentR.push_back(extentMap[mode]);
Note
cutensornetTensorQR()
operates in reduced mode, expecting the shared extent for the output to be the minimum of
the combined row extent and the combined column extent.
Allocate memory and initialize data¶
Next, we allocate memory for the input and output tensor operands. The input operand is initialized to random values.
136 /***********************************
137 * Allocating data on host and device
138 ************************************/
139
140 size_t elementsT = 1;
141 for (auto mode : modesT)
142 elementsT *= extentMap[mode];
143 size_t elementsQ = 1;
144 for (auto mode : modesQ)
145 elementsQ *= extentMap[mode];
146 size_t elementsR = 1;
147 for (auto mode : modesR)
148 elementsR *= extentMap[mode];
149
150 size_t sizeT = sizeof(floatType) * elementsT;
151 size_t sizeQ = sizeof(floatType) * elementsQ;
152 size_t sizeR = sizeof(floatType) * elementsR;
153
154 printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
155
156 floatType *T = (floatType*) malloc(sizeT);
157 floatType *Q = (floatType*) malloc(sizeQ);
158 floatType *R = (floatType*) malloc(sizeR);
159
160 if (T == NULL || Q==NULL || R==NULL )
161 {
162 printf("Error: Host allocation of input T or output Q/R.\n");
163 return -1;
164 }
165
166 void* D_T;
167 void* D_Q;
168 void* D_R;
169
170 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
171 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
172 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
173
174 /****************
175 * Initialize data
176 *****************/
177
178 for (uint64_t i = 0; i < elementsT; i++)
179 T[i] = ((floatType) rand())/RAND_MAX;
180
181 HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
182 printf("Allocate memory for data, and initialize data.\n");
Initialize cuTensorNet and create tensor descriptors¶
Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t
for input and output tensors.
185 /******************
186 * cuTensorNet
187 *******************/
188
189 cudaStream_t stream;
190 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
191
192 cutensornetHandle_t handle;
193 HANDLE_ERROR( cutensornetCreate(&handle) );
194
195 /***************************
196 * Create tensor descriptors
197 ****************************/
198
199 cutensornetTensorDescriptor_t descTensorIn;
200 cutensornetTensorDescriptor_t descTensorQ;
201 cutensornetTensorDescriptor_t descTensorR;
202
203 const int32_t numModesIn = modesT.size();
204 const int32_t numModesQ = modesQ.size();
205 const int32_t numModesR = modesR.size();
206
207 const int64_t* strides = NULL; // assuming fortran layout for all tensors
208
209 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
210 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
211 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
212
213 printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");
Query and allocate required workspace¶
Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes()
.
216 /********************************************
217 * Query and allocate required workspace sizes
218 *********************************************/
219
220 cutensornetWorkspaceDescriptor_t workDesc;
221 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
222 HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
223 int64_t hostWorkspaceSize, deviceWorkspaceSize;
224
225 // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
226 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
227 workDesc,
228 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
229 CUTENSORNET_MEMSPACE_DEVICE,
230 CUTENSORNET_WORKSPACE_SCRATCH,
231 &deviceWorkspaceSize) );
232 HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
233 workDesc,
234 CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
235 CUTENSORNET_MEMSPACE_HOST,
236 CUTENSORNET_WORKSPACE_SCRATCH,
237 &hostWorkspaceSize) );
238
239 void *devWork = nullptr, *hostWork = nullptr;
240 if (deviceWorkspaceSize > 0) {
241 HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
242 }
243 if (hostWorkspaceSize > 0) {
244 hostWork = malloc(hostWorkspaceSize);
245 }
246 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
247 workDesc,
248 CUTENSORNET_MEMSPACE_DEVICE,
249 CUTENSORNET_WORKSPACE_SCRATCH,
250 devWork,
251 deviceWorkspaceSize) );
252 HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
253 workDesc,
254 CUTENSORNET_MEMSPACE_HOST,
255 CUTENSORNET_WORKSPACE_SCRATCH,
256 hostWork,
257 hostWorkspaceSize) );
Execution¶
At this stage, we can perform the QR decomposition by calling cutensornetTensorQR()
.
260 /**********
261 * Execution
262 ***********/
263
264 GPUTimer timer{stream};
265 double minTimeCUTENSOR = 1e100;
266 const int numRuns = 3; // to get stable perf results
267 for (int i=0; i < numRuns; ++i)
268 {
269 // restore output
270 cudaMemsetAsync(D_Q, 0, sizeQ, stream);
271 cudaMemsetAsync(D_R, 0, sizeR, stream);
272 cudaDeviceSynchronize();
273
274 timer.start();
275 HANDLE_ERROR( cutensornetTensorQR(handle,
276 descTensorIn, D_T,
277 descTensorQ, D_Q,
278 descTensorR, D_R,
279 workDesc,
280 stream) );
281 // Synchronize and measure timing
282 auto time = timer.seconds();
283 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
284 }
285
286 printf("Performing QR\n");
287
288 HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
289 HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
290
291 cudaDeviceSynchronize(); // device synchronization.
292 printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);
Free resources¶
After the computation, we need to free up all resources.
295 /***************
296 * Free resources
297 ****************/
298
299 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
300 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
301 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
302 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
303 HANDLE_ERROR( cutensornetDestroy(handle) );
304
305 if (T) free(T);
306 if (Q) free(Q);
307 if (R) free(R);
308 if (D_T) cudaFree(D_T);
309 if (D_Q) cudaFree(D_Q);
310 if (D_R) cudaFree(D_R);
311 if (devWork) cudaFree(devWork);
312 if (hostWork) free(hostWork);
313
314 printf("Free resource and exit.\n");
315
316 return 0;
317}