The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define QR decomposition

We first define the QR decomposition to perform with the data type, modes partition, and the extents.

103   /**********************************************
104   * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}  
105   ***********************************************/
106
107   typedef float floatType;
108   cudaDataType_t typeData = CUDA_R_32F;
109
110   // Create vector of modes
111   int32_t sharedMode = 'x';
112
113   std::vector<int32_t> modesT{'i','j','m','n'}; // input
114   std::vector<int32_t> modesQ{'i', sharedMode,'m'};
115   std::vector<int32_t> modesR{'n', sharedMode,'j'};  // QR output
116
117   // Extents
118   std::unordered_map<int32_t, int64_t> extentMap;
119   extentMap['i'] = 16;
120   extentMap['j'] = 16;
121   extentMap['m'] = 16;
122   extentMap['n'] = 16;
123
124   int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
125   int64_t colExtent = computeCombinedExtent(extentMap, modesR);
126   
127   // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
128   extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
129
130   // Create a vector of extents for each tensor
131   std::vector<int64_t> extentT;
132   for (auto mode : modesT)
133      extentT.push_back(extentMap[mode]);
134   std::vector<int64_t> extentQ;
135   for (auto mode : modesQ)
136      extentQ.push_back(extentMap[mode]);
137   std::vector<int64_t> extentR;
138   for (auto mode : modesR)
139      extentR.push_back(extentMap[mode]);

Note

cutensornetTensorQR() operates in reduced mode, expecting the shared extent for the output to be the minimum of the combined row extent and the combined column extent.

Allocate memory and initialize data

Next, we allocate memory for the input and output tensor operands. The input operand is initialized to random values.

142   /***********************************
143   * Allocating data on host and device
144   ************************************/
145
146   size_t elementsT = 1;
147   for (auto mode : modesT)
148      elementsT *= extentMap[mode];
149   size_t elementsQ = 1;
150   for (auto mode : modesQ)
151      elementsQ *= extentMap[mode];
152   size_t elementsR = 1;
153   for (auto mode : modesR)
154      elementsR *= extentMap[mode];
155
156   size_t sizeT = sizeof(floatType) * elementsT;
157   size_t sizeQ = sizeof(floatType) * elementsQ;
158   size_t sizeR = sizeof(floatType) * elementsR;
159
160   printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
161
162   floatType *T = (floatType*) malloc(sizeT);
163   floatType *Q = (floatType*) malloc(sizeQ);
164   floatType *R = (floatType*) malloc(sizeR);
165
166   if (T == NULL || Q==NULL || R==NULL )
167   {
168      printf("Error: Host allocation of input T or output Q/R.\n");
169      return -1;
170   }
171
172   void* D_T;
173   void* D_Q;
174   void* D_R;
175   
176   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
177   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
178   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
179
180   /****************
181   * Initialize data
182   *****************/
183
184   for (uint64_t i = 0; i < elementsT; i++)
185      T[i] = ((floatType) rand())/RAND_MAX;
186
187   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
188   printf("Allocate memory for data, and initialize data.\n");

Initialize cuTensorNet and create tensor descriptors

Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t for input and output tensors.

191   /******************
192   * cuTensorNet
193   *******************/
194
195   cudaStream_t stream;
196   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
197
198   cutensornetHandle_t handle;
199   HANDLE_ERROR( cutensornetCreate(&handle) );
200
201   /***************************
202   * Create tensor descriptors
203   ****************************/
204
205   cutensornetTensorDescriptor_t descTensorIn;
206   cutensornetTensorDescriptor_t descTensorQ;
207   cutensornetTensorDescriptor_t descTensorR;
208
209   const int32_t numModesIn = modesT.size();
210   const int32_t numModesQ = modesQ.size();
211   const int32_t numModesR = modesR.size();
212
213   const int64_t* strides = NULL; // assuming fortran layout for all tensors
214
215   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
216   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
217   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
218    
219   printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");

Query and allocate required workspace

Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes().

222   /********************************************
223   * Query and allocate required workspace sizes
224   *********************************************/
225
226   cutensornetWorkspaceDescriptor_t workDesc;
227   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
228   HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
229   int64_t hostWorkspaceSize, deviceWorkspaceSize;
230
231   // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
232   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
233                                                   workDesc,
234                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
235                                                   CUTENSORNET_MEMSPACE_DEVICE,
236                                                   CUTENSORNET_WORKSPACE_SCRATCH,
237                                                   &deviceWorkspaceSize) );
238   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
239                                                   workDesc,
240                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
241                                                   CUTENSORNET_MEMSPACE_HOST,
242                                                   CUTENSORNET_WORKSPACE_SCRATCH,
243                                                   &hostWorkspaceSize) );
244
245   void *devWork = nullptr, *hostWork = nullptr;
246   if (deviceWorkspaceSize > 0) {
247      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
248   }
249   if (hostWorkspaceSize > 0) {
250      hostWork = malloc(hostWorkspaceSize);
251   }
252   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
253                                               workDesc,
254                                               CUTENSORNET_MEMSPACE_DEVICE,
255                                               CUTENSORNET_WORKSPACE_SCRATCH,
256                                               devWork,
257                                               deviceWorkspaceSize) );
258   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
259                                               workDesc,
260                                               CUTENSORNET_MEMSPACE_HOST,
261                                               CUTENSORNET_WORKSPACE_SCRATCH,
262                                               hostWork,
263                                               hostWorkspaceSize) );

Execution

At this stage, we can perform the QR decomposition by calling cutensornetTensorQR().

266   /**********
267   * Execution
268   ***********/
269  
270   GPUTimer timer{stream};
271   double minTimeCUTENSOR = 1e100;
272   const int numRuns = 3; // to get stable perf results
273   for (int i=0; i < numRuns; ++i)
274   {  
275      // restore output
276      cudaMemsetAsync(D_Q, 0, sizeQ, stream);
277      cudaMemsetAsync(D_R, 0, sizeR, stream);
278      cudaDeviceSynchronize();
279
280      timer.start();
281      HANDLE_ERROR( cutensornetTensorQR(handle, 
282                        descTensorIn, D_T, 
283                        descTensorQ, D_Q, 
284                        descTensorR, D_R, 
285                        workDesc,
286                        stream) );
287      // Synchronize and measure timing
288      auto time = timer.seconds();
289      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
290   }
291
292   printf("Performing QR\n");
293
294   HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
295   HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
296
297   cudaDeviceSynchronize(); // device synchronization.
298   printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);

Free resources

After the computation, we need to free up all resources.

301   /***************
302   * Free resources
303   ****************/
304   
305   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
306   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
307   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
308   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
309   HANDLE_ERROR( cutensornetDestroy(handle) );
310
311   if (T) free(T);
312   if (Q) free(Q);
313   if (R) free(R);
314   if (D_T) cudaFree(D_T);
315   if (D_Q) cudaFree(D_Q);
316   if (D_R) cudaFree(D_R);
317   if (devWork) cudaFree(devWork);
318   if (hostWork) free(hostWork);
319
320   printf("Free resource and exit.\n");
321
322   return 0;
323}