The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define QR decomposition

We first define the QR decomposition to perform with the data type, modes partition, and the extents.

 97   /**********************************************
 98   * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}  
 99   ***********************************************/
100
101   typedef float floatType;
102   cudaDataType_t typeData = CUDA_R_32F;
103
104   // Create vector of modes
105   int32_t sharedMode = 'x';
106
107   std::vector<int32_t> modesT{'i','j','m','n'}; // input
108   std::vector<int32_t> modesQ{'i', sharedMode,'m'};
109   std::vector<int32_t> modesR{'n', sharedMode,'j'};  // QR output
110
111   // Extents
112   std::unordered_map<int32_t, int64_t> extentMap;
113   extentMap['i'] = 16;
114   extentMap['j'] = 16;
115   extentMap['m'] = 16;
116   extentMap['n'] = 16;
117
118   int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
119   int64_t colExtent = computeCombinedExtent(extentMap, modesR);
120   
121   // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
122   extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
123
124   // Create a vector of extents for each tensor
125   std::vector<int64_t> extentT;
126   for (auto mode : modesT)
127      extentT.push_back(extentMap[mode]);
128   std::vector<int64_t> extentQ;
129   for (auto mode : modesQ)
130      extentQ.push_back(extentMap[mode]);
131   std::vector<int64_t> extentR;
132   for (auto mode : modesR)
133      extentR.push_back(extentMap[mode]);

Note

cutensornetTensorQR() operates in reduced mode, expecting the shared extent for the output to be the minimum of the combined row extent and the combined column extent.

Allocate memory and initialize data

Next, we allocate memory for the input and output tensor operands. The input operand is initialized to random values.

136   /***********************************
137   * Allocating data on host and device
138   ************************************/
139
140   size_t elementsT = 1;
141   for (auto mode : modesT)
142      elementsT *= extentMap[mode];
143   size_t elementsQ = 1;
144   for (auto mode : modesQ)
145      elementsQ *= extentMap[mode];
146   size_t elementsR = 1;
147   for (auto mode : modesR)
148      elementsR *= extentMap[mode];
149
150   size_t sizeT = sizeof(floatType) * elementsT;
151   size_t sizeQ = sizeof(floatType) * elementsQ;
152   size_t sizeR = sizeof(floatType) * elementsR;
153
154   printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
155
156   floatType *T = (floatType*) malloc(sizeT);
157   floatType *Q = (floatType*) malloc(sizeQ);
158   floatType *R = (floatType*) malloc(sizeR);
159
160   if (T == NULL || Q==NULL || R==NULL )
161   {
162      printf("Error: Host allocation of input T or output Q/R.\n");
163      return -1;
164   }
165
166   void* D_T;
167   void* D_Q;
168   void* D_R;
169   
170   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
171   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
172   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
173
174   /****************
175   * Initialize data
176   *****************/
177
178   for (uint64_t i = 0; i < elementsT; i++)
179      T[i] = ((floatType) rand())/RAND_MAX;
180
181   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
182   printf("Allocate memory for data, and initialize data.\n");

Initialize cuTensorNet and create tensor descriptors

Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t for input and output tensors.

185   /******************
186   * cuTensorNet
187   *******************/
188
189   cudaStream_t stream;
190   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
191
192   cutensornetHandle_t handle;
193   HANDLE_ERROR( cutensornetCreate(&handle) );
194
195   /***************************
196   * Create tensor descriptors
197   ****************************/
198
199   cutensornetTensorDescriptor_t descTensorIn;
200   cutensornetTensorDescriptor_t descTensorQ;
201   cutensornetTensorDescriptor_t descTensorR;
202
203   const int32_t numModesIn = modesT.size();
204   const int32_t numModesQ = modesQ.size();
205   const int32_t numModesR = modesR.size();
206
207   const int64_t* strides = NULL; // assuming fortran layout for all tensors
208
209   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
210   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
211   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
212    
213   printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");

Query and allocate required workspace

Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes().

216   /********************************************
217   * Query and allocate required workspace sizes
218   *********************************************/
219
220   cutensornetWorkspaceDescriptor_t workDesc;
221   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
222   HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
223   int64_t hostWorkspaceSize, deviceWorkspaceSize;
224
225   // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
226   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
227                                                   workDesc,
228                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
229                                                   CUTENSORNET_MEMSPACE_DEVICE,
230                                                   CUTENSORNET_WORKSPACE_SCRATCH,
231                                                   &deviceWorkspaceSize) );
232   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
233                                                   workDesc,
234                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
235                                                   CUTENSORNET_MEMSPACE_HOST,
236                                                   CUTENSORNET_WORKSPACE_SCRATCH,
237                                                   &hostWorkspaceSize) );
238
239   void *devWork = nullptr, *hostWork = nullptr;
240   if (deviceWorkspaceSize > 0) {
241      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
242   }
243   if (hostWorkspaceSize > 0) {
244      hostWork = malloc(hostWorkspaceSize);
245   }
246   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
247                                               workDesc,
248                                               CUTENSORNET_MEMSPACE_DEVICE,
249                                               CUTENSORNET_WORKSPACE_SCRATCH,
250                                               devWork,
251                                               deviceWorkspaceSize) );
252   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
253                                               workDesc,
254                                               CUTENSORNET_MEMSPACE_HOST,
255                                               CUTENSORNET_WORKSPACE_SCRATCH,
256                                               hostWork,
257                                               hostWorkspaceSize) );

Execution

At this stage, we can perform the QR decomposition by calling cutensornetTensorQR().

260   /**********
261   * Execution
262   ***********/
263  
264   GPUTimer timer{stream};
265   double minTimeCUTENSOR = 1e100;
266   const int numRuns = 3; // to get stable perf results
267   for (int i=0; i < numRuns; ++i)
268   {  
269      // restore output
270      cudaMemsetAsync(D_Q, 0, sizeQ, stream);
271      cudaMemsetAsync(D_R, 0, sizeR, stream);
272      cudaDeviceSynchronize();
273
274      timer.start();
275      HANDLE_ERROR( cutensornetTensorQR(handle, 
276                        descTensorIn, D_T, 
277                        descTensorQ, D_Q, 
278                        descTensorR, D_R, 
279                        workDesc,
280                        stream) );
281      // Synchronize and measure timing
282      auto time = timer.seconds();
283      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
284   }
285
286   printf("Performing QR\n");
287
288   HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
289   HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
290
291   cudaDeviceSynchronize(); // device synchronization.
292   printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);

Free resources

After the computation, we need to free up all resources.

295   /***************
296   * Free resources
297   ****************/
298   
299   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
300   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
301   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
302   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
303   HANDLE_ERROR( cutensornetDestroy(handle) );
304
305   if (T) free(T);
306   if (Q) free(Q);
307   if (R) free(R);
308   if (D_T) cudaFree(D_T);
309   if (D_Q) cudaFree(D_Q);
310   if (D_R) cudaFree(D_R);
311   if (devWork) cudaFree(devWork);
312   if (hostWork) free(hostWork);
313
314   printf("Free resource and exit.\n");
315
316   return 0;
317}