The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define QR decomposition

We first define the QR decomposition to perform with the data type, modes partition, and the extents.

 96   /**********************************************
 97   * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}  
 98   ***********************************************/
 99
100   typedef float floatType;
101   cudaDataType_t typeData = CUDA_R_32F;
102
103   // Create vector of modes
104   int32_t sharedMode = 'x';
105
106   std::vector<int32_t> modesT{'i','j','m','n'}; // input
107   std::vector<int32_t> modesQ{'i', sharedMode,'m'};
108   std::vector<int32_t> modesR{'n', sharedMode,'j'};  // QR output
109
110   // Extents
111   std::unordered_map<int32_t, int64_t> extentMap;
112   extentMap['i'] = 16;
113   extentMap['j'] = 16;
114   extentMap['m'] = 16;
115   extentMap['n'] = 16;
116
117   int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
118   int64_t colExtent = computeCombinedExtent(extentMap, modesR);
119   
120   // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
121   extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
122
123   // Create a vector of extents for each tensor
124   std::vector<int64_t> extentT;
125   for (auto mode : modesT)
126      extentT.push_back(extentMap[mode]);
127   std::vector<int64_t> extentQ;
128   for (auto mode : modesQ)
129      extentQ.push_back(extentMap[mode]);
130   std::vector<int64_t> extentR;
131   for (auto mode : modesR)
132      extentR.push_back(extentMap[mode]);

Note

cutensornetTensorQR() operates in reduced mode, expecting the shared extent for the output to be the minimum of the combined row extent and the combined column extent.

Allocate memory and initialize data

Next, we allocate memory for the input and output tensor operands. The input operand is initialized to random values.

135   /***********************************
136   * Allocating data on host and device
137   ************************************/
138
139   size_t elementsT = 1;
140   for (auto mode : modesT)
141      elementsT *= extentMap[mode];
142   size_t elementsQ = 1;
143   for (auto mode : modesQ)
144      elementsQ *= extentMap[mode];
145   size_t elementsR = 1;
146   for (auto mode : modesR)
147      elementsR *= extentMap[mode];
148
149   size_t sizeT = sizeof(floatType) * elementsT;
150   size_t sizeQ = sizeof(floatType) * elementsQ;
151   size_t sizeR = sizeof(floatType) * elementsR;
152
153   printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
154
155   floatType *T = (floatType*) malloc(sizeT);
156   floatType *Q = (floatType*) malloc(sizeQ);
157   floatType *R = (floatType*) malloc(sizeR);
158
159   if (T == NULL || Q==NULL || R==NULL )
160   {
161      printf("Error: Host allocation of input T or output Q/R.\n");
162      return -1;
163   }
164
165   void* D_T;
166   void* D_Q;
167   void* D_R;
168   
169   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
170   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
171   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
172
173   /****************
174   * Initialize data
175   *****************/
176
177   for (uint64_t i = 0; i < elementsT; i++)
178      T[i] = ((floatType) rand())/RAND_MAX;
179
180   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
181   printf("Allocate memory for data, and initialize data.\n");

Initialize cuTensorNet and create tensor descriptors

Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t for input and output tensors.

184   /******************
185   * cuTensorNet
186   *******************/
187
188   cudaStream_t stream;
189   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
190
191   cutensornetHandle_t handle;
192   HANDLE_ERROR( cutensornetCreate(&handle) );
193
194   /***************************
195   * Create tensor descriptors
196   ****************************/
197
198   cutensornetTensorDescriptor_t descTensorIn;
199   cutensornetTensorDescriptor_t descTensorQ;
200   cutensornetTensorDescriptor_t descTensorR;
201
202   const int32_t numModesIn = modesT.size();
203   const int32_t numModesQ = modesQ.size();
204   const int32_t numModesR = modesR.size();
205
206   const int64_t* strides = NULL; // assuming fortran layout for all tensors
207
208   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
209   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
210   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
211    
212   printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");

Query and allocate required workspace

Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes().

215   /********************************************
216   * Query and allocate required workspace sizes
217   *********************************************/
218
219   cutensornetWorkspaceDescriptor_t workDesc;
220   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
221   HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
222   int64_t hostWorkspaceSize, deviceWorkspaceSize;
223
224   // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
225   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
226                                                   workDesc,
227                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
228                                                   CUTENSORNET_MEMSPACE_DEVICE,
229                                                   CUTENSORNET_WORKSPACE_SCRATCH,
230                                                   &deviceWorkspaceSize) );
231   HANDLE_ERROR( cutensornetWorkspaceGetMemorySize(handle,
232                                                   workDesc,
233                                                   CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
234                                                   CUTENSORNET_MEMSPACE_HOST,
235                                                   CUTENSORNET_WORKSPACE_SCRATCH,
236                                                   &hostWorkspaceSize) );
237
238   void *devWork = nullptr, *hostWork = nullptr;
239   if (deviceWorkspaceSize > 0) {
240      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
241   }
242   if (hostWorkspaceSize > 0) {
243      hostWork = malloc(hostWorkspaceSize);
244   }
245   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
246                                               workDesc,
247                                               CUTENSORNET_MEMSPACE_DEVICE,
248                                               CUTENSORNET_WORKSPACE_SCRATCH,
249                                               devWork,
250                                               deviceWorkspaceSize) );
251   HANDLE_ERROR( cutensornetWorkspaceSetMemory(handle,
252                                               workDesc,
253                                               CUTENSORNET_MEMSPACE_HOST,
254                                               CUTENSORNET_WORKSPACE_SCRATCH,
255                                               hostWork,
256                                               hostWorkspaceSize) );

Execution

At this stage, we can perform the QR decomposition by calling cutensornetTensorQR().

259   /**********
260   * Execution
261   ***********/
262  
263   GPUTimer timer{stream};
264   double minTimeCUTENSOR = 1e100;
265   const int numRuns = 3; // to get stable perf results
266   for (int i=0; i < numRuns; ++i)
267   {  
268      // restore output
269      cudaMemsetAsync(D_Q, 0, sizeQ, stream);
270      cudaMemsetAsync(D_R, 0, sizeR, stream);
271      cudaDeviceSynchronize();
272
273      timer.start();
274      HANDLE_ERROR( cutensornetTensorQR(handle, 
275                        descTensorIn, D_T, 
276                        descTensorQ, D_Q, 
277                        descTensorR, D_R, 
278                        workDesc,
279                        stream) );
280      // Synchronize and measure timing
281      auto time = timer.seconds();
282      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
283   }
284
285   printf("Performing QR\n");
286
287   HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
288   HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
289
290   cudaDeviceSynchronize(); // device synchronization.
291   printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);

Free resources

After the computation, we need to free up all resources.

294   /***************
295   * Free resources
296   ****************/
297   
298   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
299   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
300   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
301   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
302   HANDLE_ERROR( cutensornetDestroy(handle) );
303
304   if (T) free(T);
305   if (Q) free(Q);
306   if (R) free(R);
307   if (D_T) cudaFree(D_T);
308   if (D_Q) cudaFree(D_Q);
309   if (D_R) cudaFree(D_R);
310   if (devWork) cudaFree(devWork);
311   if (hostWork) free(hostWork);
312
313   printf("Free resource and exit.\n");
314
315   return 0;
316}