The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define QR decomposition

We first define the QR decomposition to perform with the data type, modes partition, and the extents.

 96   /**********************************************
 97   * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}  
 98   ***********************************************/
 99
100   typedef float floatType;
101   cudaDataType_t typeData = CUDA_R_32F;
102
103   // Create vector of modes
104   int32_t sharedMode = 'x';
105
106   std::vector<int32_t> modesT{'i','j','m','n'}; // input
107   std::vector<int32_t> modesQ{'i', sharedMode,'m'};
108   std::vector<int32_t> modesR{'n', sharedMode,'j'};  // QR output
109
110   // Extents
111   std::unordered_map<int32_t, int64_t> extentMap;
112   extentMap['i'] = 16;
113   extentMap['j'] = 16;
114   extentMap['m'] = 16;
115   extentMap['n'] = 16;
116
117   int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
118   int64_t colExtent = computeCombinedExtent(extentMap, modesR);
119   
120   // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
121   extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
122
123   // Create a vector of extents for each tensor
124   std::vector<int64_t> extentT;
125   for (auto mode : modesT)
126      extentT.push_back(extentMap[mode]);
127   std::vector<int64_t> extentQ;
128   for (auto mode : modesQ)
129      extentQ.push_back(extentMap[mode]);
130   std::vector<int64_t> extentR;
131   for (auto mode : modesR)
132      extentR.push_back(extentMap[mode]);

Note

cutensornetTensorQR() operates in reduced mode, expecting the shared extent for the output to be the minimum of the combined row extent and the combined column extent.

Allocate memory and initialize data

Next, we allocate memory for the input and output tensor operands. The input operand is initalized to random values.

135   /***********************************
136   * Allocating data on host and device
137   ************************************/
138
139   size_t elementsT = 1;
140   for (auto mode : modesT)
141      elementsT *= extentMap[mode];
142   size_t elementsQ = 1;
143   for (auto mode : modesQ)
144      elementsQ *= extentMap[mode];
145   size_t elementsR = 1;
146   for (auto mode : modesR)
147      elementsR *= extentMap[mode];
148
149   size_t sizeT = sizeof(floatType) * elementsT;
150   size_t sizeQ = sizeof(floatType) * elementsQ;
151   size_t sizeR = sizeof(floatType) * elementsR;
152
153   printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
154
155   floatType *T = (floatType*) malloc(sizeT);
156   floatType *Q = (floatType*) malloc(sizeQ);
157   floatType *R = (floatType*) malloc(sizeR);
158
159   if (T == NULL || Q==NULL || R==NULL )
160   {
161      printf("Error: Host allocation of input T or output Q/R.\n");
162      return -1;
163   }
164
165   void* D_T;
166   void* D_Q;
167   void* D_R;
168   
169   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
170   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
171   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
172
173   /****************
174   * Initialize data
175   *****************/
176
177   for (uint64_t i = 0; i < elementsT; i++)
178      T[i] = ((floatType) rand())/RAND_MAX;
179
180   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
181   printf("Allocate memory for data, and initialize data.\n");

Initialize cuTensorNet and create tensor descriptors

Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t for input and output tensors.

184   /******************
185   * cuTensorNet
186   *******************/
187
188   cudaStream_t stream;
189   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
190
191   cutensornetHandle_t handle;
192   HANDLE_ERROR( cutensornetCreate(&handle) );
193
194   /***************************
195   * Create tensor descriptors
196   ****************************/
197
198   cutensornetTensorDescriptor_t descTensorIn;
199   cutensornetTensorDescriptor_t descTensorQ;
200   cutensornetTensorDescriptor_t descTensorR;
201
202   const int32_t numModesIn = modesT.size();
203   const int32_t numModesQ = modesQ.size();
204   const int32_t numModesR = modesR.size();
205
206   const int64_t* strides = NULL; // assuming fortran layout for all tensors
207
208   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
209   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
210   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
211    
212   printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");

Query and allocate required workspace

Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes().

215   /********************************************
216   * Query and allocate required workspace sizes
217   *********************************************/
218
219   cutensornetWorkspaceDescriptor_t workDesc;
220   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
221   HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
222   uint64_t hostWorkspaceSize, deviceWorkspaceSize;
223
224   // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
225   HANDLE_ERROR( cutensornetWorkspaceGetSize(handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, CUTENSORNET_MEMSPACE_DEVICE, &deviceWorkspaceSize) );
226   HANDLE_ERROR( cutensornetWorkspaceGetSize(handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, CUTENSORNET_MEMSPACE_HOST, &hostWorkspaceSize) );
227
228   void *devWork = nullptr, *hostWork = nullptr;
229   if (deviceWorkspaceSize > 0) {
230      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
231   }
232   if (hostWorkspaceSize > 0) {
233      hostWork = malloc(hostWorkspaceSize);
234   }
235   HANDLE_ERROR( cutensornetWorkspaceSet(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, devWork, deviceWorkspaceSize) );
236   HANDLE_ERROR( cutensornetWorkspaceSet(handle, workDesc, CUTENSORNET_MEMSPACE_HOST, hostWork, hostWorkspaceSize) );

Execution

At this stage, we can perform the QR decomposition by calling cutensornetTensorQR().

239   /**********
240   * Execution
241   ***********/
242  
243   GPUTimer timer{stream};
244   double minTimeCUTENSOR = 1e100;
245   const int numRuns = 3; // to get stable perf results
246   for (int i=0; i < numRuns; ++i)
247   {  
248      // restore output
249      cudaMemsetAsync(D_Q, 0, sizeQ, stream);
250      cudaMemsetAsync(D_R, 0, sizeR, stream);
251      cudaDeviceSynchronize();
252
253      timer.start();
254      HANDLE_ERROR( cutensornetTensorQR(handle, 
255                        descTensorIn, D_T, 
256                        descTensorQ, D_Q, 
257                        descTensorR, D_R, 
258                        workDesc,
259                        stream) );
260      // Synchronize and measure timing
261      auto time = timer.seconds();
262      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
263   }
264
265   printf("Performing QR\n");
266
267   HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
268   HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
269
270   cudaDeviceSynchronize(); // device synchronization.
271   printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);

Free resources

After the computation, we need to free up all resources.

274   /***************
275   * Free resources
276   ****************/
277   
278   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
279   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
280   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
281   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
282   HANDLE_ERROR( cutensornetDestroy(handle) );
283
284   if (T) free(T);
285   if (Q) free(Q);
286   if (R) free(R);
287   if (D_T) cudaFree(D_T);
288   if (D_Q) cudaFree(D_Q);
289   if (D_R) cudaFree(D_R);
290   if (devWork) cudaFree(devWork);
291   if (hostWork) free(hostWork);
292
293   printf("Free resource and exit.\n");
294
295   return 0;
296}