The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).

# Define QR decomposition¶

We first define the QR decomposition to perform with the data type, modes partition, and the extents.

``` 96   /**********************************************
97   * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}
98   ***********************************************/
99
100   typedef float floatType;
102
103   // Create vector of modes
104   int32_t sharedMode = 'x';
105
106   std::vector<int32_t> modesT{'i','j','m','n'}; // input
107   std::vector<int32_t> modesQ{'i', sharedMode,'m'};
108   std::vector<int32_t> modesR{'n', sharedMode,'j'};  // QR output
109
110   // Extents
111   std::unordered_map<int32_t, int64_t> extentMap;
112   extentMap['i'] = 16;
113   extentMap['j'] = 16;
114   extentMap['m'] = 16;
115   extentMap['n'] = 16;
116
117   int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
118   int64_t colExtent = computeCombinedExtent(extentMap, modesR);
119
120   // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
121   extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
122
123   // Create a vector of extents for each tensor
124   std::vector<int64_t> extentT;
125   for (auto mode : modesT)
126      extentT.push_back(extentMap[mode]);
127   std::vector<int64_t> extentQ;
128   for (auto mode : modesQ)
129      extentQ.push_back(extentMap[mode]);
130   std::vector<int64_t> extentR;
131   for (auto mode : modesR)
132      extentR.push_back(extentMap[mode]);
```

Note

`cutensornetTensorQR()` operates in reduced mode, expecting the shared extent for the output to be the minimum of the combined row extent and the combined column extent.

# Allocate memory and initialize data¶

Next, we allocate memory for the input and output tensor operands. The input operand is initalized to random values.

```135   /***********************************
136   * Allocating data on host and device
137   ************************************/
138
139   size_t elementsT = 1;
140   for (auto mode : modesT)
141      elementsT *= extentMap[mode];
142   size_t elementsQ = 1;
143   for (auto mode : modesQ)
144      elementsQ *= extentMap[mode];
145   size_t elementsR = 1;
146   for (auto mode : modesR)
147      elementsR *= extentMap[mode];
148
149   size_t sizeT = sizeof(floatType) * elementsT;
150   size_t sizeQ = sizeof(floatType) * elementsQ;
151   size_t sizeR = sizeof(floatType) * elementsR;
152
153   printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
154
155   floatType *T = (floatType*) malloc(sizeT);
156   floatType *Q = (floatType*) malloc(sizeQ);
157   floatType *R = (floatType*) malloc(sizeR);
158
159   if (T == NULL || Q==NULL || R==NULL )
160   {
161      printf("Error: Host allocation of input T or output Q/R.\n");
162      return -1;
163   }
164
165   void* D_T;
166   void* D_Q;
167   void* D_R;
168
169   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
170   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
171   HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
172
173   /****************
174   * Initialize data
175   *****************/
176
177   for (uint64_t i = 0; i < elementsT; i++)
178      T[i] = ((floatType) rand())/RAND_MAX;
179
180   HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
181   printf("Allocate memory for data, and initialize data.\n");
```

# Initialize cuTensorNet and create tensor descriptors¶

Then we initialize the library handle of cuTensorNet and create `cutensorTensorDescriptor_t` for input and output tensors.

```184   /******************
185   * cuTensorNet
186   *******************/
187
188   cudaStream_t stream;
189   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
190
191   cutensornetHandle_t handle;
192   HANDLE_ERROR( cutensornetCreate(&handle) );
193
194   /***************************
195   * Create tensor descriptors
196   ****************************/
197
198   cutensornetTensorDescriptor_t descTensorIn;
199   cutensornetTensorDescriptor_t descTensorQ;
200   cutensornetTensorDescriptor_t descTensorR;
201
202   const int32_t numModesIn = modesT.size();
203   const int32_t numModesQ = modesQ.size();
204   const int32_t numModesR = modesR.size();
205
206   const int64_t* strides = NULL; // assuming fortran layout for all tensors
207
208   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
209   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
210   HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
211
212   printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");
```

# Query and allocate required workspace¶

Once all tensor descriptors are created, we can query the required workspace size with `cutensornetWorkspaceComputeQRSizes()`.

```215   /********************************************
216   * Query and allocate required workspace sizes
217   *********************************************/
218
219   cutensornetWorkspaceDescriptor_t workDesc;
220   HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
221   HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
222   uint64_t hostWorkspaceSize, deviceWorkspaceSize;
223
224   // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
225   HANDLE_ERROR( cutensornetWorkspaceGetSize(handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, CUTENSORNET_MEMSPACE_DEVICE, &deviceWorkspaceSize) );
226   HANDLE_ERROR( cutensornetWorkspaceGetSize(handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, CUTENSORNET_MEMSPACE_HOST, &hostWorkspaceSize) );
227
228   void *devWork = nullptr, *hostWork = nullptr;
229   if (deviceWorkspaceSize > 0) {
230      HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
231   }
232   if (hostWorkspaceSize > 0) {
233      hostWork = malloc(hostWorkspaceSize);
234   }
235   HANDLE_ERROR( cutensornetWorkspaceSet(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, devWork, deviceWorkspaceSize) );
236   HANDLE_ERROR( cutensornetWorkspaceSet(handle, workDesc, CUTENSORNET_MEMSPACE_HOST, hostWork, hostWorkspaceSize) );
```

# Execution¶

At this stage, we can perform the QR decomposition by calling `cutensornetTensorQR()`.

```239   /**********
240   * Execution
241   ***********/
242
243   GPUTimer timer{stream};
244   double minTimeCUTENSOR = 1e100;
245   const int numRuns = 3; // to get stable perf results
246   for (int i=0; i < numRuns; ++i)
247   {
248      // restore output
249      cudaMemsetAsync(D_Q, 0, sizeQ, stream);
250      cudaMemsetAsync(D_R, 0, sizeR, stream);
252
253      timer.start();
254      HANDLE_ERROR( cutensornetTensorQR(handle,
255                        descTensorIn, D_T,
256                        descTensorQ, D_Q,
257                        descTensorR, D_R,
258                        workDesc,
259                        stream) );
260      // Synchronize and measure timing
261      auto time = timer.seconds();
262      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
263   }
264
265   printf("Performing QR\n");
266
267   HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
268   HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
269
271   printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);
```

# Free resources¶

After the computation, we need to free up all resources.

```274   /***************
275   * Free resources
276   ****************/
277
278   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
279   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
280   HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
281   HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
282   HANDLE_ERROR( cutensornetDestroy(handle) );
283
284   if (T) free(T);
285   if (Q) free(Q);
286   if (R) free(R);
287   if (D_T) cudaFree(D_T);
288   if (D_Q) cudaFree(D_Q);
289   if (D_R) cudaFree(D_R);
290   if (devWork) cudaFree(devWork);
291   if (hostWork) free(hostWork);
292
293   printf("Free resource and exit.\n");
294
295   return 0;
296}
```