Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define SVD decomposition

As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.

 96   /******************************************************
 97   * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}  
 98   *******************************************************/
 99
100   typedef float floatType;
101   cudaDataType_t typeData = CUDA_R_32F;
102
103   // Create vector of modes
104   int32_t sharedMode = 'x';
105
106   std::vector<int32_t> modesT{'i','j','m','n'}; // input
107   std::vector<int32_t> modesU{'i', sharedMode,'m'};
108   std::vector<int32_t> modesV{'n', sharedMode,'j'};  // SVD output
109
110   // Extents
111   std::unordered_map<int32_t, int64_t> extentMap;
112   extentMap['i'] = 16;
113   extentMap['j'] = 16;
114   extentMap['m'] = 16;
115   extentMap['n'] = 16;
116
117   int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
118   int64_t colExtent = computeCombinedExtent(extentMap, modesV);
119   // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
120   int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
121   const int64_t maxExtent = fullSharedExtent / 2;  //fix extent truncation with half of the singular values trimmed out
122   extentMap[sharedMode] = maxExtent;
123
124   // Create a vector of extents for each tensor
125   std::vector<int64_t> extentT;
126   for (auto mode : modesT)
127      extentT.push_back(extentMap[mode]);
128   std::vector<int64_t> extentU;
129   for (auto mode : modesU)
130      extentU.push_back(extentMap[mode]);
131   std::vector<int64_t> extentV;
132   for (auto mode : modesV)
133      extentV.push_back(extentMap[mode]);

Note

To perform fixed extent truncation, we directly set maxExtent to half of the full extent corresponding to exact SVD.

Setup SVD truncation parameters

Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization. Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t. Meanwhile, we can create cutensornetTensorSVDInfo_t to keep track of runtime truncation information.

220   /**********************************************
221   * Setup SVD algorithm and truncation parameters
222   ***********************************************/
223
224   cutensornetTensorSVDConfig_t svdConfig;
225   HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
226
227   // set up truncation parameters
228   double absCutoff = 1e-2;
229   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
230                                          svdConfig, 
231                                          CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, 
232                                          &absCutoff, 
233                                          sizeof(absCutoff)) );
234   double relCutoff = 4e-2;
235   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
236                                          svdConfig, 
237                                          CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, 
238                                          &relCutoff, 
239                                          sizeof(relCutoff)) );
240   
241   // optional: choose gesvdj algorithm with customized parameters. Default is gesvd.
242   cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
243   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
244                                          svdConfig, 
245                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, 
246                                          &svdAlgo, 
247                                          sizeof(svdAlgo)) );
248   cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
249   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
250                                          svdConfig, 
251                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, 
252                                          &gesvdjParams, 
253                                          sizeof(gesvdjParams)) );
254   printf("Set up SVDConfig to use GESVDJ algorithm with truncation\n");
255   
256   /********************************************************
257   * Create SVDInfo to record runtime SVD truncation details
258   *********************************************************/
259
260   cutensornetTensorSVDInfo_t svdInfo; 
261   HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;

Execution

Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes(), which is very similar to its QR counterpart. At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD().

307   /**********
308   * Execution
309   ***********/
310  
311   GPUTimer timer{stream};
312   double minTimeCUTENSOR = 1e100;
313   const int numRuns = 3; // to get stable perf results
314   for (int i=0; i < numRuns; ++i)
315   {  
316      // restore output
317      cudaMemsetAsync(D_U, 0, sizeU, stream);
318      cudaMemsetAsync(D_S, 0, sizeS, stream);
319      cudaMemsetAsync(D_V, 0, sizeV, stream);
320      cudaDeviceSynchronize();
321      
322      // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
323      // We here restore descTensorU/V to the original problem.
324      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
325      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
326      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
327      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
328
329      timer.start();
330      HANDLE_ERROR( cutensornetTensorSVD(handle, 
331                        descTensorIn, D_T, 
332                        descTensorU, D_U, 
333                        D_S, 
334                        descTensorV, D_V, 
335                        svdConfig, 
336                        svdInfo,
337                        workDesc,
338                        stream) );
339      // Synchronize and measure timing
340      auto time = timer.seconds();
341      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
342   }
343
344   printf("Performing SVD\n");
345
346   HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
347   HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
348   HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );

Note

Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.

After the computation, we still need to free up all resources.