Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define SVD decomposition

As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.

 97   /******************************************************
 98   * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}  
 99   *******************************************************/
100
101   typedef float floatType;
102   cudaDataType_t typeData = CUDA_R_32F;
103
104   // Create vector of modes
105   int32_t sharedMode = 'x';
106
107   std::vector<int32_t> modesT{'i','j','m','n'}; // input
108   std::vector<int32_t> modesU{'i', sharedMode,'m'};
109   std::vector<int32_t> modesV{'n', sharedMode,'j'};  // SVD output
110
111   // Extents
112   std::unordered_map<int32_t, int64_t> extentMap;
113   extentMap['i'] = 16;
114   extentMap['j'] = 16;
115   extentMap['m'] = 16;
116   extentMap['n'] = 16;
117
118   int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
119   int64_t colExtent = computeCombinedExtent(extentMap, modesV);
120   // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
121   int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
122   const int64_t maxExtent = fullSharedExtent / 2;  //fix extent truncation with half of the singular values trimmed out
123   extentMap[sharedMode] = maxExtent;
124
125   // Create a vector of extents for each tensor
126   std::vector<int64_t> extentT;
127   for (auto mode : modesT)
128      extentT.push_back(extentMap[mode]);
129   std::vector<int64_t> extentU;
130   for (auto mode : modesU)
131      extentU.push_back(extentMap[mode]);
132   std::vector<int64_t> extentV;
133   for (auto mode : modesV)
134      extentV.push_back(extentMap[mode]);

Note

To perform fixed extent truncation, we directly set maxExtent to half of the full extent corresponding to exact SVD.

Setup SVD truncation parameters

Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization. Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t. Meanwhile, we can create cutensornetTensorSVDInfo_t to keep track of runtime truncation information.

221   /**********************************************
222   * Setup SVD algorithm and truncation parameters
223   ***********************************************/
224
225   cutensornetTensorSVDConfig_t svdConfig;
226   HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
227
228   // set up truncation parameters
229   double absCutoff = 1e-2;
230   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
231                                          svdConfig, 
232                                          CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, 
233                                          &absCutoff, 
234                                          sizeof(absCutoff)) );
235   double relCutoff = 4e-2;
236   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
237                                          svdConfig, 
238                                          CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, 
239                                          &relCutoff, 
240                                          sizeof(relCutoff)) );
241   
242   // optional: choose gesvdj algorithm with customized parameters. Default is gesvd.
243   cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
244   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
245                                          svdConfig, 
246                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, 
247                                          &svdAlgo, 
248                                          sizeof(svdAlgo)) );
249   cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
250   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
251                                          svdConfig, 
252                                          CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS, 
253                                          &gesvdjParams, 
254                                          sizeof(gesvdjParams)) );
255   printf("Set up SVDConfig to use GESVDJ algorithm with truncation\n");
256   
257   /********************************************************
258   * Create SVDInfo to record runtime SVD truncation details
259   *********************************************************/
260
261   cutensornetTensorSVDInfo_t svdInfo; 
262   HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;

Execution

Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes(), which is very similar to its QR counterpart. At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD().

308   /**********
309   * Execution
310   ***********/
311  
312   GPUTimer timer{stream};
313   double minTimeCUTENSOR = 1e100;
314   const int numRuns = 3; // to get stable perf results
315   for (int i=0; i < numRuns; ++i)
316   {  
317      // restore output
318      cudaMemsetAsync(D_U, 0, sizeU, stream);
319      cudaMemsetAsync(D_S, 0, sizeS, stream);
320      cudaMemsetAsync(D_V, 0, sizeV, stream);
321      cudaDeviceSynchronize();
322      
323      // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
324      // We here restore descTensorU/V to the original problem.
325      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
326      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
327      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
328      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
329
330      timer.start();
331      HANDLE_ERROR( cutensornetTensorSVD(handle, 
332                        descTensorIn, D_T, 
333                        descTensorU, D_U, 
334                        D_S, 
335                        descTensorV, D_V, 
336                        svdConfig, 
337                        svdInfo,
338                        workDesc,
339                        stream) );
340      // Synchronize and measure timing
341      auto time = timer.seconds();
342      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
343   }
344
345   printf("Performing SVD\n");
346
347   HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
348   HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
349   HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );

Note

Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.

After the computation, we still need to free up all resources.