Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define SVD decomposition

As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.

 96   /******************************************************
 97   * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}  
 98   *******************************************************/
 99
100   typedef float floatType;
101   cudaDataType_t typeData = CUDA_R_32F;
102
103   // Create vector of modes
104   int32_t sharedMode = 'x';
105
106   std::vector<int32_t> modesT{'i','j','m','n'}; // input
107   std::vector<int32_t> modesU{'i', sharedMode,'m'};
108   std::vector<int32_t> modesV{'n', sharedMode,'j'};  // SVD output
109
110   // Extents
111   std::unordered_map<int32_t, int64_t> extentMap;
112   extentMap['i'] = 16;
113   extentMap['j'] = 16;
114   extentMap['m'] = 16;
115   extentMap['n'] = 16;
116
117   int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
118   int64_t colExtent = computeCombinedExtent(extentMap, modesV);
119   // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
120   int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
121   const int64_t maxExtent = fullSharedExtent / 2;  //fix extent truncation with half of the singular values trimmed out
122   extentMap[sharedMode] = maxExtent;
123
124   // Create a vector of extents for each tensor
125   std::vector<int64_t> extentT;
126   for (auto mode : modesT)
127      extentT.push_back(extentMap[mode]);
128   std::vector<int64_t> extentU;
129   for (auto mode : modesU)
130      extentU.push_back(extentMap[mode]);
131   std::vector<int64_t> extentV;
132   for (auto mode : modesV)
133      extentV.push_back(extentMap[mode]);

Note

To perform fixed extent truncation, we directly set maxExtent to half of the full extent corresponding to exact SVD.

Setup SVD truncation parameters

Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization. Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t. Meanwhile, we can create cutensornetTensorSVDInfo_t to keep track of runtime truncation information.

220   /********************************
221   * Setup SVD truncation parameters
222   *********************************/
223
224   cutensornetTensorSVDConfig_t svdConfig;
225   HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
226   double absCutoff = 1e-2;
227   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
228                                          svdConfig, 
229                                          CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, 
230                                          &absCutoff, 
231                                          sizeof(absCutoff)) );
232   double relCutoff = 4e-2;
233   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
234                                          svdConfig, 
235                                          CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, 
236                                          &relCutoff, 
237                                          sizeof(relCutoff)) );
238   
239   /********************************************************
240   * Create SVDInfo to record runtime SVD truncation details
241   *********************************************************/
242
243   cutensornetTensorSVDInfo_t svdInfo; 
244   HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;

Execution

Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes(), which is very similar to its QR counterpart. At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD().

290   /**********
291   * Execution
292   ***********/
293  
294   GPUTimer timer{stream};
295   double minTimeCUTENSOR = 1e100;
296   const int numRuns = 3; // to get stable perf results
297   for (int i=0; i < numRuns; ++i)
298   {  
299      // restore output
300      cudaMemsetAsync(D_U, 0, sizeU, stream);
301      cudaMemsetAsync(D_S, 0, sizeS, stream);
302      cudaMemsetAsync(D_V, 0, sizeV, stream);
303      cudaDeviceSynchronize();
304      
305      // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
306      // We here restore descTensorU/V to the original problem.
307      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
308      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
309      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
310      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
311
312      timer.start();
313      HANDLE_ERROR( cutensornetTensorSVD(handle, 
314                        descTensorIn, D_T, 
315                        descTensorU, D_U, 
316                        D_S, 
317                        descTensorV, D_V, 
318                        svdConfig, 
319                        svdInfo,
320                        workDesc,
321                        stream) );
322      // Synchronize and measure timing
323      auto time = timer.seconds();
324      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
325   }
326
327   printf("Performing SVD\n");
328
329   HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
330   HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
331   HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );

Note

Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.

After the computation, we still need to free up all resources.