Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define SVD decomposition

As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.

 96   /******************************************************
 97   * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}  
 98   *******************************************************/
 99
100   typedef float floatType;
101   cudaDataType_t typeData = CUDA_R_32F;
102
103   // Create vector of modes
104   int32_t sharedMode = 'x';
105
106   std::vector<int32_t> modesT{'i','j','m','n'}; // input
107   std::vector<int32_t> modesU{'i', sharedMode,'m'};
108   std::vector<int32_t> modesV{'n', sharedMode,'j'};  // SVD output
109
110   // Extents
111   std::unordered_map<int32_t, int64_t> extentMap;
112   extentMap['i'] = 16;
113   extentMap['j'] = 16;
114   extentMap['m'] = 16;
115   extentMap['n'] = 16;
116
117   int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
118   int64_t colExtent = computeCombinedExtent(extentMap, modesV);
119   // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
120   int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
121   const int64_t maxExtent = fullSharedExtent / 2;  //fix extent truncation with half of the singular values trimmed out
122   extentMap[sharedMode] = maxExtent;
123
124   // Create a vector of extents for each tensor
125   std::vector<int64_t> extentT;
126   for (auto mode : modesT)
127      extentT.push_back(extentMap[mode]);
128   std::vector<int64_t> extentU;
129   for (auto mode : modesU)
130      extentU.push_back(extentMap[mode]);
131   std::vector<int64_t> extentV;
132   for (auto mode : modesV)
133      extentV.push_back(extentMap[mode]);

Note

To perform fixed extent truncation, we directly set maxExtent to half of the full extent corresponding to exact SVD.

Setup SVD truncation parameters

Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization. Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t. Meanwhile, we can create cutensornetTensorSVDInfo_t to keep track of runtime truncation information.

220   /********************************
221   * Setup SVD truncation parameters
222   *********************************/
223
224   cutensornetTensorSVDConfig_t svdConfig;
225   HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
226   double absCutoff = 1e-2;
227   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
228                                          svdConfig, 
229                                          CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, 
230                                          &absCutoff, 
231                                          sizeof(absCutoff)) );
232   double relCutoff = 4e-2;
233   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
234                                          svdConfig, 
235                                          CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, 
236                                          &relCutoff, 
237                                          sizeof(relCutoff)) );
238   
239   /********************************************************
240   * Create SVDInfo to record runtime SVD truncation details
241   *********************************************************/
242
243   cutensornetTensorSVDInfo_t svdInfo; 
244   HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;

Execution

Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes(), which is very similar to its QR counterpart. At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD().

270   /**********
271   * Execution
272   ***********/
273  
274   GPUTimer timer{stream};
275   double minTimeCUTENSOR = 1e100;
276   const int numRuns = 3; // to get stable perf results
277   for (int i=0; i < numRuns; ++i)
278   {  
279      // restore output
280      cudaMemsetAsync(D_U, 0, sizeU, stream);
281      cudaMemsetAsync(D_S, 0, sizeS, stream);
282      cudaMemsetAsync(D_V, 0, sizeV, stream);
283      cudaDeviceSynchronize();
284      
285      // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
286      // We here restore descTensorU/V to the original problem.
287      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
288      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
289      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
290      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
291
292      timer.start();
293      HANDLE_ERROR( cutensornetTensorSVD(handle, 
294                        descTensorIn, D_T, 
295                        descTensorU, D_U, 
296                        D_S, 
297                        descTensorV, D_V, 
298                        svdConfig, 
299                        svdInfo,
300                        workDesc,
301                        stream) );
302      // Synchronize and measure timing
303      auto time = timer.seconds();
304      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
305   }
306
307   printf("Performing SVD\n");
308
309   HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
310   HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
311   HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );

Note

Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.

After the computation, we still need to free up all resources.