Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define SVD decomposition¶

As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.

   /******************************************************
   * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}  
   *******************************************************/

   typedef float floatType;
   cudaDataType_t typeData = CUDA_R_32F;

   // Create vector of modes
   int32_t sharedMode = 'x';

   std::vector<int32_t> modesT{'i','j','m','n'}; // input
   std::vector<int32_t> modesU{'i', sharedMode,'m'};
   std::vector<int32_t> modesV{'n', sharedMode,'j'};  // SVD output

   // Extents
   std::unordered_map<int32_t, int64_t> extentMap;
   extentMap['i'] = 16;
   extentMap['j'] = 16;
   extentMap['m'] = 16;
   extentMap['n'] = 16;

   int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
   int64_t colExtent = computeCombinedExtent(extentMap, modesV);
   // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
   int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
   const int64_t maxExtent = fullSharedExtent / 2;  //fix extent truncation with half of the singular values trimmed out
   extentMap[sharedMode] = maxExtent;

   // Create a vector of extents for each tensor
   std::vector<int64_t> extentT;
   for (auto mode : modesT)
      extentT.push_back(extentMap[mode]);
   std::vector<int64_t> extentU;
   for (auto mode : modesU)
      extentU.push_back(extentMap[mode]);
   std::vector<int64_t> extentV;
   for (auto mode : modesV)
      extentV.push_back(extentMap[mode]);

Note

To perform fixed extent truncation, we directly set maxExtent to half of the full extent corresponding to exact SVD.

Setup SVD truncation parameters¶

Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization. Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t. Meanwhile, we can create cutensornetTensorSVDInfo_t to keep track of runtime truncation information.

   /********************************
   * Setup SVD truncation parameters
   *********************************/

   cutensornetTensorSVDConfig_t svdConfig;
   HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
   double absCutoff = 1e-2;
   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
                                          svdConfig, 
                                          CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, 
                                          &absCutoff, 
                                          sizeof(absCutoff)) );
   double relCutoff = 4e-2;
   HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle, 
                                          svdConfig, 
                                          CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, 
                                          &relCutoff, 
                                          sizeof(relCutoff)) );
   
   /********************************************************
   * Create SVDInfo to record runtime SVD truncation details
   *********************************************************/

   cutensornetTensorSVDInfo_t svdInfo; 
   HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;

Execution¶

Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes(), which is very similar to its QR counterpart. At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD().

   /**********
   * Execution
   ***********/
  
   GPUTimer timer{stream};
   double minTimeCUTENSOR = 1e100;
   const int numRuns = 3; // to get stable perf results
   for (int i=0; i < numRuns; ++i)
   {  
      // restore output
      cudaMemsetAsync(D_U, 0, sizeU, stream);
      cudaMemsetAsync(D_S, 0, sizeS, stream);
      cudaMemsetAsync(D_V, 0, sizeV, stream);
      cudaDeviceSynchronize();
      
      // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
      // We here restore descTensorU/V to the original problem.
      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );

      timer.start();
      HANDLE_ERROR( cutensornetTensorSVD(handle, 
                        descTensorIn, D_T, 
                        descTensorU, D_U, 
                        D_S, 
                        descTensorV, D_V, 
                        svdConfig, 
                        svdInfo,
                        workDesc,
                        stream) );
      // Synchronize and measure timing
      auto time = timer.seconds();
      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
   }

   printf("Performing SVD\n");

   HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
   HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
   HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );

Note

Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.

After the computation, we still need to free up all resources.