Performing gate split operation using cuTensorNet adopts a very similar workflow as QR example and SVD example. Here, we here highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define tensor operands

As with QR/SVD decomposition, we first define all the tensor operands by specifying the data type, modes partition, and the extents. In this process, we choose to perform fixed extent truncation to a size of 16.

 83   /************************************************************************************
 84   * Gate Split: A_{i,j,k,l} B_{k,o,p,q} G_{m,n,l,o}-> A'_{i,j,x,m} S_{x} B'_{x,n,p,q}  
 85   *************************************************************************************/
 86   typedef float floatType;
 87   cudaDataType_t typeData = CUDA_R_32F;
 88   cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_32F;
 89
 90   // Create vector of modes
 91   std::vector<int32_t> modesAIn{'i','j','k','l'};
 92   std::vector<int32_t> modesBIn{'k','o','p','q'};
 93   std::vector<int32_t> modesGIn{'m','n','l','o'}; // input, G is the gate operator
 94
 95   std::vector<int32_t> modesAOut{'i','j','x','m'}; 
 96   std::vector<int32_t> modesBOut{'x','n','p','q'}; // SVD output
 97
 98   // Extents
 99   std::unordered_map<int32_t, int64_t> extent;
100   extent['i'] = 16;
101   extent['j'] = 16;
102   extent['k'] = 16;
103   extent['l'] = 2;
104   extent['m'] = 2;
105   extent['n'] = 2;
106   extent['o'] = 2;
107   extent['p'] = 16;
108   extent['q'] = 16;
109   
110   const int64_t maxExtent = 16; //truncate to a maximal extent of 16
111   extent['x'] = maxExtent;
112
113   // Create a vector of extents for each tensor
114   std::vector<int64_t> extentAIn;
115   for (auto mode : modesAIn)
116      extentAIn.push_back(extent[mode]);
117   std::vector<int64_t> extentBIn;
118   for (auto mode : modesBIn)
119      extentBIn.push_back(extent[mode]);
120   std::vector<int64_t> extentGIn;
121   for (auto mode : modesGIn)
122      extentGIn.push_back(extent[mode]);
123   std::vector<int64_t> extentAOut;
124   for (auto mode : modesAOut)
125      extentAOut.push_back(extent[mode]);
126   std::vector<int64_t> extentBOut;
127   for (auto mode : modesBOut)
128      extentBOut.push_back(extent[mode]);
129   

Execution

Similar to SVD example, we can specify the SVD options in cutensornetTensorSVDConfig_t. Workspace size query can be achieved by calling cutensornetWorkspaceComputeGateSplitSizes() with the provided cutensornetGateSplitAlgo_t. Finally, we can execute the gate split computation by calling cutensornetGateSplit().

296   /**********************
297   * Execution
298   **********************/
299
300   GPUTimer timer{stream};
301   double minTimeCUTENSOR = 1e100;
302   const int numRuns = 3; // to get stable perf results
303   for (int i=0; i < numRuns; ++i)
304   {  
305      // restore output
306      cudaMemsetAsync(D_AOut, 0, sizeAOut, stream);
307      cudaMemsetAsync(D_S, 0, sizeS, stream);
308      cudaMemsetAsync(D_BOut, 0, sizeBOut, stream);
309
310      // With value-based truncation, `cutensornetGateSplit` can potentially update the shared extent in descTensorA/BOut.
311      // We here restore descTensorA/BOut to the original problem.
312      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorAOut) );
313      HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorBOut) );
314      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesAOut, extentAOut.data(), strides, modesAOut.data(), typeData, &descTensorAOut) );
315      HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesBOut, extentBOut.data(), strides, modesBOut.data(), typeData, &descTensorBOut) );
316
317      cudaDeviceSynchronize();
318      timer.start();
319      HANDLE_ERROR( cutensornetGateSplit(handle, 
320                                         descTensorAIn, D_AIn,
321                                         descTensorBIn, D_BIn,
322                                         descTensorGIn, D_GIn,
323                                         descTensorAOut, D_AOut,
324                                         D_S,
325                                         descTensorBOut, D_BOut,
326                                         gateAlgo,
327                                         svdConfig, typeCompute, svdInfo, 
328                                         workDesc, stream) );
329      // Synchronize and measure timing
330      auto time = timer.seconds();
331      minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
332   }
333
334   printf("Performing Gate Split\n");

Note

Like in cutensornetTensorSVD(), since we turned on weighted truncation options in this example, the tensor descriptors for outputs A and B need to be restored if we wish to perform the same computation multiple times.

After the computation, we always need to free up all resources.