The following code example illustrates how to integrate cuTensorNet functionalities to perform basic MPS simulation. The workflow is encapsulated in an MPSHelper class. The full code can be found in the NVIDIA/cuQuantum repository (here).

Define MPSHelper class¶

We first define an MPSHelper class to keep track of the modes and extents of all physical and virtual bonds. The simulation settings are also stored in this class. Once out of scope, all resource owned by this class will be freed.

class MPSHelper
{
   public:
      /**
       * \brief Construct an MPSHelper object for gate splitting algorithm.
       *        i       j       k
       *     -------A-------B-------                      i        j        k
       *           p|       |q            ------->     -------A`-------B`-------
       *            GGGGGGGGG                                r|        |s
       *           r|       |s
       * \param[in] numSites The number of sites in the MPS
       * \param[in] physExtent The extent for the physical mode where the gate tensors are acted on. 
       * \param[in] maxVirtualExtent The maximal extent allowed for the virtual mode shared between adjacent MPS tensors. 
       * \param[in] initialVirtualExtents A vector of size \p numSites-1 where the ith element denotes the extent of the shared mode for site i and site i+1 in the beginning of the simulation.
       * \param[in] typeData The data type for all tensors and gates
       * \param[in] typeCompute The compute type for all gate splitting process
       */
      MPSHelper(int32_t numSites, 
                int64_t physExtent,
                int64_t maxVirtualExtent,
                const std::vector<int64_t>& initialVirtualExtents,
                cudaDataType_t typeData, 
                cutensornetComputeType_t typeCompute);
      
      /**
       * \brief Initialize the MPS metadata and cutensornet library.
       */
      cutensornetStatus_t initialize();

      /**
       * \brief Compute the maximal number of elements for each site.
       */
      std::vector<size_t> getMaxTensorElements() const;

      /**
       * \brief Update the SVD truncation setting.
       * \param[in] absCutoff The cutoff value for absolute singular value truncation.
       * \param[in] relCutoff The cutoff value for relative singular value truncation.
       * \param[in] renorm The option for renormalization of the truncated singular values.
       * \param[in] partition The option for partitioning of the singular values. 
       */
      cutensornetStatus_t setSVDConfig(double absCutoff, 
                                       double relCutoff, 
                                       cutensornetTensorSVDNormalization_t renorm,
                                       cutensornetTensorSVDPartition_t partition);

      /**
       * \brief Update the algorithm to use for the gating process.
       * \param[in] gateAlgo The gate algorithm to use for MPS simulation.
       */
      void setGateAlgorithm(cutensornetGateSplitAlgo_t gateAlgo) {gateAlgo_ = gateAlgo;}

      /**
       * \brief Compute the maximal workspace needed for MPS gating algorithm.
       * \param[out] workspaceSize The required workspace size on the device. 
       */
      cutensornetStatus_t computeMaxWorkspaceSizes(int64_t* workspaceSize);

      /**
       * \brief Compute the maximal workspace needed for MPS gating algorithm.
       * \param[in] work Pointer to the allocated workspace.
       * \param[in] workspaceSize The required workspace size on the device. 
       */
      cutensornetStatus_t setWorkspace(void* work, int64_t workspaceSize);

      /**
       * \brief In-place execution of the apply gate algorithm on \p siteA and \p siteB.
       * \param[in] siteA The first site where the gate is applied to.
       * \param[in] siteB The second site where the gate is applied to. Must be adjacent to \p siteA.
       * \param[in,out] dataInA The data for the MPS tensor at \p siteA. The input will be overwritten with output mps tensor data.
       * \param[in,out] dataInB The data for the MPS tensor at \p siteB. The input will be overwritten with output mps tensor data.
       * \param[in] dataInG The input data for the gate tensor. 
       * \param[in] verbose Whether to print out the runtime information regarding truncation. 
       * \param[in] stream The CUDA stream on which the computation is performed.
       */
      cutensornetStatus_t applyGate(uint32_t siteA, 
                                    uint32_t siteB, 
                                    void* dataInA, 
                                    void* dataInB, 
                                    const void* dataInG, 
                                    bool verbose,
                                    cudaStream_t stream);
      
      /**
       * \brief Free all the tensor descriptors in mpsHelper.
       */
      ~MPSHelper()
      {
         if (inited_)
         {
            for (auto& descTensor: descTensors_)
            {
               cutensornetDestroyTensorDescriptor(descTensor);
            }
            cutensornetDestroy(handle_);
            cutensornetDestroyWorkspaceDescriptor(workDesc_);
         }
         if (svdConfig_ != nullptr)
         {
            cutensornetDestroyTensorSVDConfig(svdConfig_);
         }
         if (svdInfo_ != nullptr)
         {
            cutensornetDestroyTensorSVDInfo(svdInfo_);
         }
      }

   private:
      int32_t numSites_; ///< Number of sites in the MPS
      int64_t physExtent_; ///< Extent for the physical index 
      int64_t maxVirtualExtent_{0}; ///< The maximal extent allowed for the virtual dimension
      cudaDataType_t typeData_; 
      cutensornetComputeType_t typeCompute_;
      
      bool inited_{false};
      std::vector<int32_t> physModes_; ///< A vector of length \p numSites_ storing the physical mode of each site.
      std::vector<int32_t> virtualModes_; ///< A vector of length \p numSites_+1; For site i, virtualModes_[i] and virtualModes_[i+1] represents the left and right virtual mode.
      std::vector<int64_t> extentsPerSite_; ///< A vector of length \p numSites_+1; For site i, extentsPerSite_[i] and extentsPerSite_[i+1] represents the left and right virtual extent. 

      cutensornetHandle_t handle_;
      std::vector<cutensornetTensorDescriptor_t> descTensors_; /// A vector of length \p numSites_ storing the cutensornetTensorDescriptor_t for each site
      cutensornetWorkspaceDescriptor_t workDesc_{nullptr};
      cutensornetTensorSVDConfig_t svdConfig_{nullptr};
      cutensornetTensorSVDInfo_t svdInfo_{nullptr};
      cutensornetGateSplitAlgo_t gateAlgo_{CUTENSORNET_GATE_SPLIT_ALGO_DIRECT};
      int32_t nextMode_{0}; /// The next mode label to use for labelling site tensors and gates.
};

Note

For full definition of all the methods, please refer to the sample here.

Setup MPS simulation setting¶

Next, in the main function, we need to choose the simulation setting for the MPS simulation (i.e., the number of sites, the initial extents, and the data type).

   /***********************************
   * Step 1: basic MPS setup
   ************************************/

   // setup the simulation setting for the MPS
   typedef std::complex<double> complexType;
   cudaDataType_t typeData = CUDA_C_64F;
   cutensornetComputeType_t typeCompute = CUTENSORNET_COMPUTE_64F;
   int32_t numSites = 16;
   int64_t physExtent = 2;
   int64_t maxVirtualExtent = 12;
   const std::vector<int64_t> initialVirtualExtents(numSites-1, 1);  // starting MPS with shared extent of 1;

   // initialize an MPSHelper to dynamically update tensor metadats   
   MPSHelper mpsHelper(numSites, physExtent, maxVirtualExtent, initialVirtualExtents, typeData, typeCompute);
   HANDLE_ERROR( mpsHelper.initialize() );

The MPS metadata and all cuTensorNet library objects will be managed by the MPSHelper while the data pointers are explicitly managed in the main function.

Allocate memory and initialize data¶

Next, we allocate memory for the MPS operands and four 2-qubit gate tensors. The largest tensor size for each MPS tensor can be queried through the MPSHelper class. The MPS tensors are initialized to a state corresponding to |00..000> and the gate tensors are filled with random values.

   /***********************************
   * Step 2: data allocation 
   ************************************/

   // query largest tensor sizes for the MPS
   const std::vector<size_t> maxElementsPerSite = mpsHelper.getMaxTensorElements();
   std::vector<void*> tensors_h;
   std::vector<void*> tensors_d;
   for (int32_t i=0; i<numSites; i++)
   {
      size_t maxSize = sizeof(complexType) * maxElementsPerSite.at(i);
      void* data_h = malloc(maxSize);
      memset(data_h, 0, maxSize);
      // initialize state to |0000..0000>
      *(complexType*)(data_h) = complexType(1,0);  
      void* data_d;
      HANDLE_CUDA_ERROR( cudaMalloc(&data_d, maxSize) );
      // data transfer from host to device
      HANDLE_CUDA_ERROR( cudaMemcpy(data_d, data_h, maxSize, cudaMemcpyHostToDevice) );
      tensors_h.push_back(data_h);
      tensors_d.push_back(data_d);
   }

   // initialize 4 random gate tensors on host and copy them to device
   const int32_t numRandomGates = 4;
   const int64_t numGateElements = physExtent * physExtent * physExtent * physExtent;  // shape (2, 2, 2, 2)
   size_t gateSize = sizeof(complexType) * numGateElements;
   complexType* gates_h[numRandomGates];
   void* gates_d[numRandomGates];
   
   for (int i=0; i<numRandomGates; i++)
   {
      gates_h[i] = (complexType*) malloc(gateSize);
      HANDLE_CUDA_ERROR( cudaMalloc((void**) &gates_d[i], gateSize) );
      for (int j=0; j<numGateElements; j++)
      {
         gates_h[i][j] = complexType(((float) rand())/RAND_MAX, ((float) rand())/RAND_MAX);
      }
      HANDLE_CUDA_ERROR( cudaMemcpy(gates_d[i], gates_h[i], gateSize, cudaMemcpyHostToDevice) );
   }
   

Setup gate split options¶

Then we setup the SVD truncation parameters and the algorithm cutensornetGateSplitAlgo_t to use for the gate split process.

   /*****************************************
   * Step 3: setup options for gate operation
   ******************************************/

   double absCutoff = 1e-2;
   double relCutoff = 1e-2;
   cutensornetTensorSVDNormalization_t renorm = CUTENSORNET_TENSOR_SVD_NORMALIZATION_L2; // renormalize the L2 norm of truncated singular values to 1. 
   cutensornetTensorSVDPartition_t partition = CUTENSORNET_TENSOR_SVD_PARTITION_UV_EQUAL; // equally partition the singular values onto U and V;
   HANDLE_ERROR( mpsHelper.setSVDConfig(absCutoff, relCutoff, renorm, partition));

   cutensornetGateSplitAlgo_t gateAlgo = CUTENSORNET_GATE_SPLIT_ALGO_REDUCED;
   mpsHelper.setGateAlgorithm(gateAlgo);

Query and allocate required workspace¶

Once all simulation settings are set, we can query the required workspace size. Inside the MPSHelper, the required workspace size is estimated on the largest tensor sizes involved in the simulation.

   /********************************************
   * Step 4: workspace size query and allocation
   *********************************************/

   int64_t workspaceSize;
   HANDLE_ERROR( mpsHelper.computeMaxWorkspaceSizes(&workspaceSize) );

   void *work = nullptr;
   std::cout << "Maximal workspace size required: " << workspaceSize << std::endl;
   HANDLE_CUDA_ERROR( cudaMalloc(&work, workspaceSize) );

   HANDLE_ERROR( mpsHelper.setWorkspace(work, workspaceSize));
   

Execution¶

At this stage, we can execute the simulation by iterating over all the gate tensors. All the metadata of the MPS will be managed and updated inside the MPSHelper.

   /***********************************
   * Step 5: execution
   ************************************/

   cudaStream_t stream;
   HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
   uint32_t numLayers = 10; // 10 layers of gate
   for (uint32_t i=0; i<numLayers; i++)
   {
      uint32_t start_site = i % 2;
      std::cout << "Cycle " << i << ":" << std::endl;
      bool verbose = (i == numLayers - 1);
      for (uint32_t j=start_site; j<numSites-1; j=j+2)
      {
         uint32_t gateIdx = rand() % numRandomGates; // pick a random gate tensor
         std::cout << "apply gate " << gateIdx << " on " << j << " and " << j+1<< std::endl;
         void *dataA = tensors_d[j];
         void *dataB = tensors_d[j+1];
         void *dataG = gates_d[gateIdx];
         HANDLE_ERROR( mpsHelper.applyGate(j, j+1, dataA, dataB, dataG, verbose, stream) );
      }
   }

   HANDLE_CUDA_ERROR( cudaStreamSynchronize(stream) );

Free resources¶

After the simulation, we free up all the data pointers allocated in the main function.

   /***********************************
   * Step 6: free resources
   ************************************/
   
   std::cout << "Free all resources" << std::endl;

   for (int i=0; i<numRandomGates; i++)
   {
      free(gates_h[i]);
      HANDLE_CUDA_ERROR( cudaFree(gates_d[i]) );
   }

   for (int32_t i=0; i<numSites; i++)
   {
      free(tensors_h.at(i));
      HANDLE_CUDA_ERROR( cudaFree(tensors_d.at(i)) );
   }

   HANDLE_CUDA_ERROR( cudaFree(work) );
   // The MPSHelper destructor will free all internal resources when out of scope
   return 0;   
}

All cuTensorNet library objects owned by the MPSHelper will be freed once out of scope.