# Computing Matrix Product State Marginal Distribution Tensor¶

The following code example illustrates how to define a tensor network state, compute its Matrix Product State (MPS) factorization, and then compute a marginal distribution tensor for the MPS-factorized state. The full code can be found in the NVIDIA/cuQuantum repository (here).

``` 7#include <cstdlib>
8#include <cstdio>
9#include <cassert>
10#include <complex>
11#include <vector>
12#include <bitset>
13#include <iostream>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_CUDA_ERROR(x) \
20{ const auto err = x; \
21  if( err != cudaSuccess ) \
22  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
23};
24
25#define HANDLE_CUTN_ERROR(x) \
26{ const auto err = x; \
27  if( err != CUTENSORNET_STATUS_SUCCESS ) \
28  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
29};
30
31
32int main()
33{
34  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
35
36  constexpr std::size_t fp64size = sizeof(double);
```

# Define the tensor network state and the desired marginal distribution tensor¶

Let’s define a tensor network state corresponding to a 16-qubit quantum circuit and request the marginal distribution tensor for qubits 0 and 1.

```40  // Quantum state configuration
41  constexpr int32_t numQubits = 16;
42  const std::vector<int64_t> qubitDims(numQubits,2); // qubit dimensions
43  constexpr int32_t numMarginalModes = 2; // rank of the marginal (reduced density matrix)
44  const std::vector<int32_t> marginalModes({0,1}); // open qubits (must be in acsending order)
45  std::cout << "Quantum circuit: " << numQubits << " qubits\n";
```

# Initialize the cuTensorNet library handle¶

```49  // Initialize the cuTensorNet library
50  HANDLE_CUDA_ERROR(cudaSetDevice(0));
51  cutensornetHandle_t cutnHandle;
52  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
53  std::cout << "Initialized cuTensorNet library on GPU 0\n";
```

# Define quantum gates on GPU¶

```57  // Define necessary quantum gate tensors in Host memory
58  const double invsq2 = 1.0 / std::sqrt(2.0);
60  const std::vector<std::complex<double>> h_gateH {{invsq2, 0.0},  {invsq2, 0.0},
61                                                   {invsq2, 0.0}, {-invsq2, 0.0}};
62  //  CX gate
63  const std::vector<std::complex<double>> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
64                                                    {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
65                                                    {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0},
66                                                    {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
67
68  // Copy quantum gates to Device memory
69  void *d_gateH{nullptr}, *d_gateCX{nullptr};
70  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
71  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size)));
72  std::cout << "Allocated quantum gate memory on GPU\n";
73  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
74  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
75  std::cout << "Copied quantum gates to GPU memory\n";
```

# Allocate MPS tensors¶

Here we set the shapes of MPS tensors and allocate GPU memory for their storage.

```79  // Determine the MPS representation and allocate buffers for the MPS tensors
80  const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2
81  std::vector<std::vector<int64_t>> extents;
82  std::vector<int64_t*> extentsPtr(numQubits);
83  std::vector<void*> d_mpsTensors(numQubits, nullptr);
84  for (int32_t i = 0; i < numQubits; i++) {
85    if (i == 0) { // left boundary MPS tensor
86      extents.push_back({2, maxExtent});
87      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
88    }
89    else if (i == numQubits-1) { // right boundary MPS tensor
90      extents.push_back({maxExtent, 2});
91      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
92    }
93    else { // middle MPS tensors
94      extents.push_back({maxExtent, 2, maxExtent});
95      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size));
96    }
97    extentsPtr[i] = extents[i].data();
98  }
```

# Allocate the marginal distribution tensor on GPU¶

Here we allocate the marginal distribution tensor, that is, the reduced density matrix for qubits 0 and 1, on GPU.

```102  // Allocate the specified quantum circuit reduced density matrix (marginal) in Device memory
103  void *d_rdm{nullptr};
104  std::size_t rdmDim = 1;
105  for(const auto & mode: marginalModes) rdmDim *= qubitDims[mode];
106  const std::size_t rdmSize = rdmDim * rdmDim;
107  HANDLE_CUDA_ERROR(cudaMalloc(&d_rdm, rdmSize * (2 * fp64size)));
```

# Allocate the scratch buffer on GPU¶

```111  // Query the free memory on Device
112  std::size_t freeSize{0}, totalSize{0};
113  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
114  const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
115  void *d_scratch{nullptr};
116  HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
117  std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n";
```

# Create a pure tensor network state¶

Now let’s create a pure tensor network state for a 16-qubit quantum circuit.

```121  // Create the initial quantum state
122  cutensornetState_t quantumState;
123  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
124                    CUDA_C_64F, &quantumState));
125  std::cout << "Created the initial quantum state\n";
```

# Apply quantum gates¶

Let’s construct the GHZ quantum circuit by applying the corresponding quantum gates.

```129  // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit
130  int64_t id;
131  HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 1, std::vector<int32_t>{{0}}.data(),
132                    d_gateH, nullptr, 1, 0, 1, &id));
133  for(int32_t i = 1; i < numQubits; ++i) {
134    HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2, std::vector<int32_t>{{i-1,i}}.data(),
135                      d_gateCX, nullptr, 1, 0, 1, &id));
136  }
137  std::cout << "Applied quantum gates\n";
```

# Request MPS factorization for the final quantum circuit state¶

Here we express our intent to factorize the final quantum circuit state using MPS factorization. The provided shapes of the MPS tensors refer to their maximal size limit during the MPS renormalization procedure. The actually computed shapes of the final MPS tensors may be smaller. No computation is done here yet.

```141  // Specify the final target MPS representation (use default fortran strides)
142  HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState,
143                    CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr));
144  std::cout << "Requested the final MPS factorization of the quantum circuit state\n";
```

# Configure MPS factorization procedure¶

After expressing our intent to perform MPS factorization of the final quantum circuit state, we can also configure the MPS factorization procedure by resetting different options, for example, the SVD algorithm.

```148  // Optional, set up the SVD method for MPS truncation.
149  cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
150  HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState,
151                    CUTENSORNET_STATE_CONFIG_MPS_SVD_ALGO, &algo, sizeof(algo)));
152  std::cout << "Configured the MPS factorization computation\n";
```

# Prepare the computation of MPS factorization¶

Let’s create a workspace descriptor and prepare the computation of MPS factorization.

```156  // Prepare the MPS computation and attach workspace
157  cutensornetWorkspaceDescriptor_t workDesc;
158  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
159  std::cout << "Created the workspace descriptor\n";
160  HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0));
161  std::cout << "Prepared the computation of the quantum circuit state\n";
162  double flops {0.0};
163  HANDLE_CUTN_ERROR(cutensornetStateGetInfo(cutnHandle, quantumState,
164                    CUTENSORNET_STATE_INFO_FLOPS, &flops, sizeof(flops)));
165  if(flops > 0.0) {
166    std::cout << "Total flop count = " << (flops/1e9) << " GFlop\n";
167  }else if(flops < 0.0) {
168    std::cout << "ERROR: Negative Flop count!\n";
169    std::abort();
170  }
171
172  int64_t worksize {0};
173  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
174                                                      workDesc,
175                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
176                                                      CUTENSORNET_MEMSPACE_DEVICE,
177                                                      CUTENSORNET_WORKSPACE_SCRATCH,
178                                                      &worksize));
179  std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl;
180  if(worksize <= scratchSize) {
181    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
182                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
183  }else{
184    std::cout << "ERROR: Insufficient workspace size on Device!\n";
185    std::abort();
186  }
187  std::cout << "Set the workspace buffer for the MPS factorization computation\n";
```

# Compute MPS factorization¶

Once the MPS factorization procedure has been configured and prepared, let’s compute the MPS factorization of the final quantum circuit state.

```191  // Execute MPS computation
192  HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState,
193                    workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0));
194  std::cout << "Computed the MPS factorization\n";
```

# Create the marginal distribution object¶

Once the quantum circuit has been constructed, let’s create the marginal distribution object that will compute the marginal distribution tensor (reduced density matrix) for qubits 0 and 1.

```198  // Specify the desired reduced density matrix (marginal)
199  cutensornetStateMarginal_t marginal;
200  HANDLE_CUTN_ERROR(cutensornetCreateMarginal(cutnHandle, quantumState, numMarginalModes, marginalModes.data(),
201                    0, nullptr, std::vector<int64_t>{{1,2,4,8}}.data(), &marginal)); // using explicit strides
202  std::cout << "Created the specified quantum circuit reduced densitry matrix (marginal)\n";
```

# Configure the marginal distribution object¶

Now we can configure the marginal distribution object by setting the number of hyper-samples to be used by the tensor network contraction path finder.

```206  // Configure the computation of the specified quantum circuit reduced density matrix (marginal)
207  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
208  HANDLE_CUTN_ERROR(cutensornetMarginalConfigure(cutnHandle, marginal,
209                    CUTENSORNET_MARGINAL_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));
210  std::cout << "Configured the specified quantum circuit reduced density matrix (marginal) computation\n";
```

# Prepare the computation of the marginal distribution tensor¶

Let’s prepare the computation of the marginal distribution tensor.

```214  // Prepare the specified quantum circuit reduced densitry matrix (marginal)
215  HANDLE_CUTN_ERROR(cutensornetMarginalPrepare(cutnHandle, marginal, scratchSize, workDesc, 0x0));
216  std::cout << "Prepared the specified quantum circuit reduced density matrix (marginal)\n";
217  flops = 0.0;
218  HANDLE_CUTN_ERROR(cutensornetMarginalGetInfo(cutnHandle, marginal,
219                    CUTENSORNET_MARGINAL_INFO_FLOPS, &flops, sizeof(flops)));
220  std::cout << "Total flop count = " << (flops/1e9) << " GFlop\n";
221  if(flops <= 0.0) {
222    std::cout << "ERROR: Invalid Flop count!\n";
223    std::abort();
224  }
```

# Set up the workspace¶

Now we can set up the required workspace buffer.

```228  // Attach the workspace buffer
229  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
230                                                      workDesc,
231                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
232                                                      CUTENSORNET_MEMSPACE_DEVICE,
233                                                      CUTENSORNET_WORKSPACE_SCRATCH,
234                                                      &worksize));
235  std::cout << "Required scratch GPU workspace size (bytes) for marginal computation = " << worksize << std::endl;
236  if(worksize <= scratchSize) {
237    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
238                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
239  }else{
240    std::cout << "ERROR: Insufficient workspace size on Device!\n";
241    std::abort();
242  }
243  std::cout << "Set the workspace buffer\n";
244
```

# Compute the marginal distribution tensor¶

Once everything has been set up, we compute the requested marginal distribution tensor (reduced density matrix) for qubits 0 and 1, copy it back to Host memory, and print it.

```247  // Compute the specified quantum circuit reduced densitry matrix (marginal)
248  HANDLE_CUTN_ERROR(cutensornetMarginalCompute(cutnHandle, marginal, nullptr, workDesc, d_rdm, 0));
249  std::cout << "Computed the specified quantum circuit reduced density matrix (marginal)\n";
250  std::vector<std::complex<double>> h_rdm(rdmSize);
251  HANDLE_CUDA_ERROR(cudaMemcpy(h_rdm.data(), d_rdm, rdmSize * (2 * fp64size), cudaMemcpyDeviceToHost));
252  std::cout << "Reduced density matrix for " << numMarginalModes << " qubits:\n";
253  for(std::size_t i = 0; i < rdmDim; ++i) {
254    for(std::size_t j = 0; j < rdmDim; ++j) {
255      std::cout << " " << h_rdm[i + j * rdmDim];
256    }
257    std::cout << std::endl;
258  }
```

# Free resources¶

```262  // Destroy the workspace descriptor
263  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
264  std::cout << "Destroyed the workspace descriptor\n";
265
266  // Destroy the quantum circuit reduced density matrix
267  HANDLE_CUTN_ERROR(cutensornetDestroyMarginal(marginal));
268  std::cout << "Destroyed the quantum circuit state reduced density matrix (marginal)\n";
269
270  // Destroy the quantum circuit state
271  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
272  std::cout << "Destroyed the quantum circuit state\n";
273
274  for (int32_t i = 0; i < numQubits; i++) {
275    HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i]));
276  }
277  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
278  HANDLE_CUDA_ERROR(cudaFree(d_rdm));
279  HANDLE_CUDA_ERROR(cudaFree(d_gateCX));
280  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
281  std::cout << "Freed memory on GPU\n";
282
283  // Finalize the cuTensorNet library
284  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
285  std::cout << "Finalized the cuTensorNet library\n";
286
287  return 0;
288}
```