Sampling the Matrix Product State

The following code example illustrates how to define a tensor network state for a given quantum circuit, then compute its Matrix Product State (MPS) factorization, and, finally, sample the MPS-factorized state. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and error handling

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <vector>
12#include <iostream>
13
14#include <cuda_runtime.h>
15#include <cutensornet.h>
16
17#define HANDLE_CUDA_ERROR(x) \
18{ const auto err = x; \
19  if( err != cudaSuccess ) \
20  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
21};
22
23#define HANDLE_CUTN_ERROR(x) \
24{ const auto err = x; \
25  if( err != CUTENSORNET_STATUS_SUCCESS ) \
26  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
27};
28
29
30int main(int argc, char **argv)
31{
32  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
33
34  constexpr std::size_t fp64size = sizeof(double);

Define the tensor network state and the desired number of output samples to generate

Let’s define a tensor network state corresponding to a 16-qubit quantum circuit and request to produce 100 output samples for the full qubit register.

38  // Quantum state configuration
39  const int64_t numSamples = 100;
40  const int32_t numQubits = 16;
41  const std::vector<int64_t> qubitDims(numQubits, 2); // qubit size
42  std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n";

Initialize the cuTensorNet library handle

46  // Initialize the cuTensorNet library
47  HANDLE_CUDA_ERROR(cudaSetDevice(0));
48  cutensornetHandle_t cutnHandle;
49  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
50  std::cout << "Initialized cuTensorNet library on GPU 0\n";

Define quantum gates on GPU

54  // Define necessary quantum gate tensors in Host memory
55  const double invsq2 = 1.0 / std::sqrt(2.0);
56  //  Hadamard gate
57  const std::vector<std::complex<double>> h_gateH {{invsq2, 0.0},  {invsq2, 0.0},
58                                                   {invsq2, 0.0}, {-invsq2, 0.0}};
59  //  CX gate
60  const std::vector<std::complex<double>> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
61                                                    {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
62                                                    {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0},
63                                                    {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
64
65  // Copy quantum gates to Device memory
66  void *d_gateH{nullptr}, *d_gateCX{nullptr};
67  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
68  std::cout << "H gate buffer allocated on GPU: " << d_gateH << std::endl; //debug
69  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size)));
70  std::cout << "CX gate buffer allocated on GPU: " << d_gateCX << std::endl; //debug
71  std::cout << "Allocated quantum gate memory on GPU\n";
72  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
73  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
74  std::cout << "Copied quantum gates to GPU memory\n";

Allocate MPS tensors

Here we set the shapes of MPS tensors and allocate GPU memory for their storage.

78  // Determine the MPS representation and allocate buffer for the MPS tensors
79  const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2
80  std::vector<std::vector<int64_t>> extents;
81  std::vector<int64_t*> extentsPtr(numQubits); 
82  std::vector<void*> d_mpsTensors(numQubits, nullptr);
83  for (int32_t i = 0; i < numQubits; i++) {
84    if (i == 0) { // left boundary MPS tensor
85      extents.push_back({2, maxExtent});
86      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
87    } 
88    else if (i == numQubits-1) { // right boundary MPS tensor
89      extents.push_back({maxExtent, 2});
90      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
91    }
92    else { // middle MPS tensors
93      extents.push_back({maxExtent, 2, maxExtent});
94      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size));
95    }
96    extentsPtr[i] = extents[i].data();
97  }

Allocate the scratch buffer on GPU

101  // Query the free memory on Device
102  std::size_t freeSize {0}, totalSize {0};
103  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
104  const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
105  void *d_scratch {nullptr};
106  HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
107  std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU: "
108            << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch))  + scratchSize) << ")\n";

Create a pure tensor network state

Now let’s create a pure tensor network state for a 16-qubit quantum circuit.

112  // Create the initial quantum state
113  cutensornetState_t quantumState;
114  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
115                    CUDA_C_64F, &quantumState));
116  std::cout << "Created the initial quantum state\n";

Apply quantum gates

Let’s construct the GHZ quantum circuit by applying the corresponding quantum gates.

120  // Construct the quantum circuit state (apply quantum gates)
121  int64_t id;
122  HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 1, std::vector<int32_t>{{0}}.data(),
123                    d_gateH, nullptr, 1, 0, 1, &id));
124  for(int32_t i = 1; i < numQubits; ++i) {
125    HANDLE_CUTN_ERROR(cutensornetStateApplyTensor(cutnHandle, quantumState, 2, std::vector<int32_t>{{i-1,i}}.data(),
126                      d_gateCX, nullptr, 1, 0, 1, &id));
127  }
128  std::cout << "Applied quantum gates\n";

Request MPS factorization for the final quantum circuit state

Here we express our intent to factorize the final quantum circuit state using MPS factorization. The provided shapes of the MPS tensors refer to their maximal size limit during the MPS renormalization procedure. The actually computed shapes of the final MPS tensors may be smaller. No computation is done here yet.

132  // Specify the final target MPS representation (use default fortran strides)
133  HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, 
134                    CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr ));

Configure MPS factorization procedure

After expressing our intent to perform MPS factorization of the final quantum circuit state, we can also configure the MPS factorization procedure by resetting different options, for example, the SVD algorithm.

138  // Optional, set up the SVD method for truncation.
139  cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; 
140  HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, 
141                    CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO, &algo, sizeof(algo)));
142  std::cout << "Configured the MPS computation\n";

Prepare the computation of MPS factorization

Let’s create a workspace descriptor and prepare the computation of MPS factorization.

146  // Prepare the MPS computation and attach workspace
147  cutensornetWorkspaceDescriptor_t workDesc;
148  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
149  std::cout << "Created the workspace descriptor\n";
150  HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0));
151  int64_t worksize {0};
152  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
153                                                      workDesc,
154                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
155                                                      CUTENSORNET_MEMSPACE_DEVICE,
156                                                      CUTENSORNET_WORKSPACE_SCRATCH,
157                                                      &worksize));
158  std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl;
159  if(worksize <= scratchSize) {
160    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
161                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
162  }else{
163    std::cout << "ERROR: Insufficient workspace size on Device!\n";
164    std::abort();
165  }
166  std::cout << "Set the workspace buffer for MPS computation\n";

Compute MPS factorization

Once the MPS factorization procedure has been configured and prepared, let’s compute the MPS factorization of the final quantum circuit state.

170  // Execute MPS computation
171  HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, 
172                    workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0));
173  

Create the tensor network state sampler

Once the quantum circuit has been constructed and factorized using the MPS representation, let’s create the tensor network state sampler for the full qubit register (all qubits).

176  // Create the quantum circuit sampler
177  cutensornetStateSampler_t sampler;
178  HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler));
179  std::cout << "Created the quantum circuit sampler\n";

Configure the tensor network state sampler

Now we can configure the tensor network state sampler by setting the number of hyper-samples to be used by the tensor network contraction path finder.

183  // Configure the quantum circuit sampler
184  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
185  HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler,
186                    CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));

Prepare the tensor network state sampler

Let’s prepare the tensor network state sampler.

190  // Prepare the quantum circuit sampler
191  HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, scratchSize, workDesc, 0x0));
192  std::cout << "Prepared the quantum circuit state sampler\n";

Set up the workspace

Now we can set up the required workspace buffer.

196  // Attach the workspace buffer
197  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
198                                                      workDesc,
199                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
200                                                      CUTENSORNET_MEMSPACE_DEVICE,
201                                                      CUTENSORNET_WORKSPACE_SCRATCH,
202                                                      &worksize));
203  std::cout << "Scratch GPU workspace size (bytes) for MPS Sampling = " << worksize << std::endl;
204  assert(worksize > 0);
205  if(worksize <= scratchSize) {
206    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
207                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
208  }else{
209    std::cout << "ERROR: Insufficient workspace size on Device!\n";
210    std::abort();
211  }
212  std::cout << "Set the workspace buffer\n";

Perform sampling of the final quantum circuit state

Once everything had been set up, we perform sampling of the quantum circuit state and print the output samples.

216  // Sample the quantum circuit state
217  std::vector<int64_t> samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory
218  HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0));
219  std::cout << "Performed quantum circuit state sampling\n";
220  std::cout << "Bit-string samples:\n";
221  for(int64_t i = 0; i < numSamples; ++i) {
222    for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j];
223    std::cout << std::endl;
224  }

Free resources

228  // Destroy the workspace descriptor
229  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
230  std::cout << "Destroyed the workspace descriptor\n";
231
232  // Destroy the quantum circuit sampler
233  HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler));
234  std::cout << "Destroyed the quantum circuit state sampler\n";
235
236  // Destroy the quantum circuit state
237  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
238  std::cout << "Destroyed the quantum circuit state\n";
239
240  for (int32_t i = 0; i < numQubits; i++) {
241    HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i]));
242  }
243  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
244  HANDLE_CUDA_ERROR(cudaFree(d_gateCX));
245  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
246  std::cout << "Freed memory on GPU\n";
247
248  // Finalize the cuTensorNet library
249  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
250  std::cout << "Finalized the cuTensorNet library\n";
251
252  return 0;
253}