Sampling the Matrix Product State

The following code example illustrates how to define a tensor network state for a given quantum circuit, then compute its Matrix Product State (MPS) factorization, and, finally, sample the MPS-factorized state. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and error handling

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <vector>
12#include <iostream>
13
14#include <cuda_runtime.h>
15#include <cutensornet.h>
16
17#define HANDLE_CUDA_ERROR(x) \
18{ const auto err = x; \
19  if( err != cudaSuccess ) \
20  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
21};
22
23#define HANDLE_CUTN_ERROR(x) \
24{ const auto err = x; \
25  if( err != CUTENSORNET_STATUS_SUCCESS ) \
26  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
27};
28
29
30int main()
31{
32  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
33
34  constexpr std::size_t fp64size = sizeof(double);

Define the tensor network state and the desired number of output samples to generate

Let’s define a tensor network state corresponding to a 16-qubit quantum circuit and request to produce 100 output samples for the full qubit register.

38  // Quantum state configuration
39  const int64_t numSamples = 100;
40  const int32_t numQubits = 16;
41  const std::vector<int64_t> qubitDims(numQubits, 2); // qubit size
42  std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n";

Initialize the cuTensorNet library handle

46  // Initialize the cuTensorNet library
47  HANDLE_CUDA_ERROR(cudaSetDevice(0));
48  cutensornetHandle_t cutnHandle;
49  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
50  std::cout << "Initialized cuTensorNet library on GPU 0\n";

Define quantum gates on GPU

54  // Define necessary quantum gate tensors in Host memory
55  const double invsq2 = 1.0 / std::sqrt(2.0);
56  //  Hadamard gate
57  const std::vector<std::complex<double>> h_gateH {{invsq2, 0.0},  {invsq2, 0.0},
58                                                   {invsq2, 0.0}, {-invsq2, 0.0}};
59  //  CX gate
60  const std::vector<std::complex<double>> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
61                                                    {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
62                                                    {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0},
63                                                    {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
64
65  // Copy quantum gates to Device memory
66  void *d_gateH{nullptr}, *d_gateCX{nullptr};
67  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
68  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size)));
69  std::cout << "Allocated GPU memory for quantum gates\n";
70  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
71  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
72  std::cout << "Copied quantum gates to GPU memory\n";

Allocate MPS tensors

Here we set the shapes of MPS tensors and allocate GPU memory for their storage.

76  // Determine the MPS representation and allocate buffer for the MPS tensors
77  const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2
78  std::vector<std::vector<int64_t>> extents;
79  std::vector<int64_t*> extentsPtr(numQubits); 
80  std::vector<void*> d_mpsTensors(numQubits, nullptr);
81  for (int32_t i = 0; i < numQubits; i++) {
82    if (i == 0) { // left boundary MPS tensor
83      extents.push_back({2, maxExtent});
84      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
85    } 
86    else if (i == numQubits-1) { // right boundary MPS tensor
87      extents.push_back({maxExtent, 2});
88      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
89    }
90    else { // middle MPS tensors
91      extents.push_back({maxExtent, 2, maxExtent});
92      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size));
93    }
94    extentsPtr[i] = extents[i].data();
95  }

Allocate the scratch buffer on GPU

 99  // Query the free memory on Device
100  std::size_t freeSize {0}, totalSize {0};
101  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
102  const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
103  void *d_scratch {nullptr};
104  HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
105  std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU: "
106            << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch))  + scratchSize) << ")\n";

Create a pure tensor network state

Now let’s create a pure tensor network state for a 16-qubit quantum circuit.

110  // Create the initial quantum state
111  cutensornetState_t quantumState;
112  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
113                    CUDA_C_64F, &quantumState));
114  std::cout << "Created the initial quantum state\n";

Apply quantum gates

Let’s construct the GHZ quantum circuit by applying the corresponding quantum gates.

118  // Construct the quantum circuit state (apply quantum gates)
119  int64_t id;
120  HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 1, std::vector<int32_t>{{0}}.data(),
121                    d_gateH, nullptr, 1, 0, 1, &id));
122  for(int32_t i = 1; i < numQubits; ++i) {
123    HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2, std::vector<int32_t>{{i-1,i}}.data(),
124                      d_gateCX, nullptr, 1, 0, 1, &id));
125  }
126  std::cout << "Applied quantum gates\n";

Request MPS factorization for the final quantum circuit state

Here we express our intent to factorize the final quantum circuit state using MPS factorization. The provided shapes of the MPS tensors refer to their maximal size limit during the MPS renormalization procedure. The actually computed shapes of the final MPS tensors may be smaller. No computation is done here yet.

130  // Specify the final target MPS representation (use default fortran strides)
131  HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, 
132                    CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr ));

Configure MPS factorization procedure

After expressing our intent to perform MPS factorization of the final quantum circuit state, we can also configure the MPS factorization procedure by resetting different options, for example, the SVD algorithm.

136  // Optional, set up the SVD method for truncation.
137  cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; 
138  HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, 
139                    CUTENSORNET_STATE_CONFIG_MPS_SVD_ALGO, &algo, sizeof(algo)));
140  std::cout << "Configured the MPS computation\n";

Prepare the computation of MPS factorization

Let’s create a workspace descriptor and prepare the computation of MPS factorization.

144  // Prepare the MPS computation and attach workspace
145  cutensornetWorkspaceDescriptor_t workDesc;
146  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
147  std::cout << "Created the workspace descriptor\n";
148  HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0));
149  std::cout << "Prepared the computation of the quantum circuit state\n";
150  double flops {0.0};
151  HANDLE_CUTN_ERROR(cutensornetStateGetInfo(cutnHandle, quantumState,
152                    CUTENSORNET_STATE_INFO_FLOPS, &flops, sizeof(flops)));
153  if(flops > 0.0) {
154    std::cout << "Total flop count = " << (flops/1e9) << " GFlop\n";
155  }else if(flops < 0.0) {
156    std::cout << "ERROR: Negative Flop count!\n";
157    std::abort();
158  }
159
160  int64_t worksize {0};
161  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
162                                                      workDesc,
163                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
164                                                      CUTENSORNET_MEMSPACE_DEVICE,
165                                                      CUTENSORNET_WORKSPACE_SCRATCH,
166                                                      &worksize));
167  std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl;
168  if(worksize <= scratchSize) {
169    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
170                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
171  }else{
172    std::cout << "ERROR: Insufficient workspace size on Device!\n";
173    std::abort();
174  }
175  std::cout << "Set the workspace buffer for MPS computation\n";

Compute MPS factorization

Once the MPS factorization procedure has been configured and prepared, let’s compute the MPS factorization of the final quantum circuit state.

179  // Execute MPS computation
180  HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, 
181                    workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0));
182  std::cout << "Computed MPS factorization\n";

Create the tensor network state sampler

Once the quantum circuit has been constructed and factorized using the MPS representation, let’s create the tensor network state sampler for the full qubit register (all qubits).

186  // Create the quantum circuit sampler
187  cutensornetStateSampler_t sampler;
188  HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler));
189  std::cout << "Created the quantum circuit sampler\n";

Configure the tensor network state sampler

Now we can configure the tensor network state sampler by setting the number of hyper-samples to be used by the tensor network contraction path finder.

193  // Configure the quantum circuit sampler
194  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
195  HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler,
196                    CUTENSORNET_SAMPLER_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));
197  std::cout << "Configured the quantum circuit sampler\n";

Prepare the tensor network state sampler

Let’s prepare the tensor network state sampler.

201  // Prepare the quantum circuit sampler
202  HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, scratchSize, workDesc, 0x0));
203  std::cout << "Prepared the quantum circuit state sampler\n";
204  flops = 0.0;
205  HANDLE_CUTN_ERROR(cutensornetSamplerGetInfo(cutnHandle, sampler,
206                    CUTENSORNET_SAMPLER_INFO_FLOPS, &flops, sizeof(flops)));
207  std::cout << "Total flop count per sample = " << (flops/1e9) << " GFlop\n";
208  if(flops <= 0.0) {
209    std::cout << "ERROR: Invalid Flop count!\n";
210    std::abort();
211  }

Set up the workspace

Now we can set up the required workspace buffer.

215  // Attach the workspace buffer
216  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
217                                                      workDesc,
218                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
219                                                      CUTENSORNET_MEMSPACE_DEVICE,
220                                                      CUTENSORNET_WORKSPACE_SCRATCH,
221                                                      &worksize));
222  std::cout << "Scratch GPU workspace size (bytes) for MPS Sampling = " << worksize << std::endl;
223  assert(worksize > 0);
224  if(worksize <= scratchSize) {
225    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
226                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
227  }else{
228    std::cout << "ERROR: Insufficient workspace size on Device!\n";
229    std::abort();
230  }
231  std::cout << "Set the workspace buffer\n";

Perform sampling of the final quantum circuit state

Once everything had been set up, we perform sampling of the quantum circuit state and print the output samples.

235  // Sample the quantum circuit state
236  std::vector<int64_t> samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory
237  HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0));
238  std::cout << "Performed quantum circuit state sampling\n";
239  std::cout << "Bit-string samples:\n";
240  for(int64_t i = 0; i < numSamples; ++i) {
241    for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j];
242    std::cout << std::endl;
243  }

Free resources

247  // Destroy the workspace descriptor
248  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
249  std::cout << "Destroyed the workspace descriptor\n";
250
251  // Destroy the quantum circuit sampler
252  HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler));
253  std::cout << "Destroyed the quantum circuit state sampler\n";
254
255  // Destroy the quantum circuit state
256  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
257  std::cout << "Destroyed the quantum circuit state\n";
258
259  for (int32_t i = 0; i < numQubits; i++) {
260    HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i]));
261  }
262  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
263  HANDLE_CUDA_ERROR(cudaFree(d_gateCX));
264  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
265  std::cout << "Freed memory on GPU\n";
266
267  // Finalize the cuTensorNet library
268  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
269  std::cout << "Finalized the cuTensorNet library\n";
270
271  return 0;
272}