Sampling the Matrix Product State (QFT Circuit)

The following code example illustrates how to define a tensor network state for a given quantum circuit (QFT), then compute its Matrix Product State (MPS) factorization, and, finally, sample the MPS-factorized state. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and error handling

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <cmath>
12#include <vector>
13#include <iostream>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18#define HANDLE_CUDA_ERROR(x) \
19{ const auto err = x; \
20  if( err != cudaSuccess ) \
21  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
22};
23
24#define HANDLE_CUTN_ERROR(x) \
25{ const auto err = x; \
26  if( err != CUTENSORNET_STATUS_SUCCESS ) \
27  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
28};
29
30
31int main()
32{
33  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
34
35  constexpr std::size_t fp64size = sizeof(double);

Define the tensor network state and the desired number of output samples to generate

Let’s define a tensor network state for a quantum circuit with the given number of qubits and request to produce a given number of output samples for the full qubit register.

39  // Quantum state configuration
40  const int64_t numSamples = 128; // number of samples to generate
41  const int32_t numQubits = 16;   // number of qubits
42  const std::vector<int64_t> qubitDims(numQubits, 2); // qubit dimensions
43  std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n";

Initialize the cuTensorNet library handle

47  // Initialize the cuTensorNet library
48  HANDLE_CUDA_ERROR(cudaSetDevice(0));
49  cutensornetHandle_t cutnHandle;
50  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
51  std::cout << "Initialized cuTensorNet library on GPU 0\n";

Define quantum gates in GPU memory

55  // Define necessary quantum gate tensors in Host memory
56  const double invsq2 = 1.0 / std::sqrt(2.0);
57  const double pi = 3.14159265358979323846;
58  using GateData = std::vector<std::complex<double>>;
59  //  CR(k) gate generator
60  auto cRGate = [&pi] (int32_t k) {
61    const double phi = pi * 2.0 / std::exp2(k);
62    const GateData cr {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
63                       {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
64                       {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0},
65                       {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {std::cos(phi), std::sin(phi)}};
66    return cr;
67  };
68  //  Hadamard gate
69  const GateData h_gateH {{invsq2, 0.0}, {invsq2, 0.0},
70                          {invsq2, 0.0}, {-invsq2, 0.0}};
71  //  CR(k) gates (controlled rotations)
72  std::vector<GateData> h_gateCR(numQubits);
73  for(int32_t k = 0; k < numQubits; ++k) {
74    h_gateCR[k] = cRGate(k+1);
75  }
76
77  // Copy quantum gates into Device memory
78  void *d_gateH {nullptr};
79  std::vector<void*> d_gateCR(numQubits, nullptr);
80  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
81  for(int32_t k = 0; k < numQubits; ++k) {
82    HANDLE_CUDA_ERROR(cudaMalloc(&(d_gateCR[k]), 16 * (2 * fp64size)));
83  }
84  std::cout << "Allocated GPU memory for quantum gates\n";
85  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
86  for(int32_t k = 0; k < numQubits; ++k) {
87    HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCR[k], h_gateCR[k].data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
88  }
89  std::cout << "Copied quantum gates into GPU memory\n";

Allocate MPS tensors in GPU memory

Here we set the shapes of MPS tensors and allocate GPU memory for their storage.

 93  // Define the MPS representation and allocate memory buffers for the MPS tensors
 94  const int64_t maxExtent = 8; // max bond dimension (level of low-rank MPS approximation)
 95  std::vector<std::vector<int64_t>> extents;
 96  std::vector<int64_t*> extentsPtr(numQubits);
 97  std::vector<void*> d_mpsTensors(numQubits, nullptr);
 98  for (int32_t i = 0; i < numQubits; ++i) {
 99    if (i == 0) { // left boundary MPS tensor
100      extents.push_back({2, maxExtent});
101      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * (2 * fp64size)));
102    } 
103    else if (i == numQubits-1) { // right boundary MPS tensor
104      extents.push_back({maxExtent, 2});
105      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * (2 * fp64size)));
106    }
107    else { // middle MPS tensors
108      extents.push_back({maxExtent, 2, maxExtent});
109      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * (2 * fp64size)));
110    }
111    extentsPtr[i] = extents[i].data();
112  }
113  std::cout << "Allocated GPU memory for MPS tensors\n";

Allocate the scratch buffer on GPU

117  // Query free memory on Device and allocate a scratch buffer
118  std::size_t freeSize {0}, totalSize {0};
119  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
120  const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
121  void *d_scratch {nullptr};
122  HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
123  std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU: "
124            << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch))  + scratchSize) << ")\n";

Create a pure tensor network state

Now let’s create a pure tensor network state for a quantum circuit with the given number of qubits.

128  // Create the initial quantum state
129  cutensornetState_t quantumState;
130  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
131                    CUDA_C_64F, &quantumState));
132  std::cout << "Created the initial (vacuum) quantum state\n";

Apply quantum gates

Let’s construct the QFT quantum circuit with no bit reversal by applying the corresponding quantum gates.

136  // Construct the QFT quantum circuit state (apply quantum gates)
137  //  Example of a QFT circuit with 3 qubits (with no bit reversal):
138  //  Q0--H--CR2--CR3-------------
139  //         |    |
140  //  Q1-----o----|----H--CR2-----
141  //              |       |
142  //  Q2----------o-------o----H--
143  int64_t id;
144  for(int32_t i = 0; i < numQubits; ++i) {
145    HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 1, std::vector<int32_t>{{i}}.data(),
146                      d_gateH, nullptr, 1, 0, 1, &id));
147    for(int32_t j = (i+1); j < numQubits; ++j) {
148      HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2, std::vector<int32_t>{{j,i}}.data(),
149                        d_gateCR[j-i], nullptr, 1, 0, 1, &id));
150    }
151  }
152  std::cout << "Applied quantum gates\n";

Request MPS factorization for the final quantum circuit state

Here we express our intent to factorize the final quantum circuit state using MPS factorization. The provided shapes (mode extents) of the MPS tensors refer to their maximal size limit during the MPS renormalization procedure. The actually computed shapes (mode extents) of the final MPS tensors may be smaller than their limits. Note that no computation is done here yet.

156  // Specify the target MPS representation (use default strides)
157  HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, 
158                    CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr ));
159  std::cout << "Finalized the form of the MPS representation\n";

Configure MPS factorization procedure

After expressing our intent to perform MPS factorization of the final quantum circuit state, we can also configure the MPS factorization procedure by resetting different options, for example, the SVD algorithm.

163  // Set up the SVD method for bonds truncation (optional)
164  cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; 
165  HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, 
166                    CUTENSORNET_STATE_MPS_SVD_CONFIG_ALGO, &algo, sizeof(algo)));
167  std::cout << "Configured the MPS computation\n";

Prepare the computation of MPS factorization

Let’s create a workspace descriptor and prepare the computation of the MPS factorization of the final quantum circuit state.

171  // Prepare the MPS computation and attach a workspace buffer
172  cutensornetWorkspaceDescriptor_t workDesc;
173  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
174  std::cout << "Created the workspace descriptor\n";
175  HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0));
176  int64_t worksize {0};
177  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
178                                                      workDesc,
179                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
180                                                      CUTENSORNET_MEMSPACE_DEVICE,
181                                                      CUTENSORNET_WORKSPACE_SCRATCH,
182                                                      &worksize));
183  std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl;
184  if(worksize <= scratchSize) {
185    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
186                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
187  }else{
188    std::cout << "ERROR: Insufficient workspace size on Device!\n";
189    std::abort();
190  }
191  std::cout << "Set the workspace buffer for MPS computation\n";

Compute MPS factorization

Once the MPS factorization procedure has been configured and prepared, let’s compute the MPS factorization of the final quantum circuit state.

195  // Compute the MPS factorization of the quantum circuit state
196  HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, 
197                    workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0));
198  std::cout << "Computed the MPS factorization of the quantum circuit state\n";
199  

Create the tensor network state sampler

Once the quantum circuit state has been constructed and factorized using the MPS representation, let’s create the tensor network state sampler for the full qubit register (all qubits).

202  // Create the quantum circuit sampler
203  cutensornetStateSampler_t sampler;
204  HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler));
205  std::cout << "Created the quantum circuit sampler\n";

Configure the tensor network state sampler

Optionally, we can configure the tensor network state sampler by setting the number of hyper-samples to be used by the tensor network contraction path finder.

209  // Configure the quantum circuit sampler
210  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
211  HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler,
212                    CUTENSORNET_SAMPLER_OPT_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));

Prepare the tensor network state sampler

Now let’s prepare the tensor network state sampler.

216  // Prepare the quantum circuit sampler
217  HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, scratchSize, workDesc, 0x0));
218  std::cout << "Prepared the quantum circuit state sampler\n";

Set up the workspace

Now we can set up the required workspace buffer.

222  // Attach the workspace buffer
223  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
224                                                      workDesc,
225                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
226                                                      CUTENSORNET_MEMSPACE_DEVICE,
227                                                      CUTENSORNET_WORKSPACE_SCRATCH,
228                                                      &worksize));
229  std::cout << "Scratch GPU workspace size (bytes) for MPS Sampling = " << worksize << std::endl;
230  assert(worksize > 0);
231  if(worksize <= scratchSize) {
232    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
233                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
234  }else{
235    std::cout << "ERROR: Insufficient workspace size on Device!\n";
236    std::abort();
237  }
238  std::cout << "Set the workspace buffer\n";

Perform sampling of the final quantum circuit state

Once everything had been set up, we perform sampling of the quantum circuit state and print the output samples.

242  // Sample the quantum circuit state
243  std::vector<int64_t> samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory
244  HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0));
245  std::cout << "Performed quantum circuit state sampling\n";
246  std::cout << "Bit-string samples:\n";
247  for(int64_t i = 0; i < numSamples; ++i) {
248    for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j];
249    std::cout << std::endl;
250  }

Free resources

254  // Destroy the workspace descriptor
255  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
256  std::cout << "Destroyed the workspace descriptor\n";
257
258  // Destroy the quantum circuit sampler
259  HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler));
260  std::cout << "Destroyed the quantum circuit state sampler\n";
261
262  // Destroy the quantum circuit state
263  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
264  std::cout << "Destroyed the quantum circuit state\n";
265
266  // Free GPU buffers
267  for (int32_t i = 0; i < numQubits; i++) {
268    HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i]));
269  }
270  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
271  for(auto ptr: d_gateCR) HANDLE_CUDA_ERROR(cudaFree(ptr));
272  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
273  std::cout << "Freed memory on GPU\n";
274
275  // Finalize the cuTensorNet library
276  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
277  std::cout << "Finalized the cuTensorNet library\n";
278
279  return 0;
280}