Computing Matrix Product State (MPS) Amplitudes

The following code example illustrates how to define a tensor network state, factorize it as a Matrix Product State (MPS), and then compute a slice of amplitudes of the factorized MPS state. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and error handling

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <vector>
12#include <bitset>
13#include <iostream>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18
19#define HANDLE_CUDA_ERROR(x) \
20{ const auto err = x; \
21  if( err != cudaSuccess ) \
22  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
23};
24
25#define HANDLE_CUTN_ERROR(x) \
26{ const auto err = x; \
27  if( err != CUTENSORNET_STATUS_SUCCESS ) \
28  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
29};
30
31
32int main()
33{
34  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
35
36  constexpr std::size_t fp64size = sizeof(double);

Define the tensor network state and the desired slice of state amplitudes

Let’s define a tensor network state corresponding to a 6-qubit quantum circuit and request a slice of state amplitudes where qubits 0 and 1 are fixed at value 1.

40  // Quantum state configuration
41  constexpr int32_t numQubits = 6; // number of qubits
42  const std::vector<int64_t> qubitDims(numQubits,2); // qubit dimensions
43  const std::vector<int32_t> fixedModes({0,1}); // fixed modes in the output amplitude tensor (must be in acsending order)
44  const std::vector<int64_t> fixedValues({1,1}); // values of the fixed modes in the output amplitude tensor
45  const int32_t numFixedModes = fixedModes.size(); // number of fixed modes in the output amplitude tensor
46  std::cout << "Quantum circuit: " << numQubits << " qubits\n";

Initialize the cuTensorNet library handle

50  // Initialize the cuTensorNet library
51  HANDLE_CUDA_ERROR(cudaSetDevice(0));
52  cutensornetHandle_t cutnHandle;
53  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
54  std::cout << "Initialized cuTensorNet library on GPU 0\n";

Define quantum gates on GPU

58  // Define necessary quantum gate tensors in Host memory
59  const double invsq2 = 1.0 / std::sqrt(2.0);
60  //  Hadamard gate
61  const std::vector<std::complex<double>> h_gateH {{invsq2, 0.0},  {invsq2, 0.0},
62                                                   {invsq2, 0.0}, {-invsq2, 0.0}};
63  //  CX gate
64  const std::vector<std::complex<double>> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
65                                                    {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
66                                                    {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0},
67                                                    {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
68
69  // Copy quantum gates to Device memory
70  void *d_gateH{nullptr}, *d_gateCX{nullptr};
71  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
72  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size)));
73  std::cout << "Allocated quantum gate memory on GPU\n";
74  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
75  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
76  std::cout << "Copied quantum gates to GPU memory\n";

Allocate MPS tensors

Here we set the shapes of MPS tensors and allocate GPU memory for their storage.

80  // Determine the MPS representation and allocate buffers for the MPS tensors
81  const int64_t maxExtent = 2; // GHZ state can be exactly represented with max bond dimension of 2
82  std::vector<std::vector<int64_t>> extents;
83  std::vector<int64_t*> extentsPtr(numQubits); 
84  std::vector<void*> d_mpsTensors(numQubits, nullptr);
85  for (int32_t i = 0; i < numQubits; i++) {
86    if (i == 0) { // left boundary MPS tensor
87      extents.push_back({2, maxExtent});
88      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
89    }
90    else if (i == numQubits-1) { // right boundary MPS tensor
91      extents.push_back({maxExtent, 2});
92      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * 2 * fp64size));
93    }
94    else { // middle MPS tensors
95      extents.push_back({maxExtent, 2, maxExtent});
96      HANDLE_CUDA_ERROR(cudaMalloc(&d_mpsTensors[i], 2 * maxExtent * maxExtent * 2 * fp64size));
97    }
98    extentsPtr[i] = extents[i].data();
99  }

Allocate the amplitudes slice tensor on GPU

Here we allocate GPU memory for the requested amplitudes slice tensor.

103  // Allocate Device memory for the specified slice of the quantum circuit amplitudes tensor
104  void *d_amp{nullptr};
105  std::size_t ampSize = 1;
106  for(const auto & qubitDim: qubitDims) ampSize *= qubitDim; // all state modes (full size)
107  for(const auto & fixedMode: fixedModes) ampSize /= qubitDims[fixedMode]; // fixed state modes reduce the slice size
108  HANDLE_CUDA_ERROR(cudaMalloc(&d_amp, ampSize * (2 * fp64size)));
109  std::cout << "Allocated memory for the specified slice of the quantum circuit amplitude tensor of size "
110            << ampSize << " elements\n";

Allocate the scratch buffer on GPU

114  // Query the free memory on Device
115  std::size_t freeSize{0}, totalSize{0};
116  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
117  const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
118  void *d_scratch{nullptr};
119  HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
120  std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n";

Create a pure tensor network state

Now let’s create a pure tensor network state for a 6-qubit quantum circuit.

124  // Create the initial quantum state
125  cutensornetState_t quantumState;
126  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
127                    CUDA_C_64F, &quantumState));
128  std::cout << "Created the initial quantum state\n";

Apply quantum gates

Let’s construct the GHZ quantum circuit by applying the corresponding quantum gates.

132  // Construct the final quantum circuit state (apply quantum gates) for the GHZ circuit
133  int64_t id;
134  HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 1, std::vector<int32_t>{{0}}.data(),
135                    d_gateH, nullptr, 1, 0, 1, &id));
136  for(int32_t i = 1; i < numQubits; ++i) {
137    HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2, std::vector<int32_t>{{i-1,i}}.data(),
138                      d_gateCX, nullptr, 1, 0, 1, &id));
139  }
140  std::cout << "Applied quantum gates\n";

Request MPS factorization for the final quantum circuit state

Here we express our intent to factorize the final quantum circuit state using MPS factorization. The provided shapes of the MPS tensors refer to their maximal size limit during the MPS renormalization procedure. The actually computed shapes of the final MPS tensors may be smaller. No computation is done here yet.

144  // Specify the final target MPS representation (use default fortran strides)
145  HANDLE_CUTN_ERROR(cutensornetStateFinalizeMPS(cutnHandle, quantumState, 
146                    CUTENSORNET_BOUNDARY_CONDITION_OPEN, extentsPtr.data(), /*strides=*/nullptr));
147  std::cout << "Requested the final MPS factorization of the quantum circuit state\n";

Configure MPS factorization procedure

After expressing our intent to perform MPS factorization of the final quantum circuit state, we can also configure the MPS factorization procedure by resetting different options, for example, the SVD algorithm.

151  // Optional, set up the SVD method for MPS truncation.
152  cutensornetTensorSVDAlgo_t algo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; 
153  HANDLE_CUTN_ERROR(cutensornetStateConfigure(cutnHandle, quantumState, 
154                    CUTENSORNET_STATE_CONFIG_MPS_SVD_ALGO, &algo, sizeof(algo)));
155  std::cout << "Configured the MPS factorization computation\n";

Prepare the computation of MPS factorization

Let’s create a workspace descriptor and prepare the computation of MPS factorization.

159  // Prepare the MPS computation and attach workspace
160  cutensornetWorkspaceDescriptor_t workDesc;
161  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
162  std::cout << "Created the workspace descriptor\n";
163  HANDLE_CUTN_ERROR(cutensornetStatePrepare(cutnHandle, quantumState, scratchSize, workDesc, 0x0));
164  std::cout << "Prepared the computation of the quantum circuit state\n";
165  double flops {0.0};
166  HANDLE_CUTN_ERROR(cutensornetStateGetInfo(cutnHandle, quantumState,
167                    CUTENSORNET_STATE_INFO_FLOPS, &flops, sizeof(flops)));
168  if(flops > 0.0) {
169    std::cout << "Total flop count = " << (flops/1e9) << " GFlop\n";
170  }else if(flops < 0.0) {
171    std::cout << "ERROR: Negative Flop count!\n";
172    std::abort();
173  }
174
175  int64_t worksize {0};
176  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
177                                                      workDesc,
178                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
179                                                      CUTENSORNET_MEMSPACE_DEVICE,
180                                                      CUTENSORNET_WORKSPACE_SCRATCH,
181                                                      &worksize));
182  std::cout << "Scratch GPU workspace size (bytes) for MPS computation = " << worksize << std::endl;
183  if(worksize <= scratchSize) {
184    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
185                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
186  }else{
187    std::cout << "ERROR: Insufficient workspace size on Device!\n";
188    std::abort();
189  }
190  std::cout << "Set the workspace buffer for the MPS factorization computation\n";

Compute MPS factorization

Once the MPS factorization procedure has been configured and prepared, let’s compute the MPS factorization of the final quantum circuit state.

194  // Execute MPS computation
195  HANDLE_CUTN_ERROR(cutensornetStateCompute(cutnHandle, quantumState, 
196                    workDesc, extentsPtr.data(), /*strides=*/nullptr, d_mpsTensors.data(), 0));
197  std::cout << "Computed the MPS factorization\n";

Create the state amplitudes accessor

Once the factorized MPS representation of the final quantum circuit state has been computed, let’s create the amplitudes accessor object that will compute the requested slice of state amplitudes.

201  // Specify the quantum circuit amplitudes accessor
202  cutensornetStateAccessor_t accessor;
203  HANDLE_CUTN_ERROR(cutensornetCreateAccessor(cutnHandle, quantumState, numFixedModes, fixedModes.data(),
204                    nullptr, &accessor)); // using default strides
205  std::cout << "Created the specified quantum circuit amplitudes accessor\n";

Configure the state amplitudes accessor

Now we can configure the state amplitudes accessor object by setting the number of hyper-samples to be used by the tensor network contraction path finder.

209  // Configure the computation of the slice of the specified quantum circuit amplitudes tensor
210  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
211  HANDLE_CUTN_ERROR(cutensornetAccessorConfigure(cutnHandle, accessor,
212                    CUTENSORNET_ACCESSOR_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));

Prepare the computation of the amplitudes slice tensor

Let’s prepare the computation of the amplitudes slice tensor.

216  // Prepare the computation of the specified slice of the quantum circuit amplitudes tensor
217  HANDLE_CUTN_ERROR(cutensornetAccessorPrepare(cutnHandle, accessor, scratchSize, workDesc, 0x0));
218  std::cout << "Prepared the computation of the specified slice of the quantum circuit amplitudes tensor\n";
219  flops = 0.0;
220  HANDLE_CUTN_ERROR(cutensornetAccessorGetInfo(cutnHandle, accessor,
221                    CUTENSORNET_ACCESSOR_INFO_FLOPS, &flops, sizeof(flops)));
222  std::cout << "Total flop count = " << (flops/1e9) << " GFlop\n";
223  if(flops <= 0.0) {
224    std::cout << "ERROR: Invalid Flop count!\n";
225    std::abort();
226  }

Set up the workspace

Now we can set up the required workspace buffer.

230  // Attach the workspace buffer
231  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
232                                                      workDesc,
233                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
234                                                      CUTENSORNET_MEMSPACE_DEVICE,
235                                                      CUTENSORNET_WORKSPACE_SCRATCH,
236                                                      &worksize));
237  std::cout << "Required scratch GPU workspace size (bytes) = " << worksize << std::endl;
238  if(worksize <= scratchSize) {
239    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
240                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
241  }else{
242    std::cout << "ERROR: Insufficient workspace size on Device!\n";
243    std::abort();
244  }
245  std::cout << "Set the workspace buffer\n";

Compute the specified slice of state amplitudes

Once everything has been set up, we compute the requested slice of state amplitudes, copy it back to Host memory, and print it.

249  // Compute the specified slice of the quantum circuit amplitudes tensor
250  std::complex<double> stateNorm2{0.0,0.0};
251  HANDLE_CUTN_ERROR(cutensornetAccessorCompute(cutnHandle, accessor, fixedValues.data(),
252                    workDesc, d_amp, static_cast<void*>(&stateNorm2), 0x0));
253  std::cout << "Computed the specified slice of the quantum circuit amplitudes tensor\n";
254  std::vector<std::complex<double>> h_amp(ampSize);
255  HANDLE_CUDA_ERROR(cudaMemcpy(h_amp.data(), d_amp, ampSize * (2 * fp64size), cudaMemcpyDeviceToHost));
256  std::cout << "Amplitudes slice for " << (numQubits - numFixedModes) << " qubits:\n";
257  for(std::size_t i = 0; i < ampSize; ++i) {
258    std::cout << " " << h_amp[i] << std::endl;
259  }
260  std::cout << "Squared 2-norm of the state = (" << stateNorm2.real() << ", " << stateNorm2.imag() << ")\n";

Free resources

264  // Destroy the workspace descriptor
265  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
266  std::cout << "Destroyed the workspace descriptor\n";
267
268  // Destroy the quantum circuit amplitudes accessor
269  HANDLE_CUTN_ERROR(cutensornetDestroyAccessor(accessor));
270  std::cout << "Destroyed the quantum circuit amplitudes accessor\n";
271
272  // Destroy the quantum circuit state
273  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
274  std::cout << "Destroyed the quantum circuit state\n";
275
276  for (int32_t i = 0; i < numQubits; i++) {
277    HANDLE_CUDA_ERROR(cudaFree(d_mpsTensors[i]));
278  }
279  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
280  HANDLE_CUDA_ERROR(cudaFree(d_amp));
281  HANDLE_CUDA_ERROR(cudaFree(d_gateCX));
282  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
283  std::cout << "Freed memory on GPU\n";
284
285  // Finalize the cuTensorNet library
286  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
287  std::cout << "Finalized the cuTensorNet library\n";
288
289  return 0;
290}