Sampling the tensor network state

The following code example illustrates how to sample the tensor network state (e.g., sampling the final quantum circuit state). The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and error handling

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <vector>
12#include <iostream>
13
14#include <cuda_runtime.h>
15#include <cutensornet.h>
16
17#define HANDLE_CUDA_ERROR(x) \
18{ const auto err = x; \
19  if( err != cudaSuccess ) \
20  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
21};
22
23#define HANDLE_CUTN_ERROR(x) \
24{ const auto err = x; \
25  if( err != CUTENSORNET_STATUS_SUCCESS ) \
26  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
27};
28
29
30int main()
31{
32  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
33
34  constexpr std::size_t fp64size = sizeof(double);

Define the tensor network state and the desired number of output samples to generate

Let’s define a tensor network state corresponding to a 16-qubit quantum circuit and request to produce 100 output samples for the full qubit register.

38  // Quantum state configuration
39  const int64_t numSamples = 100;
40  const int32_t numQubits = 16;
41  const std::vector<int64_t> qubitDims(numQubits, 2); // qubit size
42  std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n";

Initialize the cuTensorNet library handle

46  // Initialize the cuTensorNet library
47  HANDLE_CUDA_ERROR(cudaSetDevice(0));
48  cutensornetHandle_t cutnHandle;
49  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
50  std::cout << "Initialized cuTensorNet library on GPU 0\n";

Define quantum gates on GPU

54  // Define necessary quantum gate tensors in Host memory
55  const double invsq2 = 1.0 / std::sqrt(2.0);
56  //  Hadamard gate
57  const std::vector<std::complex<double>> h_gateH {{invsq2, 0.0},  {invsq2, 0.0},
58                                                   {invsq2, 0.0}, {-invsq2, 0.0}};
59  //  CX gate
60  const std::vector<std::complex<double>> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
61                                                    {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
62                                                    {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0},
63                                                    {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
64
65  // Copy quantum gates to Device memory
66  void *d_gateH{nullptr}, *d_gateCX{nullptr};
67  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
68  std::cout << "H gate buffer allocated on GPU: " << d_gateH << std::endl; //debug
69  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size)));
70  std::cout << "CX gate buffer allocated on GPU: " << d_gateCX << std::endl; //debug
71  std::cout << "Allocated quantum gate memory on GPU\n";
72  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
73  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
74  std::cout << "Copied quantum gates to GPU memory\n";

Create a pure tensor network state

Now let’s create a pure tensor network state for a 16-qubit quantum circuit.

78  // Create the initial quantum state
79  cutensornetState_t quantumState;
80  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
81                    CUDA_C_64F, &quantumState));
82  std::cout << "Created the initial quantum state\n";

Apply quantum gates

Let’s construct the GHZ quantum circuit by applying the corresponding quantum gates.

86  // Construct the quantum circuit state (apply quantum gates)
87  int64_t id;
88  HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 1, std::vector<int32_t>{{0}}.data(),
89                    d_gateH, nullptr, 1, 0, 1, &id));
90  for(int32_t i = 1; i < numQubits; ++i) {
91    HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2, std::vector<int32_t>{{i-1,i}}.data(),
92                      d_gateCX, nullptr, 1, 0, 1, &id));
93  }
94  std::cout << "Applied quantum gates\n";

Create the tensor network state sampler

Once the quantum circuit has been constructed, let’s create the tensor network state sampler for the full qubit register (all qubits).

 98  // Create the quantum circuit sampler
 99  cutensornetStateSampler_t sampler;
100  HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler));
101  std::cout << "Created the quantum circuit sampler\n";

Configure the tensor network state sampler

Now we can configure the tensor network state sampler by setting the number of hyper-samples to be used by the tensor network contraction path finder.

105  // Configure the quantum circuit sampler
106  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
107  HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler,
108    CUTENSORNET_SAMPLER_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));
109  const int32_t rndSeed = 13; // explicit random seed to get the same results each run
110  HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler,
111    CUTENSORNET_SAMPLER_CONFIG_DETERMINISTIC, &rndSeed, sizeof(rndSeed)));

Prepare the tensor network state sampler

Let’s create a workspace descriptor and prepare the tensor network state sampler.

115  // Query the free memory on Device
116  std::size_t freeSize {0}, totalSize {0};
117  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
118  const std::size_t workSizeAvailable = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
119
120  // Prepare the quantum circuit sampler
121  cutensornetWorkspaceDescriptor_t workDesc;
122  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
123  HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, workSizeAvailable, workDesc, 0x0));
124  std::cout << "Prepared the quantum circuit state sampler\n";
125  double flops {0.0};
126  HANDLE_CUTN_ERROR(cutensornetSamplerGetInfo(cutnHandle, sampler,
127                    CUTENSORNET_SAMPLER_INFO_FLOPS, &flops, sizeof(flops)));
128  std::cout << "Total flop count per sample = " << (flops/1e9) << " GFlop\n";

Allocate the workspace buffer on GPU and setup the workspace

Allocate the required scratch workspace buffer, and provide extra buffer for cache workspace.

132  // Attach the workspace buffer
133  int64_t worksize {0};
134  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
135                                                      workDesc,
136                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
137                                                      CUTENSORNET_MEMSPACE_DEVICE,
138                                                      CUTENSORNET_WORKSPACE_SCRATCH,
139                                                      &worksize));
140  assert(worksize > 0);
141
142  void *d_scratch {nullptr};
143  if(worksize <= workSizeAvailable) {
144    HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, worksize));
145    std::cout << "Allocated " << worksize << " bytes of scratch memory on GPU: "
146              << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch))  + worksize) << ")\n";
147
148    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
149                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
150  }else{
151    std::cout << "ERROR: Insufficient workspace size on Device!\n";
152    std::abort();
153  }
154
155  int64_t reqCacheSize {0};
156  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
157                                                      workDesc,
158                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
159                                                      CUTENSORNET_MEMSPACE_DEVICE,
160                                                      CUTENSORNET_WORKSPACE_CACHE,
161                                                      &reqCacheSize));
162
163  //query the free size again
164  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
165  // grab the minimum of [required size, or 90% of the free memory (to avoid oversubscribing)]
166  const std::size_t cacheSizeAvailable = std::min(static_cast<size_t>(reqCacheSize), size_t(freeSize * 0.9) - (size_t(freeSize * 0.9) % 4096));
167  void *d_cache {nullptr};
168  HANDLE_CUDA_ERROR(cudaMalloc(&d_cache, cacheSizeAvailable));
169  std::cout << "Allocated " << cacheSizeAvailable << " bytes of cache memory on GPU: "
170            << "[" << d_cache << ":" << (void*)(((char*)(d_cache))  + cacheSizeAvailable) << ")\n";
171  HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle,
172                                                  workDesc,
173                                                  CUTENSORNET_MEMSPACE_DEVICE,
174                                                  CUTENSORNET_WORKSPACE_CACHE,
175                                                  d_cache,
176                                                  cacheSizeAvailable));
177  std::cout << "Set the workspace buffer\n";

Perform sampling of the final quantum circuit state

Once everything had been set up, we perform sampling of the quantum circuit state and print the output samples.

181  // Sample the quantum circuit state
182  std::vector<int64_t> samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory
183  HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0));
184  std::cout << "Performed quantum circuit state sampling\n";
185  std::cout << "Bit-string samples:\n";
186  for(int64_t i = 0; i < numSamples; ++i) {
187    for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j];
188    std::cout << std::endl;
189  }

Free resources

193  // Destroy the workspace descriptor
194  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
195  std::cout << "Destroyed the workspace descriptor\n";
196
197  // Destroy the quantum circuit sampler
198  HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler));
199  std::cout << "Destroyed the quantum circuit state sampler\n";
200
201  // Destroy the quantum circuit state
202  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
203  std::cout << "Destroyed the quantum circuit state\n";
204
205  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
206  HANDLE_CUDA_ERROR(cudaFree(d_cache));
207  HANDLE_CUDA_ERROR(cudaFree(d_gateCX));
208  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
209  std::cout << "Freed memory on GPU\n";
210
211  // Finalize the cuTensorNet library
212  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
213  std::cout << "Finalized the cuTensorNet library\n";
214
215  return 0;
216}