Sampling the Tensor Network State

The following code example illustrates how to sample the tensor network state (e.g., sampling the final quantum circuit state). The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers and error handling

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <vector>
12#include <iostream>
13
14#include <cuda_runtime.h>
15#include <cutensornet.h>
16
17#define HANDLE_CUDA_ERROR(x) \
18{ const auto err = x; \
19  if( err != cudaSuccess ) \
20  { printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
21};
22
23#define HANDLE_CUTN_ERROR(x) \
24{ const auto err = x; \
25  if( err != CUTENSORNET_STATUS_SUCCESS ) \
26  { printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); fflush(stdout); std::abort(); } \
27};
28
29
30int main()
31{
32  static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
33
34  constexpr std::size_t fp64size = sizeof(double);

Define the tensor network state and the desired number of output samples to generate

Let’s define a tensor network state corresponding to a 16-qubit quantum circuit and request to produce 100 output samples for the full qubit register.

38  // Quantum state configuration
39  const int64_t numSamples = 100;
40  const int32_t numQubits = 16;
41  const std::vector<int64_t> qubitDims(numQubits, 2); // qubit size
42  std::cout << "Quantum circuit: " << numQubits << " qubits; " << numSamples << " samples\n";

Initialize the cuTensorNet library handle

46  // Initialize the cuTensorNet library
47  HANDLE_CUDA_ERROR(cudaSetDevice(0));
48  cutensornetHandle_t cutnHandle;
49  HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
50  std::cout << "Initialized cuTensorNet library on GPU 0\n";

Define quantum gates on GPU

54  // Define necessary quantum gate tensors in Host memory
55  const double invsq2 = 1.0 / std::sqrt(2.0);
56  //  Hadamard gate
57  const std::vector<std::complex<double>> h_gateH {{invsq2, 0.0},  {invsq2, 0.0},
58                                                   {invsq2, 0.0}, {-invsq2, 0.0}};
59  //  CX gate
60  const std::vector<std::complex<double>> h_gateCX {{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
61                                                    {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
62                                                    {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0},
63                                                    {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}};
64
65  // Copy quantum gates to Device memory
66  void *d_gateH{nullptr}, *d_gateCX{nullptr};
67  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
68  std::cout << "H gate buffer allocated on GPU: " << d_gateH << std::endl; //debug
69  HANDLE_CUDA_ERROR(cudaMalloc(&d_gateCX, 16 * (2 * fp64size)));
70  std::cout << "CX gate buffer allocated on GPU: " << d_gateCX << std::endl; //debug
71  std::cout << "Allocated quantum gate memory on GPU\n";
72  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
73  HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCX, h_gateCX.data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
74  std::cout << "Copied quantum gates to GPU memory\n";

Create a pure tensor network state

Now let’s create a pure tensor network state for a 16-qubit quantum circuit.

78  // Create the initial quantum state
79  cutensornetState_t quantumState;
80  HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
81                    CUDA_C_64F, &quantumState));
82  std::cout << "Created the initial quantum state\n";

Apply quantum gates

Let’s construct the GHZ quantum circuit by applying the corresponding quantum gates.

86  // Construct the quantum circuit state (apply quantum gates)
87  int64_t id;
88  HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 1, std::vector<int32_t>{{0}}.data(),
89                    d_gateH, nullptr, 1, 0, 1, &id));
90  for(int32_t i = 1; i < numQubits; ++i) {
91    HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2, std::vector<int32_t>{{i-1,i}}.data(),
92                      d_gateCX, nullptr, 1, 0, 1, &id));
93  }
94  std::cout << "Applied quantum gates\n";

Create the tensor network state sampler

Once the quantum circuit has been constructed, let’s create the tensor network state sampler for the full qubit register (all qubits).

 98  // Create the quantum circuit sampler
 99  cutensornetStateSampler_t sampler;
100  HANDLE_CUTN_ERROR(cutensornetCreateSampler(cutnHandle, quantumState, numQubits, nullptr, &sampler));
101  std::cout << "Created the quantum circuit sampler\n";

Configure the tensor network state sampler

Now we can configure the tensor network state sampler by setting the number of hyper-samples to be used by the tensor network contraction path finder.

105  // Configure the quantum circuit sampler
106  const int32_t numHyperSamples = 8; // desired number of hyper samples used in the tensor network contraction path finder
107  HANDLE_CUTN_ERROR(cutensornetSamplerConfigure(cutnHandle, sampler,
108    CUTENSORNET_SAMPLER_CONFIG_NUM_HYPER_SAMPLES, &numHyperSamples, sizeof(numHyperSamples)));

Prepare the tensor network state sampler

Let’s create a workspace descriptor and prepare the tensor network state sampler.

112  // Query the free memory on Device
113  std::size_t freeSize {0}, totalSize {0};
114  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
115  const std::size_t workSizeAvailable = (freeSize - (freeSize % 4096)) / 2; // use half of available memory with alignment
116
117  // Prepare the quantum circuit sampler
118  cutensornetWorkspaceDescriptor_t workDesc;
119  HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
120  HANDLE_CUTN_ERROR(cutensornetSamplerPrepare(cutnHandle, sampler, workSizeAvailable, workDesc, 0x0));
121  std::cout << "Prepared the quantum circuit state sampler\n";
122  double flops {0.0};
123  HANDLE_CUTN_ERROR(cutensornetSamplerGetInfo(cutnHandle, sampler,
124                    CUTENSORNET_SAMPLER_INFO_FLOPS, &flops, sizeof(flops)));
125  std::cout << "Total flop count per sample = " << (flops/1e9) << " GFlop\n";

Allocate the workspace buffer on GPU and setup the workspace

Allocate the required scratch workspace buffer, and provide extra buffer for cache workspace.

129  // Attach the workspace buffer
130  int64_t worksize {0};
131  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
132                                                      workDesc,
133                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
134                                                      CUTENSORNET_MEMSPACE_DEVICE,
135                                                      CUTENSORNET_WORKSPACE_SCRATCH,
136                                                      &worksize));
137  assert(worksize > 0);
138
139  void *d_scratch {nullptr};
140  if(worksize <= workSizeAvailable) {
141    HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, worksize));
142    std::cout << "Allocated " << worksize << " bytes of scratch memory on GPU: "
143              << "[" << d_scratch << ":" << (void*)(((char*)(d_scratch))  + worksize) << ")\n";
144
145    HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
146                      CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
147  }else{
148    std::cout << "ERROR: Insufficient workspace size on Device!\n";
149    std::abort();
150  }
151
152  int64_t reqCacheSize {0};
153  HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle,
154                                                      workDesc,
155                                                      CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
156                                                      CUTENSORNET_MEMSPACE_DEVICE,
157                                                      CUTENSORNET_WORKSPACE_CACHE,
158                                                      &reqCacheSize));
159
160  //query the free size again
161  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
162  // grab the minimum of [required size, or 90% of the free memory (to avoid oversubscribing)]
163  const std::size_t cacheSizeAvailable = std::min(static_cast<size_t>(reqCacheSize), size_t(freeSize * 0.9) - (size_t(freeSize * 0.9) % 4096));
164  void *d_cache {nullptr};
165  HANDLE_CUDA_ERROR(cudaMalloc(&d_cache, cacheSizeAvailable));
166  std::cout << "Allocated " << cacheSizeAvailable << " bytes of cache memory on GPU: "
167            << "[" << d_cache << ":" << (void*)(((char*)(d_cache))  + cacheSizeAvailable) << ")\n";
168  HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle,
169                                                  workDesc,
170                                                  CUTENSORNET_MEMSPACE_DEVICE,
171                                                  CUTENSORNET_WORKSPACE_CACHE,
172                                                  d_cache,
173                                                  cacheSizeAvailable));
174  std::cout << "Set the workspace buffer\n";

Perform sampling of the final quantum circuit state

Once everything had been set up, we perform sampling of the quantum circuit state and print the output samples.

178  // Sample the quantum circuit state
179  std::vector<int64_t> samples(numQubits * numSamples); // samples[SampleId][QubitId] reside in Host memory
180  HANDLE_CUTN_ERROR(cutensornetSamplerSample(cutnHandle, sampler, numSamples, workDesc, samples.data(), 0));
181  std::cout << "Performed quantum circuit state sampling\n";
182  std::cout << "Bit-string samples:\n";
183  for(int64_t i = 0; i < numSamples; ++i) {
184    for(int64_t j = 0; j < numQubits; ++j) std::cout << " " << samples[i * numQubits + j];
185    std::cout << std::endl;
186  }

Free resources

190  // Destroy the workspace descriptor
191  HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
192  std::cout << "Destroyed the workspace descriptor\n";
193
194  // Destroy the quantum circuit sampler
195  HANDLE_CUTN_ERROR(cutensornetDestroySampler(sampler));
196  std::cout << "Destroyed the quantum circuit state sampler\n";
197
198  // Destroy the quantum circuit state
199  HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
200  std::cout << "Destroyed the quantum circuit state\n";
201
202  HANDLE_CUDA_ERROR(cudaFree(d_scratch));
203  HANDLE_CUDA_ERROR(cudaFree(d_cache));
204  HANDLE_CUDA_ERROR(cudaFree(d_gateCX));
205  HANDLE_CUDA_ERROR(cudaFree(d_gateH));
206  std::cout << "Freed memory on GPU\n";
207
208  // Finalize the cuTensorNet library
209  HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
210  std::cout << "Finalized the cuTensorNet library\n";
211
212  return 0;
213}