Projection MPS Circuit DMRG

The following code example illustrates how to use the state projection MPS API to perform circuit simulation using DMRG optimization, following the method described by [Ayral et al., 2023]. The full code can be found in the NVIDIA/cuQuantum repository (here).

Headers, error handling, and helper functions

 7#include <cstdlib>
 8#include <cstdio>
 9#include <cassert>
10#include <complex>
11#include <cmath>
12#include <vector>
13#include <iostream>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18#define HANDLE_CUDA_ERROR(x)                                                         \
19    {                                                                                \
20        const auto err = x;                                                          \
21        if (err != cudaSuccess)                                                      \
22        {                                                                            \
23            printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); \
24            fflush(stdout);                                                          \
25            std::abort();                                                            \
26        }                                                                            \
27    };
28
29#define HANDLE_CUTN_ERROR(x)                                                                       \
30    {                                                                                              \
31        const auto err = x;                                                                        \
32        if (err != CUTENSORNET_STATUS_SUCCESS)                                                     \
33        {                                                                                          \
34            printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
35            fflush(stdout);                                                                        \
36            std::abort();                                                                          \
37        }                                                                                          \
38    };
39
40// Helper function to compute maximum bond extents
41std::vector<int64_t> getMaxBondExtents(const std::vector<int64_t>& stateModeExtents, int64_t maxdim = -1)
42{
43    const int32_t numQubits = stateModeExtents.size();
44    std::vector<int64_t> cumprodLeft(numQubits), cumprodRight(numQubits);
45
46    // Compute cumulative products from left and right
47    cumprodLeft[0] = stateModeExtents[0];
48    for (int32_t i = 1; i < numQubits; ++i)
49    {
50        cumprodLeft[i] = cumprodLeft[i - 1] * stateModeExtents[i];
51    }
52
53    cumprodRight[numQubits - 1] = stateModeExtents[numQubits - 1];
54    for (int32_t i = numQubits - 2; i >= 0; --i)
55    {
56        cumprodRight[i] = cumprodRight[i + 1] * stateModeExtents[i];
57    }
58
59    std::vector<int64_t> maxBondExtents(numQubits - 1);
60    for (int32_t i = 0; i < numQubits - 1; ++i)
61    {
62        int64_t minVal    = std::min(cumprodLeft[i], cumprodRight[i + 1]);
63        maxBondExtents[i] = (maxdim > 0) ? std::min(minVal, maxdim) : minVal;
64    }
65
66    return maxBondExtents;
67}
68
69int main()
70{
71    static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
72
73    constexpr std::size_t fp64size = sizeof(double);

Define the tensor network state and quantum circuit configuration

Let’s define a tensor network state corresponding to a 16-qubit QFT quantum circuit.

77    // Quantum state configuration
78    const int32_t numQubits = 16;
79    const std::vector<int64_t> qubitDims(numQubits, 2);
80    std::cout << "DMRG quantum circuit with " << numQubits << " qubits\n";

Initialize the cuTensorNet library handle

84    // Initialize the cuTensorNet library
85    HANDLE_CUDA_ERROR(cudaSetDevice(0));
86    cutensornetHandle_t cutnHandle;
87    HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
88    std::cout << "Initialized cuTensorNet library on GPU 0\n";

Define quantum gates on GPU

 92    // Define necessary quantum gate tensors in Host memory
 93    const double invsq2 = 1.0 / std::sqrt(2.0);
 94    const double pi     = 3.14159265358979323846;
 95    using GateData      = std::vector<std::complex<double>>;
 96
 97    // CR(k) gate generator
 98    auto cRGate = [&pi](int32_t k)
 99    {
100        const double phi = pi / std::pow(2.0, k);
101        const GateData cr{{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
102                          {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
103                          {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0},
104                          {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {std::cos(phi), std::sin(phi)}};
105        return cr;
106    };
107
108    // Hadamard gate
109    const GateData h_gateH{{invsq2, 0.0}, {invsq2, 0.0}, {invsq2, 0.0}, {-invsq2, 0.0}};
110
111    // CR(k) gates (controlled rotations)
112    std::vector<GateData> h_gateCR(numQubits);
113    for (int32_t k = 0; k < numQubits; ++k)
114    {
115        h_gateCR[k] = cRGate(k + 1);
116    }
117
118    // Copy quantum gates into Device memory
119    void* d_gateH{nullptr};
120    std::vector<void*> d_gateCR(numQubits, nullptr);
121    HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
122    for (int32_t k = 0; k < numQubits; ++k)
123    {
124        HANDLE_CUDA_ERROR(cudaMalloc(&(d_gateCR[k]), 16 * (2 * fp64size)));
125    }
126
127    HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
128    for (int32_t k = 0; k < numQubits; ++k)
129    {
130        HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCR[k], h_gateCR[k].data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
131    }
132    std::cout << "Copied quantum gates into GPU memory\n";

Create initial quantum state and apply gates

Here we query the available GPU memory and allocate a scratch buffer for computations, followed by creating the pure quantum state and then applying the quantum gates.

136    // Query free memory on Device and allocate a scratch buffer
137    std::size_t freeSize{0}, totalSize{0};
138    HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
139    const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2;
140    void* d_scratch{nullptr};
141    HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
142    std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n";
143
144    // Create the initial quantum state
145    cutensornetState_t quantumState;
146    HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
147                                             CUDA_C_64F, &quantumState));
148    std::cout << "Created the initial quantum state\n";
149
150    // Construct the quantum circuit state (apply quantum gates)
151    int64_t id;
152    for (int32_t i = 0; i < numQubits; ++i)
153    {
154        HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
155            cutnHandle, quantumState, 1, std::vector<int32_t>{{i}}.data(), d_gateH, nullptr, 1, 0, 1, &id));
156        for (int32_t j = (i + 1); j < numQubits; ++j)
157        {
158            HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2,
159                                                                  std::vector<int32_t>{{j, i}}.data(), d_gateCR[j - i],
160                                                                  nullptr, 1, 0, 1, &id));
161        }
162    }
163    std::cout << "Applied quantum gates\n";

Set up and specify the MPS representation of the output state

Now let’s set up the MPS representation of the output state for our 16-qubit quantum circuit.

167    // Define the MPS representation and allocate memory buffers for the MPS tensors
168    const int64_t maxExtent = 8;
169    std::vector<int64_t> dimExtents(numQubits, 2);
170    std::vector<int64_t> maxBondExtents = getMaxBondExtents(dimExtents, maxExtent);
171
172    std::vector<std::vector<int64_t>> projMPSTensorExtents;
173    std::vector<const int64_t*> projMPSTensorExtentsPtr(numQubits);
174    std::vector<void*> d_projMPSTensors(numQubits, nullptr);
175    std::vector<int64_t>  numElements(numQubits);
176    std::vector<void*> d_envTensorsExtract(numQubits, nullptr);
177    std::vector<void*> d_envTensorsInsert(numQubits, nullptr);

Initialize the MPS in the vacuum state

For simplicity and reproducibility, we initialize the MPS as the vacuum state. This includes logic for the bond dimension, as we require the MPS to not have overcomplete bond dimensions.

182    for (int32_t i = 0; i < numQubits; ++i)
183    {
184        if (i == 0)
185        {
186            // left boundary MPS tensor
187            auto extent2 = std::min(maxBondExtents[i], maxExtent);
188            projMPSTensorExtents.push_back({2, extent2});
189            numElements[i] = 2 * extent2;
190            HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
191            //projMPSTensorStrides.push_back({1, 2});
192        }
193        else if (i == numQubits - 1)
194        {
195            // right boundary MPS tensor
196            auto extent1 = std::min(maxBondExtents[i - 1], maxExtent);
197            projMPSTensorExtents.push_back({extent1, 2});
198            numElements[i] = extent1 * 2;
199            HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
200        }
201        else
202        {
203            // middle MPS tensors
204            auto extent1 = std::min(maxBondExtents[i - 1], maxExtent);
205            auto extent3 = std::min(maxBondExtents[i], maxExtent);
206            projMPSTensorExtents.push_back({extent1, 2, extent3});
207            numElements[i] = extent1 * 2 * extent3;
208            HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
209        }
210        projMPSTensorExtentsPtr[i] = projMPSTensorExtents[i].data();
211
212        HANDLE_CUDA_ERROR(cudaMalloc(&d_envTensorsExtract[i], numElements[i] * (2 * fp64size)));
213        HANDLE_CUDA_ERROR(cudaMalloc(&d_envTensorsInsert[i], numElements[i] * (2 * fp64size)));
214    }
215
216    // Initialize the vacuum state
217    for (int32_t i = 0; i < numQubits; ++i)
218    {
219        std::vector<std::complex<double>> hostTensor(numElements[i]);
220        for (int64_t j = 0; j < numElements[i]; ++j)
221        {
222            hostTensor[j] = {0.0, 0.0};
223        }
224        hostTensor[0] = {1.0, 0.0};
225        HANDLE_CUDA_ERROR(
226            cudaMemcpy(d_projMPSTensors[i], hostTensor.data(), numElements[i] * (2 * fp64size), cudaMemcpyHostToDevice));
227    }
228    std::cout << "Allocated GPU memory for projection MPS tensors\n";

Define MPS representation and allocate projection MPS tensors

Here we set up the MPS representation to be optimized by the projection MPS API and allocate GPU memory for the projection MPS tensors and environment tensors to be used over the course of the DMRG optimization.

232    // Prepare state projection MPS
233    std::vector<cutensornetState_t> states    = {quantumState};
234    std::vector<cuDoubleComplex> coefficients = {{1.0, 0.0}};
235
236    // Environment specifications
237    std::vector<cutensornetMPSEnvBounds_t> envSpecs;
238    for (int32_t i = 0; i < numQubits; ++i)
239    {
240        cutensornetMPSEnvBounds_t spec;
241        spec.lowerBound = i - 1;
242        spec.upperBound = i + 1;
243        envSpecs.push_back(spec);
244    }
245
246    cutensornetMPSEnvBounds_t initialOrthoSpec;
247    initialOrthoSpec.lowerBound = -1;
248    initialOrthoSpec.upperBound = numQubits;

Create state projection MPS

Now we create the cutensornetStateProjectionMPS_t object that will be used for DMRG optimization. Many MPS optimization algorithms, including circuit DMRG, make use of the gauge degree of freedom of the MPS and enforce orthogonality conditions on the MPS tensors, such that the optimized tensors constitute coefficients of the quantum state in a orthonormal basis defined by the other MPS tensors. By default, the cutensornetStateProjectionMPSExtract API enforces these orthogonality conditions by performing orthogonalization of the MPS tensors towards the environment for which the extraction happens.

The following figure illustrates the main idea behind the state projection MPS API for the four-qubit case.

../../_images/circuit_dmrg_network.png

The environment network is the initial MPS (left) contracted with the circuit gates (middle) and the target MPS (right), where the tensor to be variationally optimized is removed from the network. Contracting this network results in a single-site tensor, which we will call the environment tensor. Note that we use the following convention for the indexing of the environment tensors: The single-site environment for tensor \(i\) is specified by the environment specification \((i-1, i+1)\).

252    // Create state projection MPS
253    cutensornetStateProjectionMPS_t projectionMps;
254    HANDLE_CUTN_ERROR(cutensornetCreateStateProjectionMPS(
255        cutnHandle, states.size(), states.data(), coefficients.data(), false, envSpecs.size(), envSpecs.data(),
256        CUTENSORNET_BOUNDARY_CONDITION_OPEN, numQubits, 0, projMPSTensorExtentsPtr.data(), 0,
257        d_projMPSTensors.data(), &initialOrthoSpec, &projectionMps));
258    std::cout << "Created state projection MPS\n";
259
260    // Prepare the state projection MPS computation
261    cutensornetWorkspaceDescriptor_t workDesc;
262    HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
263    HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSPrepare(cutnHandle, projectionMps, scratchSize, workDesc, 0x0));
264
265    int64_t worksize{0};
266    HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
267                                                        CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
268                                                        &worksize));
269    std::cout << "Workspace size for MPS projection = " << worksize << " bytes\n";
270
271    if (worksize <= scratchSize)
272    {
273        HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
274                                                        CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
275    }
276    else
277    {
278        std::cout << "ERROR: Insufficient workspace size on Device!\n";
279
280        // Cleanup
281        HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
282        HANDLE_CUTN_ERROR(cutensornetDestroyStateProjectionMPS(projectionMps));
283        HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
284
285        // Free GPU buffers
286        for (int32_t i = 0; i < numQubits; i++)
287        {
288            HANDLE_CUDA_ERROR(cudaFree(d_projMPSTensors[i]));
289            HANDLE_CUDA_ERROR(cudaFree(d_envTensorsExtract[i]));
290            HANDLE_CUDA_ERROR(cudaFree(d_envTensorsInsert[i]));
291        }
292        HANDLE_CUDA_ERROR(cudaFree(d_scratch));
293        for (auto ptr : d_gateCR) HANDLE_CUDA_ERROR(cudaFree(ptr));
294        HANDLE_CUDA_ERROR(cudaFree(d_gateH));
295        std::cout << "Freed memory on GPU\n";
296
297        // Finalize the cuTensorNet library
298        HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
299
300        std::abort();
301    }

Perform DMRG optimization

Once the state projection MPS has been prepared, we perform DMRG optimization through forward and backward sweeps to optimize the MPS representation. For each tensor, we extract the site tensor, compute the corresponding environment tensor, and then insert the resulting tensor back into the MPS.

305    // DMRG iterations
306    const int32_t numIterations = 5;
307    std::cout << "Starting DMRG iterations\n";
308
309    for (int32_t iter = 0; iter < numIterations; ++iter)
310    {
311        std::cout << "DMRG iteration " << iter + 1 << "/" << numIterations << std::endl;
312
313        // Forward sweep
314        for (int32_t i = 0; i < numQubits; ++i)
315        {
316            // Environment bounds for site i
317            cutensornetMPSEnvBounds_t envBounds;
318            envBounds.lowerBound = i - 1;
319            envBounds.upperBound = i + 1;
320
321            // Extract site tensor
322            HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSExtractTensor(
323                cutnHandle, projectionMps, &envBounds, nullptr, d_envTensorsExtract[i], workDesc, 0x0));
324
325            // Compute environment tensor
326            HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSComputeTensorEnv(cutnHandle, projectionMps, &envBounds, 0, 0,
327                                                                            0, d_envTensorsInsert[i], 0,
328                                                                            0, workDesc, 0x0));
329
330            // Insert updated tensor
331            cutensornetMPSEnvBounds_t orthoSpec = envBounds;
332            HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSInsertTensor(cutnHandle, projectionMps, &envBounds,
333                                                                        &orthoSpec, 0,
334                                                                        d_envTensorsInsert[i], workDesc, 0x0));
335        }
336
337        // Backward sweep
338        for (int32_t i = numQubits - 1; i >= 0; --i)
339        {
340            // Environment bounds for site i
341            cutensornetMPSEnvBounds_t envBounds;
342            envBounds.lowerBound = i - 1;
343            envBounds.upperBound = i + 1;
344
345            // Extract site tensor
346            HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSExtractTensor(
347                cutnHandle, projectionMps, &envBounds, 0, d_envTensorsExtract[i], workDesc, 0x0));
348
349            // Compute environment tensor
350            HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSComputeTensorEnv(cutnHandle, projectionMps, &envBounds, 0, 0,
351                                                                            0, d_envTensorsInsert[i], 0,
352                                                                            0, workDesc, 0x0));
353
354            // Insert updated tensor
355            cutensornetMPSEnvBounds_t orthoSpec = envBounds;
356            HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSInsertTensor(cutnHandle, projectionMps, &envBounds,
357                                                                        &orthoSpec, 0,
358                                                                        d_envTensorsInsert[i], workDesc, 0x0));
359        }
360
361        std::cout << "Completed DMRG iteration " << iter + 1 << std::endl;
362    }
363
364    std::cout << "DMRG optimization completed\n";

Free resources

368    // Cleanup
369    HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
370    HANDLE_CUTN_ERROR(cutensornetDestroyStateProjectionMPS(projectionMps));
371    HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
372
373    // Free GPU buffers
374    for (int32_t i = 0; i < numQubits; i++)
375    {
376        HANDLE_CUDA_ERROR(cudaFree(d_projMPSTensors[i]));
377        HANDLE_CUDA_ERROR(cudaFree(d_envTensorsExtract[i]));
378        HANDLE_CUDA_ERROR(cudaFree(d_envTensorsInsert[i]));
379    }
380    HANDLE_CUDA_ERROR(cudaFree(d_scratch));
381    for (auto ptr : d_gateCR) HANDLE_CUDA_ERROR(cudaFree(ptr));
382    HANDLE_CUDA_ERROR(cudaFree(d_gateH));
383    std::cout << "Freed memory on GPU\n";
384
385    // Finalize the cuTensorNet library
386    HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
387    std::cout << "Finalized the cuTensorNet library\n";
388
389    return 0;
390}

References

Ayral, Thomas, Thibaud Louvet, Yiqing Zhou, Cyprien Lambert, E. Miles Stoudenmire, and Xavier Waintal. 2023. “Density-Matrix Renormalization Group Algorithm for Simulating Quantum Circuits with a Finite Fidelity.” PRX Quantum 4 (April): 020304. https://doi.org/10.1103/PRXQuantum.4.020304.