Projection MPS Circuit DMRG¶
The following code example illustrates how to use the state projection MPS API to perform circuit simulation using DMRG optimization, following the method described by [Ayral et al., 2023]. The full code can be found in the NVIDIA/cuQuantum repository (here).
Headers, error handling, and helper functions¶
7#include <cstdlib>
8#include <cstdio>
9#include <cassert>
10#include <complex>
11#include <cmath>
12#include <vector>
13#include <iostream>
14
15#include <cuda_runtime.h>
16#include <cutensornet.h>
17
18#define HANDLE_CUDA_ERROR(x) \
19 { \
20 const auto err = x; \
21 if (err != cudaSuccess) \
22 { \
23 printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); \
24 fflush(stdout); \
25 std::abort(); \
26 } \
27 };
28
29#define HANDLE_CUTN_ERROR(x) \
30 { \
31 const auto err = x; \
32 if (err != CUTENSORNET_STATUS_SUCCESS) \
33 { \
34 printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
35 fflush(stdout); \
36 std::abort(); \
37 } \
38 };
39
40// Helper function to compute maximum bond extents
41std::vector<int64_t> getMaxBondExtents(const std::vector<int64_t>& stateModeExtents, int64_t maxdim = -1)
42{
43 const int32_t numQubits = stateModeExtents.size();
44 std::vector<int64_t> cumprodLeft(numQubits), cumprodRight(numQubits);
45
46 // Compute cumulative products from left and right
47 cumprodLeft[0] = stateModeExtents[0];
48 for (int32_t i = 1; i < numQubits; ++i)
49 {
50 cumprodLeft[i] = cumprodLeft[i - 1] * stateModeExtents[i];
51 }
52
53 cumprodRight[numQubits - 1] = stateModeExtents[numQubits - 1];
54 for (int32_t i = numQubits - 2; i >= 0; --i)
55 {
56 cumprodRight[i] = cumprodRight[i + 1] * stateModeExtents[i];
57 }
58
59 std::vector<int64_t> maxBondExtents(numQubits - 1);
60 for (int32_t i = 0; i < numQubits - 1; ++i)
61 {
62 int64_t minVal = std::min(cumprodLeft[i], cumprodRight[i + 1]);
63 maxBondExtents[i] = (maxdim > 0) ? std::min(minVal, maxdim) : minVal;
64 }
65
66 return maxBondExtents;
67}
68
69int main()
70{
71 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
72
73 constexpr std::size_t fp64size = sizeof(double);
Define the tensor network state and quantum circuit configuration¶
Let’s define a tensor network state corresponding to a 16-qubit QFT quantum circuit.
77 // Quantum state configuration
78 const int32_t numQubits = 16;
79 const std::vector<int64_t> qubitDims(numQubits, 2);
80 std::cout << "DMRG quantum circuit with " << numQubits << " qubits\n";
Initialize the cuTensorNet library handle¶
84 // Initialize the cuTensorNet library
85 HANDLE_CUDA_ERROR(cudaSetDevice(0));
86 cutensornetHandle_t cutnHandle;
87 HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
88 std::cout << "Initialized cuTensorNet library on GPU 0\n";
Define quantum gates on GPU¶
92 // Define necessary quantum gate tensors in Host memory
93 const double invsq2 = 1.0 / std::sqrt(2.0);
94 const double pi = 3.14159265358979323846;
95 using GateData = std::vector<std::complex<double>>;
96
97 // CR(k) gate generator
98 auto cRGate = [&pi](int32_t k)
99 {
100 const double phi = pi / std::pow(2.0, k);
101 const GateData cr{{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
102 {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
103 {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0},
104 {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {std::cos(phi), std::sin(phi)}};
105 return cr;
106 };
107
108 // Hadamard gate
109 const GateData h_gateH{{invsq2, 0.0}, {invsq2, 0.0}, {invsq2, 0.0}, {-invsq2, 0.0}};
110
111 // CR(k) gates (controlled rotations)
112 std::vector<GateData> h_gateCR(numQubits);
113 for (int32_t k = 0; k < numQubits; ++k)
114 {
115 h_gateCR[k] = cRGate(k + 1);
116 }
117
118 // Copy quantum gates into Device memory
119 void* d_gateH{nullptr};
120 std::vector<void*> d_gateCR(numQubits, nullptr);
121 HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
122 for (int32_t k = 0; k < numQubits; ++k)
123 {
124 HANDLE_CUDA_ERROR(cudaMalloc(&(d_gateCR[k]), 16 * (2 * fp64size)));
125 }
126
127 HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
128 for (int32_t k = 0; k < numQubits; ++k)
129 {
130 HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCR[k], h_gateCR[k].data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
131 }
132 std::cout << "Copied quantum gates into GPU memory\n";
Create initial quantum state and apply gates¶
Here we query the available GPU memory and allocate a scratch buffer for computations, followed by creating the pure quantum state and then applying the quantum gates.
136 // Query free memory on Device and allocate a scratch buffer
137 std::size_t freeSize{0}, totalSize{0};
138 HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
139 const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2;
140 void* d_scratch{nullptr};
141 HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
142 std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n";
143
144 // Create the initial quantum state
145 cutensornetState_t quantumState;
146 HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
147 CUDA_C_64F, &quantumState));
148 std::cout << "Created the initial quantum state\n";
149
150 // Construct the quantum circuit state (apply quantum gates)
151 int64_t id;
152 for (int32_t i = 0; i < numQubits; ++i)
153 {
154 HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
155 cutnHandle, quantumState, 1, std::vector<int32_t>{{i}}.data(), d_gateH, nullptr, 1, 0, 1, &id));
156 for (int32_t j = (i + 1); j < numQubits; ++j)
157 {
158 HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2,
159 std::vector<int32_t>{{j, i}}.data(), d_gateCR[j - i],
160 nullptr, 1, 0, 1, &id));
161 }
162 }
163 std::cout << "Applied quantum gates\n";
Set up and specify the MPS representation of the output state¶
Now let’s set up the MPS representation of the output state for our 16-qubit quantum circuit.
167 // Define the MPS representation and allocate memory buffers for the MPS tensors
168 const int64_t maxExtent = 8;
169 std::vector<int64_t> dimExtents(numQubits, 2);
170 std::vector<int64_t> maxBondExtents = getMaxBondExtents(dimExtents, maxExtent);
171
172 std::vector<std::vector<int64_t>> projMPSTensorExtents;
173 std::vector<const int64_t*> projMPSTensorExtentsPtr(numQubits);
174 std::vector<void*> d_projMPSTensors(numQubits, nullptr);
175 std::vector<int64_t> numElements(numQubits);
176 std::vector<void*> d_envTensorsExtract(numQubits, nullptr);
177 std::vector<void*> d_envTensorsInsert(numQubits, nullptr);
Initialize the MPS in the vacuum state¶
For simplicity and reproducibility, we initialize the MPS as the vacuum state. This includes logic for the bond dimension, as we require the MPS to not have overcomplete bond dimensions.
182 for (int32_t i = 0; i < numQubits; ++i)
183 {
184 if (i == 0)
185 {
186 // left boundary MPS tensor
187 auto extent2 = std::min(maxBondExtents[i], maxExtent);
188 projMPSTensorExtents.push_back({2, extent2});
189 numElements[i] = 2 * extent2;
190 HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
191 //projMPSTensorStrides.push_back({1, 2});
192 }
193 else if (i == numQubits - 1)
194 {
195 // right boundary MPS tensor
196 auto extent1 = std::min(maxBondExtents[i - 1], maxExtent);
197 projMPSTensorExtents.push_back({extent1, 2});
198 numElements[i] = extent1 * 2;
199 HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
200 }
201 else
202 {
203 // middle MPS tensors
204 auto extent1 = std::min(maxBondExtents[i - 1], maxExtent);
205 auto extent3 = std::min(maxBondExtents[i], maxExtent);
206 projMPSTensorExtents.push_back({extent1, 2, extent3});
207 numElements[i] = extent1 * 2 * extent3;
208 HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
209 }
210 projMPSTensorExtentsPtr[i] = projMPSTensorExtents[i].data();
211
212 HANDLE_CUDA_ERROR(cudaMalloc(&d_envTensorsExtract[i], numElements[i] * (2 * fp64size)));
213 HANDLE_CUDA_ERROR(cudaMalloc(&d_envTensorsInsert[i], numElements[i] * (2 * fp64size)));
214 }
215
216 // Initialize the vacuum state
217 for (int32_t i = 0; i < numQubits; ++i)
218 {
219 std::vector<std::complex<double>> hostTensor(numElements[i]);
220 for (int64_t j = 0; j < numElements[i]; ++j)
221 {
222 hostTensor[j] = {0.0, 0.0};
223 }
224 hostTensor[0] = {1.0, 0.0};
225 HANDLE_CUDA_ERROR(
226 cudaMemcpy(d_projMPSTensors[i], hostTensor.data(), numElements[i] * (2 * fp64size), cudaMemcpyHostToDevice));
227 }
228 std::cout << "Allocated GPU memory for projection MPS tensors\n";
Define MPS representation and allocate projection MPS tensors¶
Here we set up the MPS representation to be optimized by the projection MPS API and allocate GPU memory for the projection MPS tensors and environment tensors to be used over the course of the DMRG optimization.
232 // Prepare state projection MPS
233 std::vector<cutensornetState_t> states = {quantumState};
234 std::vector<cuDoubleComplex> coefficients = {{1.0, 0.0}};
235
236 // Environment specifications
237 std::vector<cutensornetMPSEnvBounds_t> envSpecs;
238 for (int32_t i = 0; i < numQubits; ++i)
239 {
240 cutensornetMPSEnvBounds_t spec;
241 spec.lowerBound = i - 1;
242 spec.upperBound = i + 1;
243 envSpecs.push_back(spec);
244 }
245
246 cutensornetMPSEnvBounds_t initialOrthoSpec;
247 initialOrthoSpec.lowerBound = -1;
248 initialOrthoSpec.upperBound = numQubits;
Create state projection MPS¶
Now we create the cutensornetStateProjectionMPS_t
object that will be used for DMRG optimization.
Many MPS optimization algorithms, including circuit DMRG, make use of the gauge degree of freedom of the MPS and enforce
orthogonality conditions on the MPS tensors, such that the optimized tensors constitute coefficients of the quantum state in a orthonormal basis defined by the other MPS tensors.
By default, the cutensornetStateProjectionMPSExtract
API enforces these orthogonality conditions by performing orthogonalization of the MPS tensors towards the environment for which the extraction happens.
The following figure illustrates the main idea behind the state projection MPS API for the four-qubit case.
The environment network is the initial MPS (left) contracted with the circuit gates (middle) and the target MPS (right), where the tensor to be variationally optimized is removed from the network. Contracting this network results in a single-site tensor, which we will call the environment tensor. Note that we use the following convention for the indexing of the environment tensors: The single-site environment for tensor \(i\) is specified by the environment specification \((i-1, i+1)\).
252 // Create state projection MPS
253 cutensornetStateProjectionMPS_t projectionMps;
254 HANDLE_CUTN_ERROR(cutensornetCreateStateProjectionMPS(
255 cutnHandle, states.size(), states.data(), coefficients.data(), false, envSpecs.size(), envSpecs.data(),
256 CUTENSORNET_BOUNDARY_CONDITION_OPEN, numQubits, 0, projMPSTensorExtentsPtr.data(), 0,
257 d_projMPSTensors.data(), &initialOrthoSpec, &projectionMps));
258 std::cout << "Created state projection MPS\n";
259
260 // Prepare the state projection MPS computation
261 cutensornetWorkspaceDescriptor_t workDesc;
262 HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
263 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSPrepare(cutnHandle, projectionMps, scratchSize, workDesc, 0x0));
264
265 int64_t worksize{0};
266 HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
267 CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
268 &worksize));
269 std::cout << "Workspace size for MPS projection = " << worksize << " bytes\n";
270
271 if (worksize <= scratchSize)
272 {
273 HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
274 CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
275 }
276 else
277 {
278 std::cout << "ERROR: Insufficient workspace size on Device!\n";
279
280 // Cleanup
281 HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
282 HANDLE_CUTN_ERROR(cutensornetDestroyStateProjectionMPS(projectionMps));
283 HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
284
285 // Free GPU buffers
286 for (int32_t i = 0; i < numQubits; i++)
287 {
288 HANDLE_CUDA_ERROR(cudaFree(d_projMPSTensors[i]));
289 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsExtract[i]));
290 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsInsert[i]));
291 }
292 HANDLE_CUDA_ERROR(cudaFree(d_scratch));
293 for (auto ptr : d_gateCR) HANDLE_CUDA_ERROR(cudaFree(ptr));
294 HANDLE_CUDA_ERROR(cudaFree(d_gateH));
295 std::cout << "Freed memory on GPU\n";
296
297 // Finalize the cuTensorNet library
298 HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
299
300 std::abort();
301 }
Perform DMRG optimization¶
Once the state projection MPS has been prepared, we perform DMRG optimization through forward and backward sweeps to optimize the MPS representation. For each tensor, we extract the site tensor, compute the corresponding environment tensor, and then insert the resulting tensor back into the MPS.
305 // DMRG iterations
306 const int32_t numIterations = 5;
307 std::cout << "Starting DMRG iterations\n";
308
309 for (int32_t iter = 0; iter < numIterations; ++iter)
310 {
311 std::cout << "DMRG iteration " << iter + 1 << "/" << numIterations << std::endl;
312
313 // Forward sweep
314 for (int32_t i = 0; i < numQubits; ++i)
315 {
316 // Environment bounds for site i
317 cutensornetMPSEnvBounds_t envBounds;
318 envBounds.lowerBound = i - 1;
319 envBounds.upperBound = i + 1;
320
321 // Extract site tensor
322 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSExtractTensor(
323 cutnHandle, projectionMps, &envBounds, nullptr, d_envTensorsExtract[i], workDesc, 0x0));
324
325 // Compute environment tensor
326 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSComputeTensorEnv(cutnHandle, projectionMps, &envBounds, 0, 0,
327 0, d_envTensorsInsert[i], 0,
328 0, workDesc, 0x0));
329
330 // Insert updated tensor
331 cutensornetMPSEnvBounds_t orthoSpec = envBounds;
332 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSInsertTensor(cutnHandle, projectionMps, &envBounds,
333 &orthoSpec, 0,
334 d_envTensorsInsert[i], workDesc, 0x0));
335 }
336
337 // Backward sweep
338 for (int32_t i = numQubits - 1; i >= 0; --i)
339 {
340 // Environment bounds for site i
341 cutensornetMPSEnvBounds_t envBounds;
342 envBounds.lowerBound = i - 1;
343 envBounds.upperBound = i + 1;
344
345 // Extract site tensor
346 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSExtractTensor(
347 cutnHandle, projectionMps, &envBounds, 0, d_envTensorsExtract[i], workDesc, 0x0));
348
349 // Compute environment tensor
350 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSComputeTensorEnv(cutnHandle, projectionMps, &envBounds, 0, 0,
351 0, d_envTensorsInsert[i], 0,
352 0, workDesc, 0x0));
353
354 // Insert updated tensor
355 cutensornetMPSEnvBounds_t orthoSpec = envBounds;
356 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSInsertTensor(cutnHandle, projectionMps, &envBounds,
357 &orthoSpec, 0,
358 d_envTensorsInsert[i], workDesc, 0x0));
359 }
360
361 std::cout << "Completed DMRG iteration " << iter + 1 << std::endl;
362 }
363
364 std::cout << "DMRG optimization completed\n";
Free resources¶
368 // Cleanup
369 HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
370 HANDLE_CUTN_ERROR(cutensornetDestroyStateProjectionMPS(projectionMps));
371 HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
372
373 // Free GPU buffers
374 for (int32_t i = 0; i < numQubits; i++)
375 {
376 HANDLE_CUDA_ERROR(cudaFree(d_projMPSTensors[i]));
377 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsExtract[i]));
378 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsInsert[i]));
379 }
380 HANDLE_CUDA_ERROR(cudaFree(d_scratch));
381 for (auto ptr : d_gateCR) HANDLE_CUDA_ERROR(cudaFree(ptr));
382 HANDLE_CUDA_ERROR(cudaFree(d_gateH));
383 std::cout << "Freed memory on GPU\n";
384
385 // Finalize the cuTensorNet library
386 HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
387 std::cout << "Finalized the cuTensorNet library\n";
388
389 return 0;
390}
References¶
Ayral, Thomas, Thibaud Louvet, Yiqing Zhou, Cyprien Lambert, E. Miles Stoudenmire, and Xavier Waintal. 2023. “Density-Matrix Renormalization Group Algorithm for Simulating Quantum Circuits with a Finite Fidelity.” PRX Quantum 4 (April): 020304. https://doi.org/10.1103/PRXQuantum.4.020304.