Projection MPS Circuit DMRG¶
The following code example illustrates how to use the state projection MPS API to perform circuit simulation using DMRG optimization, following the method described by [Ayral et al., 2023]. The full code can be found in the NVIDIA/cuQuantum repository (here).
Headers, error handling, and helper functions¶
7#include <cstdlib>
8#include <cstdio>
9#include <cassert>
10#include <complex>
11#include <cmath>
12#include <vector>
13#include <limits>
14#include <iostream>
15
16#include <cuda_runtime.h>
17#include <cutensornet.h>
18#include <cublas_v2.h>
19
20#define HANDLE_CUDA_ERROR(x) \
21 { \
22 const auto err = x; \
23 if (err != cudaSuccess) \
24 { \
25 printf("CUDA error %s in line %d\n", cudaGetErrorString(err), __LINE__); \
26 fflush(stdout); \
27 std::abort(); \
28 } \
29 };
30
31#define HANDLE_CUTN_ERROR(x) \
32 { \
33 const auto err = x; \
34 if (err != CUTENSORNET_STATUS_SUCCESS) \
35 { \
36 printf("cuTensorNet error %s in line %d\n", cutensornetGetErrorString(err), __LINE__); \
37 fflush(stdout); \
38 std::abort(); \
39 } \
40 };
41
42#define HANDLE_CUBLAS_ERROR(x) \
43 { \
44 const auto err = x; \
45 if (err != CUBLAS_STATUS_SUCCESS) \
46 { \
47 printf("cuBLAS error (status code %d) in line %d\n", static_cast<int>(err), __LINE__); \
48 fflush(stdout); \
49 std::abort(); \
50 } \
51 };
52
53// Helper function to compute maximum bond extents
54std::vector<int64_t> getMaxBondExtents(const std::vector<int64_t>& stateModeExtents, int64_t maxdim = -1)
55{
56 const int32_t numQubits = stateModeExtents.size();
57 std::vector<int64_t> cumprodLeft(numQubits), cumprodRight(numQubits);
58
59 // Compute cumulative products from left and right
60 cumprodLeft[0] = stateModeExtents[0];
61 for (int32_t i = 1; i < numQubits; ++i)
62 {
63 cumprodLeft[i] = cumprodLeft[i - 1] * stateModeExtents[i];
64 }
65
66 cumprodRight[numQubits - 1] = stateModeExtents[numQubits - 1];
67 for (int32_t i = numQubits - 2; i >= 0; --i)
68 {
69 cumprodRight[i] = cumprodRight[i + 1] * stateModeExtents[i];
70 }
71
72 std::vector<int64_t> maxBondExtents(numQubits - 1);
73 for (int32_t i = 0; i < numQubits - 1; ++i)
74 {
75 int64_t minVal = std::min(cumprodLeft[i], cumprodRight[i + 1]);
76 maxBondExtents[i] = (maxdim > 0) ? std::min(minVal, maxdim) : minVal;
77 }
78
79 return maxBondExtents;
80}
81
82int main()
83{
84 static_assert(sizeof(size_t) == sizeof(int64_t), "Please build this sample on a 64-bit architecture!");
85
86 constexpr std::size_t fp64size = sizeof(double);
Define the tensor network state and quantum circuit configuration¶
Let’s define a tensor network state corresponding to a 16-qubit QFT quantum circuit.
90 // Quantum state configuration
91 const int32_t numQubits = 16;
92 const std::vector<int64_t> qubitDims(numQubits, 2);
93 std::cout << "DMRG quantum circuit with " << numQubits << " qubits\n";
Initialize the cuTensorNet library handle¶
97 // Initialize the cuTensorNet library
98 HANDLE_CUDA_ERROR(cudaSetDevice(0));
99 cutensornetHandle_t cutnHandle;
100 HANDLE_CUTN_ERROR(cutensornetCreate(&cutnHandle));
101 std::cout << "Initialized cuTensorNet library on GPU 0\n";
102
103 cublasHandle_t cublasHandle;
104 HANDLE_CUBLAS_ERROR(cublasCreate(&cublasHandle));
105
Define quantum gates on GPU¶
108 // Define necessary quantum gate tensors in Host memory
109 const double invsq2 = 1.0 / std::sqrt(2.0);
110 const double pi = 3.14159265358979323846;
111 using GateData = std::vector<std::complex<double>>;
112
113 // CR(k) gate generator
114 auto cRGate = [&pi](int32_t k)
115 {
116 const double phi = pi / std::pow(2.0, k);
117 const GateData cr{{1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
118 {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},
119 {0.0, 0.0}, {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0},
120 {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {std::cos(phi), std::sin(phi)}};
121 return cr;
122 };
123
124 // Hadamard gate
125 const GateData h_gateH{{invsq2, 0.0}, {invsq2, 0.0}, {invsq2, 0.0}, {-invsq2, 0.0}};
126
127 // CR(k) gates (controlled rotations)
128 std::vector<GateData> h_gateCR(numQubits);
129 for (int32_t k = 0; k < numQubits; ++k)
130 {
131 h_gateCR[k] = cRGate(k + 1);
132 }
133
134 // Copy quantum gates into Device memory
135 void* d_gateH{nullptr};
136 std::vector<void*> d_gateCR(numQubits, nullptr);
137 HANDLE_CUDA_ERROR(cudaMalloc(&d_gateH, 4 * (2 * fp64size)));
138 for (int32_t k = 0; k < numQubits; ++k)
139 {
140 HANDLE_CUDA_ERROR(cudaMalloc(&(d_gateCR[k]), 16 * (2 * fp64size)));
141 }
142
143 HANDLE_CUDA_ERROR(cudaMemcpy(d_gateH, h_gateH.data(), 4 * (2 * fp64size), cudaMemcpyHostToDevice));
144 for (int32_t k = 0; k < numQubits; ++k)
145 {
146 HANDLE_CUDA_ERROR(cudaMemcpy(d_gateCR[k], h_gateCR[k].data(), 16 * (2 * fp64size), cudaMemcpyHostToDevice));
147 }
148 std::cout << "Copied quantum gates into GPU memory\n";
Create initial quantum state and apply gates¶
Here we query the available GPU memory and allocate a scratch buffer for computations, followed by creating the pure quantum state and then applying the quantum gates.
152 // Query free memory on Device and allocate a scratch buffer
153 std::size_t freeSize{0}, totalSize{0};
154 HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeSize, &totalSize));
155 const std::size_t scratchSize = (freeSize - (freeSize % 4096)) / 2;
156 void* d_scratch{nullptr};
157 HANDLE_CUDA_ERROR(cudaMalloc(&d_scratch, scratchSize));
158 std::cout << "Allocated " << scratchSize << " bytes of scratch memory on GPU\n";
159
160 // Create the initial quantum state
161 cutensornetState_t quantumState;
162 HANDLE_CUTN_ERROR(cutensornetCreateState(cutnHandle, CUTENSORNET_STATE_PURITY_PURE, numQubits, qubitDims.data(),
163 CUDA_C_64F, &quantumState));
164 std::cout << "Created the initial quantum state\n";
165
166 // Construct the quantum circuit state (apply quantum gates)
167 int64_t id;
168 for (int32_t i = 0; i < numQubits; ++i)
169 {
170 HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(
171 cutnHandle, quantumState, 1, std::vector<int32_t>{{i}}.data(), d_gateH, nullptr, 1, 0, 1, &id));
172 for (int32_t j = (i + 1); j < numQubits; ++j)
173 {
174 HANDLE_CUTN_ERROR(cutensornetStateApplyTensorOperator(cutnHandle, quantumState, 2,
175 std::vector<int32_t>{{j, i}}.data(), d_gateCR[j - i],
176 nullptr, 1, 0, 1, &id));
177 }
178 }
179 std::cout << "Applied quantum gates\n";
Set up and specify the MPS representation of the output state¶
Now let’s set up the MPS representation of the output state for our 16-qubit quantum circuit.
183 // Define the MPS representation and allocate memory buffers for the MPS tensors
184 const int64_t maxExtent = 8;
185 std::vector<int64_t> dimExtents(numQubits, 2);
186 std::vector<int64_t> maxBondExtents = getMaxBondExtents(dimExtents, maxExtent);
187
188 std::vector<std::vector<int64_t>> projMPSTensorExtents;
189 std::vector<const int64_t*> projMPSTensorExtentsPtr(numQubits);
190 std::vector<void*> d_projMPSTensors(numQubits, nullptr);
191 std::vector<int64_t> numElements(numQubits);
192 std::vector<void*> d_envTensorsExtract(numQubits, nullptr);
193 std::vector<void*> d_envTensorsInsert(numQubits, nullptr);
Initialize the MPS in the vacuum state¶
For simplicity and reproducibility, we initialize the MPS as the vacuum state. This includes logic for the bond dimension, as we require the MPS to not have overcomplete bond dimensions.
198 for (int32_t i = 0; i < numQubits; ++i)
199 {
200 if (i == 0)
201 {
202 // left boundary MPS tensor
203 auto extent2 = std::min(maxBondExtents[i], maxExtent);
204 projMPSTensorExtents.push_back({2, extent2});
205 numElements[i] = 2 * extent2;
206 HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
207 //projMPSTensorStrides.push_back({1, 2});
208 }
209 else if (i == numQubits - 1)
210 {
211 // right boundary MPS tensor
212 auto extent1 = std::min(maxBondExtents[i - 1], maxExtent);
213 projMPSTensorExtents.push_back({extent1, 2});
214 numElements[i] = extent1 * 2;
215 HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
216 }
217 else
218 {
219 // middle MPS tensors
220 auto extent1 = std::min(maxBondExtents[i - 1], maxExtent);
221 auto extent3 = std::min(maxBondExtents[i], maxExtent);
222 projMPSTensorExtents.push_back({extent1, 2, extent3});
223 numElements[i] = extent1 * 2 * extent3;
224 HANDLE_CUDA_ERROR(cudaMalloc(&d_projMPSTensors[i], numElements[i] * (2 * fp64size)));
225 }
226 projMPSTensorExtentsPtr[i] = projMPSTensorExtents[i].data();
227
228 HANDLE_CUDA_ERROR(cudaMalloc(&d_envTensorsExtract[i], numElements[i] * (2 * fp64size)));
229 HANDLE_CUDA_ERROR(cudaMalloc(&d_envTensorsInsert[i], numElements[i] * (2 * fp64size)));
230 }
231
232 // Initialize the vacuum state
233 for (int32_t i = 0; i < numQubits; ++i)
234 {
235 std::vector<std::complex<double>> hostTensor(numElements[i]);
236 for (int64_t j = 0; j < numElements[i]; ++j)
237 {
238 hostTensor[j] = {0.0, 0.0};
239 }
240 hostTensor[0] = {1.0, 0.0};
241 HANDLE_CUDA_ERROR(
242 cudaMemcpy(d_projMPSTensors[i], hostTensor.data(), numElements[i] * (2 * fp64size), cudaMemcpyHostToDevice));
243 }
244 std::cout << "Allocated GPU memory for projection MPS tensors\n";
Define MPS representation and allocate projection MPS tensors¶
Here we set up the MPS representation to be optimized by the projection MPS API and allocate GPU memory for the projection MPS tensors and environment tensors to be used over the course of the DMRG optimization.
248 // Prepare state projection MPS
249 std::vector<cutensornetState_t> states = {quantumState};
250 std::vector<cuDoubleComplex> coefficients = {{1.0, 0.0}};
251
252 // Environment specifications
253 std::vector<cutensornetMPSEnvBounds_t> envSpecs;
254 for (int32_t i = 0; i < numQubits; ++i)
255 {
256 cutensornetMPSEnvBounds_t spec;
257 spec.lowerBound = i - 1;
258 spec.upperBound = i + 1;
259 envSpecs.push_back(spec);
260 }
261
262 cutensornetMPSEnvBounds_t initialOrthoSpec;
263 initialOrthoSpec.lowerBound = -1;
264 initialOrthoSpec.upperBound = numQubits;
Create state projection MPS¶
Now we create the cutensornetStateProjectionMPS_t
object that will be used for DMRG optimization.
Many MPS optimization algorithms, including circuit DMRG, make use of the gauge degree of freedom of the MPS and enforce
orthogonality conditions on the MPS tensors, such that the optimized tensors constitute coefficients of the quantum state in a orthonormal basis defined by the other MPS tensors.
By default, the cutensornetStateProjectionMPSExtract
API enforces these orthogonality conditions by performing orthogonalization of the MPS tensors towards the environment for which the extraction happens.
The following figure illustrates the main idea behind the state projection MPS API for the four-qubit case.
The environment network is the initial MPS (left) contracted with the circuit gates (middle) and the target MPS (right), where the tensor to be variationally optimized is removed from the network. Contracting this network results in a single-site tensor, which we will call the environment tensor. Note that we use the following convention for the indexing of the environment tensors: The single-site environment for tensor \(i\) is specified by the environment specification \((i-1, i+1)\).
268 // Create state projection MPS
269 cutensornetStateProjectionMPS_t projectionMps;
270 HANDLE_CUTN_ERROR(cutensornetCreateStateProjectionMPS(
271 cutnHandle, states.size(), states.data(), coefficients.data(), false, envSpecs.size(), envSpecs.data(),
272 CUTENSORNET_BOUNDARY_CONDITION_OPEN, numQubits, 0, projMPSTensorExtentsPtr.data(), 0,
273 d_projMPSTensors.data(), &initialOrthoSpec, &projectionMps));
274 std::cout << "Created state projection MPS\n";
275
276 // Prepare the state projection MPS computation
277 cutensornetWorkspaceDescriptor_t workDesc;
278 HANDLE_CUTN_ERROR(cutensornetCreateWorkspaceDescriptor(cutnHandle, &workDesc));
279 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSPrepare(cutnHandle, projectionMps, scratchSize, workDesc, 0x0));
280
281 int64_t worksize{0};
282 HANDLE_CUTN_ERROR(cutensornetWorkspaceGetMemorySize(cutnHandle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
283 CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
284 &worksize));
285 std::cout << "Workspace size for MPS projection = " << worksize << " bytes\n";
286
287 if (worksize <= scratchSize)
288 {
289 HANDLE_CUTN_ERROR(cutensornetWorkspaceSetMemory(cutnHandle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
290 CUTENSORNET_WORKSPACE_SCRATCH, d_scratch, worksize));
291 }
292 else
293 {
294 std::cout << "ERROR: Insufficient workspace size on Device!\n";
295
296 // Cleanup
297 HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
298 HANDLE_CUTN_ERROR(cutensornetDestroyStateProjectionMPS(projectionMps));
299 HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
300
301 // Free GPU buffers
302 for (int32_t i = 0; i < numQubits; i++)
303 {
304 HANDLE_CUDA_ERROR(cudaFree(d_projMPSTensors[i]));
305 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsExtract[i]));
306 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsInsert[i]));
307 }
308 HANDLE_CUDA_ERROR(cudaFree(d_scratch));
309 for (auto ptr : d_gateCR) HANDLE_CUDA_ERROR(cudaFree(ptr));
310 HANDLE_CUDA_ERROR(cudaFree(d_gateH));
311 std::cout << "Freed memory on GPU\n";
312
313 // Finalize the cuTensorNet library
314 HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
315
316 std::abort();
317 }
Perform DMRG optimization¶
Once the state projection MPS has been prepared, we perform DMRG optimization through forward and backward sweeps to optimize the MPS representation. For each tensor, we extract the site tensor, compute the corresponding environment tensor, and then insert the resulting tensor back into the MPS.
321 // DMRG iterations
322 const int32_t numIterations = 5;
323 std::cout << "Starting DMRG iterations\n";
324
325 for (int32_t iter = 0; iter < numIterations; ++iter)
326 {
327 std::cout << "DMRG iteration " << iter + 1 << "/" << numIterations << std::endl;
328
329 // Forward sweep
330 for (int32_t i = 0; i < numQubits; ++i)
331 {
332 // Environment bounds for site i
333 cutensornetMPSEnvBounds_t envBounds;
334 envBounds.lowerBound = i - 1;
335 envBounds.upperBound = i + 1;
336
337 // Extract site tensor
338 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSExtractTensor(
339 cutnHandle, projectionMps, &envBounds, nullptr, d_envTensorsExtract[i], workDesc, 0x0));
340
341 // Compute environment tensor
342 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSComputeTensorEnv(cutnHandle, projectionMps, &envBounds, 0, 0,
343 0, d_envTensorsInsert[i], 0,
344 0, workDesc, 0x0));
345
346 // Apply partial-fidelity scaling factor to the environment tensor
347 double f_tau_sqrt;
348 HANDLE_CUBLAS_ERROR(cublasDznrm2(cublasHandle, numElements[i],
349 static_cast<const cuDoubleComplex*>(d_envTensorsInsert[i]), 1,
350 &f_tau_sqrt));
351 HANDLE_CUDA_ERROR(cudaStreamSynchronize(0));
352
353 if (f_tau_sqrt < std::sqrt(std::numeric_limits<double>::epsilon()))
354 {
355 std::cout << "ERROR: Scaling factor is zero!\n";
356 std::abort();
357 }
358 cuDoubleComplex scaling_factor = {1.0 / f_tau_sqrt, 0.0};
359
360 HANDLE_CUBLAS_ERROR(cublasZscal(cublasHandle, numElements[i],
361 &scaling_factor,
362 static_cast<cuDoubleComplex*>(d_envTensorsInsert[i]), 1));
363
364 // Insert updated tensor
365 cutensornetMPSEnvBounds_t orthoSpec = envBounds;
366 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSInsertTensor(cutnHandle, projectionMps, &envBounds,
367 &orthoSpec, 0,
368 d_envTensorsInsert[i], workDesc, 0x0));
369 }
370
371 // Backward sweep
372 for (int32_t i = numQubits - 1; i >= 0; --i)
373 {
374 // Environment bounds for site i
375 cutensornetMPSEnvBounds_t envBounds;
376 envBounds.lowerBound = i - 1;
377 envBounds.upperBound = i + 1;
378
379 // Extract site tensor
380 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSExtractTensor(
381 cutnHandle, projectionMps, &envBounds, 0, d_envTensorsExtract[i], workDesc, 0x0));
382
383 // Compute environment tensor
384 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSComputeTensorEnv(cutnHandle, projectionMps, &envBounds, 0, 0,
385 0, d_envTensorsInsert[i], 0,
386 0, workDesc, 0x0));
387
388 // Apply partial-fidelity scaling factor to the environment tensor
389 double f_tau_sqrt;
390 HANDLE_CUBLAS_ERROR(cublasDznrm2(cublasHandle, numElements[i],
391 static_cast<const cuDoubleComplex*>(d_envTensorsInsert[i]), 1,
392 &f_tau_sqrt));
393 HANDLE_CUDA_ERROR(cudaStreamSynchronize(0));
394
395 if (f_tau_sqrt < std::sqrt(std::numeric_limits<double>::epsilon()))
396 {
397 std::cout << "ERROR: Scaling factor is zero!\n";
398 std::abort();
399 }
400 cuDoubleComplex scaling_factor = {1.0 / f_tau_sqrt, 0.0};
401
402 HANDLE_CUBLAS_ERROR(cublasZscal(cublasHandle, numElements[i],
403 &scaling_factor,
404 static_cast<cuDoubleComplex*>(d_envTensorsInsert[i]), 1));
405
406 // Insert updated tensor
407 cutensornetMPSEnvBounds_t orthoSpec = envBounds;
408 HANDLE_CUTN_ERROR(cutensornetStateProjectionMPSInsertTensor(cutnHandle, projectionMps, &envBounds,
409 &orthoSpec, 0,
410 d_envTensorsInsert[i], workDesc, 0x0));
411 }
412
413 std::cout << "Completed DMRG iteration " << iter + 1 << std::endl;
414 }
415
416 std::cout << "DMRG optimization completed\n";
Free resources¶
420 // Cleanup
421 HANDLE_CUTN_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
422 HANDLE_CUTN_ERROR(cutensornetDestroyStateProjectionMPS(projectionMps));
423 HANDLE_CUTN_ERROR(cutensornetDestroyState(quantumState));
424
425 // Free GPU buffers
426 for (int32_t i = 0; i < numQubits; i++)
427 {
428 HANDLE_CUDA_ERROR(cudaFree(d_projMPSTensors[i]));
429 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsExtract[i]));
430 HANDLE_CUDA_ERROR(cudaFree(d_envTensorsInsert[i]));
431 }
432 HANDLE_CUDA_ERROR(cudaFree(d_scratch));
433 for (auto ptr : d_gateCR) HANDLE_CUDA_ERROR(cudaFree(ptr));
434 HANDLE_CUDA_ERROR(cudaFree(d_gateH));
435 std::cout << "Freed memory on GPU\n";
436
437 HANDLE_CUBLAS_ERROR(cublasDestroy(cublasHandle));
438
439 // Finalize the cuTensorNet library
440 HANDLE_CUTN_ERROR(cutensornetDestroy(cutnHandle));
441 std::cout << "Finalized the cuTensorNet library\n";
442
443 return 0;
444}
References¶
Ayral, Thomas, Thibaud Louvet, Yiqing Zhou, Cyprien Lambert, E. Miles Stoudenmire, and Xavier Waintal. 2023. “Density-Matrix Renormalization Group Algorithm for Simulating Quantum Circuits with a Finite Fidelity.” PRX Quantum 4 (April): 020304. https://doi.org/10.1103/PRXQuantum.4.020304.