Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).
Define SVD decomposition¶
As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.
96 /******************************************************
97 * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}
98 *******************************************************/
99
100 typedef float floatType;
101 cudaDataType_t typeData = CUDA_R_32F;
102
103 // Create vector of modes
104 int32_t sharedMode = 'x';
105
106 std::vector<int32_t> modesT{'i','j','m','n'}; // input
107 std::vector<int32_t> modesU{'i', sharedMode,'m'};
108 std::vector<int32_t> modesV{'n', sharedMode,'j'}; // SVD output
109
110 // Extents
111 std::unordered_map<int32_t, int64_t> extentMap;
112 extentMap['i'] = 16;
113 extentMap['j'] = 16;
114 extentMap['m'] = 16;
115 extentMap['n'] = 16;
116
117 int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
118 int64_t colExtent = computeCombinedExtent(extentMap, modesV);
119 // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
120 int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
121 const int64_t maxExtent = fullSharedExtent / 2; //fix extent truncation with half of the singular values trimmed out
122 extentMap[sharedMode] = maxExtent;
123
124 // Create a vector of extents for each tensor
125 std::vector<int64_t> extentT;
126 for (auto mode : modesT)
127 extentT.push_back(extentMap[mode]);
128 std::vector<int64_t> extentU;
129 for (auto mode : modesU)
130 extentU.push_back(extentMap[mode]);
131 std::vector<int64_t> extentV;
132 for (auto mode : modesV)
133 extentV.push_back(extentMap[mode]);
Note
To perform fixed extent truncation, we directly set maxExtent
to half of the full extent corresponding to exact SVD.
Setup SVD truncation parameters¶
Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization.
Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t
.
Meanwhile, we can create cutensornetTensorSVDInfo_t
to keep track of runtime truncation information.
220 /**********************************************
221 * Setup SVD algorithm and truncation parameters
222 ***********************************************/
223
224 cutensornetTensorSVDConfig_t svdConfig;
225 HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
226
227 // set up truncation parameters
228 double absCutoff = 1e-2;
229 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
230 svdConfig,
231 CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF,
232 &absCutoff,
233 sizeof(absCutoff)) );
234 double relCutoff = 4e-2;
235 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
236 svdConfig,
237 CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF,
238 &relCutoff,
239 sizeof(relCutoff)) );
240
241 // optional: choose gesvdj algorithm with customized parameters. Default is gesvd.
242 cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
243 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
244 svdConfig,
245 CUTENSORNET_TENSOR_SVD_CONFIG_ALGO,
246 &svdAlgo,
247 sizeof(svdAlgo)) );
248 cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
249 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
250 svdConfig,
251 CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS,
252 &gesvdjParams,
253 sizeof(gesvdjParams)) );
254 printf("Set up SVDConfig to use GESVDJ algorithm with truncation\n");
255
256 /********************************************************
257 * Create SVDInfo to record runtime SVD truncation details
258 *********************************************************/
259
260 cutensornetTensorSVDInfo_t svdInfo;
261 HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;
Execution¶
Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes()
, which is very similar to its QR counterpart.
At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD()
.
307 /**********
308 * Execution
309 ***********/
310
311 GPUTimer timer{stream};
312 double minTimeCUTENSOR = 1e100;
313 const int numRuns = 3; // to get stable perf results
314 for (int i=0; i < numRuns; ++i)
315 {
316 // restore output
317 cudaMemsetAsync(D_U, 0, sizeU, stream);
318 cudaMemsetAsync(D_S, 0, sizeS, stream);
319 cudaMemsetAsync(D_V, 0, sizeV, stream);
320 cudaDeviceSynchronize();
321
322 // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
323 // We here restore descTensorU/V to the original problem.
324 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
325 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
326 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
327 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
328
329 timer.start();
330 HANDLE_ERROR( cutensornetTensorSVD(handle,
331 descTensorIn, D_T,
332 descTensorU, D_U,
333 D_S,
334 descTensorV, D_V,
335 svdConfig,
336 svdInfo,
337 workDesc,
338 stream) );
339 // Synchronize and measure timing
340 auto time = timer.seconds();
341 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
342 }
343
344 printf("Performing SVD\n");
345
346 HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
347 HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
348 HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );
Note
Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.
After the computation, we still need to free up all resources.