Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).
Define SVD decomposition¶
As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.
97 /******************************************************
98 * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}
99 *******************************************************/
100
101 typedef float floatType;
102 cudaDataType_t typeData = CUDA_R_32F;
103
104 // Create vector of modes
105 int32_t sharedMode = 'x';
106
107 std::vector<int32_t> modesT{'i','j','m','n'}; // input
108 std::vector<int32_t> modesU{'i', sharedMode,'m'};
109 std::vector<int32_t> modesV{'n', sharedMode,'j'}; // SVD output
110
111 // Extents
112 std::unordered_map<int32_t, int64_t> extentMap;
113 extentMap['i'] = 16;
114 extentMap['j'] = 16;
115 extentMap['m'] = 16;
116 extentMap['n'] = 16;
117
118 int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
119 int64_t colExtent = computeCombinedExtent(extentMap, modesV);
120 // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
121 int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
122 const int64_t maxExtent = fullSharedExtent / 2; //fix extent truncation with half of the singular values trimmed out
123 extentMap[sharedMode] = maxExtent;
124
125 // Create a vector of extents for each tensor
126 std::vector<int64_t> extentT;
127 for (auto mode : modesT)
128 extentT.push_back(extentMap[mode]);
129 std::vector<int64_t> extentU;
130 for (auto mode : modesU)
131 extentU.push_back(extentMap[mode]);
132 std::vector<int64_t> extentV;
133 for (auto mode : modesV)
134 extentV.push_back(extentMap[mode]);
Note
To perform fixed extent truncation, we directly set maxExtent
to half of the full extent corresponding to exact SVD.
Setup SVD truncation parameters¶
Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization.
Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t
.
Meanwhile, we can create cutensornetTensorSVDInfo_t
to keep track of runtime truncation information.
221 /**********************************************
222 * Setup SVD algorithm and truncation parameters
223 ***********************************************/
224
225 cutensornetTensorSVDConfig_t svdConfig;
226 HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
227
228 // set up truncation parameters
229 double absCutoff = 1e-2;
230 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
231 svdConfig,
232 CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF,
233 &absCutoff,
234 sizeof(absCutoff)) );
235 double relCutoff = 4e-2;
236 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
237 svdConfig,
238 CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF,
239 &relCutoff,
240 sizeof(relCutoff)) );
241
242 // optional: choose gesvdj algorithm with customized parameters. Default is gesvd.
243 cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
244 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
245 svdConfig,
246 CUTENSORNET_TENSOR_SVD_CONFIG_ALGO,
247 &svdAlgo,
248 sizeof(svdAlgo)) );
249 cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
250 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
251 svdConfig,
252 CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS,
253 &gesvdjParams,
254 sizeof(gesvdjParams)) );
255 printf("Set up SVDConfig to use GESVDJ algorithm with truncation\n");
256
257 /********************************************************
258 * Create SVDInfo to record runtime SVD truncation details
259 *********************************************************/
260
261 cutensornetTensorSVDInfo_t svdInfo;
262 HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;
Execution¶
Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes()
, which is very similar to its QR counterpart.
At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD()
.
308 /**********
309 * Execution
310 ***********/
311
312 GPUTimer timer{stream};
313 double minTimeCUTENSOR = 1e100;
314 const int numRuns = 3; // to get stable perf results
315 for (int i=0; i < numRuns; ++i)
316 {
317 // restore output
318 cudaMemsetAsync(D_U, 0, sizeU, stream);
319 cudaMemsetAsync(D_S, 0, sizeS, stream);
320 cudaMemsetAsync(D_V, 0, sizeV, stream);
321 cudaDeviceSynchronize();
322
323 // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
324 // We here restore descTensorU/V to the original problem.
325 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
326 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
327 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
328 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
329
330 timer.start();
331 HANDLE_ERROR( cutensornetTensorSVD(handle,
332 descTensorIn, D_T,
333 descTensorU, D_U,
334 D_S,
335 descTensorV, D_V,
336 svdConfig,
337 svdInfo,
338 workDesc,
339 stream) );
340 // Synchronize and measure timing
341 auto time = timer.seconds();
342 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
343 }
344
345 printf("Performing SVD\n");
346
347 HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
348 HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
349 HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );
Note
Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.
After the computation, we still need to free up all resources.