Performing tensor SVD using cuTensorNet adopts a very similar workflow as QR example. Here, we highlight the notable differences between the two APIs. The full code can be found in the NVIDIA/cuQuantum repository (here).
Define SVD decomposition¶
As with QR decomposition, we first define the SVD decomposition to perform with the data type, modes partition, and the extents.
96 /******************************************************
97 * Tensor SVD: T_{i,j,m,n} -> U_{i,x,m} S_{x} V_{n,x,j}
98 *******************************************************/
99
100 typedef float floatType;
101 cudaDataType_t typeData = CUDA_R_32F;
102
103 // Create vector of modes
104 int32_t sharedMode = 'x';
105
106 std::vector<int32_t> modesT{'i','j','m','n'}; // input
107 std::vector<int32_t> modesU{'i', sharedMode,'m'};
108 std::vector<int32_t> modesV{'n', sharedMode,'j'}; // SVD output
109
110 // Extents
111 std::unordered_map<int32_t, int64_t> extentMap;
112 extentMap['i'] = 16;
113 extentMap['j'] = 16;
114 extentMap['m'] = 16;
115 extentMap['n'] = 16;
116
117 int64_t rowExtent = computeCombinedExtent(extentMap, modesU);
118 int64_t colExtent = computeCombinedExtent(extentMap, modesV);
119 // cuTensorNet tensor SVD operates in reduced mode expecting k <= min(m, n)
120 int64_t fullSharedExtent = rowExtent <= colExtent? rowExtent: colExtent;
121 const int64_t maxExtent = fullSharedExtent / 2; //fix extent truncation with half of the singular values trimmed out
122 extentMap[sharedMode] = maxExtent;
123
124 // Create a vector of extents for each tensor
125 std::vector<int64_t> extentT;
126 for (auto mode : modesT)
127 extentT.push_back(extentMap[mode]);
128 std::vector<int64_t> extentU;
129 for (auto mode : modesU)
130 extentU.push_back(extentMap[mode]);
131 std::vector<int64_t> extentV;
132 for (auto mode : modesV)
133 extentV.push_back(extentMap[mode]);
Note
To perform fixed extent truncation, we directly set maxExtent
to half of the full extent corresponding to exact SVD.
Setup SVD truncation parameters¶
Once the SVD decomposition is defined, we can follow the same workflow as QR example for data allocation and tensor descriptor initialization.
Before querying workspace, we can choose different SVD options in cutensornetTensorSVDConfig_t
.
Meanwhile, we can create cutensornetTensorSVDInfo_t
to keep track of runtime truncation information.
220 /********************************
221 * Setup SVD truncation parameters
222 *********************************/
223
224 cutensornetTensorSVDConfig_t svdConfig;
225 HANDLE_ERROR( cutensornetCreateTensorSVDConfig(handle, &svdConfig) );
226 double absCutoff = 1e-2;
227 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
228 svdConfig,
229 CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF,
230 &absCutoff,
231 sizeof(absCutoff)) );
232 double relCutoff = 4e-2;
233 HANDLE_ERROR( cutensornetTensorSVDConfigSetAttribute(handle,
234 svdConfig,
235 CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF,
236 &relCutoff,
237 sizeof(relCutoff)) );
238
239 /********************************************************
240 * Create SVDInfo to record runtime SVD truncation details
241 *********************************************************/
242
243 cutensornetTensorSVDInfo_t svdInfo;
244 HANDLE_ERROR( cutensornetCreateTensorSVDInfo(handle, &svdInfo)) ;
Execution¶
Next, we can query and allocate the workspace with cutensornetWorkspaceComputeSVDSizes()
, which is very similar to its QR counterpart.
At this stage, we can perform the SVD decomposition by calling cutensornetTensorSVD()
.
290 /**********
291 * Execution
292 ***********/
293
294 GPUTimer timer{stream};
295 double minTimeCUTENSOR = 1e100;
296 const int numRuns = 3; // to get stable perf results
297 for (int i=0; i < numRuns; ++i)
298 {
299 // restore output
300 cudaMemsetAsync(D_U, 0, sizeU, stream);
301 cudaMemsetAsync(D_S, 0, sizeS, stream);
302 cudaMemsetAsync(D_V, 0, sizeV, stream);
303 cudaDeviceSynchronize();
304
305 // With value-based truncation, `cutensornetTensorSVD` can potentially update the shared extent in descTensorU/V.
306 // We here restore descTensorU/V to the original problem.
307 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorU) );
308 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorV) );
309 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesU, extentU.data(), strides, modesU.data(), typeData, &descTensorU) );
310 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesV, extentV.data(), strides, modesV.data(), typeData, &descTensorV) );
311
312 timer.start();
313 HANDLE_ERROR( cutensornetTensorSVD(handle,
314 descTensorIn, D_T,
315 descTensorU, D_U,
316 D_S,
317 descTensorV, D_V,
318 svdConfig,
319 svdInfo,
320 workDesc,
321 stream) );
322 // Synchronize and measure timing
323 auto time = timer.seconds();
324 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
325 }
326
327 printf("Performing SVD\n");
328
329 HANDLE_CUDA_ERROR( cudaMemcpyAsync(U, D_U, sizeU, cudaMemcpyDeviceToHost) );
330 HANDLE_CUDA_ERROR( cudaMemcpyAsync(S, D_S, sizeS, cudaMemcpyDeviceToHost) );
331 HANDLE_CUDA_ERROR( cudaMemcpyAsync(V, D_V, sizeV, cudaMemcpyDeviceToHost) );
Note
Since we turned on weighted truncation options in this example, we need to restore the tensor descriptors for U and V if we wish to perform the same computation multiple times.
After the computation, we still need to free up all resources.