The following code example illustrates how to integrate cuTensorNet functionalities to perform tensor QR operation. The full code can be found in the NVIDIA/cuQuantum repository (here).
Define QR decomposition¶
We first define the QR decomposition to perform with the data type, modes partition, and the extents.
96 /**********************************************
97 * Tensor QR: T_{i,j,m,n} -> Q_{i,x,m} R_{n,x,j}
98 ***********************************************/
99
100 typedef float floatType;
101 cudaDataType_t typeData = CUDA_R_32F;
102
103 // Create vector of modes
104 int32_t sharedMode = 'x';
105
106 std::vector<int32_t> modesT{'i','j','m','n'}; // input
107 std::vector<int32_t> modesQ{'i', sharedMode,'m'};
108 std::vector<int32_t> modesR{'n', sharedMode,'j'}; // QR output
109
110 // Extents
111 std::unordered_map<int32_t, int64_t> extentMap;
112 extentMap['i'] = 16;
113 extentMap['j'] = 16;
114 extentMap['m'] = 16;
115 extentMap['n'] = 16;
116
117 int64_t rowExtent = computeCombinedExtent(extentMap, modesQ);
118 int64_t colExtent = computeCombinedExtent(extentMap, modesR);
119
120 // cuTensorNet tensor QR operates in reduced mode expecting k = min(m, n)
121 extentMap[sharedMode] = rowExtent <= colExtent? rowExtent: colExtent;
122
123 // Create a vector of extents for each tensor
124 std::vector<int64_t> extentT;
125 for (auto mode : modesT)
126 extentT.push_back(extentMap[mode]);
127 std::vector<int64_t> extentQ;
128 for (auto mode : modesQ)
129 extentQ.push_back(extentMap[mode]);
130 std::vector<int64_t> extentR;
131 for (auto mode : modesR)
132 extentR.push_back(extentMap[mode]);
Note
cutensornetTensorQR()
operates in reduced mode, expecting the shared extent for the output to be the minimum of
the combined row extent and the combined column extent.
Allocate memory and initialize data¶
Next, we allocate memory for the input and output tensor operands. The input operand is initalized to random values.
135 /***********************************
136 * Allocating data on host and device
137 ************************************/
138
139 size_t elementsT = 1;
140 for (auto mode : modesT)
141 elementsT *= extentMap[mode];
142 size_t elementsQ = 1;
143 for (auto mode : modesQ)
144 elementsQ *= extentMap[mode];
145 size_t elementsR = 1;
146 for (auto mode : modesR)
147 elementsR *= extentMap[mode];
148
149 size_t sizeT = sizeof(floatType) * elementsT;
150 size_t sizeQ = sizeof(floatType) * elementsQ;
151 size_t sizeR = sizeof(floatType) * elementsR;
152
153 printf("Total memory: %.2f GiB\n", (sizeT + sizeQ + sizeR)/1024./1024./1024);
154
155 floatType *T = (floatType*) malloc(sizeT);
156 floatType *Q = (floatType*) malloc(sizeQ);
157 floatType *R = (floatType*) malloc(sizeR);
158
159 if (T == NULL || Q==NULL || R==NULL )
160 {
161 printf("Error: Host allocation of input T or output Q/R.\n");
162 return -1;
163 }
164
165 void* D_T;
166 void* D_Q;
167 void* D_R;
168
169 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_T, sizeT) );
170 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_Q, sizeQ) );
171 HANDLE_CUDA_ERROR( cudaMalloc((void**) &D_R, sizeR) );
172
173 /****************
174 * Initialize data
175 *****************/
176
177 for (uint64_t i = 0; i < elementsT; i++)
178 T[i] = ((floatType) rand())/RAND_MAX;
179
180 HANDLE_CUDA_ERROR( cudaMemcpy(D_T, T, sizeT, cudaMemcpyHostToDevice) );
181 printf("Allocate memory for data, and initialize data.\n");
Initialize cuTensorNet and create tensor descriptors¶
Then we initialize the library handle of cuTensorNet and create cutensorTensorDescriptor_t
for input and output tensors.
184 /******************
185 * cuTensorNet
186 *******************/
187
188 cudaStream_t stream;
189 HANDLE_CUDA_ERROR( cudaStreamCreate(&stream) );
190
191 cutensornetHandle_t handle;
192 HANDLE_ERROR( cutensornetCreate(&handle) );
193
194 /***************************
195 * Create tensor descriptors
196 ****************************/
197
198 cutensornetTensorDescriptor_t descTensorIn;
199 cutensornetTensorDescriptor_t descTensorQ;
200 cutensornetTensorDescriptor_t descTensorR;
201
202 const int32_t numModesIn = modesT.size();
203 const int32_t numModesQ = modesQ.size();
204 const int32_t numModesR = modesR.size();
205
206 const int64_t* strides = NULL; // assuming fortran layout for all tensors
207
208 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesIn, extentT.data(), strides, modesT.data(), typeData, &descTensorIn) );
209 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesQ, extentQ.data(), strides, modesQ.data(), typeData, &descTensorQ) );
210 HANDLE_ERROR( cutensornetCreateTensorDescriptor(handle, numModesR, extentR.data(), strides, modesR.data(), typeData, &descTensorR) );
211
212 printf("Initialize the cuTensorNet library and create all tensor descriptors.\n");
Query and allocate required workspace¶
Once all tensor descriptors are created, we can query the required workspace size with cutensornetWorkspaceComputeQRSizes()
.
215 /********************************************
216 * Query and allocate required workspace sizes
217 *********************************************/
218
219 cutensornetWorkspaceDescriptor_t workDesc;
220 HANDLE_ERROR( cutensornetCreateWorkspaceDescriptor(handle, &workDesc) );
221 HANDLE_ERROR( cutensornetWorkspaceComputeQRSizes(handle, descTensorIn, descTensorQ, descTensorR, workDesc) );
222 uint64_t hostWorkspaceSize, deviceWorkspaceSize;
223
224 // for tensor QR, it does not matter which cutensornetWorksizePref_t we pick
225 HANDLE_ERROR( cutensornetWorkspaceGetSize(handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, CUTENSORNET_MEMSPACE_DEVICE, &deviceWorkspaceSize) );
226 HANDLE_ERROR( cutensornetWorkspaceGetSize(handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED, CUTENSORNET_MEMSPACE_HOST, &hostWorkspaceSize) );
227
228 void *devWork = nullptr, *hostWork = nullptr;
229 if (deviceWorkspaceSize > 0) {
230 HANDLE_CUDA_ERROR( cudaMalloc(&devWork, deviceWorkspaceSize) );
231 }
232 if (hostWorkspaceSize > 0) {
233 hostWork = malloc(hostWorkspaceSize);
234 }
235 HANDLE_ERROR( cutensornetWorkspaceSet(handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE, devWork, deviceWorkspaceSize) );
236 HANDLE_ERROR( cutensornetWorkspaceSet(handle, workDesc, CUTENSORNET_MEMSPACE_HOST, hostWork, hostWorkspaceSize) );
Execution¶
At this stage, we can perform the QR decomposition by calling cutensornetTensorQR()
.
239 /**********
240 * Execution
241 ***********/
242
243 GPUTimer timer{stream};
244 double minTimeCUTENSOR = 1e100;
245 const int numRuns = 3; // to get stable perf results
246 for (int i=0; i < numRuns; ++i)
247 {
248 // restore output
249 cudaMemsetAsync(D_Q, 0, sizeQ, stream);
250 cudaMemsetAsync(D_R, 0, sizeR, stream);
251 cudaDeviceSynchronize();
252
253 timer.start();
254 HANDLE_ERROR( cutensornetTensorQR(handle,
255 descTensorIn, D_T,
256 descTensorQ, D_Q,
257 descTensorR, D_R,
258 workDesc,
259 stream) );
260 // Synchronize and measure timing
261 auto time = timer.seconds();
262 minTimeCUTENSOR = (minTimeCUTENSOR < time) ? minTimeCUTENSOR : time;
263 }
264
265 printf("Performing QR\n");
266
267 HANDLE_CUDA_ERROR( cudaMemcpyAsync(Q, D_Q, sizeQ, cudaMemcpyDeviceToHost) );
268 HANDLE_CUDA_ERROR( cudaMemcpyAsync(R, D_R, sizeR, cudaMemcpyDeviceToHost) );
269
270 cudaDeviceSynchronize(); // device synchronization.
271 printf("%.2f ms\n", minTimeCUTENSOR * 1000.f);
Free resources¶
After the computation, we need to free up all resources.
274 /***************
275 * Free resources
276 ****************/
277
278 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorIn) );
279 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorQ) );
280 HANDLE_ERROR( cutensornetDestroyTensorDescriptor(descTensorR) );
281 HANDLE_ERROR( cutensornetDestroyWorkspaceDescriptor(workDesc) );
282 HANDLE_ERROR( cutensornetDestroy(handle) );
283
284 if (T) free(T);
285 if (Q) free(Q);
286 if (R) free(R);
287 if (D_T) cudaFree(D_T);
288 if (D_Q) cudaFree(D_Q);
289 if (D_R) cudaFree(D_R);
290 if (devWork) cudaFree(devWork);
291 if (hostWork) free(hostWork);
292
293 printf("Free resource and exit.\n");
294
295 return 0;
296}