.. role:: cpp(code) :language: cpp :class: highlight .. _getting-started-label: Getting Started =============== In this section, we show how to implement a first tensor contraction using cuTENSOR. Our code will compute the following operation using single-precision arithmetic. .. math:: C_{m,u,n,v} = \alpha A_{m,h,k,n} B_{u,k,v,h} + \beta C_{m,u,n,v} We build the code up step by step, each step adding code at the end. The steps are separated by comments consisting of multiple stars. Installation and Compilation ---------------------------- Assuming cuTENSOR has been downloaded to *CUTENSOR_DOWNLOAD_FILE*, we extract it, remember cuTENSOR's root location, and update the library path accordingly. .. code-block:: tar xf ${CUTENSOR_DOWNLOAD_FILE} export CUTENSOR_ROOT=${PWD}/libcutensor export LD_LIBRARY_PATH=${CUTENSOR_ROOT}/lib/10.1/:${LD_LIBRARY_PATH} If we store the following code in a file called *contraction.cu*, we can compile it via the following command: .. code-block:: nvcc contraction.cu -L${CUTENSOR_ROOT}/lib/10.1/ -I${CUTENSOR_ROOT}/include -std=c++11 -lcutensor -o contraction When compiling intermediate steps of this example, the compiler might warn about unused variables. This is due to the example not being complete. The final step should issue no warnings. Headers and Data Types ---------------------- To start off, we begin with a trivial :code:`main()` function, include some headers, and define some data types. .. code-block:: cpp #include #include #include #include int main(int argc, char** argv) { // Host element type definition typedef float floatTypeA; typedef float floatTypeB; typedef float floatTypeC; typedef float floatTypeCompute; // CUDA types cudaDataType_t typeA = CUDA_R_32F; cudaDataType_t typeB = CUDA_R_32F; cudaDataType_t typeC = CUDA_R_32F; cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; floatTypeCompute alpha = (floatTypeCompute)1.1f; floatTypeCompute beta = (floatTypeCompute)0.9f; printf("Include headers and define data types\n"); return 0; } Define Tensor Sizes --------------------- Next, we define the modes and extents of our tensors. For the sake of the example, let us say that the modes :math:`m`, :math:`n` and :math:`u` have an extent of 96; and that :math:`v`, :math:`h` and :math:`k` have an extent of 64. Note how modes are labeled by integers and how that allows us to label the modes using characters. See :ref:`nomenclature-label` for an explanation of the terms mode and extent. .. code-block:: cpp #include #include #include #include #include #include int main(int argc, char** argv) { // Host element type definition typedef float floatTypeA; typedef float floatTypeB; typedef float floatTypeC; typedef float floatTypeCompute; // CUDA types cudaDataType_t typeA = CUDA_R_32F; cudaDataType_t typeB = CUDA_R_32F; cudaDataType_t typeC = CUDA_R_32F; cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; floatTypeCompute alpha = (floatTypeCompute)1.1f; floatTypeCompute beta = (floatTypeCompute)0.9f; printf("Include headers and define data types\n"); /* ***************************** */ // Create vector of modes std::vector modeC{'m','u','n','v'}; std::vector modeA{'m','h','k','n'}; std::vector modeB{'u','k','v','h'}; int nmodeA = modeA.size(); int nmodeB = modeB.size(); int nmodeC = modeC.size(); // Extents std::unordered_map extent; extent['m'] = 96; extent['n'] = 96; extent['u'] = 96; extent['v'] = 64; extent['h'] = 64; extent['k'] = 64; // Create a vector of extents for each tensor std::vector extentC; for(auto mode : modeC) extentC.push_back(extent[mode]); std::vector extentA; for(auto mode : modeA) extentA.push_back(extent[mode]); std::vector extentB; for(auto mode : modeB) extentB.push_back(extent[mode]); printf("Define modes and extents\n"); return 0; } Initialize Tensor Data ---------------------- Next, we need to allocate and initialize host and device memory for our tensors: .. code-block:: cpp #include #include #include #include #include #include int main(int argc, char** argv) { // Host element type definition typedef float floatTypeA; typedef float floatTypeB; typedef float floatTypeC; typedef float floatTypeCompute; // CUDA types cudaDataType_t typeA = CUDA_R_32F; cudaDataType_t typeB = CUDA_R_32F; cudaDataType_t typeC = CUDA_R_32F; cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; floatTypeCompute alpha = (floatTypeCompute)1.1f; floatTypeCompute beta = (floatTypeCompute)0.9f; printf("Include headers and define data types\n"); /* ***************************** */ // Create vector of modes std::vector modeC{'m','u','n','v'}; std::vector modeA{'m','h','k','n'}; std::vector modeB{'u','k','v','h'}; int nmodeA = modeA.size(); int nmodeB = modeB.size(); int nmodeC = modeC.size(); // Extents std::unordered_map extent; extent['m'] = 96; extent['n'] = 96; extent['u'] = 96; extent['v'] = 64; extent['h'] = 64; extent['k'] = 64; // Create a vector of extents for each tensor std::vector extentC; for(auto mode : modeC) extentC.push_back(extent[mode]); std::vector extentA; for(auto mode : modeA) extentA.push_back(extent[mode]); std::vector extentB; for(auto mode : modeB) extentB.push_back(extent[mode]); printf("Define modes and extents\n"); /* ***************************** */ // Number of elements of each tensor size_t elementsA = 1; for(auto mode : modeA) elementsA *= extent[mode]; size_t elementsB = 1; for(auto mode : modeB) elementsB *= extent[mode]; size_t elementsC = 1; for(auto mode : modeC) elementsC *= extent[mode]; // Size in bytes size_t sizeA = sizeof(floatTypeA) * elementsA; size_t sizeB = sizeof(floatTypeB) * elementsB; size_t sizeC = sizeof(floatTypeC) * elementsC; // Allocate on device void *A_d, *B_d, *C_d; cudaMalloc((void**)&A_d, sizeA); cudaMalloc((void**)&B_d, sizeB); cudaMalloc((void**)&C_d, sizeC); // Allocate on host floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA); floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB); floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC); // Initialize data on host for(int64_t i = 0; i < elementsA; i++) A[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsB; i++) B[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsC; i++) C[i] = (((float) rand())/RAND_MAX - 0.5)*100; // Copy to device cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice); cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice); cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice); printf("Allocate, initialize and transfer tensors\n"); return 0; } Create Tensor Descriptors ----------------------------- We are now ready to use the cuTENSOR library: We add a macro to handle errors and initialize the cuTENSOR by creating a handle. Then, we create a descriptor for each tensor by providing its data type, order, data type, and element-wise operation. The latter sets an element-wise operation that is applied to that tensor when it is used during computation. In this case, the operator is the identity; see :ref:`cutensorOperator-label` for other possibilities. .. code-block:: cpp #include #include #include #include #include #include // Handle cuTENSOR errors #define HANDLE_ERROR(x) { \ const auto err = x; \ if( x != CUTENSOR_STATUS_SUCCESS ) \ { printf("Error: %s in line %d\n", cutensorGetErrorString(x), __LINE__); exit(-1); } \ } int main(int argc, char** argv) { // Host element type definition typedef float floatTypeA; typedef float floatTypeB; typedef float floatTypeC; typedef float floatTypeCompute; // CUDA types cudaDataType_t typeA = CUDA_R_32F; cudaDataType_t typeB = CUDA_R_32F; cudaDataType_t typeC = CUDA_R_32F; cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; floatTypeCompute alpha = (floatTypeCompute) 1.1f; floatTypeCompute beta = (floatTypeCompute) 0.9f; printf("Include headers and define data types\n"); /* ***************************** */ // Create vector of modes std::vector modeC{'m','u','n','v'}; std::vector modeA{'m','h','k','n'}; std::vector modeB{'u','k','v','h'}; int nmodeA = modeA.size(); int nmodeB = modeB.size(); int nmodeC = modeC.size(); // Extents std::unordered_map extent; extent['m'] = 96; extent['n'] = 96; extent['u'] = 96; extent['v'] = 64; extent['h'] = 64; extent['k'] = 64; // Create a vector of extents for each tensor std::vector extentC; for(auto mode : modeC) extentC.push_back(extent[mode]); std::vector extentA; for(auto mode : modeA) extentA.push_back(extent[mode]); std::vector extentB; for(auto mode : modeB) extentB.push_back(extent[mode]); printf("Define modes and extents\n"); /* ***************************** */ // Number of elements of each tensor size_t elementsA = 1; for(auto mode : modeA) elementsA *= extent[mode]; size_t elementsB = 1; for(auto mode : modeB) elementsB *= extent[mode]; size_t elementsC = 1; for(auto mode : modeC) elementsC *= extent[mode]; // Size in bytes size_t sizeA = sizeof(floatTypeA) * elementsA; size_t sizeB = sizeof(floatTypeB) * elementsB; size_t sizeC = sizeof(floatTypeC) * elementsC; // Allocate on device void *A_d, *B_d, *C_d; cudaMalloc((void**)&A_d, sizeA); cudaMalloc((void**)&B_d, sizeB); cudaMalloc((void**)&C_d, sizeC); // Allocate on host floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA); floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB); floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC); // Initialize data on host for(int64_t i = 0; i < elementsA; i++) A[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsB; i++) B[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsC; i++) C[i] = (((float) rand())/RAND_MAX - 0.5)*100; // Copy to device cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice); cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice); cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice); printf("Allocate, initialize and transfer tensors\n"); /* ***************************** */ // Initialize cuTENSOR library cutensorHandle_t handle; cutensorInit(&handle); // Create Tensor Descriptors cutensorTensorDescriptor_t descA; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descA, nmodeA, extentA.data(), NULL,/*stride*/ typeA, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descB; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descB, nmodeB, extentB.data(), NULL,/*stride*/ typeB, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descC; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descC, nmodeC, extentC.data(), NULL,/*stride*/ typeC, CUTENSOR_OP_IDENTITY ) ); printf("Initialize cuTENSOR and tensor descriptors\n"); return 0; } Create Contraction Descriptor ----------------------------- In the next step we query the library to find out the best alignment requirements that the data pointer allow for and create the descriptor for the contraction problem: .. code-block:: cpp #include #include #include #include #include #include // Handle cuTENSOR errors #define HANDLE_ERROR(x) { \ const auto err = x; \ if( x != CUTENSOR_STATUS_SUCCESS ) \ { printf("Error: %s in line %d\n", cutensorGetErrorString(x), __LINE__); exit(-1); } \ } int main(int argc, char** argv) { // Host element type definition typedef float floatTypeA; typedef float floatTypeB; typedef float floatTypeC; typedef float floatTypeCompute; // CUDA types cudaDataType_t typeA = CUDA_R_32F; cudaDataType_t typeB = CUDA_R_32F; cudaDataType_t typeC = CUDA_R_32F; cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; floatTypeCompute alpha = (floatTypeCompute)1.1f; floatTypeCompute beta = (floatTypeCompute)0.9f; printf("Include headers and define data types\n"); /* ***************************** */ // Create vector of modes std::vector modeC{'m','u','n','v'}; std::vector modeA{'m','h','k','n'}; std::vector modeB{'u','k','v','h'}; int nmodeA = modeA.size(); int nmodeB = modeB.size(); int nmodeC = modeC.size(); // Extents std::unordered_map extent; extent['m'] = 96; extent['n'] = 96; extent['u'] = 96; extent['v'] = 64; extent['h'] = 64; extent['k'] = 64; // Create a vector of extents for each tensor std::vector extentC; for(auto mode : modeC) extentC.push_back(extent[mode]); std::vector extentA; for(auto mode : modeA) extentA.push_back(extent[mode]); std::vector extentB; for(auto mode : modeB) extentB.push_back(extent[mode]); printf("Define modes and extents\n"); /* ***************************** */ // Number of elements of each tensor size_t elementsA = 1; for(auto mode : modeA) elementsA *= extent[mode]; size_t elementsB = 1; for(auto mode : modeB) elementsB *= extent[mode]; size_t elementsC = 1; for(auto mode : modeC) elementsC *= extent[mode]; // Size in bytes size_t sizeA = sizeof(floatTypeA) * elementsA; size_t sizeB = sizeof(floatTypeB) * elementsB; size_t sizeC = sizeof(floatTypeC) * elementsC; // Allocate on device void *A_d, *B_d, *C_d; cudaMalloc((void**)&A_d, sizeA); cudaMalloc((void**)&B_d, sizeB); cudaMalloc((void**)&C_d, sizeC); // Allocate on host floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA); floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB); floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC); // Initialize data on host for(int64_t i = 0; i < elementsA; i++) A[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsB; i++) B[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsC; i++) C[i] = (((float) rand())/RAND_MAX - 0.5)*100; // Copy to device cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice); cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice); cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice); printf("Allocate, initialize and transfer tensors\n"); /* ***************************** */ // Initialize cuTENSOR library cutensorHandle_t handle; cutensorInit(&handle); // Create Tensor Descriptors cutensorTensorDescriptor_t descA; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descA, nmodeA, extentA.data(), NULL,/*stride*/ typeA, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descB; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descB, nmodeB, extentB.data(), NULL,/*stride*/ typeB, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descC; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descC, nmodeC, extentC.data(), NULL,/*stride*/ typeC, CUTENSOR_OP_IDENTITY ) ); printf("Initialize cuTENSOR and tensor descriptors\n"); /* ***************************** */ //Retrieve the memory alignment for each tensor uint32_t alignmentRequirementA; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, A_d, &descA, &alignmentRequirementA) ); uint32_t alignmentRequirementB; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, B_d, &descB, &alignmentRequirementB) ); uint32_t alignmentRequirementC; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, C_d, &descC, &alignmentRequirementC) ); printf("Query best alignment requirement for our pointers\n"); /* ***************************** */ // Create the Contraction Descriptor cutensorContractionDescriptor_t desc; HANDLE_ERROR( cutensorInitContractionDescriptor( &handle, &desc, &descA, modeA.data(), alignmentRequirementA, &descB, modeB.data(), alignmentRequirementB, &descC, modeC.data(), alignmentRequirementC, &descC, modeC.data(), alignmentRequirementC, typeCompute) ); printf("Initialize contraction descriptor\n"); return 0; } Determine Algorithm and Workspace --------------------------------- Now that we have defined both the tensors and the contraction that we want to perform, we must select an algorithm to perform the contraction. That algorithm is specified by :ref:`cutensorAlgo-label`. Specifying :code:`CUTENSOR_ALGO_DEFAULT` allows us to let cuTENSOR's internal heuristic choose the best approach. All the information to find a good algorithm is stored in the :ref:`cutensorFind-label` data structure. We can also query the library to find the amount of workspace required to have the most options when selecting an algorithms. While workspace memory is not mandatory, it is required for some algorithms. .. code-block:: cpp #include #include #include #include #include #include // Handle cuTENSOR errors #define HANDLE_ERROR(x) { \ const auto err = x; \ if( x != CUTENSOR_STATUS_SUCCESS ) \ { printf("Error: %s in line %d\n", cutensorGetErrorString(x), __LINE__); exit(-1); } \ } int main(int argc, char** argv) { // Host element type definition typedef float floatTypeA; typedef float floatTypeB; typedef float floatTypeC; typedef float floatTypeCompute; // CUDA types cudaDataType_t typeA = CUDA_R_32F; cudaDataType_t typeB = CUDA_R_32F; cudaDataType_t typeC = CUDA_R_32F; cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; floatTypeCompute alpha = (floatTypeCompute)1.1f; floatTypeCompute beta = (floatTypeCompute)0.9f; printf("Include headers and define data types\n"); /* ***************************** */ // Create vector of modes std::vector modeC{'m','u','n','v'}; std::vector modeA{'m','h','k','n'}; std::vector modeB{'u','k','v','h'}; int nmodeA = modeA.size(); int nmodeB = modeB.size(); int nmodeC = modeC.size(); // Extents std::unordered_map extent; extent['m'] = 96; extent['n'] = 96; extent['u'] = 96; extent['v'] = 64; extent['h'] = 64; extent['k'] = 64; // Create a vector of extents for each tensor std::vector extentC; for(auto mode : modeC) extentC.push_back(extent[mode]); std::vector extentA; for(auto mode : modeA) extentA.push_back(extent[mode]); std::vector extentB; for(auto mode : modeB) extentB.push_back(extent[mode]); printf("Define modes and extents\n"); /* ***************************** */ // Number of elements of each tensor size_t elementsA = 1; for(auto mode : modeA) elementsA *= extent[mode]; size_t elementsB = 1; for(auto mode : modeB) elementsB *= extent[mode]; size_t elementsC = 1; for(auto mode : modeC) elementsC *= extent[mode]; // Size in bytes size_t sizeA = sizeof(floatTypeA) * elementsA; size_t sizeB = sizeof(floatTypeB) * elementsB; size_t sizeC = sizeof(floatTypeC) * elementsC; // Allocate on device void *A_d, *B_d, *C_d; cudaMalloc((void**)&A_d, sizeA); cudaMalloc((void**)&B_d, sizeB); cudaMalloc((void**)&C_d, sizeC); // Allocate on host floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA); floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB); floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC); // Initialize data on host for(int64_t i = 0; i < elementsA; i++) A[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsB; i++) B[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsC; i++) C[i] = (((float) rand())/RAND_MAX - 0.5)*100; // Copy to device cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice); cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice); cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice); printf("Allocate, initialize and transfer tensors\n"); /* ***************************** */ // Initialize cuTENSOR library cutensorHandle_t handle; cutensorInit(&handle); // Create Tensor Descriptors cutensorTensorDescriptor_t descA; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descA, nmodeA, extentA.data(), NULL,/*stride*/ typeA, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descB; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descB, nmodeB, extentB.data(), NULL,/*stride*/ typeB, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descC; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descC, nmodeC, extentC.data(), NULL,/*stride*/ typeC, CUTENSOR_OP_IDENTITY ) ); printf("Initialize cuTENSOR and tensor descriptors\n"); /* ***************************** */ //Retrieve the memory alignment for each tensor uint32_t alignmentRequirementA; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, A_d, &descA, &alignmentRequirementA) ); uint32_t alignmentRequirementB; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, B_d, &descB, &alignmentRequirementB) ); uint32_t alignmentRequirementC; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, C_d, &descC, &alignmentRequirementC) ); printf("Query best alignment requirement for our pointers\n"); /* ***************************** */ // Create the Contraction Descriptor cutensorContractionDescriptor_t desc; HANDLE_ERROR( cutensorInitContractionDescriptor( &handle, &desc, &descA, modeA.data(), alignmentRequirementA, &descB, modeB.data(), alignmentRequirementB, &descC, modeC.data(), alignmentRequirementC, &descC, modeC.data(), alignmentRequirementC, typeCompute) ); printf("Initialize contraction descriptor\n"); /* ***************************** */ // Set the algorithm to use cutensorContractionFind_t find; HANDLE_ERROR( cutensorInitContractionFind( &handle, &find, CUTENSOR_ALGO_DEFAULT) ); printf("Initialize settings to find algorithm\n"); /* ***************************** */ // Query workspace size_t worksize = 0; HANDLE_ERROR( cutensorContractionGetWorkspace(&handle, &desc, &find, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize ) ); // Allocate workspace void *work = nullptr; if(worksize > 0) { if( cudaSuccess != cudaMalloc(&work, worksize) ) // This is optional! { work = nullptr; worksize = 0; } } printf("Query recommended workspace size and allocate it\n"); return 0; } Plan and Execute --------------------------------- Finally, we are ready to create the contraction plan and execute the tensor contraction: .. code-block:: cpp #include #include #include #include #include #include // Handle cuTENSOR errors #define HANDLE_ERROR(x) { \ const auto err = x; \ if( x != CUTENSOR_STATUS_SUCCESS ) \ { printf("Error: %s in line %d\n", cutensorGetErrorString(x), __LINE__); exit(-1); } \ } int main(int argc, char** argv) { // Host element type definition typedef float floatTypeA; typedef float floatTypeB; typedef float floatTypeC; typedef float floatTypeCompute; // CUDA types cudaDataType_t typeA = CUDA_R_32F; cudaDataType_t typeB = CUDA_R_32F; cudaDataType_t typeC = CUDA_R_32F; cutensorComputeType_t typeCompute = CUTENSOR_R_MIN_32F; floatTypeCompute alpha = (floatTypeCompute)1.1f; floatTypeCompute beta = (floatTypeCompute)0.9f; printf("Include headers and define data types\n"); /* ***************************** */ // Create vector of modes std::vector modeC{'m','u','n','v'}; std::vector modeA{'m','h','k','n'}; std::vector modeB{'u','k','v','h'}; int nmodeA = modeA.size(); int nmodeB = modeB.size(); int nmodeC = modeC.size(); // Extents std::unordered_map extent; extent['m'] = 96; extent['n'] = 96; extent['u'] = 96; extent['v'] = 64; extent['h'] = 64; extent['k'] = 64; // Create a vector of extents for each tensor std::vector extentC; for(auto mode : modeC) extentC.push_back(extent[mode]); std::vector extentA; for(auto mode : modeA) extentA.push_back(extent[mode]); std::vector extentB; for(auto mode : modeB) extentB.push_back(extent[mode]); printf("Define modes and extents\n"); /* ***************************** */ // Number of elements of each tensor size_t elementsA = 1; for(auto mode : modeA) elementsA *= extent[mode]; size_t elementsB = 1; for(auto mode : modeB) elementsB *= extent[mode]; size_t elementsC = 1; for(auto mode : modeC) elementsC *= extent[mode]; // Size in bytes size_t sizeA = sizeof(floatTypeA) * elementsA; size_t sizeB = sizeof(floatTypeB) * elementsB; size_t sizeC = sizeof(floatTypeC) * elementsC; // Allocate on device void *A_d, *B_d, *C_d; cudaMalloc((void**)&A_d, sizeA); cudaMalloc((void**)&B_d, sizeB); cudaMalloc((void**)&C_d, sizeC); // Allocate on host floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA); floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB); floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC); // Initialize data on host for(int64_t i = 0; i < elementsA; i++) A[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsB; i++) B[i] = (((float) rand())/RAND_MAX - 0.5)*100; for(int64_t i = 0; i < elementsC; i++) C[i] = (((float) rand())/RAND_MAX - 0.5)*100; // Copy to device cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice); cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice); cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice); printf("Allocate, initialize and transfer tensors\n"); /* ***************************** */ // Initialize cuTENSOR library cutensorHandle_t handle; cutensorInit(&handle); // Create Tensor Descriptors cutensorTensorDescriptor_t descA; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descA, nmodeA, extentA.data(), NULL,/*stride*/ typeA, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descB; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descB, nmodeB, extentB.data(), NULL,/*stride*/ typeB, CUTENSOR_OP_IDENTITY ) ); cutensorTensorDescriptor_t descC; HANDLE_ERROR( cutensorInitTensorDescriptor( &handle, &descC, nmodeC, extentC.data(), NULL,/*stride*/ typeC, CUTENSOR_OP_IDENTITY ) ); printf("Initialize cuTENSOR and tensor descriptors\n"); /* ***************************** */ //Retrieve the memory alignment for each tensor uint32_t alignmentRequirementA; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, A_d, &descA, &alignmentRequirementA) ); uint32_t alignmentRequirementB; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, B_d, &descB, &alignmentRequirementB) ); uint32_t alignmentRequirementC; HANDLE_ERROR( cutensorGetAlignmentRequirement( &handle, C_d, &descC, &alignmentRequirementC) ); printf("Query best alignment requirement for our pointers\n"); /* ***************************** */ // Create the Contraction Descriptor cutensorContractionDescriptor_t desc; HANDLE_ERROR( cutensorInitContractionDescriptor( &handle, &desc, &descA, modeA.data(), alignmentRequirementA, &descB, modeB.data(), alignmentRequirementB, &descC, modeC.data(), alignmentRequirementC, &descC, modeC.data(), alignmentRequirementC, typeCompute) ); printf("Initialize contraction descriptor\n"); /* ***************************** */ // Set the algorithm to use cutensorContractionFind_t find; HANDLE_ERROR( cutensorInitContractionFind( &handle, &find, CUTENSOR_ALGO_DEFAULT) ); printf("Initialize settings to find algorithm\n"); /* ***************************** */ // Query workspace size_t worksize = 0; HANDLE_ERROR( cutensorContractionGetWorkspace(&handle, &desc, &find, CUTENSOR_WORKSPACE_RECOMMENDED, &worksize ) ); // Allocate workspace void *work = nullptr; if(worksize > 0) { if( cudaSuccess != cudaMalloc(&work, worksize) ) // This is optional! { work = nullptr; worksize = 0; } } printf("Query recommended workspace size and allocate it\n"); /* ***************************** */ // Create Contraction Plan cutensorContractionPlan_t plan; HANDLE_ERROR( cutensorInitContractionPlan(&handle, &plan, &desc, &find, worksize) ); printf("Create plan for contraction\n"); /* ***************************** */ cutensorStatus_t err; // Execute the tensor contraction err = cutensorContraction(&handle, &plan, (void*)&alpha, A_d, B_d, (void*)&beta, C_d, C_d, work, worksize, 0 /* stream */); cudaDeviceSynchronize(); // Check for errors if(err != CUTENSOR_STATUS_SUCCESS) { printf("ERROR: %s\n", cutensorGetErrorString(err)); } printf("Execute contraction from plan\n"); /* ***************************** */ if ( A ) free( A ); if ( B ) free( B ); if ( C ) free( C ); if ( A_d ) cudaFree( A_d ); if ( B_d ) cudaFree( B_d ); if ( C_d ) cudaFree( C_d ); if ( work ) cudaFree( work ); printf("Successful completion\n"); return 0; } That is it. We have run our first cuTENSOR contraction! You can find this and other examples in the `samples repository `_.