Getting Started

In this section, we show how to implement a first tensor contraction using cuTENSOR. Our code will compute the following operation using single-precision arithmetic.

\[C_{m,u,n,v} = \alpha A_{m,h,k,n} B_{u,k,v,h} + \beta C_{m,u,n,v}\]

We build the code up step by step, each step adding code at the end. The steps are separated by comments consisting of multiple stars.

Installation and Compilation

The download for cuTENSOR is available here. Assuming cuTENSOR has been downloaded to CUTENSOR_DOWNLOAD_FILE, we extract it, remember cuTENSOR’s root location, and update the library path accordingly (in this example we are using the 10.1 version, depending on your toolkit, you might have to choose a different version).

tar xf ${CUTENSOR_DOWNLOAD_FILE}
export CUTENSOR_ROOT=${PWD}/libcutensor
export LD_LIBRARY_PATH=${CUTENSOR_ROOT}/lib/12/:${LD_LIBRARY_PATH}

If we store the following code in a file called contraction.cu, we can compile it via the following command:

nvcc contraction.cu -L${CUTENSOR_ROOT}/lib/12/ -I${CUTENSOR_ROOT}/include -std=c++11 -lcutensor -o contraction

When compiling intermediate steps of this example, the compiler might warn about unused variables. This is due to the example not being complete. The final step should issue no warnings.

Headers and Data Types

To start off, we begin with a trivial main() function, include some headers, and define some data types.

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  return 0;
}

Define Tensor Sizes

Next, we define the modes and extents of our tensors. For the sake of the example, let us say that the modes \(m\), \(n\) and \(u\) have an extent of 96; and that \(v\), \(h\) and \(k\) have an extent of 64. Note how modes are labeled by integers and how that allows us to label the modes using characters. See Nomenclature for an explanation of the terms mode and extent.

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  extent['m'] = 96;
  extent['n'] = 96;
  extent['u'] = 96;
  extent['v'] = 64;
  extent['h'] = 64;
  extent['k'] = 64;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  return 0;
}

Initialize Tensor Data

Next, we need to allocate and initialize host and device memory for our tensors:

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  extent['m'] = 96;
  extent['n'] = 96;
  extent['u'] = 96;
  extent['v'] = 64;
  extent['h'] = 64;
  extent['k'] = 64;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  /* ***************************** */

  // Number of elements of each tensor
  size_t elementsA = 1;
  for(auto mode : modeA)
      elementsA *= extent[mode];
  size_t elementsB = 1;
  for(auto mode : modeB)
      elementsB *= extent[mode];
  size_t elementsC = 1;
  for(auto mode : modeC)
      elementsC *= extent[mode];

  // Size in bytes
  size_t sizeA = sizeof(floatTypeA) * elementsA;
  size_t sizeB = sizeof(floatTypeB) * elementsB;
  size_t sizeC = sizeof(floatTypeC) * elementsC;

  // Allocate on device
  void *A_d, *B_d, *C_d;
  cudaMalloc((void**)&A_d, sizeA);
  cudaMalloc((void**)&B_d, sizeB);
  cudaMalloc((void**)&C_d, sizeC);

  // Allocate on host
  floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
  floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
  floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);

  // Initialize data on host
  for(int64_t i = 0; i < elementsA; i++)
      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsB; i++)
      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsC; i++)
      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

  // Copy to device
  cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice);
  cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice);
  cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice);

  printf("Allocate, initialize and transfer tensors\n");

  /**********************
   * Free allocated data
   **********************/
  if (A) free(A);
  if (B) free(B);
  if (C) free(C);
  if (A_d) cudaFree(A_d);
  if (B_d) cudaFree(B_d);
  if (C_d) cudaFree(C_d);
  if (work) cudaFree(work);

  return 0;
}

Create Tensor Descriptors

We are now ready to use the cuTENSOR library and to initialize its library handle. Then, we create a descriptor for each tensor by providing its data type, order, data type, and alignment (of the data pointer for which this tensor descriptor is created, in bytes).

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

// Handle cuTENSOR errors
#define HANDLE_ERROR(x)                                             \
{ const auto err = x;                                               \
  if( err != CUTENSOR_STATUS_SUCCESS )                              \
  { printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  extent['m'] = 96;
  extent['n'] = 96;
  extent['u'] = 96;
  extent['v'] = 64;
  extent['h'] = 64;
  extent['k'] = 64;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  /* ***************************** */

  // Number of elements of each tensor
  size_t elementsA = 1;
  for(auto mode : modeA)
      elementsA *= extent[mode];
  size_t elementsB = 1;
  for(auto mode : modeB)
      elementsB *= extent[mode];
  size_t elementsC = 1;
  for(auto mode : modeC)
      elementsC *= extent[mode];

  // Size in bytes
  size_t sizeA = sizeof(floatTypeA) * elementsA;
  size_t sizeB = sizeof(floatTypeB) * elementsB;
  size_t sizeC = sizeof(floatTypeC) * elementsC;

  // Allocate on device
  void *A_d, *B_d, *C_d;
  cudaMalloc((void**)&A_d, sizeA);
  cudaMalloc((void**)&B_d, sizeB);
  cudaMalloc((void**)&C_d, sizeC);

  // Allocate on host
  floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
  floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
  floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);

  // Initialize data on host
  for(int64_t i = 0; i < elementsA; i++)
      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsB; i++)
      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsC; i++)
      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

  // Copy to device
  HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));

  const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
  assert(uintptr_t(A_d) % kAlignment == 0);
  assert(uintptr_t(B_d) % kAlignment == 0);
  assert(uintptr_t(C_d) % kAlignment == 0);

  printf("Allocate, initialize and transfer tensors\n");

  /*************************
   * cuTENSOR
   *************************/

  cutensorHandle_t handle;
  HANDLE_ERROR(cutensorCreate(&handle));

  /**********************
   * Create Tensor Descriptors
   **********************/

  cutensorTensorDescriptor_t descA;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descA,
               nmodeA,
               extentA.data(),
               NULL,/*stride*/
               typeA, kAlignment));

  cutensorTensorDescriptor_t descB;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descB,
               nmodeB,
               extentB.data(),
               NULL,/*stride*/
               typeB, kAlignment));

  cutensorTensorDescriptor_t descC;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descC,
               nmodeC,
               extentC.data(),
               NULL,/*stride*/
               typeC, kAlignment));

  printf("Initialize cuTENSOR and tensor descriptors\n");

  return 0;
}

Create Contraction Descriptor

In this step we create the operation descriptor that encodes the contraction and ensure that the scalar type is correct:

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

// Handle cuTENSOR errors
#define HANDLE_ERROR(x)                                             \
{ const auto err = x;                                               \
  if( err != CUTENSOR_STATUS_SUCCESS )                              \
  { printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  extent['m'] = 96;
  extent['n'] = 96;
  extent['u'] = 96;
  extent['v'] = 64;
  extent['h'] = 64;
  extent['k'] = 64;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  /* ***************************** */

  // Number of elements of each tensor
  size_t elementsA = 1;
  for(auto mode : modeA)
      elementsA *= extent[mode];
  size_t elementsB = 1;
  for(auto mode : modeB)
      elementsB *= extent[mode];
  size_t elementsC = 1;
  for(auto mode : modeC)
      elementsC *= extent[mode];

  // Size in bytes
  size_t sizeA = sizeof(floatTypeA) * elementsA;
  size_t sizeB = sizeof(floatTypeB) * elementsB;
  size_t sizeC = sizeof(floatTypeC) * elementsC;

  // Allocate on device
  void *A_d, *B_d, *C_d;
  cudaMalloc((void**)&A_d, sizeA);
  cudaMalloc((void**)&B_d, sizeB);
  cudaMalloc((void**)&C_d, sizeC);

  // Allocate on host
  floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
  floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
  floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);

  // Initialize data on host
  for(int64_t i = 0; i < elementsA; i++)
      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsB; i++)
      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsC; i++)
      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

  // Copy to device
  HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));

  const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
  assert(uintptr_t(A_d) % kAlignment == 0);
  assert(uintptr_t(B_d) % kAlignment == 0);
  assert(uintptr_t(C_d) % kAlignment == 0);

  printf("Allocate, initialize and transfer tensors\n");

  /*************************
   * cuTENSOR
   *************************/

  cutensorHandle_t handle;
  HANDLE_ERROR(cutensorCreate(&handle));

  /**********************
   * Create Tensor Descriptors
   **********************/

  cutensorTensorDescriptor_t descA;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descA,
               nmodeA,
               extentA.data(),
               NULL,/*stride*/
               typeA, kAlignment));

  cutensorTensorDescriptor_t descB;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descB,
               nmodeB,
               extentB.data(),
               NULL,/*stride*/
               typeB, kAlignment));

  cutensorTensorDescriptor_t descC;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descC,
               nmodeC,
               extentC.data(),
               NULL,/*stride*/
               typeC, kAlignment));

  printf("Initialize cuTENSOR and tensor descriptors\n");

  /*******************************
   * Create Contraction Descriptor
   *******************************/

  cutensorOperationDescriptor_t desc;
  HANDLE_ERROR(cutensorCreateContraction(handle,
               &desc,
               descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
               descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(),
               descCompute));

  /*****************************
   * Optional (but recommended): ensure that the scalar type is correct.
   *****************************/

  cutensorDataType_t scalarType;
  HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
      desc,
      CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
      (void*)&scalarType,
      sizeof(scalarType)));

  assert(scalarType == CUTENSOR_R_32F);
  typedef float floatTypeCompute;
  floatTypeCompute alpha = (floatTypeCompute)1.1f;
  floatTypeCompute beta  = (floatTypeCompute)0.f;

  return 0;
}

Determine Algorithm and Workspace

Now that we have defined both the tensors and the contraction that we want to perform, we must select an algorithm to perform the contraction. That algorithm is specified by cutensorAlgo_t. Specifying CUTENSOR_ALGO_DEFAULT allows us to let cuTENSOR’s internal heuristic choose the best approach. All the information to find a good algorithm is stored in the cutensorPlanPreference_t data structure. We can also query the library to estimate the amount of workspace requirement for the provided operation descriptor; users can choose between between different cutensorWorksizePreference_t. For this example we are using CUTENSOR_WORKSPACE_DEFAULT which is a good default choice which aims to attain high performance while also reducing the workspace requirement. While workspace memory is not strictly required, it is strongly recommended to attain high performance.

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

// Handle cuTENSOR errors
#define HANDLE_ERROR(x)                                             \
{ const auto err = x;                                               \
  if( err != CUTENSOR_STATUS_SUCCESS )                              \
  { printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  extent['m'] = 96;
  extent['n'] = 96;
  extent['u'] = 96;
  extent['v'] = 64;
  extent['h'] = 64;
  extent['k'] = 64;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  /* ***************************** */

  // Number of elements of each tensor
  size_t elementsA = 1;
  for(auto mode : modeA)
      elementsA *= extent[mode];
  size_t elementsB = 1;
  for(auto mode : modeB)
      elementsB *= extent[mode];
  size_t elementsC = 1;
  for(auto mode : modeC)
      elementsC *= extent[mode];

  // Size in bytes
  size_t sizeA = sizeof(floatTypeA) * elementsA;
  size_t sizeB = sizeof(floatTypeB) * elementsB;
  size_t sizeC = sizeof(floatTypeC) * elementsC;

  // Allocate on device
  void *A_d, *B_d, *C_d;
  cudaMalloc((void**)&A_d, sizeA);
  cudaMalloc((void**)&B_d, sizeB);
  cudaMalloc((void**)&C_d, sizeC);

  // Allocate on host
  floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
  floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
  floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);

  // Initialize data on host
  for(int64_t i = 0; i < elementsA; i++)
      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsB; i++)
      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsC; i++)
      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

  // Copy to device
  HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));

  const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
  assert(uintptr_t(A_d) % kAlignment == 0);
  assert(uintptr_t(B_d) % kAlignment == 0);
  assert(uintptr_t(C_d) % kAlignment == 0);

  printf("Allocate, initialize and transfer tensors\n");

  /*************************
   * cuTENSOR
   *************************/

  cutensorHandle_t handle;
  HANDLE_ERROR(cutensorCreate(&handle));

  /**********************
   * Create Tensor Descriptors
   **********************/

  cutensorTensorDescriptor_t descA;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descA,
               nmodeA,
               extentA.data(),
               NULL,/*stride*/
               typeA, kAlignment));

  cutensorTensorDescriptor_t descB;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descB,
               nmodeB,
               extentB.data(),
               NULL,/*stride*/
               typeB, kAlignment));

  cutensorTensorDescriptor_t descC;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descC,
               nmodeC,
               extentC.data(),
               NULL,/*stride*/
               typeC, kAlignment));

  printf("Initialize cuTENSOR and tensor descriptors\n");

  /*******************************
   * Create Contraction Descriptor
   *******************************/

  cutensorOperationDescriptor_t desc;
  HANDLE_ERROR(cutensorCreateContraction(handle,
               &desc,
               descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
               descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(),
               descCompute));

  /*****************************
   * Optional (but recommended): ensure that the scalar type is correct.
   *****************************/

  cutensorDataType_t scalarType;
  HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
      desc,
      CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
      (void*)&scalarType,
      sizeof(scalarType)));

  assert(scalarType == CUTENSOR_R_32F);
  typedef float floatTypeCompute;
  floatTypeCompute alpha = (floatTypeCompute)1.1f;
  floatTypeCompute beta  = (floatTypeCompute)0.f;

  /**************************
  * Set the algorithm to use
  ***************************/

  const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;

  cutensorPlanPreference_t planPref;
  HANDLE_ERROR(cutensorCreatePlanPreference(
                             handle,
                             &planPref,
                             algo,
                             CUTENSOR_JIT_MODE_NONE));

  /**********************
   * Query workspace estimate
   **********************/

  uint64_t workspaceSizeEstimate = 0;
  const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
  HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
                                        desc,
                                        planPref,
                                        workspacePref,
                                        &workspaceSizeEstimate));

  return 0;
}

Notice that we’re not using just-in-time compilation in this example (CUTENSOR_JIT_MODE_NONE), if you want to learn more about cuTENSOR’s JIT capabilities please refer to Just In Time (JIT) Compilation.

Plan and reduce workspace

With an estimation of the workspace size at hand we can now go on to create the actual plan; this step relies on cuTENSOR’s heuristics to select the most suitable algorithm and kernel:

cutensorPlan_t plan;
HANDLE_ERROR(cutensorCreatePlan(handle,
             &plan,
             desc,
             planPref,
             workspaceSizeEstimate));

Once the plan is created and the kernel is selected, we can (optionally) query the workspace that is actually required by the plan:

uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(cutensorPlanGetAttribute(handle,
    plan,
    CUTENSOR_PLAN_REQUIRED_WORKSPACE,
    &actualWorkspaceSize,
    sizeof(actualWorkspaceSize)));

With that our code becomes:

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

// Handle cuTENSOR errors
#define HANDLE_ERROR(x)                                             \
{ const auto err = x;                                               \
  if( err != CUTENSOR_STATUS_SUCCESS )                              \
  { printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  extent['m'] = 96;
  extent['n'] = 96;
  extent['u'] = 96;
  extent['v'] = 64;
  extent['h'] = 64;
  extent['k'] = 64;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  /* ***************************** */

  // Number of elements of each tensor
  size_t elementsA = 1;
  for(auto mode : modeA)
      elementsA *= extent[mode];
  size_t elementsB = 1;
  for(auto mode : modeB)
      elementsB *= extent[mode];
  size_t elementsC = 1;
  for(auto mode : modeC)
      elementsC *= extent[mode];

  // Size in bytes
  size_t sizeA = sizeof(floatTypeA) * elementsA;
  size_t sizeB = sizeof(floatTypeB) * elementsB;
  size_t sizeC = sizeof(floatTypeC) * elementsC;

  // Allocate on device
  void *A_d, *B_d, *C_d;
  cudaMalloc((void**)&A_d, sizeA);
  cudaMalloc((void**)&B_d, sizeB);
  cudaMalloc((void**)&C_d, sizeC);

  // Allocate on host
  floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
  floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
  floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);

  // Initialize data on host
  for(int64_t i = 0; i < elementsA; i++)
      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsB; i++)
      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsC; i++)
      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

  // Copy to device
  HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));

  const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
  assert(uintptr_t(A_d) % kAlignment == 0);
  assert(uintptr_t(B_d) % kAlignment == 0);
  assert(uintptr_t(C_d) % kAlignment == 0);

  printf("Allocate, initialize and transfer tensors\n");

  /*************************
   * cuTENSOR
   *************************/

  cutensorHandle_t handle;
  HANDLE_ERROR(cutensorCreate(&handle));

  /**********************
   * Create Tensor Descriptors
   **********************/

  cutensorTensorDescriptor_t descA;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descA,
               nmodeA,
               extentA.data(),
               NULL,/*stride*/
               typeA, kAlignment));

  cutensorTensorDescriptor_t descB;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descB,
               nmodeB,
               extentB.data(),
               NULL,/*stride*/
               typeB, kAlignment));

  cutensorTensorDescriptor_t descC;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descC,
               nmodeC,
               extentC.data(),
               NULL,/*stride*/
               typeC, kAlignment));

  printf("Initialize cuTENSOR and tensor descriptors\n");

  /*******************************
   * Create Contraction Descriptor
   *******************************/

  cutensorOperationDescriptor_t desc;
  HANDLE_ERROR(cutensorCreateContraction(handle,
               &desc,
               descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
               descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(),
               descCompute));

  /*****************************
   * Optional (but recommended): ensure that the scalar type is correct.
   *****************************/

  cutensorDataType_t scalarType;
  HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
      desc,
      CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
      (void*)&scalarType,
      sizeof(scalarType)));

  assert(scalarType == CUTENSOR_R_32F);
  typedef float floatTypeCompute;
  floatTypeCompute alpha = (floatTypeCompute)1.1f;
  floatTypeCompute beta  = (floatTypeCompute)0.f;

  /**************************
  * Set the algorithm to use
  ***************************/

  const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;

  cutensorPlanPreference_t planPref;
  HANDLE_ERROR(cutensorCreatePlanPreference(
                             handle,
                             &planPref,
                             algo,
                             CUTENSOR_JIT_MODE_NONE));

  /**********************
   * Query workspace estimate
   **********************/

  uint64_t workspaceSizeEstimate = 0;
  const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
  HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
                                        desc,
                                        planPref,
                                        workspacePref,
                                        &workspaceSizeEstimate));

  /**************************
   * Create Contraction Plan
   **************************/

  cutensorPlan_t plan;
  HANDLE_ERROR(cutensorCreatePlan(handle,
               &plan,
               desc,
               planPref,
               workspaceSizeEstimate));

  /**************************
   * Optional: Query information about the created plan
   **************************/

  // query actually used workspace
  uint64_t actualWorkspaceSize = 0;
  HANDLE_ERROR(cutensorPlanGetAttribute(handle,
      plan,
      CUTENSOR_PLAN_REQUIRED_WORKSPACE,
      &actualWorkspaceSize,
      sizeof(actualWorkspaceSize)));

  // At this point the user knows exactly how much memory is need by the operation and
  // only the smaller actual workspace needs to be allocated
  assert(actualWorkspaceSize <= workspaceSizeEstimate);

  void *work = nullptr;
  if (actualWorkspaceSize > 0)
  {
      HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize));
      assert(uintptr_t(work) % 128 == 0); // workspace must be aligned to 128 byte-boundary
  }

  return 0;
}

Execute

Finally, we are ready to execute the tensor contraction and destroy (free) all the allocated resources:

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <cuda_runtime.h>
#include <cutensor.h>

#include <unordered_map>
#include <vector>

// Handle cuTENSOR errors
#define HANDLE_ERROR(x)                                             \
{ const auto err = x;                                               \
  if( err != CUTENSOR_STATUS_SUCCESS )                              \
  { printf("Error: %s\n", cutensorGetErrorString(err)); exit(-1); } \
};

#define HANDLE_CUDA_ERROR(x)                                      \
{ const auto err = x;                                             \
  if( err != cudaSuccess )                                        \
  { printf("Error: %s\n", cudaGetErrorString(err)); exit(-1); } \
};

int main(int argc, char** argv)
{
  // Host element type definition
  typedef float floatTypeA;
  typedef float floatTypeB;
  typedef float floatTypeC;
  typedef float floatTypeCompute;

  // CUDA types
  cutensorDataType_t typeA = CUTENSOR_R_32F;
  cutensorDataType_t typeB = CUTENSOR_R_32F;
  cutensorDataType_t typeC = CUTENSOR_R_32F;
  cutensorComputeDescriptor_t descCompute = CUTENSOR_COMPUTE_DESC_32F;

  printf("Include headers and define data types\n");

  /* ***************************** */

  // Create vector of modes
  std::vector<int> modeC{'m','u','n','v'};
  std::vector<int> modeA{'m','h','k','n'};
  std::vector<int> modeB{'u','k','v','h'};
  int nmodeA = modeA.size();
  int nmodeB = modeB.size();
  int nmodeC = modeC.size();

  // Extents
  std::unordered_map<int, int64_t> extent;
  extent['m'] = 96;
  extent['n'] = 96;
  extent['u'] = 96;
  extent['v'] = 64;
  extent['h'] = 64;
  extent['k'] = 64;

  // Create a vector of extents for each tensor
  std::vector<int64_t> extentC;
  for(auto mode : modeC)
      extentC.push_back(extent[mode]);
  std::vector<int64_t> extentA;
  for(auto mode : modeA)
      extentA.push_back(extent[mode]);
  std::vector<int64_t> extentB;
  for(auto mode : modeB)
      extentB.push_back(extent[mode]);

  printf("Define modes and extents\n");

  /* ***************************** */

  // Number of elements of each tensor
  size_t elementsA = 1;
  for(auto mode : modeA)
      elementsA *= extent[mode];
  size_t elementsB = 1;
  for(auto mode : modeB)
      elementsB *= extent[mode];
  size_t elementsC = 1;
  for(auto mode : modeC)
      elementsC *= extent[mode];

  // Size in bytes
  size_t sizeA = sizeof(floatTypeA) * elementsA;
  size_t sizeB = sizeof(floatTypeB) * elementsB;
  size_t sizeC = sizeof(floatTypeC) * elementsC;

  // Allocate on device
  void *A_d, *B_d, *C_d;
  cudaMalloc((void**)&A_d, sizeA);
  cudaMalloc((void**)&B_d, sizeB);
  cudaMalloc((void**)&C_d, sizeC);

  // Allocate on host
  floatTypeA *A = (floatTypeA*) malloc(sizeof(floatTypeA) * elementsA);
  floatTypeB *B = (floatTypeB*) malloc(sizeof(floatTypeB) * elementsB);
  floatTypeC *C = (floatTypeC*) malloc(sizeof(floatTypeC) * elementsC);

  // Initialize data on host
  for(int64_t i = 0; i < elementsA; i++)
      A[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsB; i++)
      B[i] = (((float) rand())/RAND_MAX - 0.5)*100;
  for(int64_t i = 0; i < elementsC; i++)
      C[i] = (((float) rand())/RAND_MAX - 0.5)*100;

  // Copy to device
  HANDLE_CUDA_ERROR(cudaMemcpy(A_d, A, sizeA, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(B_d, B, sizeB, cudaMemcpyHostToDevice));
  HANDLE_CUDA_ERROR(cudaMemcpy(C_d, C, sizeC, cudaMemcpyHostToDevice));

  const uint32_t kAlignment = 128; // Alignment of the global-memory device pointers (bytes)
  assert(uintptr_t(A_d) % kAlignment == 0);
  assert(uintptr_t(B_d) % kAlignment == 0);
  assert(uintptr_t(C_d) % kAlignment == 0);

  printf("Allocate, initialize and transfer tensors\n");

  /*************************
   * cuTENSOR
   *************************/

  cutensorHandle_t handle;
  HANDLE_ERROR(cutensorCreate(&handle));

  /**********************
   * Create Tensor Descriptors
   **********************/

  cutensorTensorDescriptor_t descA;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descA,
               nmodeA,
               extentA.data(),
               NULL,/*stride*/
               typeA, kAlignment));

  cutensorTensorDescriptor_t descB;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descB,
               nmodeB,
               extentB.data(),
               NULL,/*stride*/
               typeB, kAlignment));

  cutensorTensorDescriptor_t descC;
  HANDLE_ERROR(cutensorCreateTensorDescriptor(handle,
               &descC,
               nmodeC,
               extentC.data(),
               NULL,/*stride*/
               typeC, kAlignment));

  printf("Initialize cuTENSOR and tensor descriptors\n");

  /*******************************
   * Create Contraction Descriptor
   *******************************/

  cutensorOperationDescriptor_t desc;
  HANDLE_ERROR(cutensorCreateContraction(handle,
               &desc,
               descA, modeA.data(), /* unary operator A*/CUTENSOR_OP_IDENTITY,
               descB, modeB.data(), /* unary operator B*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(), /* unary operator C*/CUTENSOR_OP_IDENTITY,
               descC, modeC.data(),
               descCompute));

  /*****************************
   * Optional (but recommended): ensure that the scalar type is correct.
   *****************************/

  cutensorDataType_t scalarType;
  HANDLE_ERROR(cutensorOperationDescriptorGetAttribute(handle,
      desc,
      CUTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
      (void*)&scalarType,
      sizeof(scalarType)));

  assert(scalarType == CUTENSOR_R_32F);
  typedef float floatTypeCompute;
  floatTypeCompute alpha = (floatTypeCompute)1.1f;
  floatTypeCompute beta  = (floatTypeCompute)0.f;

  /**************************
  * Set the algorithm to use
  ***************************/

  const cutensorAlgo_t algo = CUTENSOR_ALGO_DEFAULT;

  cutensorPlanPreference_t planPref;
  HANDLE_ERROR(cutensorCreatePlanPreference(
                             handle,
                             &planPref,
                             algo,
                             CUTENSOR_JIT_MODE_NONE));

  /**********************
   * Query workspace estimate
   **********************/

  uint64_t workspaceSizeEstimate = 0;
  const cutensorWorksizePreference_t workspacePref = CUTENSOR_WORKSPACE_DEFAULT;
  HANDLE_ERROR(cutensorEstimateWorkspaceSize(handle,
                                        desc,
                                        planPref,
                                        workspacePref,
                                        &workspaceSizeEstimate));

  /**************************
   * Create Contraction Plan
   **************************/

  cutensorPlan_t plan;
  HANDLE_ERROR(cutensorCreatePlan(handle,
               &plan,
               desc,
               planPref,
               workspaceSizeEstimate));

  /**************************
   * Optional: Query information about the created plan
   **************************/

  // query actually used workspace
  uint64_t actualWorkspaceSize = 0;
  HANDLE_ERROR(cutensorPlanGetAttribute(handle,
      plan,
      CUTENSOR_PLAN_REQUIRED_WORKSPACE,
      &actualWorkspaceSize,
      sizeof(actualWorkspaceSize)));

  // At this point the user knows exactly how much memory is need by the operation and
  // only the smaller actual workspace needs to be allocated
  assert(actualWorkspaceSize <= workspaceSizeEstimate);

  void *work = nullptr;
  if (actualWorkspaceSize > 0)
  {
      HANDLE_CUDA_ERROR(cudaMalloc(&work, actualWorkspaceSize));
      assert(uintptr_t(work) % 128 == 0); // workspace must be aligned to 128 byte-boundary
  }

  /**********************
   * Execute
   **********************/

  cudaStream_t stream;
  HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));

  HANDLE_ERROR(cutensorContract(handle,
                         plan,
                         (void*) &alpha, A_d, B_d,
                         (void*) &beta,  C_d, C_d,
                         work, actualWorkspaceSize, stream));

  /**********************
   * Free allocated data
   **********************/
  HANDLE_ERROR(cutensorDestroy(handle));
  HANDLE_ERROR(cutensorDestroyPlan(plan));
  HANDLE_ERROR(cutensorDestroyOperationDescriptor(desc));
  HANDLE_ERROR(cutensorDestroyTensorDescriptor(descA));
  HANDLE_ERROR(cutensorDestroyTensorDescriptor(descB));
  HANDLE_ERROR(cutensorDestroyTensorDescriptor(descC));
  HANDLE_CUDA_ERROR(cudaStreamDestroy(stream));

  if (A) free(A);
  if (B) free(B);
  if (C) free(C);
  if (A_d) cudaFree(A_d);
  if (B_d) cudaFree(B_d);
  if (C_d) cudaFree(C_d);
  if (work) cudaFree(work);

  return 0;
}

That is it. We have executed our first contraction via cuTENSOR! You can find this and other examples in the samples repository.