Getting Started

In this section, we show how to implement a first tensor contraction using nvplTENSOR. Our code will compute the following operation using single-precision arithmetic.

\[C_{0,2,1,3} = \alpha A_{0,4,5,1} B_{2,5,3,4} + \beta C_{0,2,1,3}\]

We build the code up step by step, each step adding code at the end. The steps are separated by comments consisting of multiple stars.

Headers and Data Types

To start off, we begin with a trivial main() function, include some headers, and define some data types.

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <nvpl_tensor.h>

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    return 0;
}

Define Tensor Sizes

Next, we define the modes and extents of our tensors. For the sake of the example, let us say that the modes \(m\), \(n\) and \(u\) have an extent of 96; and that \(v\), \(h\) and \(k\) have an extent of 64. Note how modes are labeled by integers and how that allows us to label the modes using characters. See Nomenclature for an explanation of the terms mode and extent.

#include <stdlib.h>
#include <stdio.h>
#include <assert.h>

#include <nvpl_tensor.h>

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0,2,1,3};
    int32_t modeA[] = {0,4,5,1};
    int32_t modeB[] = {2,5,3,4};
    const int nmodeA = 4;
    const int nmodeB = 4;
    const int nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    return 0;
}

Initialize Tensor Data

Next, we need to allocate and initialize host and device memory for our tensors:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize tensors\n");

    /**********************
     * Free allocated data
     **********************/

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

Create Tensor Descriptors

We are now ready to use the nvplTENSOR library and to initialize its library handle. Then, we create a descriptor for each tensor by providing its data type, order, data type, and alignment (of the data pointer for which this tensor descriptor is created, in bytes).

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

Create Contraction Descriptor

In this step we create the operation descriptor that encodes the contraction and ensure that the scalar type is correct:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

Determine Algorithm and Workspace

Now that we have defined both the tensors and the contraction that we want to perform, we must select an algorithm to perform the contraction. That algorithm is specified by nvpltensorAlgo_t. Specifying NVPLTENSOR_ALGO_DEFAULT allows us to let nvplTENSOR’s internal heuristic choose the best approach. All the information to find a good algorithm is stored in the nvpltensorPlanPreference_t data structure. We can also query the library to estimate the amount of workspace requirement for the provided operation descriptor; users can choose between between different nvpltensorWorksizePreference_t. For this example we are using NVPLTENSOR_WORKSPACE_DEFAULT which is a good default choice which aims to attain high performance while also reducing the workspace requirement. While workspace memory is not strictly required, it is strongly recommended to attain high performance.

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**************************
     * Set the algorithm to use
     ***************************/

    nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;

    nvpltensorPlanPreference_t planPref;
    HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
                 algo, NVPLTENSOR_JIT_MODE_NONE));

    printf("Initialize plan preference\n");

    /**********************
     * Query workspace estimate
     **********************/

    uint64_t workspaceSizeEstimate = 0;
    nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
    HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
                 planPref, workspacePref, &workspaceSizeEstimate));

    printf("Estimate workspace required for operation\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);

    return 0;
}

Notice that we’re not using just-in-time compilation in this example (NVPLTENSOR_JIT_MODE_NONE), if you want to learn more about nvplTENSOR’s JIT capabilities please refer to jit-compilation-label.

Plan and reduce workspace

With an estimation of the workspace size at hand we can now go on to create the actual plan; this step relies on nvplTENSOR’s heuristics to select the most suitable algorithm and kernel:

nvpltensorPlan_t plan;
HANDLE_ERROR(nvpltensorCreatePlan(handle,
             &plan,
             desc,
             planPref,
             workspaceSizeEstimate));

Once the plan is created and the kernel is selected, we can (optionally) query the workspace that is actually required by the plan:

uint64_t actualWorkspaceSize = 0;
HANDLE_ERROR(nvpltensorPlanGetAttribute(handle,
    plan,
    NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
    &actualWorkspaceSize,
    sizeof(actualWorkspaceSize)));

With that our code becomes:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**************************
     * Set the algorithm to use
     ***************************/

    nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;

    nvpltensorPlanPreference_t planPref;
    HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
                 algo, NVPLTENSOR_JIT_MODE_NONE));

    printf("Initialize plan preference\n");

    /**********************
     * Query workspace estimate
     **********************/

    uint64_t workspaceSizeEstimate = 0;
    nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
    HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
                 planPref, workspacePref, &workspaceSizeEstimate));

    printf("Estimate workspace required for operation\n");

    /**************************
     * Create Contraction Plan
     **************************/

    nvpltensorPlan_t plan;
    HANDLE_ERROR(nvpltensorCreatePlan(handle, &plan,
                 desc, planPref, workspaceSizeEstimate));

    printf("Initialize plan\n");

    /**************************
     * Optional: Query information about the created plan
     **************************/

    // query actually used workspace
    uint64_t actualWorkspaceSize = 0;
    HANDLE_ERROR(nvpltensorPlanGetAttribute(handle, plan,
                 NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
                 &actualWorkspaceSize, sizeof(actualWorkspaceSize)));

    // At this point the user knows exactly how much memory is need by the operation and
    // only the smaller actual workspace needs to be allocated
    assert(actualWorkspaceSize <= workspaceSizeEstimate);
    actualWorkspaceSize += 256;

    printf("Query information about the created plan\n");

    void* work = NULL;
    if (actualWorkspaceSize > 0)
    {
        work = aligned_alloc(kAlignment, actualWorkspaceSize);
    }

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyPlan(plan));
    HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);
    if (work) free(work);

    return 0;
}

Execute

Finally, we are ready to execute the tensor contraction and destroy (free) all the allocated resources:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include <nvpl_tensor.h>

// Handle nvplTENSOR errors
#define HANDLE_ERROR(x)                                           \
    {                                                             \
        const nvpltensorStatus_t err = x;                         \
        if (err != NVPLTENSOR_STATUS_SUCCESS)                     \
        {                                                         \
            printf("Error: %s\n", nvpltensorGetErrorString(err)); \
            exit(-1);                                             \
        }                                                         \
    };

int main(int argc, char** argv)
{
    // Host element type definition
    typedef float floatTypeA;
    typedef float floatTypeB;
    typedef float floatTypeC;
    typedef float floatTypeCompute;

    // nvplTENSOR types
    nvpltensorDataType_t typeA = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeB = NVPLTENSOR_R_32F;
    nvpltensorDataType_t typeC = NVPLTENSOR_R_32F;
    nvpltensorComputeDescriptor_t descCompute = NVPLTENSOR_COMPUTE_DESC_32F;

    printf("Include headers and define data types\n");

    /* ***************************** */

    int32_t modeC[] = {0, 2, 1, 3};
    int32_t modeA[] = {0, 4, 5, 1};
    int32_t modeB[] = {2, 5, 3, 4};
    int const nmodeA = 4;
    int const nmodeB = 4;
    int const nmodeC = 4;

    int64_t extent[] = {6, 6, 6, 4, 4, 4};

    int64_t extentC[nmodeC];
    for (int i = 0; i < nmodeC; ++i)
    {
        extentC[i] = extent[modeC[i]];
    }
    int64_t extentA[nmodeA];
    for (int i = 0; i < nmodeA; ++i)
    {
        extentA[i] = extent[modeA[i]];
    }
    int64_t extentB[nmodeB];
    for (int i = 0; i < nmodeB; ++i)
    {
        extentB[i] = extent[modeB[i]];
    }

    printf("Define modes and extents\n");

    /* ***************************** */

    // Number of elements of each tensor
    int64_t elementsA = 1;
    for (int i = 0; i < nmodeA; ++i)
    {
        elementsA *= extent[i];
    }
    int64_t elementsB = 1;
    for (int i = 0; i < nmodeB; ++i)
    {
        elementsB *= extent[i];
    }
    int64_t elementsC = 1;
    for (int i = 0; i < nmodeC; ++i)
    {
        elementsC *= extent[i];
    }

    // Size in bytes
    int64_t sizeA = sizeof(floatTypeA) * elementsA;
    int64_t sizeB = sizeof(floatTypeB) * elementsB;
    int64_t sizeC = sizeof(floatTypeC) * elementsC;

    uint32_t const kAlignment = 128;  // Alignment of the pointers (bytes)

    // Allocate
    floatTypeA* A = aligned_alloc(kAlignment, sizeA);
    floatTypeB* B = aligned_alloc(kAlignment, sizeB);
    floatTypeC* C = aligned_alloc(kAlignment, sizeC);

    if (A == NULL || B == NULL || C == NULL)
    {
        printf("Error: allocation of tensor memory.\n");
        return -1;
    }

    // Initialize data
    for (int64_t i = 0; i < elementsA; i++)
        A[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsB; i++)
        B[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;
    for (int64_t i = 0; i < elementsC; i++)
        C[i] = (((float) rand()) / RAND_MAX - 0.5) * 100;

    assert((uintptr_t)A % kAlignment == 0);
    assert((uintptr_t)B % kAlignment == 0);
    assert((uintptr_t)C % kAlignment == 0);

    printf("Allocate and initialize\n");

    /*************************
     * nvplTENSOR
     *************************/

    nvpltensorHandle_t handle;
    HANDLE_ERROR(nvpltensorCreate(&handle));

    /**********************
     * Set number of threads, that nvplTensor can use
     **********************/

    uint32_t const numThreads = 4;
    HANDLE_ERROR(nvpltensorSetNumThreads(handle, numThreads));

    /**********************
     * Create Tensor Descriptors
     **********************/

    nvpltensorTensorDescriptor_t descA;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descA,
                 nmodeA, extentA, NULL /*stride*/,
                 typeA, kAlignment));

    nvpltensorTensorDescriptor_t descB;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descB,
                 nmodeB, extentB, NULL /*stride*/,
                 typeB, kAlignment));

    nvpltensorTensorDescriptor_t descC;
    HANDLE_ERROR(nvpltensorCreateTensorDescriptor(handle, &descC,
                 nmodeC, extentC, NULL /*stride*/,
                 typeC, kAlignment));

    printf("Initialize nvplTENSOR and tensor descriptors\n");

    /*******************************
     * Create Contraction Descriptor
     *******************************/

    nvpltensorOperationDescriptor_t desc;
    HANDLE_ERROR(nvpltensorCreateContraction(handle, &desc,
                 descA, modeA, /* unary operator A*/ NVPLTENSOR_OP_IDENTITY,
                 descB, modeB, /* unary operator B*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, /* unary operator C*/ NVPLTENSOR_OP_IDENTITY,
                 descC, modeC, descCompute));

    printf("Initialize operation descriptor\n");

    /*****************************
     * Optional (but recommended): ensure that the scalar type is correct.
     *****************************/

    nvpltensorDataType_t scalarType;
    HANDLE_ERROR(nvpltensorOperationDescriptorGetAttribute(handle, desc,
                 NVPLTENSOR_OPERATION_DESCRIPTOR_SCALAR_TYPE,
                 (void*) &scalarType, sizeof(scalarType)));

    assert(scalarType == NVPLTENSOR_R_32F);
    typedef float floatTypeCompute;
    floatTypeCompute alpha = (floatTypeCompute) 1.1f;
    floatTypeCompute beta = (floatTypeCompute) 0.f;

    printf("Check scalar type required for operation\n");

    /**************************
     * Set the algorithm to use
     ***************************/

    nvpltensorAlgo_t const algo = NVPLTENSOR_ALGO_DEFAULT;

    nvpltensorPlanPreference_t planPref;
    HANDLE_ERROR(nvpltensorCreatePlanPreference(handle, &planPref,
                 algo, NVPLTENSOR_JIT_MODE_NONE));

    printf("Initialize plan preference\n");

    /**********************
     * Query workspace estimate
     **********************/

    uint64_t workspaceSizeEstimate = 0;
    nvpltensorWorksizePreference_t const workspacePref = NVPLTENSOR_WORKSPACE_DEFAULT;
    HANDLE_ERROR(nvpltensorEstimateWorkspaceSize(handle, desc,
                 planPref, workspacePref, &workspaceSizeEstimate));

    printf("Estimate workspace required for operation\n");

    /**************************
     * Create Contraction Plan
     **************************/

    nvpltensorPlan_t plan;
    HANDLE_ERROR(nvpltensorCreatePlan(handle, &plan,
                 desc, planPref, workspaceSizeEstimate));

    printf("Initialize plan\n");

    /**************************
     * Optional: Query information about the created plan
     **************************/

    // query actually used workspace
    uint64_t actualWorkspaceSize = 0;
    HANDLE_ERROR(nvpltensorPlanGetAttribute(handle, plan,
                 NVPLTENSOR_PLAN_REQUIRED_WORKSPACE,
                 &actualWorkspaceSize, sizeof(actualWorkspaceSize)));

    // At this point the user knows exactly how much memory is need by the operation and
    // only the smaller actual workspace needs to be allocated
    assert(actualWorkspaceSize <= workspaceSizeEstimate);
    actualWorkspaceSize += 256;

    printf("Query information about the created plan\n");

    void* work = NULL;
    if (actualWorkspaceSize > 0)
    {
        work = aligned_alloc(kAlignment, actualWorkspaceSize);
    }

    /**********************
     * Execute
     **********************/

    HANDLE_ERROR(nvpltensorContract(handle, plan,
                 (void*) &alpha, A, B, (void*) &beta, C, C,
                 work, actualWorkspaceSize));

    printf("Perform operation\n");

    /**********************
     * Free allocated data
     **********************/

    HANDLE_ERROR(nvpltensorDestroy(handle));
    HANDLE_ERROR(nvpltensorDestroyPlan(plan));
    HANDLE_ERROR(nvpltensorDestroyPlanPreference(planPref));
    HANDLE_ERROR(nvpltensorDestroyOperationDescriptor(desc));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descA));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descB));
    HANDLE_ERROR(nvpltensorDestroyTensorDescriptor(descC));

    if (A) free(A);
    if (B) free(B);
    if (C) free(C);
    if (work) free(work);

    return 0;
}

That is it. We have executed our first contraction via nvplTENSOR! You can find this and other examples in the example directory.