cuBLASMp C API#
Library Management#
cublasMpCreate
#
cublasMpStatus_t cublasMpCreate(
cublasMpHandle_t *handle,
cudaStream_t stream);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
Out |
cuBLASMp library handle. |
stream |
Host |
In |
Stream that will be assigned to the handle. |
cublasMpDestroy
#
cublasMpStatus_t cublasMpDestroy(
cublasMpHandle_t handle);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In/Out |
cuBLASMp library handle to destroy. |
cublasMpStreamSet
#
cublasMpStatus_t cublasMpStreamSet(
cublasMpHandle_t handle,
cudaStream_t stream);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
stream |
Host |
In |
CUDA stream pointer to set. |
cublasMpStreamGet
#
cublasMpStatus_t cublasMpStreamGet(
cublasMpHandle_t handle,
cudaStream_t* stream);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
stream |
Host |
Out |
CUDA stream pointer to set. |
cublasMpGetVersion
#
cublasMpStatus_t cublasMpGetVersion(
int *version);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
version |
Host |
Out |
cuBLASMp library version. Value is |
Grid Management#
cublasMpGridCreate
#
cublasMpStatus_t cublasMpGridCreate(
int64_t nprow,
int64_t npcol,
cublasMpGridLayout_t layout,
cal_comm_t comm,
cublasMpGrid_t* grid);
Note
cuBLASMp will initialize NVSHMEM as the first grid is created, hence the user should ensure that it uses a communicator that contains all the required ranks.
If NVSHMEM was previously initialized by the user in their application, the first cuBLASMp grid should be created using the same set of ranks.
cuBLASMp will call nvshmem_finalize
as part of the cublasMpGridDestroy() call of the last remaining grid.
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
nprow |
Host |
In |
How many row processes the grid contains. |
npcol |
Host |
In |
How many column processes the grid contains. |
layout |
Host |
In |
Grid’s layout (cublasMpGridLayout_t). |
comm |
Host |
In |
Communicator associated with the grid. |
grid |
Host |
In/Out |
Pointer to a grid object. |
cublasMpGridDestroy
#
cublasMpStatus_t cublasMpGridDestroy(
cublasMpGrid_t grid);
grid
object.Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
grid |
Host |
In/Out |
Grid object to destroy. |
Matrix Management#
cublasMpMatrixDescriptorCreate
#
cublasMpStatus_t cublasMpMatrixDescriptorCreate(
int64_t m,
int64_t n,
int64_t mb,
int64_t nb,
int64_t rsrc,
int64_t csrc,
int64_t lld,
cudaDataType_t type,
cublasMpGrid_t grid,
cublasMpMatrixDescriptor_t* desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
m |
Host |
In |
Number of rows in the global matrix. |
n |
Host |
In |
Number of columns in the global matrix. |
mb |
Host |
In |
Blocking factor used to distribute the rows of the global matrix. |
nb |
Host |
In |
Blocking factor used to distribute the columns of the global matrix. |
rsrc |
Host |
In |
Row rank of the process who owns the first row block of the global matrix. |
csrc |
Host |
In |
Column rank of the process who owns the first column block of the global matrix. |
lld |
Host |
In |
Leading dimension of the local matrix. |
type |
Host |
In |
Data type of the matrix. |
grid |
Host |
In |
Grid object associated with the matrix descriptor. |
desc |
Host |
Out |
Matrix descriptor object initialized by this function. |
type
argument are listed.Data Type |
Description |
---|---|
CUDA_R_8I |
8-bit real signed integer. |
CUDA_R_32I |
32-bit real signed integer. |
CUDA_R_8F_E4M3 |
8-bit real floating point in E4M3 format. |
CUDA_R_8F_E5M2 |
8-bit real floating point in E5M2 format. |
CUDA_R_16F |
16-bit real half precision floating-point. |
CUDA_R_16BF |
16-bit real bfloat16 floating-point. |
CUDA_R_32F |
32-bit real single precision floating-point. |
CUDA_R_64F |
64-bit real double precision floating-point. |
CUDA_C_32F |
64-bit structure comprised of two single precision floating-points representing a complex number. |
CUDA_C_64F |
128-bit structure comprised of two double precision floating-points representing a complex number. |
cublasMpMatrixDescriptorDestroy
#
cublasMpStatus_t cublasMpMatrixDescriptorDestroy(
cublasMpMatrixDescriptor_t desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
desc |
Host |
In/Out |
Matrix descriptor object to destroy. |
cublasMpMatrixDescriptorInit
#
cublasMpStatus_t cublasMpMatrixDescriptorInit(
int64_t m,
int64_t n,
int64_t mb,
int64_t nb,
int64_t rsrc,
int64_t csrc,
int64_t lld,
cudaDataType_t type,
cublasMpGrid_t grid,
cublasMpMatrixDescriptor_t desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
m |
Host |
In |
Number of rows in the global matrix. |
n |
Host |
In |
Number of columns in the global matrix. |
mb |
Host |
In |
Blocking factor used to distribute the rows of the global matrix. |
nb |
Host |
In |
Blocking factor used to distribute the columns of the global matrix. |
rsrc |
Host |
In |
Row rank of the process who owns the first row block of the global matrix. |
csrc |
Host |
In |
Column rank of the process who owns the first column block of the global matrix. |
lld |
Host |
In |
Leading dimension of the local matrix. |
type |
Host |
In |
Data type of the matrix. |
grid |
Host |
In |
Grid object associated with the matrix descriptor. |
desc |
Host |
Out |
Matrix descriptor object initialized by this function. |
type
argument are listed below:Data Type |
Description |
---|---|
CUDA_R_8I |
8-bit real signed integer. |
CUDA_R_32I |
32-bit real signed integer. |
CUDA_R_8F_E4M3 |
8-bit real floating point in E4M3 format. |
CUDA_R_8F_E5M2 |
8-bit real floating point in E5M2 format. |
CUDA_R_16F |
16-bit real half precision floating-point. |
CUDA_R_16BF |
16-bit real bfloat16 floating-point. |
CUDA_R_32F |
32-bit real single precision floating-point. |
CUDA_R_64F |
64-bit real double precision floating-point. |
CUDA_C_32F |
64-bit structure comprised of two single precision floating-points representing a complex number. |
CUDA_C_64F |
128-bit structure comprised of two double precision floating-points representing a complex number. |
Matmul Properties#
cublasMpMatmulDescriptorCreate
#
cublasMpStatus_t cublasMpMatmulDescriptorCreate(
cublasMpMatmulDescriptor_t* matmulDesc,
cublasComputeType_t computeType);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In/Out |
Pointer to a cublasMpMatmulDescriptor object to initialize. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
computeType
argument are listed.Compute Types |
---|
CUBLAS_COMPUTE_32I |
CUBLAS_COMPUTE_32I_PEDANTIC |
CUBLAS_COMPUTE_16F |
CUBLAS_COMPUTE_16F_PEDANTIC |
CUBLAS_COMPUTE_32F |
CUBLAS_COMPUTE_32F_PEDANTIC |
CUBLAS_COMPUTE_32F_FAST_16F |
CUBLAS_COMPUTE_32F_FAST_16BF |
CUBLAS_COMPUTE_32F_FAST_TF32 |
CUBLAS_COMPUTE_64F |
CUBLAS_COMPUTE_64F_PEDANTIC |
cublasMpMatmulDescriptorDestroy
#
cublasMpStatus_t cublasMpMatmulDescriptorDestroy(
cublasMpMatmulDescriptor_t matmulDesc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In/Out |
Matmul descriptor object to destroy. |
cublasMpMatmulDescriptorAttributeSet
#
cublasMpStatus_t cublasMpMatmulDescriptorAttributeSet(
cublasMpMatmulDescriptor_t matmulDesc,
cublasMpMatmulDescriptorAttribute_t attr,
const void* buf,
size_t sizeInBytes);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In |
Matmul descriptor object to set its attribute. |
attr |
Host |
In |
Matmul descriptor attribute to set. |
buf |
Host |
In |
Attribute value to set. |
sizeInBytes |
Host |
In |
Attribute buffer size in bytes. |
cublasMpMatmulDescriptorAttributeGet
#
cublasMpStatus_t cublasMpMatmulDescriptorAttributeGet(
cublasMpMatmulDescriptor_t matmulDesc,
cublasMpMatmulDescriptorAttribute_t attr,
const void* buf,
size_t sizeInBytes,
size_t* sizeWritten);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In |
Matmul descriptor object to set its attribute. |
attr |
Host |
In |
Matmul descriptor attribute to set. |
buf |
Host |
Out |
Attribute value to set. |
sizeInBytes |
Host |
In |
Attribute buffer size in bytes. |
sizeWritten |
Host |
Out |
Size of the attribute written into |
Utility#
cublasMpNumroc
#
int64_t cublasMpNumroc(
int64_t n,
int64_t nb,
uint32_t iproc,
uint32_t isrcproc,
uint32_t nprocs);
iproc
argument.Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
n |
Host |
In |
Number of rows or columns in the global distributed matrix. |
nb |
Host |
In |
Row or column blocking size of the global matrix. |
iproc |
Host |
In |
The coordinate of the process whose local array row or column is to be determined. |
isrcproc |
Host |
In |
The coordinate of the process that owns the first row or column of the distributed matrix. |
nprocs |
Host |
In |
The total number of row or column processes over which the matrix is distributed. |
cublasMpGemr2D
#
cublasMpStatus_t cublasMpGemr2D(
cublasMpHandle_t handle,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(A) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGemr2D_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGemr2D_bufferSize(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpGemr2D_bufferSize
#
cublasMpStatus_t cublasMpGemr2D_bufferSize(
cublasMpHandle_t handle,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(B) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGemr2D(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGemr2D(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpTrmr2D
#
cublasMpStatus_t cublasMpTrmr2D(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(A) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTrmr2D_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTrmr2D_bufferSize(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpTrmr2D_bufferSize
#
cublasMpStatus_t cublasMpTrmr2D_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(B) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTrmr2D(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTrmr2D(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
Logging#
cublasMpLoggerSetCallback
#
cublasMpStatus_t cublasMpLoggerSetCallback(
cublasMpLoggerCallback_t callback);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
callback |
Host |
In |
Pointer to a callback function. See cublasMpLoggerCallback_t. |
Warning
This is an experimental feature.
cublasMpLoggerSetFile
#
cublasMpStatus_t cublasMpLoggerSetFile(
FILE *file);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
file |
Host |
In |
Pointer to an open file. File should have write permission. |
Warning
This is an experimental feature.
cublasMpLoggerOpenFile
#
cublasMpStatus_t cublasMpLoggerOpenFile(
const char* logFile);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
logFile |
Host |
In |
Path of the logging output file. |
Warning
This is an experimental feature.
cublasMpLoggerSetLevel
#
cublasMpStatus_t cublasMpLoggerSetLevel(
int level);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
level |
Host |
In |
Value of the logging level. See cuBLASMp Logging. |
Warning
This is an experimental feature.
cublasMpLoggerSetMask
#
cublasMpStatus_t cublasMpLoggerSetMask(
int mask);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
mask |
Host |
In |
Value of the logging mask. See cuBLASMp Logging. |
Warning
This is an experimental feature.
cublasMpLoggerForceDisable
#
cublasMpStatus_t cublasMpLoggerForceDisable();
Warning
This is an experimental feature.
Dense Linear Algebra APIs#
cublasMpTrsm
#
cublasMpStatus_t cublasMpTrsm(
cublasMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(\left\{ \begin{matrix} {\text{op}(A)X = \alpha B} & {\text{if }\textsf{side == $\mathrm{CUBLAS\_SIDE\_LEFT}$}} \\ {X\text{op}(A) = \alpha B} & {\text{if }\textsf{side == $\mathrm{CUBLAS\_SIDE\_RIGHT}$}} \\ \end{matrix} \right.\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
side |
Host |
In |
Indicates if matrix A is on the left or right of X. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of matrix sub(B), with matrix sub(A) sized accordingly. |
n |
Host |
In |
Number of columns of matrix sub(B), with matrix sub(A) is sized accordingly. |
alpha |
Host |
In |
Scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTrsm_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTrsm_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpTrsm_bufferSize
#
cublasMpStatus_t cublasMpTrsm_bufferSize(
cublasMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
side |
Host |
In |
Indicates if matrix A is on the left or right of X. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of matrix sub(B), with matrix sub(A) sized accordingly. |
n |
Host |
In |
Number of columns of matrix sub(B), with matrix sub(A) is sized accordingly. |
alpha |
Host |
In |
Scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTrsm(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTrsm(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpGemm
#
cublasMpStatus_t cublasMpGemm(
cublasMpHandle_t handle,
cublasOperation_t transA,
cublasOperation_t transB,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A)\text{op}(B) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{transA == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{transA == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{transA == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
transA |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
transB |
Host |
In |
Operation op(B) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGemm_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGemm_bufferSize(). |
Note
This routine will internally call cublasMpMatmul() with d == c
.
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpGemm_bufferSize
#
cublasMpStatus_t cublasMpGemm_bufferSize(
cublasMpHandle_t handle,
cublasOperation_t transA,
cublasOperation_t transB,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
transA |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
transB |
Host |
In |
Operation op(B) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGemm(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGemm(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpMatmul
#
cublasMpStatus_t cublasMpMatmul(
cublasMpHandle_t handle,
cublasMpMatmulDescriptor_t matmulDesc,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
const void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d,
int64_t id,
int64_t jd,
cublasMpMatrixDescriptor_t descD,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(D = \alpha\text{op}(A)\text{op}(B) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{TRANSA == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{TRANSA == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{TRANSA == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Note
FP8 matrix multiplication is only supported for the TN
format, i.e. \(A^T * B\), CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSA == CUBLAS_OP_T
, CUBLASMP_MATMUL_DESCRIPTOR_ATTRIBUTE_TRANSB == CUBLAS_OP_N
.
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
matmulDesc |
Host |
In |
Descriptor of the operation to perform, created with cublasMpMatmulDescriptorCreate(). |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix D. |
id |
Host |
In |
Row index of the first row of the sub(D). |
jd |
Host |
In |
Column index of the first column of the sub(D). |
descD |
Host |
In |
Matrix descriptor associated to the global matrix D. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpMatmul_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpMatmul_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype |
Btype |
Ctype |
Dtype |
---|---|---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_8I |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_8I |
CUDA_C_32F |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpMatmul_bufferSize
#
cublasMpStatus_t cublasMpMatmul_bufferSize(
cublasMpHandle_t handle,
cublasMpMatmulDescriptor_t matmulDesc,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
const void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d,
int64_t id,
int64_t jd,
cublasMpMatrixDescriptor_t descD,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
matmulDesc |
Host |
In |
Descriptor of the operation to perform, created with cublasMpMatmulDescriptorCreate(). |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix D. |
id |
Host |
In |
Row index of the first row of the sub(D). |
jd |
Host |
In |
Column index of the first column of the sub(D). |
descD |
Host |
In |
Matrix descriptor associated to the global matrix D. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpMatmul(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpMatmul(). |
Compute Type |
Scale Type (alpha and beta) |
Atype |
Btype |
Ctype |
Dtype |
---|---|---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_8I |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_8I |
CUDA_C_32F |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpSyrk
#
cublasMpStatus_t cublasMpSyrk(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A)\text{op}(A)^{T} + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or transpose. |
n |
Host |
In |
Number of rows of sub(A) and sub(C). |
k |
Host |
In |
Number of columns of sub(A). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpSyrk_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpSyrk_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpSyrk_bufferSize
#
cublasMpStatus_t cublasMpSyrk_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or transpose. |
n |
Host |
In |
Number of rows of sub(A) and sub(C). |
k |
Host |
In |
Number of columns of sub(A). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpSyrk(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpSyrk(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpGeadd
#
cublasMpStatus_t cublasMpGeadd(
cublasMpHandle_t handle,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGeadd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGeadd_bufferSize(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpGeadd_bufferSize
#
cublasMpStatus_t cublasMpGeadd_bufferSize(
cublasMpHandle_t handle,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGeadd(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGeadd(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpTradd
#
cublasMpStatus_t cublasMpTradd(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTradd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTradd_bufferSize(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpTradd_bufferSize
#
cublasMpStatus_t cublasMpTradd_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTradd(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTradd(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |