cuBLASMp C API#
Library Management#
cublasMpCreate
#
cublasMpStatus_t cublasMpCreate(
cublasMpHandle_t *handle,
cudaStream_t stream);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
Out |
cuBLASMp library handle. |
stream |
Host |
In |
Stream that will be assigned to the handle. |
cublasMpDestroy
#
cublasMpStatus_t cublasMpDestroy(
cublasMpHandle_t handle);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In/Out |
cuBLASMp library handle to destroy. |
cublasMpStreamSet
#
cublasMpStatus_t cublasMpStreamSet(
cublasMpHandle_t handle,
cudaStream_t stream);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
stream |
Host |
In |
CUDA stream pointer to set. |
cublasMpStreamGet
#
cublasMpStatus_t cublasMpStreamGet(
cublasMpHandle_t handle,
cudaStream_t* stream);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
stream |
Host |
Out |
CUDA stream pointer to set. |
cublasMpGetVersion
#
cublasMpStatus_t cublasMpGetVersion(
int *version);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
version |
Host |
Out |
cuBLASMp library version. Value is |
Grid Management#
cublasMpGridCreate
#
cublasMpStatus_t cublasMpGridCreate(
int64_t nprow,
int64_t npcol,
cublasMpGridLayout_t layout,
cal_comm_t comm,
cublasMpGrid_t* grid);
If NVSHMEM is not initialized by the user, cuBLASMp will initialize it as the first grid is created. If NVSHMEM is initialized already, it will not be re-initialized. cuBLASMp will only call
nvshmem_finalize
if it initialized NVSHMEM.
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
nprow |
Host |
In |
How many row processes the grid contains. |
npcol |
Host |
In |
How many column processes the grid contains. |
layout |
Host |
In |
Grid’s layout (cublasMpGridLayout_t). |
comm |
Host |
In |
Communicator associated with the grid. |
grid |
Host |
In/Out |
Pointer to a grid object. |
cublasMpGridDestroy
#
cublasMpStatus_t cublasMpGridDestroy(
cublasMpGrid_t grid);
grid
object.Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
grid |
Host |
In/Out |
Grid object to destroy. |
Matrix Management#
cublasMpMatrixDescriptorCreate
#
cublasMpStatus_t cublasMpMatrixDescriptorCreate(
int64_t m,
int64_t n,
int64_t mb,
int64_t nb,
int64_t rsrc,
int64_t csrc,
int64_t lld,
cudaDataType_t type,
cublasMpGrid_t grid,
cublasMpMatrixDescriptor_t* desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
m |
Host |
In |
Number of rows in the global matrix. |
n |
Host |
In |
Number of columns in the global matrix. |
mb |
Host |
In |
Blocking factor used to distribute the rows of the global matrix. |
nb |
Host |
In |
Blocking factor used to distribute the columns of the global matrix. |
rsrc |
Host |
In |
Row rank of the process who owns the first row block of the global matrix. |
csrc |
Host |
In |
Column rank of the process who owns the first column block of the global matrix. |
lld |
Host |
In |
Leading dimension of the local matrix. |
type |
Host |
In |
Data type of the matrix. |
grid |
Host |
In |
Grid object associated with the matrix descriptor. |
desc |
Host |
Out |
Matrix descriptor object initialized by this function. |
type
argument are listed.Data Type |
Description |
---|---|
CUDA_R_8I |
8-bit real signed integer. |
CUDA_R_32I |
32-bit real signed integer. |
CUDA_R_8F_E4M3 |
8-bit real floating point in E4M3 format. |
CUDA_R_8F_E5M2 |
8-bit real floating point in E5M2 format. |
CUDA_R_16F |
16-bit real half precision floating-point. |
CUDA_R_16BF |
16-bit real bfloat16 floating-point. |
CUDA_R_32F |
32-bit real single precision floating-point. |
CUDA_R_64F |
64-bit real double precision floating-point. |
CUDA_C_32F |
64-bit structure comprised of two single precision floating-points representing a complex number. |
CUDA_C_64F |
128-bit structure comprised of two double precision floating-points representing a complex number. |
cublasMpMatrixDescriptorDestroy
#
cublasMpStatus_t cublasMpMatrixDescriptorDestroy(
cublasMpMatrixDescriptor_t desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
desc |
Host |
In/Out |
Matrix descriptor object to destroy. |
cublasMpMatrixDescriptorInit
#
cublasMpStatus_t cublasMpMatrixDescriptorInit(
int64_t m,
int64_t n,
int64_t mb,
int64_t nb,
int64_t rsrc,
int64_t csrc,
int64_t lld,
cudaDataType_t type,
cublasMpGrid_t grid,
cublasMpMatrixDescriptor_t desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
m |
Host |
In |
Number of rows in the global matrix. |
n |
Host |
In |
Number of columns in the global matrix. |
mb |
Host |
In |
Blocking factor used to distribute the rows of the global matrix. |
nb |
Host |
In |
Blocking factor used to distribute the columns of the global matrix. |
rsrc |
Host |
In |
Row rank of the process who owns the first row block of the global matrix. |
csrc |
Host |
In |
Column rank of the process who owns the first column block of the global matrix. |
lld |
Host |
In |
Leading dimension of the local matrix. |
type |
Host |
In |
Data type of the matrix. |
grid |
Host |
In |
Grid object associated with the matrix descriptor. |
desc |
Host |
Out |
Matrix descriptor object initialized by this function. |
type
argument are listed below:Data Type |
Description |
---|---|
CUDA_R_8I |
8-bit real signed integer. |
CUDA_R_32I |
32-bit real signed integer. |
CUDA_R_8F_E4M3 |
8-bit real floating point in E4M3 format. |
CUDA_R_8F_E5M2 |
8-bit real floating point in E5M2 format. |
CUDA_R_16F |
16-bit real half precision floating-point. |
CUDA_R_16BF |
16-bit real bfloat16 floating-point. |
CUDA_R_32F |
32-bit real single precision floating-point. |
CUDA_R_64F |
64-bit real double precision floating-point. |
CUDA_C_32F |
64-bit structure comprised of two single precision floating-points representing a complex number. |
CUDA_C_64F |
128-bit structure comprised of two double precision floating-points representing a complex number. |
Matmul Properties#
cublasMpMatmulDescriptorCreate
#
cublasMpStatus_t cublasMpMatmulDescriptorCreate(
cublasMpMatmulDescriptor_t* matmulDesc,
cublasComputeType_t computeType);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In/Out |
Pointer to a cublasMpMatmulDescriptor object to initialize. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
computeType
argument are listed.Compute Types |
---|
CUBLAS_COMPUTE_32I |
CUBLAS_COMPUTE_32I_PEDANTIC |
CUBLAS_COMPUTE_16F |
CUBLAS_COMPUTE_16F_PEDANTIC |
CUBLAS_COMPUTE_32F |
CUBLAS_COMPUTE_32F_PEDANTIC |
CUBLAS_COMPUTE_32F_FAST_16F |
CUBLAS_COMPUTE_32F_FAST_16BF |
CUBLAS_COMPUTE_32F_FAST_TF32 |
CUBLAS_COMPUTE_64F |
CUBLAS_COMPUTE_64F_PEDANTIC |
cublasMpMatmulDescriptorDestroy
#
cublasMpStatus_t cublasMpMatmulDescriptorDestroy(
cublasMpMatmulDescriptor_t matmulDesc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In/Out |
Matmul descriptor object to destroy. |
cublasMpMatmulDescriptorAttributeSet
#
cublasMpStatus_t cublasMpMatmulDescriptorAttributeSet(
cublasMpMatmulDescriptor_t matmulDesc,
cublasMpMatmulDescriptorAttribute_t attr,
const void* buf,
size_t sizeInBytes);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In |
Matmul descriptor object to set its attribute. |
attr |
Host |
In |
Matmul descriptor attribute to set. |
buf |
Host |
In |
Attribute value to set. |
sizeInBytes |
Host |
In |
Attribute buffer size in bytes. |
cublasMpMatmulDescriptorAttributeGet
#
cublasMpStatus_t cublasMpMatmulDescriptorAttributeGet(
cublasMpMatmulDescriptor_t matmulDesc,
cublasMpMatmulDescriptorAttribute_t attr,
const void* buf,
size_t sizeInBytes,
size_t* sizeWritten);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
matmulDesc |
Host |
In |
Matmul descriptor object to set its attribute. |
attr |
Host |
In |
Matmul descriptor attribute to set. |
buf |
Host |
Out |
Attribute value to set. |
sizeInBytes |
Host |
In |
Attribute buffer size in bytes. |
sizeWritten |
Host |
Out |
Size of the attribute written into |
Utility#
cublasMpNumroc
#
int64_t cublasMpNumroc(
int64_t n,
int64_t nb,
uint32_t iproc,
uint32_t isrcproc,
uint32_t nprocs);
iproc
argument.Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
n |
Host |
In |
Number of rows or columns in the global distributed matrix. |
nb |
Host |
In |
Row or column blocking size of the global matrix. |
iproc |
Host |
In |
The coordinate of the process whose local array row or column is to be determined. |
isrcproc |
Host |
In |
The coordinate of the process that owns the first row or column of the distributed matrix. |
nprocs |
Host |
In |
The total number of row or column processes over which the matrix is distributed. |
cublasMpGemr2D
#
cublasMpStatus_t cublasMpGemr2D(
cublasMpHandle_t handle,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(A) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGemr2D_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGemr2D_bufferSize(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpGemr2D_bufferSize
#
cublasMpStatus_t cublasMpGemr2D_bufferSize(
cublasMpHandle_t handle,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(B) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGemr2D(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGemr2D(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpTrmr2D
#
cublasMpStatus_t cublasMpTrmr2D(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(A) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTrmr2D_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTrmr2D_bufferSize(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpTrmr2D_bufferSize
#
cublasMpStatus_t cublasMpTrmr2D_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(B) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTrmr2D(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTrmr2D(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
Logging#
cublasMpLoggerSetCallback
#
cublasMpStatus_t cublasMpLoggerSetCallback(
cublasMpLoggerCallback_t callback);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
callback |
Host |
In |
Pointer to a callback function. See |
Warning
This is an experimental feature.
cublasMpLoggerSetFile
#
cublasMpStatus_t cublasMpLoggerSetFile(
FILE *file);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
file |
Host |
In |
Pointer to an open file. File should have write permission. |
Warning
This is an experimental feature.
cublasMpLoggerOpenFile
#
cublasMpStatus_t cublasMpLoggerOpenFile(
const char* logFile);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
logFile |
Host |
In |
Path of the logging output file. |
Warning
This is an experimental feature.
cublasMpLoggerSetLevel
#
cublasMpStatus_t cublasMpLoggerSetLevel(
int level);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
level |
Host |
In |
Value of the logging level. See cuBLASMp Logging. |
Warning
This is an experimental feature.
cublasMpLoggerSetMask
#
cublasMpStatus_t cublasMpLoggerSetMask(
int mask);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
mask |
Host |
In |
Value of the logging mask. See cuBLASMp Logging. |
Warning
This is an experimental feature.
cublasMpLoggerForceDisable
#
cublasMpStatus_t cublasMpLoggerForceDisable();
Warning
This is an experimental feature.
Dense Linear Algebra APIs#
cublasMpTrsm
#
cublasMpStatus_t cublasMpTrsm(
cublasMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(\left\{ \begin{matrix} {\text{op}(A)X = \alpha B} & {\text{if }\textsf{side == $\mathrm{CUBLAS\_SIDE\_LEFT}$}} \\ {X\text{op}(A) = \alpha B} & {\text{if }\textsf{side == $\mathrm{CUBLAS\_SIDE\_RIGHT}$}} \\ \end{matrix} \right.\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
side |
Host |
In |
Indicates if matrix A is on the left or right of X. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of matrix sub(B), with matrix sub(A) sized accordingly. |
n |
Host |
In |
Number of columns of matrix sub(B), with matrix sub(A) is sized accordingly. |
alpha |
Host |
In |
Scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTrsm_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTrsm_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpTrsm_bufferSize
#
cublasMpStatus_t cublasMpTrsm_bufferSize(
cublasMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
side |
Host |
In |
Indicates if matrix A is on the left or right of X. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of matrix sub(B), with matrix sub(A) sized accordingly. |
n |
Host |
In |
Number of columns of matrix sub(B), with matrix sub(A) is sized accordingly. |
alpha |
Host |
In |
Scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTrsm(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTrsm(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpGemm
#
cublasMpStatus_t cublasMpGemm(
cublasMpHandle_t handle,
cublasOperation_t transA,
cublasOperation_t transB,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A)\text{op}(B) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{transA == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{transA == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{transA == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
transA |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
transB |
Host |
In |
Operation op(B) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGemm_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGemm_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpGemm_bufferSize
#
cublasMpStatus_t cublasMpGemm_bufferSize(
cublasMpHandle_t handle,
cublasOperation_t transA,
cublasOperation_t transB,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
transA |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
transB |
Host |
In |
Operation op(B) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGemm(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGemm(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpMatmul
#
cublasMpStatus_t cublasMpMatmul(
cublasMpHandle_t handle,
cublasMpMatmulDescriptor_t matmulDesc,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
const void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d,
int64_t id,
int64_t jd,
cublasMpMatrixDescriptor_t descD,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(D = \alpha\text{op}(A)\text{op}(B) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{TRANSA == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{TRANSA == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{TRANSA == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
matmulDesc |
Host |
In |
Descriptor of the operation to perform, created with cublasMpMatmulDescriptorCreate(). |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix D. |
id |
Host |
In |
Row index of the first row of the sub(D). |
jd |
Host |
In |
Column index of the first column of the sub(D). |
descD |
Host |
In |
Matrix descriptor associated to the global matrix D. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpMatmul_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpMatmul_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype |
Btype |
Ctype |
Dtype |
---|---|---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_8I |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_8I |
CUDA_C_32F |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpMatmul_bufferSize
#
cublasMpStatus_t cublasMpMatmul_bufferSize(
cublasMpHandle_t handle,
cublasMpMatmulDescriptor_t matmulDesc,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
const void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d,
int64_t id,
int64_t jd,
cublasMpMatrixDescriptor_t descD,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
matmulDesc |
Host |
In |
Descriptor of the operation to perform, created with cublasMpMatmulDescriptorCreate(). |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix D. |
id |
Host |
In |
Row index of the first row of the sub(D). |
jd |
Host |
In |
Column index of the first column of the sub(D). |
descD |
Host |
In |
Matrix descriptor associated to the global matrix D. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGemm(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGemm(). |
Compute Type |
Scale Type (alpha and beta) |
Atype |
Btype |
Ctype |
Dtype |
---|---|---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_8I |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_8I |
CUDA_C_32F |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E4M3 |
CUDA_R_8F_E5M2 |
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_16BF |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16BF |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E4M3 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_16F |
CUDA_R_8F_E5M2 |
||
CUDA_R_8F_E5M2 |
CUDA_R_8F_E4M3 |
CUDA_R_32F |
CUDA_R_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpSyrk
#
cublasMpStatus_t cublasMpSyrk(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A)\text{op}(A)^{T} + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or transpose. |
n |
Host |
In |
Number of rows of sub(A) and sub(C). |
k |
Host |
In |
Number of columns of sub(A). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix A. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpSyrk_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpSyrk_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpSyrk_bufferSize
#
cublasMpStatus_t cublasMpSyrk_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or transpose. |
n |
Host |
In |
Number of rows of sub(A) and sub(C). |
k |
Host |
In |
Number of columns of sub(A). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpSyrk(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpSyrk(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provided to this function is used only for internal matrix-matrix multiplications.cublasMpGeadd
#
cublasMpStatus_t cublasMpGeadd(
cublasMpHandle_t handle,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGeadd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGeadd_bufferSize(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpGeadd_bufferSize
#
cublasMpStatus_t cublasMpGeadd_bufferSize(
cublasMpHandle_t handle,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGeadd(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGeadd(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpTradd
#
cublasMpStatus_t cublasMpTradd(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTradd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTradd_bufferSize(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpTradd_bufferSize
#
cublasMpStatus_t cublasMpTradd_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTradd(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTradd(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |