cuBLASMp C API¶
Library Management¶
cublasMpCreate
¶
cublasStatus_t cublasMpCreate(
cublasMpHandle_t *handle,
cudaStream_t stream);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
Out |
cuBLASMp library handle |
stream |
Host |
In |
Stream that will be assigned to the handle. |
cublasMpDestroy
¶
cublasStatus_t cublasMpDestroy(
cublasMpHandle_t handle);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In/Out |
cuBLASMp library handle to destroy |
cublasMpGetVersion
¶
cublasStatus_t cublasMpGetVersion(
cublasMpHandle_t handle,
int *version);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle |
version |
Host |
Out |
cuBLASMp library version. Value is |
cublasMpSetMathMode
¶
cublasStatus_t cublasMpSetMathMode(
cublasMpHandle_t handle,
cublasMath_t mode);
CUBLAS_TENSOR_OP_MATH
).
For example, cublasMpSetMathMode(handle, CUBLAS_DEFAULT_MATH | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION)
. Please note that the default math mode is CUBLAS_DEFAULT_MATH
.Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle |
mode |
Host |
In |
cuBLAS math mode. |
cublasMpGetMathMode
¶
cublasStatus_t cublasMpGetMathMode(
cublasMpHandle_t handle,
cublasMath_t* mode);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle |
mode |
Host |
Out |
cuBLAS math mode. |
Grid Management¶
cublasMpGridCreate
¶
cublasStatus_t cublasMpGridCreate(
cublasMpHandle_t handle,
int64_t nprow,
int64_t npcol,
int64_t myprow,
int64_t mypcol,
cal_comm_t comm,
cublasMpGrid_t* grid);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle |
nprow |
Host |
In |
How many row process the grid contains. |
npcol |
Host |
In |
How many column process the grid contains. |
myprow |
Host |
In |
What is the current process’s row rank. |
mypcol |
Host |
In |
What is the current process’s column rank |
comm |
Host |
In |
Communicator associated with the grid. |
grid |
Host |
In/Out |
Pointer to a grid object. |
cublasMpGridDestroy
¶
cublasStatus_t cublasMpGridDestroy(
cublasMpHandle_t handle,
cublasMpGrid_t grid);
grid
object.Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle |
grid |
Host |
In/Out |
Grid object to destroy. |
Matrix Management¶
cublasMpMatrixDescriptorCreate
¶
cublasStatus_t cublasMpMatrixDescriptorCreate(
cublasMpHandle_t handle,
int64_t m,
int64_t n,
int64_t mb,
int64_t nb,
int64_t rsrc,
int64_t csrc,
int64_t lld,
cudaDataType_t type,
cublasMpGrid_t grid,
cublasMpMatrixDescriptor_t* desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle |
m |
Host |
In |
Number of rows in the global array. |
n |
Host |
In |
Number of columns in the global matrix. |
mb |
Host |
In |
Blocking factor used to distribute the rows of the global matrix. |
nb |
Host |
In |
Blocking factor used to distribute the columns of the global matrix. |
rsrc |
Host |
In |
Row rank of the process who owns the first row block of the global matrix. |
csrc |
Host |
In |
Column rank of the process who owns the first column block of the global matrix. |
lld |
Host |
In |
Leading dimension of the local matrix. |
type |
Host |
In |
Data type of the matrix. |
grid |
Host |
In |
Grid object associated with the matrix descriptor |
desc |
Host |
Out |
Matrix descriptor object initialized by this function. |
dataType
argument are listed.Data Type of A |
Description |
---|---|
CUDA_R_32F |
Single precision real values. |
CUDA_R_64F |
Double precision real values. |
CUDA_C_32F |
Single precision complex values. |
CUDA_C_64F |
Double precision complex values. |
cublasMpMatrixDescriptorDestroy
¶
cublasStatus_t cublasMpMatrixDescriptorDestroy(
cublasMpHandle_t handle,
cublasMpMatrixDescriptor_t desc);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle |
desc |
Host |
In/Out |
Matrix descriptor object to destroy. |
Utility¶
cublasMpNumroc
¶
int64_t cublasMpNumroc(
int64_t n,
int64_t nb,
uint32_t iproc,
uint32_t isrcproc,
uint32_t nprocs);
iproc
argument.Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
n |
Host |
In |
Number of rows or columns in the global distributed matrix. |
nb |
Host |
In |
Row or column blocking size of the global matrix. |
iproc |
Host |
In |
The coordinate of the process whole local array row or column is to be determined. |
isrcproc |
Host |
In |
The coordinate of the process that owns the first row or column of the distributed matrix. |
nprocs |
Host |
In |
The total number of row or column processes over which the matrix is distributed. |
cublasMpGemr2D
¶
cublasStatus_t cublasMpGemr2D(
cublasMpHandle_t handle,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(A) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
d_work |
Device |
Out |
Host workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGemr2D_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGemr2D_bufferSize() |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpGemr2D_bufferSize
¶
cublasStatus_t cublasMpGemr2D_bufferSize(
cublasMpHandle_t handle,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGemr2D(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGemr2D(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpTrmr2D
¶
cublasStatus_t cublasMpTrmr2D(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of sub(A) and sub(B). |
n |
Host |
In |
Number of columns of sub(A) and sub(B). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
d_work |
Device |
Out |
Host workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTrmr2D_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTrmr2D_bufferSize() |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
cublasMpTrmr2D_bufferSize
¶
cublasStatus_t cublasMpTrmr2D_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost,
cal_comm_t global_comm);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. descA’s grid value must be set to null in processes that are not part of the grid of A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. descB’s grid value must be set to null in processes that are not part of the grid of B. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTrmr2D(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTrmr2D(). |
global_comm |
Host |
In |
A communicator containing at least the union of all processes in the communicators of A and B. All processes in the communicator must call this function, even if they do not own a piece of either matrix. |
Logging¶
cublasMpLoggerSetCallback
¶
cublasStatus_t cublasMpLoggerSetCallback(
cublasMpLoggerCallback_t callback);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
callback |
Host |
In |
Pointer to a callback function. See |
Warning
This is an experimental feature.
cublasMpLoggerSetFile
¶
cublasStatus_t cublasMpLoggerSetFile(
FILE *file);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
file |
Host |
In |
Pointer to an open file. File should have write permission |
Warning
This is an experimental feature.
cublasMpLoggerOpenFile
¶
cublasStatus_t cublasMpLoggerOpenFile(
const char* logFile);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
logFile |
Host |
In |
Path of the logging output file. |
Warning
This is an experimental feature.
cublasMpLoggerSetLevel
¶
cublasStatus_t cublasMpLoggerSetLevel(
int level);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
level |
Host |
In |
Value of the logging level. See |
Warning
This is an experimental feature.
cublasMpLoggerSetMask
¶
cublasStatus_t cublasMpLoggerSetMask(
int mask);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
mask |
Host |
In |
Value of the logging mask. See |
Warning
This is an experimental feature.
cublasMpLoggerForceDisable
¶
cublasStatus_t cublasMpLoggerForceDisable(
int level);
Warning
This is an experimental feature.
Dense Linear Algebra APIs¶
cublasMpTrsm
¶
cublasStatus_t cublasMpTrsm(
cublasMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(\left\{ \begin{matrix} {\text{op}(A)X = \alpha B} & {\text{if }\textsf{side == $\mathrm{CUBLAS\_SIDE\_LEFT}$}} \\ {X\text{op}(A) = \alpha B} & {\text{if }\textsf{side == $\mathrm{CUBLAS\_SIDE\_RIGHT}$}} \\ \end{matrix} \right.\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
side |
Host |
In |
Indicates if matrix A is on the left or right of X. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of matrix sub(B), with matrix sub(A) sized accordingly. |
n |
Host |
In |
Number of columns of matrix sub(B), with matrix sub(A) is sized accordingly. |
alpha |
Host |
In |
Scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A |
b |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTrsm_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTrsm_bufferSize() |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provdied to this function is used only for internal matrix-matrix multiplications.cublasMpTrsm_bufferSize
¶
cublasStatus_t cublasMpTrsm_bufferSize(
cublasMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
cublasDiagType_t diag,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
side |
Host |
In |
Indicates if matrix A is on the left or right of X. |
uplo |
Host |
In |
Indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
diag |
Host |
In |
Indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. |
m |
Host |
In |
Number of rows of matrix sub(B), with matrix sub(A) sized accordingly. |
n |
Host |
In |
Number of columns of matrix sub(B), with matrix sub(A) is sized accordingly. |
alpha |
Host |
In |
Scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTrsm(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTrsm(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provdied to this function is used only for internal matrix-matrix multiplications.cublasMpGemm
¶
cublasStatus_t cublasMpGemm(
cublasMpHandle_t handle,
cublasOperation_t transA,
cublasOperation_t transB,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A)\text{op}(B) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
trans |
Form of the linear system |
---|---|
CUBLAS_OP_N |
\(sub(A) \cdot X = sub(B)\) |
CUBLAS_OP_T |
\(sub(A)^T \cdot X = sub(B)\) |
CUBLAS_OP_C |
\(sub(A)^H \cdot X = sub(B)\) |
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
transA |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
transB |
Host |
In |
Operation op(B) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Host workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGemm_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGemm_bufferSize() |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provdied to this function is used only for internal matrix-matrix multiplications.cublasMpGemm_bufferSize
¶
cublasStatus_t cublasMpGemm_bufferSize(
cublasMpHandle_t handle,
cublasOperation_t transA,
cublasOperation_t transB,
int64_t m,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* b,
int64_t ib,
int64_t jb,
cublasMpMatrixDescriptor_t descB,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
transA |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
transB |
Host |
In |
Operation op(B) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(B) and sub(C). |
k |
Host |
In |
Number of columns of sub(A) and rows of sub(B). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
b |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. |
ib |
Host |
In |
Row index of the first row of the sub(B). |
jb |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGemm(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGemm(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_16F or CUBLAS_COMPUTE_16F_PEDANTIC |
CUDA_R_16F |
CUDA_R_16F |
CUDA_R_16F |
CUBLAS_COMPUTE_32F or CUBLAS_COMPUTE_32F_PEDANTIC |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_16BF |
CUDA_R_16F |
CUDA_R_16F |
||
CUDA_R_8I |
CUDA_R_32F |
||
CUDA_R_16BF |
CUDA_R_32F |
||
CUDA_R_16F |
CUDA_R_32F |
||
CUDA_R_32F |
CUDA_R_32F |
||
CUDA_C_32F |
CUDA_C_8I |
CUDA_C_32F |
|
CUDA_C_32F |
CUDA_C_32F |
||
CUBLAS_COMPUTE_32F_FAST_16F or CUBLAS_COMPUTE_32F_FAST_16BF or CUBLAS_COMPUTE_32F_FAST_TF32 |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F or CUBLAS_COMPUTE_64F_PEDANTIC |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provdied to this function is used only for internal matrix-matrix multiplications.cublasMpSyrk
¶
cublasStatus_t cublasMpSyrk(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A)\text{op}(A)^{T} + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_T}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or transpose. |
n |
Host |
In |
Number of rows of sub(A) and sub(C). |
k |
Host |
In |
Number of columns of sub(A). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix A. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpSyrk_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpSyrk_bufferSize(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provdied to this function is used only for internal matrix-matrix multiplications.cublasMpSyrk_bufferSize
¶
cublasStatus_t cublasMpSyrk_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t n,
int64_t k,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
cublasComputeType_t computeType,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other symmetric part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or transpose. |
n |
Host |
In |
Number of rows of sub(A) and sub(C). |
k |
Host |
In |
Number of columns of sub(A). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
cuBLAS compute type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpSyrk(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpSyrk(). |
Compute Type |
Scale Type (alpha and beta) |
Atype/Btype |
Ctype |
---|---|---|---|
CUBLAS_COMPUTE_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
|
CUBLAS_COMPUTE_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
computeType
parameter provdied to this function is used only for internal matrix-matrix multiplications.cublasMpGeadd
¶
cublasStatus_t cublasMpGeadd(
cublasMpHandle_t handle,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d_work |
Device |
Out |
Host workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpGeadd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpGeadd_bufferSize() |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpGeadd_bufferSize
¶
cublasStatus_t cublasMpGeadd_bufferSize(
cublasMpHandle_t handle,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpGeadd(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpGeadd(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpTradd
¶
cublasStatus_t cublasMpTradd(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
void* d_work,
size_t workspaceSizeInBytesOnDevice,
void* h_work,
size_t workspaceSizeInBytesOnHost);
\(C = \alpha\text{op}(A) + \beta C\)
\(\text{op}(A) = \left\{ \begin{matrix} A & {\text{if }\textsf{transa == $\mathrm{CUBLAS\_OP\_N}$}} \\ A^{T} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_T}$}} \\ A^{H} & {\text{if }\textsf{trans == $\mathrm{CUBLAS\_OP\_C}$}} \\ \end{matrix} \right.\)
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
d_work |
Device |
Out |
Host workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cublasMpTradd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cublasMpTradd_bufferSize() |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cublasMpTradd_bufferSize
¶
cublasStatus_t cublasMpTradd_bufferSize(
cublasMpHandle_t handle,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t m,
int64_t n,
const void* alpha,
const void* a,
int64_t ia,
int64_t ja,
cublasMpMatrixDescriptor_t descA,
const void* beta,
void* c,
int64_t ic,
int64_t jc,
cublasMpMatrixDescriptor_t descC,
size_t* workspaceSizeInBytesOnDevice,
size_t* workspaceSizeInBytesOnHost);
Parameter |
Memory |
In/Out |
Description |
---|---|---|---|
handle |
Host |
In |
cuBLASMp library handle. |
uplo |
Host |
In |
Indicates if matrix C lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. |
trans |
Host |
In |
Operation op(A) that is non- or (conj.) transpose. |
m |
Host |
In |
Number of rows of sub(A) and sub(C). |
n |
Host |
In |
Number of columns of sub(A) and sub(C). |
alpha |
Host |
In |
<type> scalar used for multiplication. |
a |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
ia |
Host |
In |
Row index of the first row of the sub(A). |
ja |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
beta |
Host |
In |
<type> scalar used for multiplication. |
c |
Device |
In |
Pointer to the first entry of the local portion of the global matrix C. |
ic |
Host |
In |
Row index of the first row of the sub(C). |
jc |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cublasMpTradd(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cublasMpTradd(). |
Data Type of A |
computeType |
Output Data Type |
---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |