cuSOLVERMp C API#
Library Management#
cusolverMpCreate#
cusolverStatus_t cusolverMpCreate(
cusolverMpHandle_t *handle,
int device,
cudaStream_t stream)
device and the CUDA stream stream.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
Out |
cuSOLVERMp library handle. |
device |
Host |
In |
Device that will be assigned to the handle. |
stream |
Host |
In |
Stream that will be assigned to the handle. |
cusolverMpDestroy#
cusolverStatus_t cusolverMpDestroy(
cusolverMpHandle_t handle)
device. Only one handle per process and per GPU supported.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In/Out |
cuSOLVERMp library handle. |
cusolverMpSetStream#
cusolverStatus_t cusolverMpSetStream(
cusolverMpHandle_t handle,
cudaStream_t stream)
stream associated to the handle.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
stream |
Host |
In |
New stream associated with the handle. |
cusolverMpGetStream#
cusolverStatus_t cusolverMpGetStream(
cusolverMpHandle_t handle,
cudaStream_t *stream)
stream associated to the handle.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
stream |
Host |
Out |
Stream associated with the handle. |
cusolverMpGetVersion#
cusolverStatus_t cusolverMpGetVersion(
cusolverMpHandle_t handle,
int *version)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
version |
Host |
Out |
cuSOLVERMp library version. Value is |
cusolverMpSetMathMode#
cusolverStatus_t cusolverMpSetMathMode(
cusolverMpHandle_t handle,
cusolverMathMode_t mode)
Note
Please note that the workspace sizes returned by *_bufferSize APIs may depend on the math mode.
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
mode |
Host |
In |
Math mode to be set for the handle. Available options are |
cusolverMpGetMathMode#
cusolverStatus_t cusolverMpGetMathMode(
cusolverMpHandle_t handle,
cusolverMathMode_t *mode)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
mode |
Host |
Out |
Current math mode of the handle. |
cusolverMpSetEmulationStrategy#
cusolverStatus_t cusolverMpSetEmulationStrategy(
cusolverMpHandle_t handle,
cudaEmulationStrategy_t strategy)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
strategy |
Host |
In |
Emulation strategy to be set for the handle. Available options are |
CUSOLVER_FP32_EMULATED_BF16X9_MATH.cusolverMpGetEmulationStrategy#
cusolverStatus_t cusolverMpGetEmulationStrategy(
cusolverMpHandle_t handle,
cudaEmulationStrategy_t *strategy)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
strategy |
Host |
Out |
Current emulation strategy of the handle. |
Grid Management#
cusolverMpCreateDeviceGrid#
cusolverStatus_t cusolverMpCreateDeviceGrid(
cusolverMpHandle_t handle,
cusolverMpGrid_t *grid,
ncclComm_t comm,
int32_t numRowDevices,
int32_t numColDevices,
cusolverMpGridMapping_t mapping)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
grid |
Host |
Out |
Grid object to be initialized. |
comm |
Host |
In |
Communicator that will be associated with the grid. |
numRowDevices |
Host |
In |
How many process rows the grid will contain. |
numColDevices |
Host |
In |
How many process columns the grid will contain. |
mapping |
Host |
In |
How to map processes to the grid. See description of cusolverMpGrid_t for further details. |
cusolverMpDestroyGrid#
cusolverStatus_t cusolverMpDestroyGrid(
cusolverMpGrid_t grid)
grid object.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
grid |
Host |
In/Out |
Grid object to be destroyed. |
Memory Management#
cusolverMpBufferRegister#
cusolverStatus_t cusolverMpBufferRegister(
cusolverMpGrid_t grid,
void *ptr,
size_t size)
ncclCommWindowRegister with NCCL_WIN_COLL_SYMMETRIC. Pre-registering workspace buffers can improve the performance of NCCL collective operations by enabling hardware-accelerated communication paths.ncclMemAlloc. Registration is idempotent: re-registering the same pointer with the same size is a no-op.ptr and size values. Re-registering the same pointer with a different size is invalid.ptr is not compatible with NCCL symmetric memory registration, this function returns CUSOLVER_STATUS_NOT_SUPPORTED.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
grid |
Host |
In |
Grid object (provides the NCCL communicator). |
ptr |
Device |
In |
Device buffer compatible with NCCL symmetric memory registration. |
size |
Host |
In |
Buffer size in bytes. |
cusolverMpBufferDeregister#
cusolverStatus_t cusolverMpBufferDeregister(
cusolverMpGrid_t grid,
void *ptr)
ptr value. Should be called before freeing the underlying allocation; otherwise behavior is undefined.CUSOLVER_STATUS_INVALID_VALUE.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
grid |
Host |
In |
Grid object (provides the NCCL communicator). |
ptr |
Device |
In |
Device buffer to deregister. |
Matrix Management#
cusolverMpCreateMatrixDesc#
cusolverStatus_t cusolverMpCreateMatrixDesc(
cusolverMpMatrixDescriptor_t *desc,
cusolverMpGrid_t grid,
cudaDataType dataType,
int64_t M_A,
int64_t N_A,
int64_t MB_A,
int64_t NB_A,
uint32_t RSRC_A,
uint32_t CSRC_A,
int64_t LLD_A)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
desc |
Host |
Out |
Matrix descriptor object initialized by this function. |
grid |
Host |
In |
Grid object associated with the global matrix A. |
dataType |
Host |
In |
Data type of the matrix A. |
M_A |
Host |
In |
Number of rows in the global matrix A. |
N_A |
Host |
In |
Number of columns in the global matrix A. |
MB_A |
Host |
In |
Blocking factor used to distribute the rows of the global matrix A. |
NB_A |
Host |
In |
Blocking factor used to distribute the columns of the global matrix A. |
RSRC_A |
Host |
In |
Process row over which the first row of the matrix A is distributed. Only the value of |
CSRC_A |
Host |
In |
Process column over which the first column of the matrix A is distributed. Only the value of |
LLD_A |
Host |
In |
Leading dimension of the local matrix. |
dataType argument are listed below:Data Type of A |
Description |
|---|---|
CUDA_R_16F |
Half precision real values. |
CUDA_R_16BF |
bfloat16 real values. |
CUDA_R_32I |
32-bit integer values. |
CUDA_R_64I |
64-bit integer values. |
CUDA_R_32F |
Single precision real values. |
CUDA_R_64F |
Double precision real values. |
CUDA_C_32F |
Single precision complex values. |
CUDA_C_64F |
Double precision complex values. |
cusolverMpDestroyMatrixDesc#
cusolverStatus_t cusolverMpDestroyMatrixDesc(
cusolverMpMatrixDescriptor_t desc)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
desc |
Host |
In/Out |
Matrix descriptor object destroyed by this function. |
Newton-Schulz Properties#
cusolverMpNewtonSchulzDescriptorCreate#
cusolverStatus_t cusolverMpNewtonSchulzDescriptorCreate(
cusolverMpNewtonSchulzDescriptor_t *nsDesc)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
nsDesc |
Host |
Out |
cusolverMpNewtonSchulzDescriptor_t descriptor to be created. |
cusolverMpNewtonSchulzDescriptorDestroy#
cusolverStatus_t cusolverMpNewtonSchulzDescriptorDestroy(
cusolverMpNewtonSchulzDescriptor_t nsDesc)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
nsDesc |
Host |
In/Out |
Newton-Schulz descriptor to be destroyed. |
cusolverMpNewtonSchulzDescriptorSetAttribute#
cusolverStatus_t cusolverMpNewtonSchulzDescriptorSetAttribute(
cusolverMpNewtonSchulzDescriptor_t nsDesc,
cusolverMpNewtonSchulzDescriptorAttribute_t attr,
const void *buf,
size_t sizeInBytes)
CUSOLVERMP_NEWTON_SCHULZ_DESCRIPTOR_ATTRIBUTE_NORMALIZE(int, default1): When set to1, the input matrix is normalized by its Frobenius norm before the iterations begin. Normalization is required for convergence. Set to0only when the input is already normalized (e.g., to avoid redundant normalization in a pipeline that pre-normalizes the matrix).
CUSOLVERMP_NEWTON_SCHULZ_DESCRIPTOR_ATTRIBUTE_REDUCE_VIA_COMPUTE_TYPE(int, default0): When set to1, the distributed Gram-matrix reduction path may communicate/reduce intermediateX^T Xdata using the compute type when the value type differs from the compute type.
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
nsDesc |
Host |
In/Out |
Newton-Schulz descriptor. |
attr |
Host |
In |
cusolverMpNewtonSchulzDescriptorAttribute_t attribute to set. |
buf |
Host |
In |
Pointer to the attribute value. |
sizeInBytes |
Host |
In |
Size of the attribute value in bytes. |
cusolverMpNewtonSchulzDescriptorGetAttribute#
cusolverStatus_t cusolverMpNewtonSchulzDescriptorGetAttribute(
cusolverMpNewtonSchulzDescriptor_t nsDesc,
cusolverMpNewtonSchulzDescriptorAttribute_t attr,
void *buf,
size_t sizeInBytes,
size_t *sizeInBytesWritten)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
nsDesc |
Host |
In |
Newton-Schulz descriptor. |
attr |
Host |
In |
cusolverMpNewtonSchulzDescriptorAttribute_t attribute to query. |
buf |
Host |
Out |
Buffer to receive the attribute value. |
sizeInBytes |
Host |
In |
Size of the output buffer in bytes. |
sizeInBytesWritten |
Host |
Out |
Number of bytes actually written to |
Utility#
cusolverMpNUMROC#
int64_t cusolverMpNUMROC(
int64_t n,
int64_t nb,
uint32_t iproc,
uint32_t isrcproc,
uint32_t nprocs)
iproc argument.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
n |
Host |
In |
Number of rows or columns in the global distributed matrix. |
nb |
Host |
In |
Row or column blocking size of the global matrix. |
iproc |
Host |
In |
The coordinate of the process whose local array row or column is to be determined. |
isrcproc |
Host |
In |
The coordinate of the process that owns the first row or column of the distributed matrix. |
nprocs |
Host |
In |
The total number of row or column processes over which the matrix is distributed. |
iproc argument.cusolverMpMatrixGatherD2H#
cusolverStatus_t cusolverMpMatrixGatherD2H(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
int root,
void *h_dst,
int64_t h_lddst)
A on a buffer provided on process root. The input matrix A is originally distributed using 2D block-cyclic format, on output h_dst contains the matrix in column-major format.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of the global distributed matrix A. |
N |
Host |
In |
Number of columns of the global distributed matrix A. |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index in the global matrix A indicating the first row of sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index in the global matrix A indicating the first column of sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor of the global matrix A. |
root |
Host |
In |
Process ID on which the matrix A will be gathered. |
h_dst |
Host |
Out |
Destination host buffer on |
h_lddst |
Host |
In |
Leading dimension of the |
Warning
This function is meant as a utility function to verify correctness of the data layouts and it is not intended to achieve high performance on large inputs.
cusolverMpMatrixScatterH2D#
cusolverStatus_t cusolverMpMatrixScatterH2D(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
int root,
const void *h_src,
int64_t h_ldsrc)
h_src from root process to a distributed global matrix A.h_src is stored in column-major format. On output, d_A contains the local portions of the global matrix A distributed in 2D block-cyclic format.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of the global distributed matrix A. |
N |
Host |
In |
Number of columns of the global distributed matrix A. |
d_A |
Device |
Out |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index in the global matrix A indicating the first row of sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index in the global matrix A indicating the first column of sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor of the global matrix A. |
root |
Host |
In |
Process ID which the matrix A will be scattered from. |
h_src |
Host |
In |
Source buffer on |
h_ldsrc |
Host |
In |
Leading dimension of the |
Warning
This function is meant as a utility function to verify correctness of the data layouts and it is not intended to achieve high performance on large inputs.
Logging#
cusolverMpLoggerSetCallback#
cusolverStatus_t cusolverMpLoggerSetCallback(
cusolverMpLoggerCallback_t callback)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
callback |
Host |
In |
Pointer to a callback function. See cusolverMpLoggerCallback_t. |
Warning
This is an experimental feature.
cusolverMpLoggerSetFile#
cusolverStatus_t cusolverMpLoggerSetFile(
FILE *file)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
file |
Host |
In |
Pointer to an open file. File should have write permission. |
Warning
This is an experimental feature.
cusolverMpLoggerOpenFile#
cusolverStatus_t cusolverMpLoggerOpenFile(
const char* logFile)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
logFile |
Host |
In |
Path of the logging output file. |
Warning
This is an experimental feature.
cusolverMpLoggerSetLevel#
cusolverStatus_t cusolverMpLoggerSetLevel(
int level)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
level |
Host |
In |
Value of the logging level. See |
Warning
This is an experimental feature.
cusolverMpLoggerSetMask#
cusolverStatus_t cusolverMpLoggerSetMask(
int mask)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
mask |
Host |
In |
Value of the logging mask. See |
Warning
This is an experimental feature.
cusolverMpLoggerForceDisable#
cusolverStatus_t cusolverMpLoggerForceDisable()
Warning
This is an experimental feature.
Dense Linear Algebra APIs#
cusolverMpGetrf#
cusolverStatus_t cusolverMpGetrf(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
int64_t *d_ipiv,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
d_ipiv=NULL.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
d_A |
Device |
In/Out |
Pointer to the first entry of the local portion of the global matrix A. On output, the sub(A) is overwritten with the L and U factors. |
IA |
Host |
In |
Row index of the first row of the sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index of the first column of the sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_ipiv |
Device |
Out |
Local array of dimension |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpGetrf_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpGetrf_bufferSize(). |
info |
Device |
Out |
|
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpGetrf_bufferSize#
cusolverStatus_t cusolverMpGetrf_bufferSize(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
int64_t *d_ipiv,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
d_ipiv=NULL so cusolverMpGetrf() will compute the LU factorization of the input matrix A without pivoting.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
d_A |
Device |
In |
Pointer to the first entry of the local portion of the global matrix A. |
IA |
Host |
In |
Row index of the first row of the sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index of the first column of the sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_ipiv |
Device |
In |
Indicates a pointer to a distributed integer array. When it is not |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cusolverMpGetrf(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cusolverMpGetrf(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpGetrs#
cusolverStatus_t cusolverMpGetrs(
cusolverMpHandle_t handle,
cublasOperation_t trans,
int64_t N,
int64_t NRHS,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const int64_t *d_ipiv,
void *d_B,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *d_info)
trans, which allows to solve linear systems of the form:trans |
Form of the linear system |
|---|---|
CUBLAS_OP_N |
\(sub(A) \cdot X = sub(B)\) |
CUBLAS_OP_T |
\(sub(A)^T \cdot X = sub(B)\) |
CUBLAS_OP_C |
\(sub(A)^H \cdot X = sub(B)\) |
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
trans |
Host |
In |
Specifies the form of the linear system. Only |
N |
Host |
In |
Number of rows of sub(A). |
NRHS |
Host |
In |
Number of columns of sub(B). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index of the first column of the sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_ipiv |
Device |
In |
Local array of dimension |
d_B |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IB |
Host |
In |
Row index of the first row of the sub(B). This function does not make any assumptions on the alignment of |
JB |
Host |
In |
Column index of the first column of the sub(B). This function does not make any assumptions on the alignment of |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpGetrs_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpGetrs_bufferSize(). |
info |
Device |
Out |
|
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpGetrs_bufferSize#
cusolverStatus_t cusolverMpGetrs_bufferSize(
cusolverMpHandle_t handle,
cublasOperation_t trans,
int64_t N,
int64_t NRHS,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const int64_t *d_ipiv,
void *d_B,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
d_ipiv=NULL.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
trans |
Host |
In |
Specifies the form of the linear system. Only |
N |
Host |
In |
Number of rows of sub(A). |
NRHS |
Host |
In |
Number of columns of sub(B). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index of the first column of the sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_ipiv |
Device |
In |
Local array of dimension |
d_B |
Device |
In |
Pointer to the first entry of the local portion of the global matrix B. On output, B is overwritten the solution of the linear system. |
IB |
Host |
In |
Row index of the first row of the sub(B). This function does not make any assumptions on the alignment of |
JB |
Host |
In |
Column index of the first column of the sub(B). This function does not make any assumptions on the alignment of |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cusolverMpGetrs(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cusolverMpGetrs(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpPotrf#
cusolverStatus_t cusolverMpPotrf(
cusolverMpHandle_t handle,
cublasFillMode_t uplo,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
A(IA:IA+N-1, JA:JA+N-1).uplo=CUBLAS_FILL_MODE_UPPER, the factorization has the formuplo is set to CUBLAS_FILL_MODE_LOWER, the factorization has the formParameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
uplo |
Host |
In |
Specifies if A is upper ( |
N |
Host |
In |
Number of rows and columns of sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A).`JA` must be a multiple of the column blocking dimension |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpPotrf_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpPotrf_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A).Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpPotrf_bufferSize#
cusolverStatus_t cusolverMpPotrf_bufferSize(
cusolverMpHandle_t handle,
cublasFillMode_t uplo,
int64_t N,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
cudaDataType_t computeType,
size_t* workspaceInBytesOnDevice,
size_t* workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
uplo |
Host |
In |
Specifies if A is upper ( |
N |
Host |
In |
Number of rows and columns of sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index of the first column of the sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cusolverMpPotrf(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cusolverMpPotrf(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpPotrs#
cusolverStatus_t cusolverMpPotrs(
cusolverMpHandle_t handle,
cublasFillMode_t uplo,
int64_t N,
int64_t NRHS,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_B,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
A(IA:IA+N-1,JA:JA+N-1) and is a N-by-N symmetric or hermitian positive definite distributed matrix using the Cholesky factorization:\[sub(A) = U^H \cdot U\]
B(IB:IB+N-1,JB:JB+NRHS-1).Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
uplo |
Host |
In |
Specifies if A is upper ( |
N |
Host |
In |
Number of rows and columns of sub(A). |
NRHS |
Host |
In |
Number of columns of sub(B). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_B |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IB |
Host |
In |
Row index of the first row of the sub(B). This function does not make any assumptions on the alignment of |
JB |
Host |
In |
Column index of the first column of the sub(B). This function does not make any assumptions on the alignment of |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpPotrs_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpPotrs_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A) and alignment of sub(A) and sub(B) matrices, meaning (MB_A == MB_B) and (IA == IB).Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
—
cusolverMpPotrs_bufferSize#
cusolverStatus_t cusolverMpPotrs_bufferSize(
cusolverMpHandle_t handle,
cublasFillMode_t uplo,
int64_t n,
int64_t nrhs,
const void *a,
int64_t ia,
int64_t ja,
cusolverMpMatrixDescriptor_t descA,
const void *b,
int64_t ib,
int64_t jb,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
size_t* workspaceInBytesOnDevice,
size_t* workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
uplo |
Host |
In |
Specifies if A is upper ( |
N |
Host |
In |
Number of rows and columns of sub(A). |
NRHS |
Host |
In |
Number of columns of sub(B). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_B |
Device |
In |
Pointer into the local memory to an array of dimension |
IB |
Host |
In |
Row index of the first row of the sub(B). This function does not make any assumptions on the alignment of |
JB |
Host |
In |
Column index of the first column of the sub(B). This function does not make any assumptions on the alignment of |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cusolverMpPotrs(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cusolverMpPotrs(). |
(MB_A == NB_A) and alignment of sub(A) and sub(B) matrices, meaning (MB_A == MB_B) and (IA == IB).Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
—
cusolverMpGeqrf#
cusolverStatus_t cusolverMpGeqrf(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_tau,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
A(IA:IA+M-1, JA:JA+N-1).tau and R is upper triangular matrix.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
d_A |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_tau |
Device |
Out |
Pointer into the local memory to an array of dimension |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpGeqrf_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpGeqrf_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A).Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpGeqrf_bufferSize#
cusolverStatus_t cusolverMpGeqrf_bufferSize(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
cudaDataType_t computeType,
size_t* workspaceInBytesOnDevice,
size_t* workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index in the global matrix A indicating the first row of sub(A). This function does not make any assumptions on the alignment of |
JA |
Host |
In |
Column index in the global matrix A indicating the first column of sub(A). This function does not make any assumptions on the alignment of |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cusolverMpGeqrf(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cusolverMpGeqrf(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
—
cusolverMpOrmqr#
cusolverStatus_t cusolverMpOrmqr(
cusolverMpHandle_t handle,
cublasSideMode_t side,
cublasOperation_t trans,
int64_t M,
int64_t N,
int64_t K,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const void *d_tau,
void *d_C,
int64_t IC,
int64_t JC,
cusolverMpMatrixDescriptor_t descC,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
C(IC:IC+M-1, JC:JC+N-1) by the orthogonal matrix Q can be given from cusolverMpGeqrf().side of CUBLAS_SIDE_LEFT and CUBLAS_SIDE_RIGHT respectively. Note that the current implementation only support for CUBLAS_SIDE_LEFT.K <= M and K <= N for CUBLAS_SIDE_LEFT and CUBLAS_SIDE_RIGHT respectively.op can be translated to \(Q\), \(Q^T\), \(Q^H\) based on the trans argument CUBLAS_OP_N, CUBLAS_OP_T and CUBLAS_OP_H.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
side |
Host |
In |
Indicate that Q is applied from left or right side. |
trans |
Host |
In |
Indicate that Q is applied with no-transpose or (conj)transpose. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
K |
Host |
In |
Number of Householder reflectors defining Q. |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A).`JA` must be a multiple of the column blocking dimension |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_tau |
Device |
In |
Pointer into the local memory to an array of dimension |
d_C |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IC |
Host |
In |
Row index of the first row of the sub(C). |
JC |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpOrmqr_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpOrmqr_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A) and alignment of sub(A) and sub(C) matrices, meaning (MB_A == MB_C) and (IA == IC).Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpOrmqr_bufferSize#
cusolverStatus_t cusolverMpOrmqr_bufferSize(
cusolverMpHandle_t handle,
cublasSideMode_t side,
cublasOperation_t trans,
int64_t M,
int64_t N,
int64_t K,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const void *d_tau,
void *d_C,
int64_t IC,
int64_t JC,
cusolverMpMatrixDescriptor_t descC,
cudaDataType_t computeType,
size_t* workspaceInBytesOnDevice,
size_t* workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
side |
Host |
In |
Indicate that Q is applied from left or right side. |
trans |
Host |
In |
Indicate that Q is applied with no-transpose or (conj)transpose. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
K |
Host |
In |
Number of Householder reflectors defining Q. |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A).`JA` must be a multiple of the column blocking dimension |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_tau |
Device |
In |
Pointer into the local memory to an array of dimension |
d_C |
Device |
In |
Pointer into the local memory to an array of dimension |
IC |
Host |
In |
Row index of the first row of the sub(C). |
JC |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by cusolverMpOrmqr(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpOrmqr(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpOrgqr#
cusolverStatus_t cusolverMpOrgqr(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
int64_t K,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const void *d_tau,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *d_info)
M-by-N matrix Q with orthonormal columns from the QR factorization computed by cusolverMpGeqrf(). Q is defined as the product of K elementary Householder reflectors of order M:H(i) are the elementary reflectors stored in the lower triangular part of A(IA:IA+M-1, JA:JA+K-1) as returned by cusolverMpGeqrf(), with corresponding scalar factors in d_tau.M >= N >= K >= 0. When K = 0, the routine sets Q to the identity matrix.d_A contains the Householder reflectors and d_tau as output by cusolverMpGeqrf(). On output, the submatrix A(IA:IA+M-1, JA:JA+N-1) is overwritten with the first N columns of Q.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of the matrix Q. |
N |
Host |
In |
Number of columns of the matrix Q. |
K |
Host |
In |
Number of elementary reflectors. |
d_A |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the submatrix. |
JA |
Host |
In |
Column index of the first column of the submatrix. |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_tau |
Device |
In |
Pointer into the local memory to an array of dimension |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpOrgqr_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpOrgqr_bufferSize(). |
d_info |
Device |
Out |
|
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpOrgqr_bufferSize#
cusolverStatus_t cusolverMpOrgqr_bufferSize(
cusolverMpHandle_t handle,
int64_t M,
int64_t N,
int64_t K,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const void *d_tau,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
M |
Host |
In |
Number of rows of the matrix Q. |
N |
Host |
In |
Number of columns of the matrix Q. |
K |
Host |
In |
Number of elementary reflectors. |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the submatrix. |
JA |
Host |
In |
Column index of the first column of the submatrix. |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_tau |
Device |
In |
Pointer into the local memory to an array of dimension |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cusolverMpOrgqr(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cusolverMpOrgqr(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpGels#
cusolverStatus_t cusolverMpGels(
cusolverMpHandle_t handle,
cublasOperation_t trans,
int64_t M,
int64_t N,
int64_t NRHS,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_B,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
A(IA:IA+M-1, JA:JA+N-1) or its transpose, using QR or LQ factorization of sub(A).M >= N) with a no-transpose option is only supported via QR factorization cusolverMpGeqrf().B(IB:IB+M-1, JB:JB+NRHS-1) and the solution multi-vector X is overwritten on the sub(B).Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
trans |
Host |
In |
Indicate that the linear system of sub(A) involves with no-transpose or (conj)transpose. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
NRHS |
Host |
In |
Number of right hand side vectors i.e., number of columns of sub(B) and X. |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_B |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IB |
Host |
In |
Row index of the first row of the sub(B). |
JB |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpGels_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpGels_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A) and alignment of sub(A) and sub(B) matrices, meaning (MB_A == MB_B) and (IA == IB).Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpGels_bufferSize#
cusolverStatus_t cusolverMpGels_bufferSize(
cusolverMpHandle_t handle,
cublasOperation_t trans,
int64_t M,
int64_t N,
int64_t NRHS,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_B,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
size_t* workspaceInBytesOnDevice,
size_t* workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
trans |
Host |
In |
Indicate that the linear system of sub(A) involves with no-transpose or (conj)transpose. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
NRHS |
Host |
In |
Number of right hand side vectors i.e., number of columns of sub(B) and X. |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_B |
Device |
In |
Pointer into the local memory to an array of dimension |
IB |
Host |
In |
Row index of the first row of the sub(B). |
JB |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by cusolverMpGels(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpGels(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSytrd#
cusolverStatus_t cusolverMpSytrd(
cusolverMpHandle_t handle,
cublasFillMode_t uplo,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_d,
void *d_e,
void *d_tau,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
A(IA:IA+N-1, JA:JA+N-1) to a tridiagonal form.CUBLAS_FILL_MODE_LOWER only.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
uplo |
Host |
In |
Indicate that the function uses either upper or lower triangular part of sub(A). |
N |
Host |
In |
Number of rows/columns of square matrix sub(A). |
d_A |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_d |
Device |
Out |
Pointer into the local memory to an array of dimension |
d_e |
Device |
Out |
Pointer into the local memory to an array of dimension |
d_tau |
Device |
Out |
Pointer into the local memory to an array of dimension |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpSytrd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpSytrd_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A).Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSytrd_bufferSize#
cusolverStatus_t cusolverMpSytrd_bufferSize(
cusolverMpHandle_t handle,
cublasFillMode_t uplo,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_d,
void *d_e,
void *d_tau,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
uplo |
Host |
In |
Indicate that the function uses either upper or lower triangular part of sub(A). |
N |
Host |
In |
Number of rows/columns of square matrix sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_d |
Device |
In |
Pointer into the local memory to an array of dimension |
d_e |
Device |
In |
Pointer into the local memory to an array of dimension |
d_tau |
Device |
In |
Pointer into the local memory to an array of dimension |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by cusolverMpSytrd(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpSytrd(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpStedc#
cusolverStatus_t cusolverMpStedc(
cusolverMpHandle_t handle,
char *compz,
int64_t N,
void *d_d,
void *d_e,
void *d_Q,
int64_t IQ,
int64_t JQ,
cusolverMpMatrixDescriptor_t descQ,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
compz |
Host |
In |
Option to compute eigenvalues only( |
N |
Host |
In |
Number of rows/columns of square matrix sub(A). |
d_d |
Device |
In/Out |
Pointer to an array of dimension |
d_e |
Device |
In/Out |
Pointer to an array of dimension |
d_Q |
Device |
Out |
Pointer into the local memory to an array of dimension |
IQ |
Host |
In |
Row index of the first row of the sub(Q). |
JQ |
Host |
In |
Column index of the first column of the sub(A). |
descQ |
Host |
In |
Matrix descriptor associated to the global matrix Q. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpStedc_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpStedc_bufferSize(). |
info |
Device |
Out |
|
(MB_Q == NB_Q).Data Type of Tridiagonal Matrix |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpStedc_bufferSize#
cusolverStatus_t cusolverMpStedc_bufferSize(
cusolverMpHandle_t handle,
char *compz,
int64_t N,
void *d_d,
void *d_e,
void *d_Q,
int64_t IQ,
int64_t JQ,
cusolverMpMatrixDescriptor_t descQ,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost,
int *iwork)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
compz |
Host |
In |
Option to compute eigenvalues only( |
N |
Host |
In |
Number of rows/columns of square matrix sub(A). |
d_d |
Device |
In |
Pointer to an array of dimension |
d_e |
Device |
In |
Pointer to an array of dimension |
d_Q |
Device |
In |
Pointer into the local memory to an array of dimension |
IQ |
Host |
In |
Row index of the first row of the sub(Q). |
JQ |
Host |
In |
Column index of the first column of the sub(A). |
descQ |
Host |
In |
Matrix descriptor associated to the global matrix Q. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by the routine cusolverMpStedc(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpStedc(). |
info |
Device |
Out |
|
Data Type of Tridiagonal Matrix |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpOrmtr#
cusolverStatus_t cusolverMpOrmtr(
cusolverMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t M,
int64_t N,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const void *d_tau,
void *d_C,
int64_t IC,
int64_t JC,
cusolverMpMatrixDescriptor_t descC,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
C(IC:IC+M-1, JC:JC+N-1) by the orthogonal matrix Q can be given from cusolverMpSytrd().CUBLAS_SIDE_LEFT and CUBLAS_SIDE_RIGHT.op can be translated to \(Q\), \(Q^T\), \(Q^H\) based on the trans argument CUBLAS_OP_N, CUBLAS_OP_T and CUBLAS_OP_H.CUBLAS_FILL_MODE_UPPER and CUBLAS_FILL_MODE_LOWERnq is either m or n according to the side parameter of CUBLAS_SIDE_LEFT or CUBLAS_SIDE_RIGHT respectively.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
side |
Host |
In |
Indicate that Q is applied from left or right side. |
uplo |
Host |
In |
Indicate that upper or lower triangular of sub(A) contains Householder reflectors. |
trans |
Host |
In |
Indicate that Q is applied with no-transpose or (conj)transpose. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A).`JA` must be a multiple of the column blocking dimension |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_tau |
Device |
In |
Pointer into the local memory to an array of dimension |
d_C |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IC |
Host |
In |
Row index of the first row of the sub(C). |
JC |
Host |
In |
Column index of the first column of the sub(C). |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpOrmtr_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpOrmtr_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A) and alignment of sub(A) and sub(B) matrices, meaning (MB_A == MB_C) and (IA == IC).Data Type of A and C |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpOrmtr_bufferSize#
cusolverStatus_t cusolverMpOrmtr_bufferSize(
cusolverMpHandle_t handle,
cublasSideMode_t side,
cublasFillMode_t uplo,
cublasOperation_t trans,
int64_t M,
int64_t N,
const void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const void *d_tau,
void *d_C,
int64_t IC,
int64_t JC,
cusolverMpMatrixDescriptor_t descC,
cudaDataType_t computeType,
size_t* workspaceInBytesOnDevice,
size_t* workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
side |
Host |
In |
Indicate that Q is applied from left or right side. |
uplo |
Host |
In |
Indicate that upper or lower triangular of sub(A) contains Householder reflectors. |
trans |
Host |
In |
Indicate that Q is applied with no-transpose or (conj)transpose. |
M |
Host |
In |
Number of rows of sub(A). |
N |
Host |
In |
Number of columns of sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A).`JA` must be a multiple of the column blocking dimension |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_tau |
Device |
In |
Pointer into the local memory to an array of dimension |
d_C |
Device |
In |
Pointer into the local memory to an array of dimension |
IC |
Host |
In |
Row index of the first row of the sub(C). |
JC |
Host |
In |
Column index of the first column of the sub(C).`JC` must be a multiple of the column blocking dimension |
descC |
Host |
In |
Matrix descriptor associated to the global matrix C. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by cusolverMpOrmtr(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpOrmtr(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSyevd#
cusolverStatus_t cusolverMpSyevd(
cusolverMpHandle_t handle,
char *jobz,
cublasFillMode_t uplo,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_d,
void *d_Z,
int64_t IZ,
int64_t JZ,
cusolverMpMatrixDescriptor_t descZ,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
A(IA:IA+N-1, JA:JA+N-1) using the divide and conquer algorithm cusolverMpStedc(). Note that the current implementation of the cusolverMpStedc may fail when the blocksize is not a power of two.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
jobz |
Host |
In |
If |
uplo |
Host |
In |
Indicate that upper or lower triangular of sub(A) is used to compute eigen solutions. |
N |
Host |
In |
Number of rows and columns of sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A).`JA` must be a multiple of the column blocking dimension |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_d |
Device |
Out |
Pointer into the memory to an array of global size |
d_Z |
Device |
Out |
Pointer into the local memory to an array of dimension |
IZ |
Host |
In |
Row index of the first row of the sub(Z). |
JZ |
Host |
In |
Column index of the first column of the sub(Z). |
descZ |
Host |
In |
Matrix descriptor associated to the global matrix Z. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpSyevd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpSyevd_bufferSize(). |
info |
Device |
Out |
|
(MB_A == NB_A) and alignment of sub(A) and sub(B) matrices, meaning (MB_A == MB_Z) and (IZ == IZ).Data Type of A and C |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSyevd_bufferSize#
cusolverStatus_t cusolverMpSyevd_bufferSize(
cusolverMpHandle_t handle,
char *jobz,
cublasFillMode_t uplo,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_d,
void *d_Z,
int64_t IZ,
int64_t JZ,
cusolverMpMatrixDescriptor_t descZ,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
jobz |
Host |
In |
If |
uplo |
Host |
In |
Indicate that upper or lower triangular of sub(A) is used to compute eigen solutions. |
N |
Host |
In |
Number of rows and columns of sub(A). |
d_A |
Device |
In |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A).`JA` must be a multiple of the column blocking dimension |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_d |
Device |
In |
Pointer into the memory to an array of global size |
d_Z |
Device |
In |
Pointer into the local memory to an array of dimension |
IZ |
Host |
In |
Row index of the first row of the sub(Z). |
JZ |
Host |
In |
Column index of the first column of the sub(Z). |
descZ |
Host |
In |
Matrix descriptor associated to the global matrix Z. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by cusolverMpSyevd(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpSyevd(). |
Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSygst#
cusolverStatus_t cusolverMpSygst(
cusolverMpHandle_t handle,
cusolverEigType_t ibtype,
cublasFillMode_t uplo,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
const void *d_B,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
ibtype = CUSOLVER_EIG_TYPE_1: the problem is sub(A)*x = lambda*sub(B)*x, and sub(A) is overwritten by inv(L)*sub(A)*inv(L^H) or inv(U^H)*sub(A)*inv(U).
ibtype = CUSOLVER_EIG_TYPE_2 or 3: the problem is sub(A)*sub(B)*x = lambda*x or sub(B)*sub(A)*x = lambda*x, and sub(A) is overwritten by L^H*sub(A)*L or U*sub(A)*U^H.
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
ibtype |
Host |
In |
Indicate the eigen problem type sub(A)*x=(lambda)*sub(B)*x, sub(A)*sub(B)x=(lambda)*x, or sub(B)*sub(A)*x=(lambda)*x. |
uplo |
Host |
In |
Indicate that lower |
N |
Host |
In |
Number of rows and columns of sub(A) and sub(B). |
d_A |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_B |
Device |
In |
Pointer into the local memory to an array of dimension |
IB |
Host |
In |
Row index of the first row of the sub(B). |
JB |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by cusolverMpSygst(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by cusolverMpSygst(). |
info |
Device |
Out |
|
Same square blocksize is used
(MB == NB)for the matrix A and B.The beginning row and column of A and B are aligned each other i.e.,
(IA == IB)and(JA == JB.
ibtype = CUSOLVER_EIG_TYPE_1, uplo = CUBLAS_FILL_MODE_LOWER.Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSygst_bufferSize#
cusolverStatus_t cusolverMpSygst_bufferSize(
cusolverMpHandle_t handle,
cusolverEigType_t ibtype,
cublasFillMode_t uplo,
int64_t N,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
ibtype |
Host |
In |
Indicate the eigen problem type sub(A)*x=(lambda)*sub(B)*x, sub(A)*sub(B)x=(lambda)*x, or sub(B)*sub(A)*x=(lambda)*x. |
uplo |
Host |
In |
Indicate that lower |
N |
Host |
In |
Number of rows and columns of sub(A) and sub(B). |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
IB |
Host |
In |
Row index of the first row of the sub(B). |
JB |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by cusolverMpSygst(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpSygst(). |
Same square blocksize is used
(MB == NB)for the matrix A and B.The beginning row and column of A and B are aligned each other i.e.,
(IA == IB)and(JA == JB.
ibtype = CUSOLVER_EIG_TYPE_1, uplo = CUBLAS_FILL_MODE_LOWER.Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSygvd#
cusolverStatus_t cusolverMpSygvd(
cusolverMpHandle_t handle,
cusolverEigType_t ibtype,
cusolverEigMode_t jobz,
cublasFillMode_t uplo,
int64_t N,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
void *d_B,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
void *d_d,
void *d_Z,
int64_t IZ,
int64_t JZ,
cusolverMpMatrixDescriptor_t descZ,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *info)
ibtype = CUSOLVER_EIG_TYPE_1: the problem is sub(A)*x = lambda*sub(B)*x.
ibtype = CUSOLVER_EIG_TYPE_2: the problem is sub(A)*sub(B)*x = lambda*x.
ibtype = CUSOLVER_EIG_TYPE_3: the problem is sub(B)*sub(A)*x = lambda*x.
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
ibtype |
Host |
In |
Indicate the eigen problem type sub(A)*x=(lambda)*sub(B)*x, sub(A)*sub(B)x=(lambda)*x, or sub(B)*sub(A)*x=(lambda)*x. |
jobz |
Host |
In |
Indicate whether the routine computes eigenvalues only |
uplo |
Host |
In |
Indicate that lower |
N |
Host |
In |
Number of rows and columns of sub(A) and sub(B). |
d_A |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_B |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IB |
Host |
In |
Row index of the first row of the sub(B). |
JB |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
d_d |
Device |
Out |
Pointer into the memory to an array of global size |
d_Z |
Device |
Out |
Pointer into the local memory to an array of dimension |
IZ |
Host |
In |
Row index of the first row of the sub(Z). |
JZ |
Host |
In |
Column index of the first column of the sub(Z). |
descZ |
Host |
In |
Matrix descriptor associated to the global matrix Z. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpSygvd_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpSygvd_bufferSize(). |
info |
Device |
Out |
|
Same square blocksize is used
(MB == NB)for the matrix A, B, and Z.The beginning row and column of A, B and Z are aligned each other i.e.,
(IA == IB == IZ)and(JA == JB == JZ.
ibtype = CUSOLVER_EIG_TYPE_1, jobz = CUSOLVER_EIG_MODE_VECTOR, uplo = CUBLAS_FILL_MODE_LOWER.Data Type of A and C |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpSygvd_bufferSize#
cusolverStatus_t cusolverMpSygvd_bufferSize(
cusolverMpHandle_t handle,
cusolverEigType_t ibtype,
cusolverEigMode_t jobz,
cublasFillMode_t uplo,
int64_t N,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
int64_t IB,
int64_t JB,
cusolverMpMatrixDescriptor_t descB,
int64_t IZ,
int64_t JZ,
cusolverMpMatrixDescriptor_t descZ,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
ibtype |
Host |
In |
Indicate the eigen problem type sub(A)*x=(lambda)*sub(B)*x, sub(A)*sub(B)x=(lambda)*x, or sub(B)*sub(A)*x=(lambda)*x. |
jobz |
Host |
In |
Indicate whether the routine computes eigenvalues only |
uplo |
Host |
In |
Indicate that lower |
N |
Host |
In |
Number of rows and columns of sub(A) and sub(B). |
IA |
Host |
In |
Row index of the first row of the sub(A). |
JA |
Host |
In |
Column index of the first column of the sub(A). |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
IB |
Host |
In |
Row index of the first row of the sub(B). |
JB |
Host |
In |
Column index of the first column of the sub(B). |
descB |
Host |
In |
Matrix descriptor associated to the global matrix B. |
IZ |
Host |
In |
Row index of the first row of the sub(Z). |
JZ |
Host |
In |
Column index of the first column of the sub(Z). |
descZ |
Host |
In |
Matrix descriptor associated to the global matrix Z. |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
workspaceInBytesOnDevice |
Host |
Out |
The size in bytes of the local device workspace needed by cusolverMpSygvd(). |
workspaceInBytesOnHost |
Host |
Out |
The size in bytes of the local host workspace needed by cusolverMpSygvd(). |
Same square blocksize is used
(MB == NB)for the matrix A, B, and Z.The beginning row and column of A, B and Z are aligned each other i.e.,
(IA == IB == IZ)and(JA == JB == JZ.
ibtype = CUSOLVER_EIG_TYPE_1, jobz = CUSOLVER_EIG_MODE_VECTOR, uplo = CUBLAS_FILL_MODE_LOWER.Data Type of A |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_32F |
CUDA_C_64F |
CUDA_C_64F |
CUDA_C_64F |
cusolverMpLaset#
cusolverStatus_t cusolverMpLaset(
cusolverMpHandle_t handle,
cublasFillMode_t uplo,
int64_t M,
int64_t N,
const void *alpha,
const void *beta,
void *d_A,
int64_t IA,
int64_t JA,
cusolverMpMatrixDescriptor_t descA,
int *d_info)
M-by-N distributed submatrix A(IA:IA+M-1, JA:JA+N-1) with alpha and the diagonal elements with beta. This is the distributed equivalent of LAPACK’s xLASET.uplo parameter controls which part of the submatrix is initialized:
CUBLAS_FILL_MODE_LOWER: only the lower triangular part (below and including the first subdiagonal) is set toalpha, and diagonal elements are set tobeta.
CUBLAS_FILL_MODE_UPPER: only the upper triangular part (above and including the first superdiagonal) is set toalpha, and diagonal elements are set tobeta.
CUBLAS_FILL_MODE_FULL: all off-diagonal elements are set toalpha, and diagonal elements are set tobeta.
alpha and beta scalars may reside in either host or device memory. The pointer type is detected automatically at runtime.Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
uplo |
Host |
In |
Specifies the part of the submatrix to initialize: |
M |
Host |
In |
Number of rows of the submatrix. |
N |
Host |
In |
Number of columns of the submatrix. |
alpha |
Host/Device |
In |
Scalar value for off-diagonal elements. Must match the data type of matrix A. |
beta |
Host/Device |
In |
Scalar value for diagonal elements. Must match the data type of matrix A. |
d_A |
Device |
Out |
Pointer into the local memory to an array of dimension |
IA |
Host |
In |
Row index of the first row of the submatrix. |
JA |
Host |
In |
Column index of the first column of the submatrix. |
descA |
Host |
In |
Matrix descriptor associated to the global matrix A. |
d_info |
Device |
Out |
|
Data Type of A |
|---|
CUDA_R_32F |
CUDA_R_64F |
CUDA_C_32F |
CUDA_C_64F |
cusolverMpNewtonSchulz#
cusolverStatus_t cusolverMpNewtonSchulz(
cusolverMpHandle_t handle,
cusolverMpNewtonSchulzDescriptor_t nsDesc,
int64_t M,
int64_t N,
void *d_X,
int64_t IX,
int64_t JX,
const cusolverMpMatrixDescriptor_t descX,
int64_t numberOfNewtonSchulzIterations,
const void *h_coeffs,
cudaDataType_t computeType,
void *d_work,
size_t workspaceInBytesOnDevice,
void *h_work,
size_t workspaceInBytesOnHost,
int *d_info)
M >= N) distributed matrix X(IX:IX+M-1, JX:JX+N-1) in-place on supported Px1 process grids. The routine approximates the orthogonal polar factor U from the polar decomposition X = U * H, where U has orthonormal columns.M >= N), each iteration i applies a polynomial update using three user-supplied coefficients (alpha_i, beta_i, gamma_i):X := X / ||X||_F. This step can be disabled via the descriptor when the input is already normalized.h_coeffs must be provided as a host array of float triplets, with 3 * numberOfNewtonSchulzIterations elements stored as [alpha_0, beta_0, gamma_0, alpha_1, beta_1, gamma_1, ...]. See the Newton-Schulz sample for example coefficients optimized for quintic convergence in 5 iterations. The classical Newton-Schulz iteration can be recovered by setting (alpha, beta, gamma) = (1.5, -0.5, 0.0) for each iteration, though more iterations will be needed to converge.ncclMemAlloc and register it via cusolverMpBufferRegister() before calling this routine.
Only Px1 process grids (1D row distribution with
numColDevices = 1) are supported. 2D block-cyclic grids are not yet implemented.Only tall or square matrices (
M >= N) are supported. Wide rectangular matrices (M < N) are not yet supported.Only
IX = JX = 1is supported (no submatrix offsets).Only
CUDA_R_16BF(bfloat16) andCUDA_R_32F(float32) value types are supported.The only supported compute type is
CUDA_R_32F.
RSRC = CSRC = 0is required.
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
nsDesc |
Host |
In |
cusolverMpNewtonSchulzDescriptor_t descriptor (may be |
M |
Host |
In |
Number of rows of the submatrix X. |
N |
Host |
In |
Number of columns of the submatrix X. |
d_X |
Device |
In/Out |
Pointer into the local memory to an array of dimension |
IX |
Host |
In |
Row index of the first row of the submatrix. |
JX |
Host |
In |
Column index of the first column of the submatrix. |
descX |
Host |
In |
Matrix descriptor associated to the global matrix X. |
numberOfNewtonSchulzIterations |
Host |
In |
Number of Newton-Schulz iterations to perform. |
h_coeffs |
Host |
In |
Host array of triplets with |
computeType |
Host |
In |
Data type used for computations. See table below for supported combinations. |
d_work |
Device |
Out |
Device workspace of size |
workspaceInBytesOnDevice |
Host |
In |
The size in bytes of the local device workspace needed by the routine as provided by cusolverMpNewtonSchulz_bufferSize(). |
h_work |
Host |
Out |
Host workspace of size |
workspaceInBytesOnHost |
Host |
In |
The size in bytes of the local host workspace needed by the routine as provided by cusolverMpNewtonSchulz_bufferSize(). |
d_info |
Device |
Out |
|
Data Type of X |
computeType |
Output Data Type |
|---|---|---|
CUDA_R_16BF |
CUDA_R_32F |
CUDA_R_16BF |
CUDA_R_32F |
CUDA_R_32F |
CUDA_R_32F |
cusolverMpNewtonSchulz_bufferSize#
cusolverStatus_t cusolverMpNewtonSchulz_bufferSize(
cusolverMpHandle_t handle,
cusolverMpNewtonSchulzDescriptor_t nsDesc,
int64_t M,
int64_t N,
void *d_X,
int64_t IX,
int64_t JX,
const cusolverMpMatrixDescriptor_t descX,
int64_t numberOfNewtonSchulzIterations,
const void *h_coeffs,
cudaDataType_t computeType,
size_t *workspaceInBytesOnDevice,
size_t *workspaceInBytesOnHost)
Parameter |
Memory |
In/Out |
Description |
|---|---|---|---|
handle |
Host |
In |
cuSOLVERMp library handle. |
nsDesc |
Host |
In |
Newton-Schulz descriptor (may be |
M |
Host |
In |
Number of rows of the submatrix X. |
N |
Host |
In |
Number of columns of the submatrix X. |
d_X |
Device |
In |
Pointer into the local memory to an array of dimension |
IX |
Host |
In |
Row index of the first row of the submatrix. |
JX |
Host |
In |
Column index of the first column of the submatrix. |
descX |
Host |
In |
Matrix descriptor associated to the global matrix X. |
numberOfNewtonSchulzIterations |
Host |
In |
Number of Newton-Schulz iterations to perform. |
h_coeffs |
Host |
In |
Host array of triplets with |
computeType |
Host |
In |
Data type used for computations. |
workspaceInBytesOnDevice |
Host |
Out |
On output, contains the size in bytes of the local device workspace needed by cusolverMpNewtonSchulz(). |
workspaceInBytesOnHost |
Host |
Out |
On output, contains the size in bytes of the local host workspace needed by cusolverMpNewtonSchulz(). |