Examples
This section contains examples with source code.
Using cuBLAS from OpenACC Host Code
This example demonstrates the use of the cublas module, the cublasHandle type, and several forms of blas calls from OpenACC data regions.
Simple OpenACC BLAS Test
program testcublas
! compile with pgfortran -ta=tesla -cudalib=cublas -cuda testcublas.f90
call testcu1(1000)
call testcu2(1000)
end
!
subroutine testcu1(n)
use openacc
use cublas
integer :: a(n), b(n)
type(cublasHandle) :: h
istat = cublasCreate(h)
! Force OpenACC kernels and cuBLAS to use the OpenACC stream.
istat = cublasSetStream(h, acc_get_cuda_stream(acc_async_sync))
!$acc data copyout(a, b)
!$acc kernels
a = 1
b = 2
!$acc end kernels
! No host_data, we are lexically inside a data region
! sswap will accept any kind(4) data type
call sswap(n, a, 1, b, 1)
call cublasSswap(n, a, 1, b, 1)
!$acc end data
if (all(a.eq.1).and.all(b.eq.2)) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
!
subroutine testcu2(n)
use openacc
use cublas
real(8) :: a(n), b(n)
a = 1.0d0
b = 2.0d0
!$acc data copy(a, b)
!$acc host_data use_device(a,b)
call dswap(n, a, 1, b, 1)
call cublasDswap(n, a, 1, b, 1)
!$acc end host_data
!$acc end data
if (all(a.eq.1.0d0).and.all(b.eq.2.0d0)) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
CUBLASXT BLAS Test
This example demonstrates the use of the cublasxt module
program testcublasxt
call testxt1(1000)
call testxt2(1000)
end
!
subroutine testxt1(n)
use cublasxt
real(4) :: a(n,n), b(n,n), c(n,n), alpha, beta
type(cublasXtHandle) :: h
integer ndevices(1)
a = 1.0
b = 2.0
c = -1.0
alpha = 1.0
beta = 0.0
istat = cublasXtCreate(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
ndevices(1) = 0
istat = cublasXtDeviceSelect(h, 1, ndevices)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtSgemm(h, CUBLAS_OP_N, CUBLAS_OP_N, &
n, n, n, &
alpha, A, n, B, n, beta, C, n)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtDestroy(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
if (all(c.eq.2.0*n)) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
print *,c(1,1),c(n,n)
end
!
subroutine testxt2(n)
use cublasxt
real(8) :: a(n,n), b(n,n), c(n,n), alpha, beta
type(cublasXtHandle) :: h
integer ndevices(1)
a = 1.0d0
b = 2.0d0
c = -1.0d0
alpha = 1.0d0
beta = 0.0
istat = cublasXtCreate(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
ndevices(1) = 0
istat = cublasXtDeviceSelect(h, 1, ndevices)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtDgemm(h, CUBLAS_OP_N, CUBLAS_OP_N, &
n, n, n, &
alpha, A, n, B, n, beta, C, n)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtDestroy(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
if (all(c.eq.2.0d0*n)) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
print *,c(1,1),c(n,n)
end
Using cuBLAS from CUDA Fortran Host Code
This example demonstrates the use of the cublas module, the cublasHandle type, and several forms of blas calls.
Simple BLAS Test
program testisamax
! Compile with "pgfortran testisamax.cuf -cudalib=cublas -lblas"
! Use the NVIDIA cudafor and cublas modules
use cudafor
use cublas
!
real*4, device, allocatable :: xd(:)
real*4 x(1000)
integer, device :: kd
type(cublasHandle) :: h
call random_number(x)
! F90 way
i = maxloc(x,dim=1)
print *,i
print *,x(i-1),x(i),x(i+1)
! Host way
j = isamax(1000,x,1)
print *,j
print *,x(j-1),x(j),x(j+1)
! CUDA Generic BLAS way
allocate(xd(1000))
xd = x
k = isamax(1000,xd,1)
print *,k
print *,x(k-1),x(k),x(k+1)
! CUDA Specific BLAS way
k = cublasIsamax(1000,xd,1)
print *,k
print *,x(k-1),x(k),x(k+1)
! CUDA V2 Host Specific BLAS way
istat = cublasCreate(h)
if (istat .ne. 0) print *,"cublasCreate returned ",istat
k = 0
istat = cublasIsamax_v2(h, 1000, xd, 1, k)
if (istat .ne. 0) print *,"cublasIsamax 1 returned ",istat
print *,k
print *,x(k-1),x(k),x(k+1)
! CUDA V2 Device Specific BLAS way
k = 0
istat = cublasIsamax_v2(h, 1000, xd, 1, kd)
if (istat .ne. 0) print *,"cublasIsamax 2 returned ",istat
k = kd
print *,k
print *,x(k-1),x(k),x(k+1)
istat = cublasDestroy(h)
if (istat .ne. 0) print *,"cublasDestroy returned ",istat
end program
Multi-threaded BLAS Test
This example demonstrates the use of the cublas module in a multi-threaded code. Each thread will attach to a different GPU, create a context, and combine the results at the end.
program tsgemm
!
! Multi-threaded example calling sgemm from cuda fortran.
! Compile with "pgfortran -mp tsgemm.cuf -cudalib=cublas"
! Set OMP_NUM_THREADS=number of GPUs in your system.
!
use cublas
use cudafor
use omp_lib
!
! Size this according to number of GPUs
!
! Small
!integer, parameter :: K = 2500
!integer, parameter :: M = 2000
!integer, parameter :: N = 2000
! Large
integer, parameter :: K = 10000
integer, parameter :: M = 10000
integer, parameter :: N = 10000
integer, parameter :: NTIMES = 10
!
real*4, device, allocatable :: a_d(:,:), b_d(:,:), c_d(:,:)
!$omp THREADPRIVATE(a_d,b_d,c_d)
real*4 a(m,k), b(k,n), c(m,n)
real*4 alpha, beta
integer, allocatable :: offs(:)
type(cudaEvent) :: start, stop
a = 1.0; b = 0.0; c = 0.0
do i = 1, N
b(i,i) = 1.0
end do
alpha = 1.0; beta = 1.0
! Break up the B and C array into sections
nthr = omp_get_max_threads()
nsec = N / nthr
print *,"Running with ",nthr," threads, each section = ",nsec
allocate(offs(nthr))
offs = (/ (i*nsec,i=0,nthr-1) /)
! Allocate and initialize the arrays
! Each thread connects to a device and creates a CUDA context.
!$omp PARALLEL private(i,istat)
i = omp_get_thread_num() + 1
istat = cudaSetDevice(i-1)
allocate(a_d(M,K), b_d(K,nsec), c_d(M,nsec))
a_d = a
b_d = b(:,offs(i)+1:offs(i)+nsec)
c_d = c(:,offs(i)+1:offs(i)+nsec)
!$omp end parallel
istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)
time = 0.0
istat = cudaEventRecord(start, 0)
! Run the traditional blas kernel
!$omp PARALLEL private(j,istat)
do j = 1, NTIMES
call sgemm('n','n', M, N/nthr, K, alpha, a_d, M, b_d, K, beta, c_d, M)
end do
istat = cudaDeviceSynchronize()
!$omp end parallel
istat = cudaEventRecord(stop, 0)
istat = cudaEventElapsedTime(time, start, stop)
time = time / (NTIMES*1.0e3)
!$omp PARALLEL private(i)
i = omp_get_thread_num() + 1
c(:,offs(i)+1:offs(i)+nsec) = c_d
!$omp end parallel
nerrors = 0
do j = 1, N
do i = 1, M
if (c(i,j) .ne. NTIMES) nerrors = nerrors + 1
end do
end do
print *,"Number of Errors:",nerrors
gflops = 2.0 * N * M * K/time/1e9
write (*,901) m,k,k,N,time*1.0e3,gflops
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f12.3,' GFlops/s')
end
Using cuFFT from OpenACC Host Code
This example demonstrates the use of the cufft module, the cufftHandle type, and several cuFFT library calls.
Simple cuFFT Test
program cufft2dTest
use cufft
use openacc
integer, parameter :: m=768, n=512
complex, allocatable :: a(:,:),b(:,:),c(:,:)
real, allocatable :: r(:,:),q(:,:)
integer :: iplan1, iplan2, iplan3, ierr
allocate(a(m,n),b(m,n),c(m,n))
allocate(r(m,n),q(m,n))
a = 1; r = 1
xmx = -99.0
ierr = cufftPlan2D(iplan1,n,m,CUFFT_C2C)
ierr = ierr + cufftSetStream(iplan1,acc_get_cuda_stream(acc_async_sync))
!$acc host_data use_device(a,b,c)
ierr = ierr + cufftExecC2C(iplan1,a,b,CUFFT_FORWARD)
ierr = ierr + cufftExecC2C(iplan1,b,c,CUFFT_INVERSE)
!$acc end host_data
! scale c
!$acc kernels
c = c / (m*n)
!$acc end kernels
! Check forward answer
write(*,*) 'Max error C2C FWD: ', cmplx(maxval(real(b)) - sum(real(b)), &
maxval(imag(b)))
! Check inverse answer
write(*,*) 'Max error C2C INV: ', maxval(abs(a-c))
! Real transform
ierr = ierr + cufftPlan2D(iplan2,n,m,CUFFT_R2C)
ierr = ierr + cufftPlan2D(iplan3,n,m,CUFFT_C2R)
ierr = ierr + cufftSetStream(iplan2,acc_get_cuda_stream(acc_async_sync))
ierr = ierr + cufftSetStream(iplan3,acc_get_cuda_stream(acc_async_sync))
!$acc host_data use_device(r,b,q)
ierr = ierr + cufftExecR2C(iplan2,r,b)
ierr = ierr + cufftExecC2R(iplan3,b,q)
!$acc end host_data
!$acc kernels
xmx = maxval(abs(r-q/(m*n)))
!$acc end kernels
! Check R2C + C2R answer
write(*,*) 'Max error R2C/C2R: ', xmx
ierr = ierr + cufftDestroy(iplan1)
ierr = ierr + cufftDestroy(iplan2)
ierr = ierr + cufftDestroy(iplan3)
if (ierr.eq.0) then
print *,"test PASSED"
else
print *,"test FAILED"
endif
end program cufft2dTest
Using cuFFT from CUDA Fortran Host Code
This example demonstrates the use of the cuFFT module, the cufftHandle type, and several cuFFT library calls.
Simple cuFFT Test
program cufft2dTest
use cudafor
use cufft
implicit none
integer, parameter :: m=768, n=512
complex, managed :: a(m,n),b(m,n)
real, managed :: ar(m,n),br(m,n)
real x
integer plan, ierr
logical passing
a = 1; ar = 1
ierr = cufftPlan2D(plan,n,m,CUFFT_C2C)
ierr = ierr + cufftExecC2C(plan,a,b,CUFFT_FORWARD)
ierr = ierr + cufftExecC2C(plan,b,b,CUFFT_INVERSE)
ierr = ierr + cudaDeviceSynchronize()
x = maxval(abs(a-b/(m*n)))
write(*,*) 'Max error C2C: ', x
passing = x .le. 1.0e-5
ierr = ierr + cufftPlan2D(plan,n,m,CUFFT_R2C)
ierr = ierr + cufftExecR2C(plan,ar,b)
ierr = ierr + cufftPlan2D(plan,n,m,CUFFT_C2R)
ierr = ierr + cufftExecC2R(plan,b,br)
ierr = ierr + cudaDeviceSynchronize()
x = maxval(abs(ar-br/(m*n)))
write(*,*) 'Max error R2C/C2R: ', x
passing = passing .and. (x .le. 1.0e-5)
ierr = ierr + cufftDestroy(plan)
print *,ierr
passing = passing .and. (ierr .eq. 0)
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end program cufft2dTest
Using cufftXt from CUDA Fortran Host Code
This example demonstrates the use of the cufftXt module, the cudaLibXtDesc type, many of the cufftXt API calls, the padding required for real/complex FFTs, and the distribution of active modes across multiple GPUs.
Simple Poisson 3D CufftXt Test
program cufftXt3DTest
use cudafor
use cufftXt ! cufftXt module includes cufft
implicit none
integer, parameter :: nGPUs = 2
integer, parameter :: M = 1, N = 1, P = 1
integer, parameter :: nx = 32, ny=32, nz=32
real, parameter :: twopi = 8.0*atan(1.0)
real, parameter :: Lx = 1.0, Ly = 1.0, Lz = 1.0
logical, parameter :: printModes = .true.
! GPU
integer :: nodeGPUs
integer :: whichGPUs(nGPUs)
! grid
real :: x(nx), y(ny), z(nz)
real :: hx, hy, hz
! wavenumbers
real :: kx(nx/2+1), ky(ny), kz(nz)
type realDeviceArray
real, device, allocatable :: v(:)
end type realDeviceArray
type(realDeviceArray) :: kx_da(nGPUs), ky_da(nGPUs), kz_da(nGPUs)
real :: phi(nx+2,ny, nz), u(nx+2,ny,nz), ua(nx,ny,nz)
integer :: status, i, j, k
! check M, N
if (M==0 .or. N==0 .or. P==0) then
write(*,*) 'M, N or P is 0 thus solution is u=0'
stop
else
write(*,"(' Running with modes M, N, P = ', i0, ', ', i0, ', ', i0)") M, N, P
write(*,"(' on a grid of ', i0, 'x', i0, 'x', i0)") nx, ny, nz
end if
! choose GPUs
! check for multiple GPUs
status = cudaGetDeviceCount(nodeGPUs)
if (status /= cudaSuccess) write(*,*) 'Error in cudaGetDeviceCount()'
if (nodeGPUs .lt. nGPUs) then
write(*,*) 'Number of GPUs on node:', nodeGPUs
write(*,*) 'Insufficient number of GPUs for this code, exiting ...'
stop
end if
! check for GPUs of same compute capability
block
type(cudaDeviceProp) :: prop
integer :: cc(0:nodeGPUs-1,2)
logical :: foundIdenticalGPUs = .false.
integer :: iWG, nIdentGPUs
write(*,*) 'Available devices:'
do i = 0, nodeGPUs-1
status = cudaGetDeviceProperties(prop, i)
if (status /= cudaSuccess) write(*,*) 'Error in cudaGetDeviceProperties()'
cc(i,1) = prop%major
cc(i,2) = prop%minor
write(*,"(' ', i0, ' CC: ', i0, '.', i0)") i, cc(i,1), cc(i,2)
enddo
do i = 0, nodeGPUs-1
nIdentGPUs = 1
do j = i+1, nodeGPUs
if (all(cc(i,:) == cc(j,:))) nIdentGPUs = nIdentGPUs+1
enddo
if (nIdentGPUs .ge. nGPUs) then
foundIdenticalGPUs = .true.
iWG = 1
whichGPUs(iWG) = i
do j = i+1, nodeGPUs-1
if (all(cc(i,:) == cc(j,:))) then
iWG = iWG+1
whichGPUs(iWG) = j
end if
if (iWG == nGPUs) exit
end do
exit
end if
end do
if (foundIdenticalGPUS) then
write(*,*) 'Running on GPUs:'
write(*,*) whichGPUs
else
write(*,"(' No ', i0, ' identical GPUs found, exiting ...')"), nGPUs
stop
end if
end block
! Physical grid
hx = Lx/nx
do i = 1, nx
x(i) = hx*i
enddo
hy = Ly/ny
do j = 1, ny
y(j) = hy*j
enddo
hz = Lz/nz
do k = 1, nz
z(k) = hz*k
enddo
! Wavenumbers
do i = 1, nx/2+1
kx(i) = (i-1)*(twoPi/Lx)
enddo
do j = 1, ny/2
ky(j) = (j-1)*(twoPi/Ly)
enddo
do j = ny/2+1, ny
ky(j) = (j-1-ny)*(twoPi/Ly)
enddo
do k = 1, nz/2
kz(k) = (k-1)*(twoPi/Lz)
enddo
do k = nz/2+1, nz
kz(k) = (k-1-nz)*(twoPi/Lz)
enddo
! copy wavenumber arrays to each device
do i = 1, nGPUs
status = cudaSetDevice(whichGPUs(i))
allocate(kx_da(i)%v, source=kx)
allocate(ky_da(i)%v, source=ky)
allocate(kz_da(i)%v, source=kz)
enddo
! Initialize phi and get analytical solution
do k = 1, nz
do j = 1, ny
do i = 1, nx
phi(i,j,k) = sin(twoPi*M*x(i))*sin(twoPi*N*y(j))*sin(twoPi*P*z(k))
ua(i,j,k) = -phi(i,j,k)/((twoPi*M)**2 + (twoPi*N)**2 + (twoPi*P)**2)
enddo
enddo
end do
! cufft block
block
integer :: planR2C, planC2R
integer(c_size_t) :: worksize(nGPUs)
type(cudaLibXtDesc), pointer :: phi_desc
status = cufftCreate(planR2C)
if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftCreate'
status = cufftCreate(planC2R)
if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftCreate'
status = cufftXtSetGPUs(planR2C, nGPUs, whichGPUs)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtSetGPUs failed'
status = cufftXtSetGPUs(planC2R, nGPUs, whichGPUs)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtSetGPUs failed'
status = cufftMakePlan3d(planR2C, nz, ny, nx, CUFFT_R2C, worksize)
if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftMakePlan2d'
status = cufftMakePlan3d(planC2R, nz, ny, nx, CUFFT_C2R, worksize)
if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftMakePlan2d'
! allocate memory on seperate devices
status = cufftXtMalloc(planR2C, phi_desc, CUFFT_XT_FORMAT_INPLACE)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMalloc failed'
! H2D transfer
write(*,*) 'cufftXtMemcpy H2D ...'
status = cufftXtMemcpy(planR2C, phi_desc, phi, CUFFT_COPY_HOST_TO_DEVICE)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMemcpy H2D failed'
! forward FFT
write(*,*) 'Forward transform ...'
status = cufftXtExecDescriptorR2C(planR2C, phi_desc, phi_desc)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtExecDescriptorR2C failed:', status
if (printModes) then
block
real :: threshold = 1.e-3
type(cudaXtDesc), pointer :: phiXtDesc
complex, device, pointer :: phi_d(:,:,:)
integer :: dev, g, jl, nyl
complex :: phi_h(nx/2+1, ny, nz)
call c_f_pointer(phi_desc%descriptor, phiXtDesc)
do g = 1, nGPUs
write(*,"(' Active modes on g = ', i0)") g
dev = phiXtDesc%GPUs(g)
status = cudaSetDevice(dev)
if (status /= cudaSuccess) write(*,*) 'cudaSetDevice failed'
! XtMalloc done with FORMAT_INPLACE, so data prior to transform
! are in natural order (split in least frequently changing index,
! in this case z), and after transform are in shuffled order (split in
! second least frequently changing index, in this case y)
nyl = ny/nGPUs
if (g .le. mod(ny, nGPUs)) nyl = nyl+1
call c_f_pointer(phiXtDesc%data(g), phi_d, [nx/2+1, nyl, nz])
!$cuf kernel do (3)
do k = 1, nz
do jl = 1, nyl
do i = 1, nx/2+1
if (abs(phi_d(i,jl,k)) .gt. threshold) print*, i, jl, k, phi_d(i,jl,k)
enddo
enddo
enddo
status = cudaDeviceSynchronize()
call flush()
end do
status = cufftXtMemcpy(planR2C, phi_h, phi_desc, CUFFT_COPY_DEVICE_TO_HOST)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMemcpy D2H failed'
write(*,*) 'Active modes on host (after D2H):'
do k = 1, nz
do j = 1, ny
do i = 1, nx/2+1
if (abs(phi_h(i,j,k)) .gt. threshold) print*, i, j, k, phi_h(i,j,k)
end do
end do
enddo
end block
end if
! solve Poisson equation
write(*,*) 'Solve Poisson Eq ...'
block
type(cudaXtDesc), pointer :: phiXtDesc
complex, device, pointer :: phi_d(:,:,:)
integer :: dev, g, jl, nyl, yOffset
real :: k2
call c_f_pointer(phi_desc%descriptor, phiXtDesc)
yOffset = 0
do g = 1, nGPUs
dev = phiXtDesc%GPUs(g)
status = cudaSetDevice(dev)
if (status /= cudaSuccess) write(*,*) 'cudaSetDevice failed'
! XtMalloc done with FORMAT_INPLACE, so data prior to transform
! are in natural order (split in least frequently changing index,
! in this case z), and after transform are in shuffled order (split in
! second least frequently changing index, in this case y)
nyl = ny/nGPUs
if (g .le. mod(ny, nGPUs)) nyl = nyl+1
call c_f_pointer(phiXtDesc%data(g), phi_d, [nx/2+1, nyl, nz])
associate(kx_d => kx_da(g)%v, ky_d => ky_da(g)%v, kz_d => kz_da(g)%v)
!$cuf kernel do (3)
do k = 1, nz
do jl = 1, nyl
j = jl + yOffset
do i = 1, nx/2+1
k2 = kx_d(i)**2 + ky_d(j)**2 + kz_d(k)**2
phi_d(i,jl,k) = -phi_d(i,jl,k)/k2/(nx*ny*nz)
enddo
enddo
enddo
end associate
! specify mean (corrects division by zero wavenumber above)
if (g == 1) phi_d(1,1,1) = 0.0
yOffset = yOffset + nyl
end do
do g = 1, nGPUs
dev = phiXtDesc%GPUs(g)
status = cudaSetDevice(dev)
if (status /= cudaSuccess) write(*,*) 'cudaSetDevice failed'
status = cudaDeviceSynchronize()
if (status /= cudaSuccess) write(*,*) 'CUF kernel sync error'
end do
end block ! poisson
! inverse FFT
write(*,*) 'Inverse transform ...'
status = cufftXtExecDescriptorC2R(planC2R, phi_desc, phi_desc)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtExecDescriptorC2R failed'
! D2H transfer
write(*,*) 'cufftXtMemcpy D2H ...'
status = cufftXtMemcpy(planC2R, u, phi_desc, CUFFT_COPY_DEVICE_TO_HOST)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMemcpy D2H failed'
! cufft block cleanup
status = cufftXtFree(phi_desc)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtFree failed'
status = cufftDestroy(planR2C)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtFree failed'
status = cufftDestroy(planC2R)
if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtFree failed'
end block
write(*,*) 'Max error: ', maxval(abs(u(1:nx,1:ny,1:nz)-ua(1:nx,1:ny,1:nz)))
! cleanup
do i = 1, nGPUs
status = cudaSetDevice(whichGPUs(i))
deallocate(kx_da(i)%v, ky_da(i)%v, kz_da(i)%v)
enddo
write(*,*) '... finished'
end program cufftXt3DTest
Using cuFFTMp from either OpenACC or CUDA Fortran
This example demonstrates the use of the cuFFTMp API from the cuFFTXt module, the cudaLibXtDesc and cudaXtDesc types, and attaching the MPI COMM to the cuFFT plan.
Real-to-Complex and Complex-to-Real cuFFTMp Test
!
! This samples illustrates a basic use of cuFFTMp using the built-in, optimized,
! data distributions.
!
! It assumes the CPU data is initially distributed according to
! CUFFT_XT_FORMAT_INPLACE, a.k.a. X-Slabs.
! Given a global array of size X! Y! Z, every MPI rank owns approximately
! (X / ngpus)! Y*Z entries.
! More precisely,
! - The first (ngpus % X) MPI rank each own (X/ngpus+1) planes of size Y*Z,
! - The remaining MPI rank each own (X / ngpus) planes of size Y*Z
!
! The CPU data is then copied on GPU and a forward transform is applied.
!
! After that transform, GPU data is distributed according to
! CUFFT_XT_FORMAT_INPLACE_SHUFFLED, a.k.a. Y-Slabs.
! Given a global array of size X * Y * Z, every MPI rank owns approximately
! X * (Y / ngpus) * Z entries.
! More precisely,
! - The first (ngpus % Y) MPI rank each own (Y/ngpus+1) planes of size X*Z,
! - The remaining MPI rank each own (Y / ngpus) planes of size X*Z
!
! A scaling kerel is applied, on the distributed GPU data (distributed
! according to CUFFT_XT_FORMAT_INPLACE)
! This kernel prints some elements to illustrate the
! CUFFT_XT_FORMAT_INPLACE_SHUFFLED data distribution and normalize entries
! by (nx * ny * nz)
!
! Finally, a backward transform is applied.
! After this, data is again distributed according to CUFFT_XT_FORMAT_INPLACE,
! same as the input data.
!
! Data is finally copied back to CPU and compared to the input data. They
! should be almost identical.
!
! This program can be used by either OpenACC or CUDA Fortran:
! mpif90 -acc cufftmp_r2c.F90 -cudalib=cufftmp
! Or:
! mpif90 -cuda cufftmp_r2c.F90 -cudalib=cufftmp
!
module cufft_required
integer :: planr2c, planc2r
integer :: local_rshape(3), local_rshape_permuted(3)
integer :: local_permuted_cshape(3)
end module cufft_required
program cufftmp_r2c
use iso_c_binding
use cufftXt
use cufft
!@cuf use cudafor
!@acc use openacc
use mpi
use cufft_required
implicit none
integer :: size, rank, ndevices, ierr
integer :: nx, ny, nz ! nx slowest
integer :: i, j, k
integer :: my_nx, my_ny, my_nz, ranks_cutoff, whichgpu(1)
real, dimension(:, :, :), allocatable :: u, ref
complex, dimension(:,:,:), allocatable :: u_permuted
real :: max_norm, max_diff
! cufft stuff
integer(c_size_t) :: worksize(1)
type(cudaLibXtDesc), pointer :: u_desc
type(cudaXtDesc), pointer :: u_descptr
#ifdef _OPENACC
complex, pointer :: u_dptr(:,:,:)
type(c_ptr) :: tmpcptr
#else
complex, pointer, device :: u_dptr(:,:,:)
#endif
call mpi_init(ierr)
call mpi_comm_size(MPI_COMM_WORLD,size,ierr)
call mpi_comm_rank(MPI_COMM_WORLD,rank,ierr)
#ifdef _OPENACC
ndevices = acc_get_num_devices(acc_device_nvidia)
call acc_set_device_num(mod(rank, ndevices), acc_device_nvidia)
#else
call checkCuda(cudaGetDeviceCount(ndevices))
call checkCuda(cudaSetDevice(mod(rank, ndevices)))
#endif
whichgpu(1) = mod(rank, ndevices)
print*,"Hello from rank ",rank," gpu id",mod(rank,ndevices),"size",size
nx = 256
ny = nx
nz = nx
! We start with X-Slabs
! Ranks 0 ... (nx % size - 1) have 1 more element in the X dimension
! and every rank own all elements in the Y and Z dimensions.
ranks_cutoff = mod(nx, size)
my_nx = nx / size
if (rank < ranks_cutoff) my_nx = my_nx + 1
my_ny = ny;
my_nz = nz;
! Note nz is first dimension, nx third
local_rshape = [2*(nz/2+1), ny, my_nx]
local_permuted_cshape = [nz/2+1, ny/size, nx]
local_rshape_permuted = [2*(nz/2+1), ny/size, nx]
if (mod(ny, size) > 0) then
print*," ny has to divide evenly by mpi_procs"
call mpi_finalize(ierr)
end if
if (rank == 0) then
write(*,*) "local_rshape :", local_rshape(1:3)
write(*,*) "local_permuted_cshape :", local_permuted_cshape(1:3)
end if
! Generate local, distributed data
allocate(u(local_rshape(1), local_rshape(2), local_rshape(3)))
allocate(u_permuted(local_permuted_cshape(1), local_permuted_cshape(2), &
local_permuted_cshape(3)))
allocate(ref(local_rshape(1), local_rshape(2), local_rshape(3)))
print*,'shape of u is ', shape(u)
print*,'shape of u_permuted is ', shape(u_permuted)
call generate_random(nz, local_rshape(1), local_rshape(2), &
local_rshape(3), u)
ref = u
u_permuted = (0.0,0.0)
call checkNorm(nz, local_rshape(1), local_rshape(2), local_rshape(3), &
u, max_norm)
print*, "initial data on ", rank, " max_norm is ", max_norm
call checkCufft(cufftCreate(planr2c))
call checkCufft(cufftCreate(planc2r))
call checkCufft(cufftMpAttachComm(planr2c, CUFFT_COMM_MPI, &
MPI_COMM_WORLD), 'cufftMpAttachComm error')
call checkCufft(cufftMpAttachComm(planc2r, CUFFT_COMM_MPI, &
MPI_COMM_WORLD), 'cufftMpAttachComm error')
! Note nx, ny, nz order
call checkCufft(cufftMakePlan3d(planr2c, nx, ny, nz, CUFFT_R2C, &
worksize), 'cufftMakePlan3d r2c error')
call checkCufft(cufftMakePlan3d(planc2r, nx, ny, nz, CUFFT_C2R, &
worksize), 'cufftMakePlan3d c2r error')
call checkCufft(cufftXtMalloc(planr2c, u_desc, &
CUFFT_XT_FORMAT_INPLACE), 'cufftXtMalloc error')
! These are equivalent, and work as well
! istat = cufftXtMemcpy(planr2c, u_desc, u, CUFFT_COPY_HOST_TO_DEVICE)
! call cufft_memcpyH2D(u_desc, u, CUFFT_XT_FORMAT_INPLACE, .false.)
call cufft_memcpyH2D(u_desc, u, CUFFT_XT_FORMAT_INPLACE, .true.)
! now reset u to make sure the check later is valid
u = 0.0
!xxxxxxxxxxxxxxxxxxxxxxxxxx Forward
call checkCufft(cufftXtExecDescriptor(planr2c, u_desc, u_desc, &
CUFFT_FORWARD),'forward fft failed')
! in case we want to check the results after Forward
!call checkCufft(cufftXtMemcpy(planr2c, u_permuted, u_desc, CUFFT_COPY_DEVICE_TO_HOST), 'permuted D2H error')
!call checkNormComplex(local_permuted_cshape(1), local_permuted_cshape(2), local_permuted_cshape(3), u_permuted, max_norm)
!write(*,'(A18, I1, A14, F25.8)') "after R2C ", rank, " max_norm is ", max_norm
! Data is now distributed as Y-Slab. We need to scale the output
call c_f_pointer(u_desc%descriptor, u_descptr)
#ifdef _OPENACC
tmpcptr = transfer(u_descptr%data(1), tmpcptr)
call c_f_pointer(tmpcptr, u_dptr, local_permuted_cshape(1:3))
call scalingData(local_permuted_cshape(1), local_permuted_cshape(2), &
local_permuted_cshape(3), u_dptr, real(nx*ny*nz))
#else
call c_f_pointer(u_descptr%data(1), u_dptr, local_permuted_cshape(1:3))
!$cuf kernel do (3)
do k =1, local_permuted_cshape(3)
do j = 1, local_permuted_cshape(2)
do i = 1, local_permuted_cshape(1)
u_dptr(i,j,k) = u_dptr(i,j,k) / real(nx*ny*nz)
end do
end do
end do
call checkCuda(cudaDeviceSynchronize())
#endif
! in case we want to check again after scaling
call checkCufft(cufftXtMemcpy(planr2c, u_permuted, u_desc, &
CUFFT_COPY_DEVICE_TO_HOST), 'permuted D2H error')
call checkNormComplex(local_permuted_cshape(1), &
local_permuted_cshape(2), local_permuted_cshape(3), &
u_permuted, max_norm)
write(*,'(A18, I1, A14, F25.8)') "after scaling ", rank, &
" max_norm is ", max_norm
!xxxxxxxxxxxxxxxxxxxxxxxxxxxx inverse
call checkCufft(cufftXtExecDescriptor(planc2r, u_desc, u_desc, &
CUFFT_INVERSE), 'inverse fft failed')
! These are equivalent, and work as well
! istat = cufftXtMemcpy(planc2r, u, u_desc, CUFFT_COPY_DEVICE_TO_HOST)
! call cufft_memcpyD2H(u, u_desc, CUFFT_XT_FORMAT_INPLACE, .false.)
call cufft_memcpyD2H(u, u_desc, CUFFT_XT_FORMAT_INPLACE, .true.)
call checkCufft(cufftXtFree(u_desc))
call checkCufft(cufftDestroy(planr2c))
call checkCufft(cufftDestroy(planc2r))
call checkNormDiff(nz, local_rshape(1), local_rshape(2), &
local_rshape(3), u, ref, max_norm, max_diff)
write(*,'(A18,I1,A14,F25.8,A14,F15.8)') "after C2R ", rank, &
" max_norm is ", max_norm, " max_diff is ", max_diff
write(*,'(A25,I1,A14,F25.8)') "Relative Linf on rank ", rank, &
" is ", max_diff/max_norm
deallocate(u)
deallocate(ref)
deallocate(u_permuted)
call mpi_finalize(ierr)
if(max_diff / max_norm > 1e-5) then
print*, ">>>> FAILED on rank ", rank
stop 1
else
print*, ">>>> PASSED on rank ", rank
end if
contains
#ifdef _CUDA
subroutine checkCuda(istat, message)
implicit none
integer, intent(in) :: istat
character(len=*),intent(in), optional :: message
if (istat /= cudaSuccess) then
write(*,"('Error code: ',I0, ': ')") istat
write(*,*) cudaGetErrorString(istat)
if(present(message)) write(*,*) message
call mpi_finalize(ierr)
endif
end subroutine checkCuda
#endif
subroutine checkCufft(istat, message)
implicit none
integer, intent(in) :: istat
character(len=*),intent(in), optional :: message
if (istat /= CUFFT_SUCCESS) then
write(*,"('Error code: ',I0, ': ')") istat
!@cuf write(*,*) cudaGetErrorString(istat)
if(present(message)) write(*,*) message
call mpi_finalize(ierr)
endif
end subroutine checkCufft
subroutine generate_random(nz1, nz, ny, nx, data)
implicit none
integer, intent(in) :: nx, ny, nz, nz1
real, dimension(nz, ny, nx), intent(out) :: data
real :: rand(1)
integer :: i,j,k
!call random_seed(put=(/seed, seed+1/))
do k =1, nx
do j = 1, ny
do i = 1, nz1
call random_number(rand)
data(i,j,k) = rand(1)
end do
end do
end do
end subroutine generate_random
subroutine checkNorm(nz1, nz, ny, nx, data, max_norm)
implicit none
integer, intent(in) :: nx, ny, nz, nz1
real, dimension(nz, ny, nx), intent(in) :: data
real :: max_norm
integer :: i, j, k
max_norm = 0
do k =1, nx
do j = 1, ny
do i = 1, nz1
max_norm = max(max_norm, abs(data(i,j,k)))
end do
end do
end do
end subroutine checkNorm
subroutine checkNormComplex(nz, ny, nx, data, max_norm)
implicit none
integer, intent(in) :: nx, ny, nz
complex, dimension(nz, ny, nx), intent(in) :: data
real :: max_norm, max_diff
integer :: i,j,k
max_norm = 0
do k =1, nx
do j = 1, ny
do i = 1, nz
max_norm = max(max_norm, abs(data(i,j,k)%re))
max_norm = max(max_norm, abs(data(i,j,k)%im))
end do
end do
end do
end subroutine checkNormComplex
subroutine checkNormDiff(nz1, nz, ny, nx, data, ref, max_norm, max_diff)
implicit none
integer, intent(in) :: nx, ny, nz, nz1
real, dimension(nz, ny, nx), intent(in) :: data, ref
real :: max_norm, max_diff
max_norm = 0
max_diff = 0
do k =1, nx
do j = 1, ny
do i = 1, nz1
max_norm = max(max_norm, abs(data(i,j,k)))
max_diff = max(max_diff, abs(ref(i,j,k)-data(i,j,k)))
if (abs(ref(i,j,k)-data(i,j,k)) > 0.0001) then
write(*,'(A9,I3,I3,I3,A2,F18.8,A7,F18.8,A9,I2)') "diff ref[", &
i,j,k,"]",ref(i,j,k),"data ",data(i,j,k)," at rank",rank
end if
end do
end do
end do
end subroutine checkNormDiff
#ifdef _OPENACC
subroutine scalingData(nz, ny, nx, data, factor)
implicit none
integer, intent(in) :: nx, ny, nz
complex, dimension(nz, ny, nz) :: data
!$acc declare deviceptr(data)
real, intent(in) :: factor
!$acc parallel loop collapse(3)
do k =1, nx
do j = 1, ny
do i = 1, nz
data(i, j, k) = data(i, j, k) / factor
end do
end do
end do
end subroutine scalingData
#endif
subroutine cufft_memcpyH2D(ulibxt, u_h, data_format, ismemcpy)
implicit none
type(cudaLibXtDesc), pointer, intent(out) :: ulibxt
real, dimension(*), intent(in) :: u_h
integer, intent(in) :: data_format
logical, intent(in) :: ismemcpy
type(cudaXtDesc), pointer :: uxt
!@cuf real, dimension(:,:,:), device, pointer :: u_d
if (ismemcpy == .false.) then
call checkCufft(cufftXtMemcpy(planc2r, ulibxt, u_h, &
CUFFT_COPY_HOST_TO_DEVICE), "cufft_memcpyHToD Error")
else
call c_f_pointer(ulibxt%descriptor, uxt)
if(data_format == CUFFT_XT_FORMAT_INPLACE_SHUFFLED) then
#ifdef _OPENACC
call acc_memcpy_to_device(uxt%data(1), u_h, &
product(int(local_rshape_permuted,kind=8))*4_8) ! bytes
#else
call c_f_pointer(uxt%data(1), u_d, local_rshape_permuted)
call checkCuda(cudaMemcpy(u_d, u_h, &
product(int(local_rshape_permuted,kind=8))), &
"cudamemcpy H2D Error")
#endif
else if (data_format == CUFFT_XT_FORMAT_INPLACE) then
#ifdef _OPENACC
call acc_memcpy_to_device(uxt%data(1), u_h, &
product(int(local_rshape,kind=8))*4_8) ! bytes
#else
call c_f_pointer(uxt%data(1), u_d, local_rshape)
call checkCuda(cudaMemcpy(u_d, u_h, &
product(int(local_rshape,kind=8))), &
"cudamemcpy H2D Error")
#endif
endif
endif
end subroutine cufft_memcpyH2D
subroutine cufft_memcpyD2H(u_h, ulibxt, data_format,ismemcpy)
implicit none
type(cudaLibXtDesc), pointer, intent(in) :: ulibxt
real, dimension(*), intent(out) :: u_h
integer, intent(in) :: data_format
logical, intent(in) :: ismemcpy
type(cudaXtDesc), pointer :: uxt
!@cuf real, dimension(:,:,:), device, pointer :: u_d
if (ismemcpy == .false.) then
call checkCufft(cufftXtMemcpy(planr2c, u_h, ulibxt, &
CUFFT_COPY_DEVICE_TO_HOST), "cufft_memcpyDToH Error")
else
call c_f_pointer(ulibxt%descriptor, uxt)
if(data_format == CUFFT_XT_FORMAT_INPLACE_SHUFFLED) then
#ifdef _OPENACC
call acc_memcpy_from_device(u_h, uxt%data(1), &
product(int(local_rshape_permuted,kind=8))*4_8) ! bytes
#else
call c_f_pointer(uxt%data(1), u_d, local_rshape_permuted)
call checkCuda(cudaMemcpy(u_h, u_d, &
product(int(local_rshape_permuted,kind=8))), &
"cudamemcpy D2H Error")
#endif
else if (data_format == CUFFT_XT_FORMAT_INPLACE) then
#ifdef _OPENACC
call acc_memcpy_from_device(u_h, uxt%data(1), &
product(int(local_rshape,kind=8))*4_8) ! bytes
#else
call c_f_pointer(uxt%data(1), u_d, local_rshape)
call checkCufft(cudamemcpy(u_h, u_d, &
product(int(local_rshape,kind=8))), &
"cufft_memcpyD2H error")
#endif
endif
endif
end subroutine cufft_memcpyD2H
end program cufftmp_r2c
Using cuRAND from OpenACC Host Code
This example demonstrates the use of the curand module, the curandHandle type, and several forms of rand calls.
Simple cuRAND Tests
program testcurand
! compile with the flags -ta=tesla -cuda -cudalib=curand
call cur1(1000, .true.); call cur1(1000, .false.)
call cur2(1000, .true.); call cur2(1000, .false.)
call cur3(1000, .true.); call cur3(1000, .false.)
end
!
subroutine cur1(n, onhost)
use curand
integer :: a(n)
type(curandGenerator) :: g
integer(8) nbits
logical onhost, passing
a = 0
passing = .true.
if (onhost) then
istat = curandCreateGeneratorHost(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = curandDestroyGenerator(g)
else
!$acc data copy(a)
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
!$acc host_data use_device(a)
istat = curandGenerate(g, a, n)
!$acc end host_data
istat = curandDestroyGenerator(g)
!$acc end data
endif
nbits = 0
do i = 1, n
if (i.lt.10) print *,i,a(i)
nbits = nbits + popcnt(a(i))
end do
print *,"Should be roughly half the bits set"
nbits = nbits / n
if ((nbits .lt. 12) .or. (nbits .gt. 20)) then
passing = .false.
else
print *,"nbits is ",nbits," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
!
subroutine cur2(n, onhost)
use curand
real :: a(n)
type(curandGenerator) :: g
logical onhost, passing
a = 0.0
passing = .true.
if (onhost) then
istat = curandCreateGeneratorHost(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = curandDestroyGenerator(g)
else
!$acc data copy(a)
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
!$acc host_data use_device(a)
istat = curandGenerate(g, a, n)
!$acc end host_data
istat = curandDestroyGenerator(g)
!$acc end data
endif
print *,"Should be uniform around 0.5"
do i = 1, n
if (i.lt.10) print *,i,a(i)
if ((a(i).lt.0.0) .or. (a(i).gt.1.0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
!
subroutine cur3(n, onhost)
use curand
real(8) :: a(n)
type(curandGenerator) :: g
logical onhost, passing
a = 0.0d0
passing = .true.
if (onhost) then
istat = curandCreateGeneratorHost(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = curandDestroyGenerator(g)
else
!$acc data copy(a)
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
!$acc host_data use_device(a)
istat = curandGenerate(g, a, n)
!$acc end host_data
istat = curandDestroyGenerator(g)
!$acc end data
endif
do i = 1, n
if (i.lt.10) print *,i,a(i)
if ((a(i).lt.0.0d0) .or. (a(i).gt.1.0d0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4d0) .or. (rmean .gt. 0.6d0)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
Using cuRAND from OpenACC Device Code
This example demonstrates the use of the curand_device module from a CUDA Fortran global subroutines.
Simple cuRAND Test from OpenACC Device Code
module mtests
integer, parameter :: n = 1000
contains
subroutine testrand( a, b )
use openacc_curand
real :: a(n), b(n)
type(curandStateXORWOW) :: h
integer(8) :: seed, seq, offset
!$acc parallel num_gangs(1) vector_length(1) copy(a,b) private(h)
seed = 12345
seq = 0
offset = 0
call curand_init(seed, seq, offset, h)
!$acc loop seq
do i = 1, n
a(i) = curand_uniform(h)
b(i) = curand_normal(h)
end do
!$acc end parallel
return
end subroutine
end module mtests
program t
use mtests
real :: a(n), b(n), c(n)
logical passing
a = 1.0
b = 2.0
passing = .true.
call testrand(a,b)
c = a
print *,"Should be uniform around 0.5"
do i = 1, n
if (i.lt.10) print *,i,c(i)
if ((c(i).lt.0.0) .or. (c(i).gt.1.0)) passing = .false.
end do
rmean = sum(c)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
c = b
print *,"Should be normal around 0.0"
nc1 = 0;
nc2 = 0;
do i = 1, n
if (i.lt.10) print *,i,c(i)
if ((c(i) .gt. -4.0) .and. (c(i) .lt. 0.0)) nc1 = nc1 + 1
if ((c(i) .gt. 0.0) .and. (c(i) .lt. 4.0)) nc2 = nc2 + 1
end do
print *,"Found on each side of zero ",nc1,nc2
if (abs(nc1-nc2) .gt. (n/10)) npassing = .false.
rmean = sum(c,mask=abs(c).lt.4.0)/n
if ((rmean .lt. -0.1) .or. (rmean .gt. 0.1)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end program
Using cuRAND from CUDA Fortran Host Code
This example demonstrates the use of the curand module, the curandHandle type, and several forms of rand calls.
Simple cuRAND Test
program testcurand1
call testr1(1000)
call testr2(1000)
call testr3(1000)
end
!
subroutine testr1(n)
use cudafor
use curand
integer, managed :: a(n)
type(curandGenerator) :: g
integer(8) nbits
logical passing
a = 0
passing = .true.
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = cudaDeviceSynchronize()
istat = curandDestroyGenerator(g)
nbits = 0
do i = 1, n
if (i.lt.10) print *,i,a(i)
nbits = nbits + popcnt(a(i))
end do
print *,"Should be roughly half the bits set"
nbits = nbits / n
if ((nbits .lt. 12) .or. (nbits .gt. 20)) then
passing = .false.
else
print *,"nbits is ",nbits," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
!
subroutine testr2(n)
use cudafor
use curand
real, managed :: a(n)
type(curandGenerator) :: g
logical passing
a = 0.0
passing = .true.
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = cudaDeviceSynchronize()
istat = curandDestroyGenerator(g)
print *,"Should be uniform around 0.5"
do i = 1, n
if (i.lt.10) print *,i,a(i)
if ((a(i).lt.0.0) .or. (a(i).gt.1.0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
!
subroutine testr3(n)
use cudafor
use curand
real(8), managed :: a(n)
type(curandGenerator) :: g
logical passing
a = 0.0d0
passing = .true.
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = cudaDeviceSynchronize()
istat = curandDestroyGenerator(g)
do i = 1, n
if (i.lt.10) print *,i,a(i)
if ((a(i).lt.0.0d0) .or. (a(i).gt.1.0d0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4d0) .or. (rmean .gt. 0.6d0)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
end program
Using cuRAND from CUDA Fortran Device Code
This example demonstrates the use of the curand_device module from a CUDA Fortran global subroutines.
Simple cuRAND Test from Device Code
module mtests
use curand_device
integer, parameter :: n = 10000
contains
attributes(global) subroutine testr( a, b )
real, device :: a(n), b(n)
type(curandStateXORWOW) :: h
integer(8) :: seed, seq, offset
integer :: iam
iam = threadIdx%x
seed = iam*64 + 12345
seq = 0
offset = 0
call curand_init(seed, seq, offset, h)
do i = iam, n, blockdim%x
a(i) = curand_uniform(h)
b(i) = curand_normal(h)
end do
return
end subroutine
end module mtests
program t
use mtests
real, allocatable, device :: a(:), b(:)
real c(n), rmean
logical passing
allocate(a(n))
allocate(b(n))
a = 0.0
b = 0.0
passing = .true.
call testr<<<1,32>>> (a,b)
c = a
print *,"Should be uniform around 0.5"
do i = 1, n
if (i.lt.10) print *,i,c(i)
if ((c(i).lt.0.0) .or. (c(i).gt.1.0)) passing = .false.
end do
rmean = sum(c)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
c = b
print *,"Should be normal around 0.0"
nc1 = 0;
nc2 = 0;
do i = 1, n
if (i.lt.10) print *,i,c(i)
if ((c(i) .gt. -4.0) .and. (c(i) .lt. 0.0)) nc1 = nc1 + 1
if ((c(i) .gt. 0.0) .and. (c(i) .lt. 4.0)) nc2 = nc2 + 1
end do
print *,"Found on each side of zero ",nc1,nc2
if (abs(nc1-nc2) .gt. (n/10)) npassing = .false.
rmean = sum(c,mask=abs(c).lt.4.0)/n
if ((rmean .lt. -0.1) .or. (rmean .gt. 0.1)) then
passing = .false.
else
print *,"Mean is ",rmean," which passes"
endif
if (passing) then
print *,"Test PASSED"
else
print *,"Test FAILED"
endif
end
end program
Using cuSPARSE from OpenACC Host Code
This example demonstrates the use of the cuSPARSE module, the cusparseHandle type, and several calls to the cuSPARSE library.
Simple BLAS Test
program sparseMatVec
integer n
n = 25 ! # rows/cols in dense matrix
call sparseMatVecSub1(n)
n = 45 ! # rows/cols in dense matrix
call sparseMatVecSub1(n)
end program
subroutine sparseMatVecSub1(n)
use openacc
use cusparse
implicit none
integer n
! dense data
real(4), allocatable :: Ade(:,:), x(:), y(:)
! sparse CSR arrays
real(4), allocatable :: csrValA(:)
integer, allocatable :: nnzPerRowA(:), csrRowPtrA(:), csrColIndA(:)
allocate(Ade(n,n), x(n), y(n))
allocate(csrValA(n))
allocate(nnzPerRowA(n), csrRowPtrA(n+1), csrColIndA(n))
call sparseMatVecSub2(Ade, x, y, csrValA, nnzPerRowA, csrRowPtrA, &
csrColIndA, n)
deallocate(Ade)
deallocate(x)
deallocate(y)
deallocate(csrValA)
deallocate(nnzPerRowA)
deallocate(csrRowPtrA)
deallocate(csrColIndA)
end subroutine
subroutine sparseMatVecSub2(Ade, x, y, csrValA, nnzPerRowA, csrRowPtrA, &
csrColIndA, n)
use openacc
use cusparse
implicit none
! dense data
real(4) :: Ade(n,n), x(n), y(n)
! sparse CSR arrays
real(4) :: csrValA(n)
integer :: nnzPerRowA(n), csrRowPtrA(n+1), csrColIndA(n)
integer :: n, nnz, status, i
type(cusparseHandle) :: h
type(cusparseMatDescr) :: descrA
! parameters
real(4) :: alpha, beta
! result
real(4) :: xerr
! initalize CUSPARSE and matrix descriptor
status = cusparseCreate(h)
if (status /= CUSPARSE_STATUS_SUCCESS) &
write(*,*) 'cusparseCreate error: ', status
status = cusparseCreateMatDescr(descrA)
status = cusparseSetMatType(descrA, &
CUSPARSE_MATRIX_TYPE_GENERAL)
status = cusparseSetMatIndexBase(descrA, &
CUSPARSE_INDEX_BASE_ONE)
status = cusparseSetStream(h, acc_get_cuda_stream(acc_async_sync))
!$acc data create(Ade, x, y, csrValA, nnzPerRowA, csrRowPtrA, csrColIndA)
! Initialize matrix (upper circular shift matrix)
!$acc kernels
Ade = 0.0
do i = 1, n-1
Ade(i,i+1) = 1.0
end do
Ade(n,1) = 1.0
! Initialize vectors and constants
do i = 1, n
x(i) = i
enddo
y = 0.0
!$acc end kernels
!$acc update host(x)
write(*,*) 'Original vector:'
write(*,'(5(1x,f7.2))') x
! convert matrix from dense to csr format
!$acc host_data use_device(Ade, nnzPerRowA, csrValA, csrRowPtrA, csrColIndA)
status = cusparseSnnz_v2(h, CUSPARSE_DIRECTION_ROW, &
n, n, descrA, Ade, n, nnzPerRowA, nnz)
status = cusparseSdense2csr(h, n, n, descrA, Ade, n, &
nnzPerRowA, csrValA, csrRowPtrA, csrColIndA)
!$acc end host_data
! A is upper circular shift matrix
! y = alpha*A*x + beta*y
alpha = 1.0
beta = 0.0
!$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, x, y)
status = cusparseScsrmv(h, CUSPARSE_OPERATION_NON_TRANSPOSE, &
n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
csrColIndA, x, beta, y)
!$acc end host_data
!$acc wait
write(*,*) 'Shifted vector:'
write(*,'(5(1x,f7.2))') y
! shift-down y and add original x
! A' is lower circular shift matrix
! x = alpha*A'*y + beta*x
beta = -1.0
!$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, x, y)
status = cusparseScsrmv(h, CUSPARSE_OPERATION_TRANSPOSE, &
n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
csrColIndA, y, beta, x)
!$acc end host_data
!$acc kernels
xerr = maxval(abs(x))
!$acc end kernels
!$acc end data
write(*,*) 'Max error = ', xerr
if (xerr.le.1.e-5) then
write(*,*) 'Test PASSED'
else
write(*,*) 'Test FAILED'
endif
end subroutine
Using cuSPARSE from CUDA Fortran Host Code
This example demonstrates the use of the cuSPARSE module, the cusparseHandle type, and several forms of cusparse calls.
Simple BLAS Test
program sparseMatVec
use cudafor
use cusparse
implicit none
integer, parameter :: n = 25 ! # rows/cols in dense matrix
type(cusparseHandle) :: h
type(cusparseMatDescr) :: descrA
! dense data
real(4), managed :: Ade(n,n), x(n), y(n)
! sparse CSR arrays
real(4), managed :: csrValA(n)
integer, managed :: nnzPerRowA(n), &
csrRowPtrA(n+1), csrColIndA(n)
integer :: nnz, status, i
! parameters
real(4) :: alpha, beta
! initalize CUSPARSE and matrix descriptor
status = cusparseCreate(h)
if (status /= CUSPARSE_STATUS_SUCCESS) &
write(*,*) 'cusparseCreate error: ', status
status = cusparseCreateMatDescr(descrA)
status = cusparseSetMatType(descrA, &
CUSPARSE_MATRIX_TYPE_GENERAL)
status = cusparseSetMatIndexBase(descrA, &
CUSPARSE_INDEX_BASE_ONE)
! Initialize matrix (upper circular shift matrix)
Ade = 0.0
do i = 1, n-1
Ade(i,i+1) = 1.0
end do
Ade(n,1) = 1.0
! Initialize vectors and constants
x = [(i,i=1,n)]
y = 0.0
write(*,*) 'Original vector:'
write(*,'(5(1x,f7.2))') x
! convert matrix from dense to csr format
status = cusparseSnnz_v2(h, CUSPARSE_DIRECTION_ROW, &
n, n, descrA, Ade, n, nnzPerRowA, nnz)
status = cusparseSdense2csr(h, n, n, descrA, Ade, n, &
nnzPerRowA, csrValA, csrRowPtrA, csrColIndA)
! A is upper circular shift matrix
! y = alpha*A*x + beta*y
alpha = 1.0
beta = 0.0
status = cusparseScsrmv(h, CUSPARSE_OPERATION_NON_TRANSPOSE, &
n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
csrColIndA, x, beta, y)
! shift-down y and add original x
! A' is lower circular shift matrix
! x = alpha*A'*y + beta*x
beta = -1.0
status = cusparseScsrmv(h, CUSPARSE_OPERATION_TRANSPOSE, &
n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
csrColIndA, y, beta, x)
status = cudaDeviceSynchronize()
write(*,*) 'Shifted vector:'
write(*,'(5(1x,f7.2))') y
write(*,*) 'Max error = ', maxval(abs(x))
if (maxval(abs(x)).le.1.e-5) then
write(*,*) 'Test PASSED'
else
write(*,*) 'Test FAILED'
endif
end program sparseMatVec
Using cuTENSOR from CUDA Fortran Host Code
This example demonstrates the use of the low-level cuTENSOR module, version 2, from CUDA Fortran to perform a reshape permutation and a sum reduction across one dimension.
This example can be compiled and linked as a normal CUDA Fortran subprogram by adding the “-cudalib=cutensor” option to the link line.
Simple cuTENSOR Test from CUDA Fortran
program testcutensorcuf1
use cudafor
use cutensor
integer, parameter :: ndim = 3
real, managed, allocatable :: dA(:,:,:), dC(:,:)
real, allocatable :: hA(:,:,:), hC(:,:)
real, device, allocatable :: workbuf(:)
real :: alpha, beta
integer(4) :: numModesA, numModesC
integer(8), dimension(ndim) :: extA, strA, extC, strC
integer(4), dimension(ndim) :: modeA, modeC
integer(8) :: workbufsize
type(cutensorStatus) :: cstat
type(cutensorHandle) :: handle
type(cutensorTensorDescriptor) :: descA, descC
type(cutensorOperationDescriptor) :: rdesc
type(cutensorComputeDescriptor) :: descCompute
type(cutensorPlan) :: plan
type(cutensorAlgo) :: algo
type(cutensorPlanPreference) :: pref
! Init
cstat = cutensorCreate(handle)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
descCompute = CUTENSOR_COMPUTE_DESC_32F
allocate(dA(100,60,80))
! This is the operation we are going to perform
! dC = sum(reshape(dA,shape=[80,60,100],order=[3,2,1]),dim=2)
!
call random_number(dA); dA = real(int(dA * 10.0))
extA = shape(dA)
strA(1) = 1; strA(2) = 100; strA(3) = 6000
modeA(1) = 3; modeA(2) = 2; modeA(3) = 1
numModesA = ndim
ialign = 256
print *,"Desc A"
cstat = cutensorCreateTensorDescriptor(handle, descA, numModesA, extA, strA, &
CUTENSOR_R_32F, ialign)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
allocate(dC(80,100))
extC(1:ndim-1) = shape(dC)
strC(1) = 1; strC(2) = 80
numModesC = ndim-1
dC = 0.0
modeC(1) = 1; modeC(2) = 3 ! Skip mode 2 for reduction across that dim
print *,"Desc C"
cstat = cutensorCreateTensorDescriptor(handle, descC, numModesC, extC, strC, &
CUTENSOR_R_32F, ialign)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
print *,"CreateRed "
cstat = cutensorCreateReduction(handle, rdesc, &
descA, modeA, CUTENSOR_OP_IDENTITY, &
descC, modeC, CUTENSOR_OP_IDENTITY, &
descC, modeC, CUTENSOR_OP_ADD, descCompute)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
print *,"CreatePlanPref "
cstat = cutensorCreatePlanPreference(handle, pref, CUTENSOR_ALGO_DEFAULT, &
CUTENSOR_JIT_MODE_NONE)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
print *,"EstimateWork "
cstat = cutensorEstimateWorkspaceSize(handle, rdesc, pref, &
CUTENSOR_WORKSPACE_DEFAULT, workbufsize)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
print *,"Estimated workspace size: ",workbufsize
print *,"CreatePlan "
cstat = cutensorCreatePlan(handle, plan, rdesc, pref, workbufsize)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
allocate(workbuf((workbufsize+3)/4))
alpha = 1.0; beta = 0.0
print *,"Reduce "
cstat = cutensorReduce(handle, plan, alpha, dA, beta, dC, &
dC, workbuf, workbufsize, 0)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
hA = dA
hC = sum(reshape(hA,[80,60,100],order=[3,2,1]), dim=2)
istat = cudaDeviceSynchronize() ! Managed memory, to be sure
if (all(hC.eq.dC)) then
print *,"test PASSED"
else
print *,"test FAILED"
end if
cstat = cutensorDestroy(handle)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyPlan(plan)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyPlanPreference(pref)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyOperationDescriptor(rdesc)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyTensorDescriptor(descA)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyTensorDescriptor(descC)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
deallocate(workbuf)
deallocate(dA, dC)
deallocate(hA, hC)
end program
Using cuTENSOREX from CUDA Fortran Host Code
This example demonstrates the use of the higher-level cuTENSOREX module from CUDA Fortran to perform a large matrix multiplication using multiple OpenMP threads.
This example can be compiled and linked as a normal CUDA Fortran subprogram by adding the “-mp -cudalib=cutensor” options to the compile and link line.
Multi-threaded cuTENSOREX Example from CUDA Fortran
! Test cuTensor + cuda Fortran + OMP multi-stream matmul
program testCuCufMsMatmul
use cudafor
use cutensorex
integer, parameter :: m=8192, k=1280, n=1024
integer, parameter :: mblksize = 128
integer, parameter :: mslices = m / mblksize
integer, parameter :: nstreams = 4
integer, parameter :: numtimes = mslices / nstreams
real(8), allocatable, dimension(:,:), device :: a_d, d_d
real(8), allocatable, dimension(:,:,:), pinned :: ha
real(8), dimension(k,n), device :: b_d
real(8), allocatable, dimension(:,:), pinned :: d
real(8) :: alpha
integer(kind=cuda_stream_kind) :: mystream
!$OMP THREADPRIVATE(a_d, d_d, mystream)
allocate( ha(k,mblksize,nstreams))
allocate( d(1:m,1:n))
b_d = 1.0d0
alpha = 1.0d0
!$OMP PARALLEL NUM_THREADS(nstreams) PRIVATE(istat)
istat = cudaStreamCreate(mystream)
istat = cudaforSetDefaultStream(mystream)
istat = cutensorExSetStream(mystream)
! At this point, all new allocations will pick up default stream
allocate(a_d(k,mblksize))
allocate(d_d(mblksize,n))
!$OMP END PARALLEL
! Test matmul
!$OMP PARALLEL DO NUM_THREADS(nstreams) PRIVATE(jlcl,jgbl,jend)
do ns = 1, nstreams
do nt = 1, numtimes
jgbl = 1 + ((ns-1) + (nt-1)*nstreams)*mblksize
jend = jgbl + mblksize - 1
! Do some host work
do jlcl = 1, mblksize
ha(:,jlcl,ns) = dble(jlcl+jgbl-1)
end do
! Move data to the device on default stream
a_d = ha(:,:,ns)
! Matrix multiply on my thread cutensorEx stream
d_d = alpha * matmul(transpose(a_d),b_d)
! Move result back to host on default stream
d(jgbl:jend,:) = d_d
end do
end do
! Wait for all threads to finish GPU work
istat = cudaDeviceSynchronize()
nfailed = 0
do j = 1, n
do i = 1, m
if (d(i,j) .ne. i*k) then
if (nfailed .lt. 100) print *,i,j,d(i,j)
nfailed = nfailed + 1
end if
end do
end do
if (nfailed .eq. 0) then
print *,"test PASSED"
else
print *,"test FAILED"
endif
end program
Using cuTENSOR from OpenACC Host Code
This example demonstrates the use of the cuTENSOREX module, calling Matmul() using OpenACC device data, and setting the cuTENSOR library stream to be consistent with the OpenACC default stream.
This example can be compiled and run with or without OpenACC. To compile with OpenACC, the options are “-ta=tesla -cuda -cudalib=cutensor”. To run on the CPU, leave off those options.
Simple cuTENSOREX Test from OpenACC
program testcutensorOpenACC
!@acc use openacc
!@acc use cutensorex
integer, parameter :: ni=1280, nj=1024, nk=960, ntimes=1
real(8) :: a(ni,nk), b(nk,nj), c(ni,nj), d(ni,nj)
call random_number(a)
call random_number(b)
a = dble(int(4.0d0*a - 2.0d0))
b = dble(int(8.0d0*b - 4.0d0))
c = 2.0; d = 0.0
!$acc enter data copyin(a,b,c) create(d)
!@acc istat = cutensorExSetStream(acc_get_cuda_stream(acc_async_sync))
!$acc host_data use_device(a,b,c,d)
do nt = 1, ntimes
d = c + matmul(a,b)
end do
!$acc end host_data
!$acc update host(d)
print *,sum(d)
do nt = 1, ntimes
!$acc kernels
do j = 1, nj
do i = 1, ni
d(i,j) = c(i,j)
do k = 1, nk
d(i,j) = d(i,j) + a(i,k) * b(k,j)
end do
end do
end do
!$acc end kernels
end do
!$acc exit data copyout(d)
print *,sum(d)
end program