Examples

This section contains examples with source code.

Using cuBLAS from OpenACC Host Code

This example demonstrates the use of the cublas module, the cublasHandle type, and several forms of blas calls from OpenACC data regions.

Simple OpenACC BLAS Test

program testcublas
! compile with pgfortran -ta=tesla -cudalib=cublas -cuda testcublas.f90
call testcu1(1000)
call testcu2(1000)
end
!
subroutine testcu1(n)
use openacc
use cublas
integer :: a(n), b(n)
type(cublasHandle) :: h
istat = cublasCreate(h)
! Force OpenACC kernels and cuBLAS to use the OpenACC stream.
istat = cublasSetStream(h, acc_get_cuda_stream(acc_async_sync))
!$acc data copyout(a, b)
!$acc kernels
a = 1
b = 2
!$acc end kernels
! No host_data, we are lexically inside a data region
! sswap will accept any kind(4) data type
call sswap(n, a, 1, b, 1)
call cublasSswap(n, a, 1, b, 1)
!$acc end data
if (all(a.eq.1).and.all(b.eq.2)) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end
!
subroutine testcu2(n)
use openacc
use cublas
real(8) :: a(n), b(n)
a = 1.0d0
b = 2.0d0
!$acc data copy(a, b)
!$acc host_data use_device(a,b)
call dswap(n, a, 1, b, 1)
call cublasDswap(n, a, 1, b, 1)
!$acc end host_data
!$acc end data
if (all(a.eq.1.0d0).and.all(b.eq.2.0d0)) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end

CUBLASXT BLAS Test

This example demonstrates the use of the cublasxt module

program testcublasxt
call testxt1(1000)
call testxt2(1000)
end
!
subroutine testxt1(n)
use cublasxt
real(4) :: a(n,n), b(n,n), c(n,n), alpha, beta
type(cublasXtHandle) :: h
integer ndevices(1)
a = 1.0
b = 2.0
c = -1.0
alpha = 1.0
beta = 0.0
istat = cublasXtCreate(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
ndevices(1) = 0
istat = cublasXtDeviceSelect(h, 1, ndevices)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtSgemm(h, CUBLAS_OP_N, CUBLAS_OP_N, &
                      n, n, n, &
                      alpha, A, n, B, n, beta, C, n)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtDestroy(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
if (all(c.eq.2.0*n)) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
print *,c(1,1),c(n,n)
end
!
subroutine testxt2(n)
use cublasxt
real(8) :: a(n,n), b(n,n), c(n,n), alpha, beta
type(cublasXtHandle) :: h
integer ndevices(1)
a = 1.0d0
b = 2.0d0
c = -1.0d0
alpha = 1.0d0
beta = 0.0
istat = cublasXtCreate(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
ndevices(1) = 0
istat = cublasXtDeviceSelect(h, 1, ndevices)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtDgemm(h, CUBLAS_OP_N, CUBLAS_OP_N, &
                      n, n, n, &
                      alpha, A, n, B, n, beta, C, n)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
istat = cublasXtDestroy(h)
if (istat .ne. CUBLAS_STATUS_SUCCESS) print *,istat
if (all(c.eq.2.0d0*n)) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
print *,c(1,1),c(n,n)
end

Using cuBLAS from CUDA Fortran Host Code

This example demonstrates the use of the cublas module, the cublasHandle type, and several forms of blas calls.

Simple BLAS Test

program testisamax
! Compile with "pgfortran testisamax.cuf -cudalib=cublas -lblas"
! Use the NVIDIA cudafor and cublas modules
use cudafor
use cublas
!
real*4, device, allocatable :: xd(:)
real*4 x(1000)
integer, device :: kd
type(cublasHandle) :: h

call random_number(x)

! F90 way
i = maxloc(x,dim=1)
print *,i
print *,x(i-1),x(i),x(i+1)

! Host way
j = isamax(1000,x,1)
print *,j
print *,x(j-1),x(j),x(j+1)

! CUDA Generic BLAS way
allocate(xd(1000))
xd = x
k = isamax(1000,xd,1)
print *,k
print *,x(k-1),x(k),x(k+1)

! CUDA Specific BLAS way
k = cublasIsamax(1000,xd,1)
print *,k
print *,x(k-1),x(k),x(k+1)

! CUDA V2 Host Specific BLAS way
istat = cublasCreate(h)
if (istat .ne. 0) print *,"cublasCreate returned ",istat
k = 0
istat = cublasIsamax_v2(h, 1000, xd, 1, k)
if (istat .ne. 0) print *,"cublasIsamax 1 returned ",istat
print *,k
print *,x(k-1),x(k),x(k+1)

! CUDA V2 Device Specific BLAS way
k = 0
istat = cublasIsamax_v2(h, 1000, xd, 1, kd)
if (istat .ne. 0) print *,"cublasIsamax 2 returned ",istat
k = kd
print *,k
print *,x(k-1),x(k),x(k+1)

istat = cublasDestroy(h)
if (istat .ne. 0) print *,"cublasDestroy returned ",istat

end program

Multi-threaded BLAS Test

This example demonstrates the use of the cublas module in a multi-threaded code. Each thread will attach to a different GPU, create a context, and combine the results at the end.

program tsgemm
!
! Multi-threaded example calling sgemm from cuda fortran.
! Compile with "pgfortran -mp tsgemm.cuf -cudalib=cublas"
! Set OMP_NUM_THREADS=number of GPUs in your system.
!
use cublas
use cudafor
use omp_lib
!
! Size this according to number of GPUs
!
! Small
!integer, parameter :: K = 2500
!integer, parameter :: M = 2000
!integer, parameter :: N = 2000
! Large
integer, parameter :: K = 10000
integer, parameter :: M = 10000
integer, parameter :: N = 10000
integer, parameter :: NTIMES = 10
!
real*4, device, allocatable :: a_d(:,:), b_d(:,:), c_d(:,:)
!$omp THREADPRIVATE(a_d,b_d,c_d)
real*4 a(m,k), b(k,n), c(m,n)
real*4 alpha, beta
integer, allocatable :: offs(:)
type(cudaEvent) :: start, stop

a = 1.0; b = 0.0; c = 0.0
do i = 1, N
  b(i,i) = 1.0
end do
alpha = 1.0; beta  = 1.0

! Break up the B and C array into sections
nthr = omp_get_max_threads()
nsec = N / nthr
print *,"Running with ",nthr," threads, each section = ",nsec
allocate(offs(nthr))
offs = (/ (i*nsec,i=0,nthr-1) /)

! Allocate and initialize the arrays
! Each thread connects to a device and creates a CUDA context.
!$omp PARALLEL private(i,istat)
    i = omp_get_thread_num() + 1
    istat = cudaSetDevice(i-1)
    allocate(a_d(M,K), b_d(K,nsec), c_d(M,nsec))
    a_d = a
    b_d = b(:,offs(i)+1:offs(i)+nsec)
    c_d = c(:,offs(i)+1:offs(i)+nsec)
!$omp end parallel

istat = cudaEventCreate(start)
istat = cudaEventCreate(stop)

time = 0.0
istat = cudaEventRecord(start, 0)
! Run the traditional blas kernel
!$omp PARALLEL private(j,istat)
    do j = 1, NTIMES
      call sgemm('n','n', M, N/nthr, K, alpha, a_d, M, b_d, K, beta, c_d, M)
    end do
    istat = cudaDeviceSynchronize()
!$omp end parallel

istat = cudaEventRecord(stop, 0)
istat = cudaEventElapsedTime(time, start, stop)
time = time / (NTIMES*1.0e3)
!$omp PARALLEL private(i)
    i = omp_get_thread_num() + 1
    c(:,offs(i)+1:offs(i)+nsec) = c_d
!$omp end parallel
nerrors = 0
do j = 1, N
    do i = 1, M
        if (c(i,j) .ne. NTIMES) nerrors = nerrors + 1
    end do
end do
print *,"Number of Errors:",nerrors
gflops = 2.0 * N * M * K/time/1e9
write (*,901) m,k,k,N,time*1.0e3,gflops
901 format(i0,'x',i0,' * ',i0,'x',i0,':\t',f8.3,' ms\t',f12.3,' GFlops/s')
end

Using cuFFT from OpenACC Host Code

This example demonstrates the use of the cufft module, the cufftHandle type, and several cuFFT library calls.

Simple cuFFT Test

program cufft2dTest
  use cufft
  use openacc
  integer, parameter :: m=768, n=512
  complex, allocatable  :: a(:,:),b(:,:),c(:,:)
  real, allocatable     :: r(:,:),q(:,:)
  integer :: iplan1, iplan2, iplan3, ierr

  allocate(a(m,n),b(m,n),c(m,n))
  allocate(r(m,n),q(m,n))

  a = 1; r = 1
  xmx = -99.0

  ierr = cufftPlan2D(iplan1,n,m,CUFFT_C2C)
  ierr = ierr + cufftSetStream(iplan1,acc_get_cuda_stream(acc_async_sync))
  !$acc host_data use_device(a,b,c)
  ierr = ierr + cufftExecC2C(iplan1,a,b,CUFFT_FORWARD)
  ierr = ierr + cufftExecC2C(iplan1,b,c,CUFFT_INVERSE)
  !$acc end host_data

  ! scale c
  !$acc kernels
  c = c / (m*n)
  !$acc end kernels

  ! Check forward answer
  write(*,*) 'Max error C2C FWD: ', cmplx(maxval(real(b)) - sum(real(b)), &
                                          maxval(imag(b)))
  ! Check inverse answer
  write(*,*) 'Max error C2C INV: ', maxval(abs(a-c))

  ! Real transform
  ierr = ierr + cufftPlan2D(iplan2,n,m,CUFFT_R2C)
  ierr = ierr + cufftPlan2D(iplan3,n,m,CUFFT_C2R)
  ierr = ierr + cufftSetStream(iplan2,acc_get_cuda_stream(acc_async_sync))
  ierr = ierr + cufftSetStream(iplan3,acc_get_cuda_stream(acc_async_sync))

  !$acc host_data use_device(r,b,q)
  ierr = ierr + cufftExecR2C(iplan2,r,b)
  ierr = ierr + cufftExecC2R(iplan3,b,q)
  !$acc end host_data

  !$acc kernels
  xmx = maxval(abs(r-q/(m*n)))
  !$acc end kernels

  ! Check R2C + C2R answer
  write(*,*) 'Max error R2C/C2R: ', xmx

  ierr = ierr + cufftDestroy(iplan1)
  ierr = ierr + cufftDestroy(iplan2)
  ierr = ierr + cufftDestroy(iplan3)

  if (ierr.eq.0) then
    print *,"test PASSED"
  else
    print *,"test FAILED"
  endif

end program cufft2dTest

Using cuFFT from CUDA Fortran Host Code

This example demonstrates the use of the cuFFT module, the cufftHandle type, and several cuFFT library calls.

Simple cuFFT Test

program cufft2dTest
  use cudafor
  use cufft
  implicit none
  integer, parameter :: m=768, n=512
  complex, managed :: a(m,n),b(m,n)
  real, managed :: ar(m,n),br(m,n)
  real    x
  integer plan, ierr
  logical passing

  a = 1; ar = 1

  ierr = cufftPlan2D(plan,n,m,CUFFT_C2C)
  ierr = ierr + cufftExecC2C(plan,a,b,CUFFT_FORWARD)
  ierr = ierr + cufftExecC2C(plan,b,b,CUFFT_INVERSE)
  ierr = ierr + cudaDeviceSynchronize()
  x = maxval(abs(a-b/(m*n)))
  write(*,*) 'Max error C2C: ', x
  passing = x .le. 1.0e-5

  ierr = ierr + cufftPlan2D(plan,n,m,CUFFT_R2C)
  ierr = ierr + cufftExecR2C(plan,ar,b)
  ierr = ierr + cufftPlan2D(plan,n,m,CUFFT_C2R)
  ierr = ierr + cufftExecC2R(plan,b,br)
  ierr = ierr + cudaDeviceSynchronize()
  x = maxval(abs(ar-br/(m*n)))
  write(*,*) 'Max error R2C/C2R: ', x
  passing = passing .and. (x .le. 1.0e-5)

  ierr = ierr + cufftDestroy(plan)
  print *,ierr
  passing = passing .and. (ierr .eq. 0)
  if (passing) then
    print *,"Test PASSED"
  else
    print *,"Test FAILED"
  endif
end program cufft2dTest

Using cufftXt from CUDA Fortran Host Code

This example demonstrates the use of the cufftXt module, the cudaLibXtDesc type, many of the cufftXt API calls, the padding required for real/complex FFTs, and the distribution of active modes across multiple GPUs.

Simple Poisson 3D CufftXt Test

program cufftXt3DTest
  use cudafor
  use cufftXt   ! cufftXt module includes cufft
  implicit none

  integer, parameter :: nGPUs = 2
  integer, parameter :: M = 1, N = 1, P = 1
  integer, parameter :: nx = 32, ny=32, nz=32
  real, parameter :: twopi = 8.0*atan(1.0)
  real, parameter :: Lx = 1.0, Ly = 1.0, Lz = 1.0
  logical, parameter :: printModes = .true.

  ! GPU
  integer :: nodeGPUs
  integer :: whichGPUs(nGPUs)

  ! grid
  real :: x(nx), y(ny), z(nz)
  real :: hx, hy, hz

  ! wavenumbers
  real :: kx(nx/2+1), ky(ny), kz(nz)
  type realDeviceArray
     real, device, allocatable :: v(:)
  end type realDeviceArray
  type(realDeviceArray) :: kx_da(nGPUs), ky_da(nGPUs), kz_da(nGPUs)

  real :: phi(nx+2,ny, nz), u(nx+2,ny,nz), ua(nx,ny,nz)
  integer :: status, i, j, k

  ! check M, N

  if (M==0 .or. N==0 .or. P==0) then
     write(*,*) 'M, N or P is 0 thus solution is u=0'
     stop
  else
     write(*,"(' Running with modes M, N, P = ', i0, ', ', i0, ', ', i0)") M, N, P
     write(*,"('  on a grid of ', i0, 'x', i0, 'x', i0)") nx, ny, nz
  end if

  ! choose GPUs

  ! check for multiple GPUs
  status = cudaGetDeviceCount(nodeGPUs)
  if (status /= cudaSuccess) write(*,*) 'Error in cudaGetDeviceCount()'
  if (nodeGPUs .lt. nGPUs) then
     write(*,*) 'Number of GPUs on node:', nodeGPUs
     write(*,*) 'Insufficient number of GPUs for this code, exiting ...'
     stop
  end if

  ! check for GPUs of same compute capability
  block
    type(cudaDeviceProp) :: prop
    integer :: cc(0:nodeGPUs-1,2)
    logical :: foundIdenticalGPUs = .false.
    integer :: iWG, nIdentGPUs

    write(*,*) 'Available devices:'
    do i = 0, nodeGPUs-1
       status = cudaGetDeviceProperties(prop, i)
       if (status /= cudaSuccess) write(*,*) 'Error in cudaGetDeviceProperties()'
       cc(i,1) = prop%major
       cc(i,2) = prop%minor
       write(*,"('  ', i0, ' CC: ', i0, '.', i0)") i, cc(i,1), cc(i,2)
    enddo

    do i = 0, nodeGPUs-1
       nIdentGPUs = 1
       do j = i+1, nodeGPUs
          if (all(cc(i,:) == cc(j,:))) nIdentGPUs = nIdentGPUs+1
       enddo

       if (nIdentGPUs .ge. nGPUs) then
          foundIdenticalGPUs = .true.
          iWG = 1
          whichGPUs(iWG) = i
          do j = i+1, nodeGPUs-1
             if (all(cc(i,:) == cc(j,:))) then
                iWG = iWG+1
                whichGPUs(iWG) = j
             end if
             if (iWG == nGPUs) exit
          end do
          exit
       end if
    end do

    if (foundIdenticalGPUS) then
       write(*,*) 'Running on GPUs:'
       write(*,*) whichGPUs
    else
       write(*,"(' No ', i0, ' identical GPUs found, exiting ...')"), nGPUs
       stop
    end if

  end block

  ! Physical grid

  hx = Lx/nx
  do i = 1, nx
     x(i) = hx*i
  enddo

  hy = Ly/ny
  do j = 1, ny
     y(j) = hy*j
  enddo

  hz = Lz/nz
  do k = 1, nz
     z(k) = hz*k
  enddo

  ! Wavenumbers

  do i = 1, nx/2+1
     kx(i) = (i-1)*(twoPi/Lx)
  enddo

  do j = 1, ny/2
     ky(j) = (j-1)*(twoPi/Ly)
  enddo
  do j = ny/2+1, ny
     ky(j) = (j-1-ny)*(twoPi/Ly)
  enddo

  do k = 1, nz/2
     kz(k) = (k-1)*(twoPi/Lz)
  enddo
  do k = nz/2+1, nz
     kz(k) = (k-1-nz)*(twoPi/Lz)
  enddo

  ! copy wavenumber arrays to each device
  do i = 1, nGPUs
     status = cudaSetDevice(whichGPUs(i))
     allocate(kx_da(i)%v, source=kx)
     allocate(ky_da(i)%v, source=ky)
     allocate(kz_da(i)%v, source=kz)
  enddo

  ! Initialize phi and get analytical solution

  do k = 1, nz
     do j = 1, ny
        do i = 1, nx
           phi(i,j,k) = sin(twoPi*M*x(i))*sin(twoPi*N*y(j))*sin(twoPi*P*z(k))
           ua(i,j,k) = -phi(i,j,k)/((twoPi*M)**2 + (twoPi*N)**2 + (twoPi*P)**2)
        enddo
     enddo
  end do

  ! cufft block

  block
    integer :: planR2C, planC2R
    integer(c_size_t) :: worksize(nGPUs)
    type(cudaLibXtDesc), pointer :: phi_desc

    status = cufftCreate(planR2C)
    if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftCreate'

    status = cufftCreate(planC2R)
    if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftCreate'

    status = cufftXtSetGPUs(planR2C, nGPUs, whichGPUs)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtSetGPUs failed'

    status = cufftXtSetGPUs(planC2R, nGPUs, whichGPUs)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtSetGPUs failed'

    status = cufftMakePlan3d(planR2C, nz, ny, nx, CUFFT_R2C, worksize)
    if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftMakePlan2d'

    status = cufftMakePlan3d(planC2R, nz, ny, nx, CUFFT_C2R, worksize)
    if (status /= CUFFT_SUCCESS) write(*,*) 'Error in cufftMakePlan2d'

    ! allocate memory on seperate devices
    status = cufftXtMalloc(planR2C, phi_desc, CUFFT_XT_FORMAT_INPLACE)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMalloc failed'

    ! H2D transfer
    write(*,*) 'cufftXtMemcpy H2D ...'
    status = cufftXtMemcpy(planR2C, phi_desc, phi, CUFFT_COPY_HOST_TO_DEVICE)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMemcpy H2D failed'

    ! forward FFT
    write(*,*) 'Forward transform ...'
    status = cufftXtExecDescriptorR2C(planR2C, phi_desc, phi_desc)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtExecDescriptorR2C failed:', status

    if (printModes) then
       block
         real :: threshold = 1.e-3
         type(cudaXtDesc), pointer :: phiXtDesc
         complex, device, pointer :: phi_d(:,:,:)
         integer :: dev, g, jl, nyl
         complex :: phi_h(nx/2+1, ny, nz)

         call c_f_pointer(phi_desc%descriptor, phiXtDesc)

         do g = 1, nGPUs
            write(*,"(' Active modes on g = ', i0)") g
            dev = phiXtDesc%GPUs(g)
            status = cudaSetDevice(dev)
            if (status /= cudaSuccess) write(*,*) 'cudaSetDevice failed'

            ! XtMalloc done with FORMAT_INPLACE, so data prior to transform
            ! are in natural order (split in least frequently changing index,
            ! in this case z), and after transform are in shuffled order (split in
            ! second least frequently changing index, in this case y)

            nyl = ny/nGPUs
            if (g .le. mod(ny, nGPUs)) nyl = nyl+1

            call c_f_pointer(phiXtDesc%data(g), phi_d, [nx/2+1, nyl, nz])

            !$cuf kernel do (3)
            do k = 1, nz
               do jl = 1, nyl
                  do i = 1, nx/2+1
                     if (abs(phi_d(i,jl,k)) .gt. threshold) print*, i, jl, k, phi_d(i,jl,k)
                  enddo
               enddo
            enddo
            status = cudaDeviceSynchronize()
            call flush()
         end do

         status = cufftXtMemcpy(planR2C, phi_h, phi_desc, CUFFT_COPY_DEVICE_TO_HOST)
         if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMemcpy D2H failed'
         write(*,*) 'Active modes on host (after D2H):'
         do k = 1, nz
            do j = 1, ny
               do i = 1, nx/2+1
                  if (abs(phi_h(i,j,k)) .gt. threshold) print*, i, j, k, phi_h(i,j,k)
               end do
            end do
         enddo
       end block
    end if

    ! solve Poisson equation
    write(*,*) 'Solve Poisson Eq ...'
    block
      type(cudaXtDesc), pointer :: phiXtDesc
      complex, device, pointer :: phi_d(:,:,:)
      integer :: dev, g, jl, nyl, yOffset
      real :: k2

      call c_f_pointer(phi_desc%descriptor, phiXtDesc)

      yOffset = 0
      do g = 1, nGPUs
         dev = phiXtDesc%GPUs(g)
         status = cudaSetDevice(dev)
         if (status /= cudaSuccess) write(*,*) 'cudaSetDevice failed'

         ! XtMalloc done with FORMAT_INPLACE, so data prior to transform
         ! are in natural order (split in least frequently changing index,
         ! in this case z), and after transform are in shuffled order (split in
         ! second least frequently changing index, in this case y)

         nyl = ny/nGPUs
         if (g .le. mod(ny, nGPUs)) nyl = nyl+1

         call c_f_pointer(phiXtDesc%data(g), phi_d, [nx/2+1, nyl, nz])

         associate(kx_d => kx_da(g)%v, ky_d => ky_da(g)%v, kz_d => kz_da(g)%v)
           !$cuf kernel do (3)
           do k = 1, nz
              do jl = 1, nyl
                 j = jl + yOffset
                 do i = 1, nx/2+1
                    k2 = kx_d(i)**2 + ky_d(j)**2 + kz_d(k)**2
                    phi_d(i,jl,k) = -phi_d(i,jl,k)/k2/(nx*ny*nz)
                 enddo
              enddo
           enddo
         end associate

         ! specify mean (corrects division by zero wavenumber above)
         if (g == 1) phi_d(1,1,1) = 0.0

         yOffset = yOffset + nyl
      end do

      do g = 1, nGPUs
         dev = phiXtDesc%GPUs(g)
         status = cudaSetDevice(dev)
         if (status /= cudaSuccess) write(*,*) 'cudaSetDevice failed'
         status = cudaDeviceSynchronize()
         if (status /= cudaSuccess) write(*,*) 'CUF kernel sync error'
      end do

    end block ! poisson

    ! inverse FFT
    write(*,*) 'Inverse transform ...'
    status = cufftXtExecDescriptorC2R(planC2R, phi_desc, phi_desc)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtExecDescriptorC2R failed'

    ! D2H transfer
    write(*,*) 'cufftXtMemcpy D2H ...'
    status = cufftXtMemcpy(planC2R, u, phi_desc, CUFFT_COPY_DEVICE_TO_HOST)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtMemcpy D2H failed'

    ! cufft block cleanup
    status = cufftXtFree(phi_desc)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtFree failed'

    status = cufftDestroy(planR2C)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtFree failed'

    status = cufftDestroy(planC2R)
    if (status /= CUFFT_SUCCESS) write(*,*) 'cufftXtFree failed'

  end block

  write(*,*) 'Max error: ', maxval(abs(u(1:nx,1:ny,1:nz)-ua(1:nx,1:ny,1:nz)))

  ! cleanup

  do i = 1, nGPUs
     status = cudaSetDevice(whichGPUs(i))
     deallocate(kx_da(i)%v, ky_da(i)%v, kz_da(i)%v)
  enddo

  write(*,*) '... finished'

end program cufftXt3DTest

Using cuFFTMp from either OpenACC or CUDA Fortran

This example demonstrates the use of the cuFFTMp API from the cuFFTXt module, the cudaLibXtDesc and cudaXtDesc types, and attaching the MPI COMM to the cuFFT plan.

Real-to-Complex and Complex-to-Real cuFFTMp Test

!
! This samples illustrates a basic use of cuFFTMp using the built-in, optimized,
! data distributions.
!
!  It assumes the CPU data is initially distributed according to
!  CUFFT_XT_FORMAT_INPLACE, a.k.a. X-Slabs.
!  Given a global array of size X!  Y!  Z, every MPI rank owns approximately
!  (X / ngpus)!  Y*Z entries.
!  More precisely,
!  - The first (ngpus % X) MPI rank each own (X/ngpus+1) planes of size Y*Z,
!  - The remaining MPI rank each own (X / ngpus) planes of size Y*Z
!
!  The CPU data is then copied on GPU and a forward transform is applied.
!
!  After that transform, GPU data is distributed according to
!  CUFFT_XT_FORMAT_INPLACE_SHUFFLED, a.k.a. Y-Slabs.
!  Given a global array of size X * Y * Z, every MPI rank owns approximately
!  X * (Y / ngpus) * Z entries.
!  More precisely,
!  - The first (ngpus % Y) MPI rank each own (Y/ngpus+1) planes of size X*Z,
!  - The remaining MPI rank each own (Y / ngpus) planes of size X*Z
!
!  A scaling kerel is applied, on the distributed GPU data (distributed
!  according to CUFFT_XT_FORMAT_INPLACE)
!  This kernel prints some elements to illustrate the
!  CUFFT_XT_FORMAT_INPLACE_SHUFFLED data distribution and normalize entries
!  by (nx * ny * nz)
!
!  Finally, a backward transform is applied.
!  After this, data is again distributed according to CUFFT_XT_FORMAT_INPLACE,
!  same as the input data.
!
!  Data is finally copied back to CPU and compared to the input data. They
!  should be almost identical.
!
!  This program can be used by either OpenACC or CUDA Fortran:
!    mpif90 -acc cufftmp_r2c.F90 -cudalib=cufftmp
!  Or:
!    mpif90 -cuda cufftmp_r2c.F90 -cudalib=cufftmp
!
module cufft_required
    integer :: planr2c, planc2r
    integer :: local_rshape(3), local_rshape_permuted(3)
    integer :: local_permuted_cshape(3)
end module cufft_required

program cufftmp_r2c
    use iso_c_binding
    use cufftXt
    use cufft
!@cuf use cudafor
!@acc use openacc
    use mpi
    use cufft_required
    implicit none

    integer :: size, rank, ndevices, ierr
    integer :: nx, ny, nz ! nx slowest
    integer :: i, j, k
    integer :: my_nx, my_ny, my_nz, ranks_cutoff, whichgpu(1)
    real, dimension(:, :, :), allocatable :: u, ref
    complex, dimension(:,:,:), allocatable :: u_permuted
    real :: max_norm, max_diff

    ! cufft stuff
    integer(c_size_t) :: worksize(1)
    type(cudaLibXtDesc), pointer :: u_desc
    type(cudaXtDesc), pointer    :: u_descptr
#ifdef _OPENACC
    complex, pointer             :: u_dptr(:,:,:)
    type(c_ptr)                  :: tmpcptr
#else
    complex, pointer, device     :: u_dptr(:,:,:)
#endif

    call mpi_init(ierr)
    call mpi_comm_size(MPI_COMM_WORLD,size,ierr)
    call mpi_comm_rank(MPI_COMM_WORLD,rank,ierr)

#ifdef _OPENACC
    ndevices = acc_get_num_devices(acc_device_nvidia)
    call acc_set_device_num(mod(rank, ndevices), acc_device_nvidia)
#else
    call checkCuda(cudaGetDeviceCount(ndevices))
    call checkCuda(cudaSetDevice(mod(rank, ndevices)))
#endif
    whichgpu(1) = mod(rank, ndevices)

    print*,"Hello from rank ",rank," gpu id",mod(rank,ndevices),"size",size

    nx = 256
    ny = nx
    nz = nx

    ! We start with X-Slabs
    ! Ranks 0 ... (nx % size - 1) have 1 more element in the X dimension
    ! and every rank own all elements in the Y and Z dimensions.
    ranks_cutoff = mod(nx, size)
    my_nx = nx / size
    if (rank < ranks_cutoff) my_nx = my_nx + 1
    my_ny =  ny;
    my_nz =  nz;
    ! Note nz is first dimension, nx third
    local_rshape = [2*(nz/2+1), ny, my_nx]
    local_permuted_cshape = [nz/2+1, ny/size, nx]
    local_rshape_permuted = [2*(nz/2+1), ny/size, nx]
    if (mod(ny, size) > 0) then
        print*," ny has to divide evenly by mpi_procs"
        call mpi_finalize(ierr)
    end if
    if (rank == 0) then
        write(*,*) "local_rshape          :", local_rshape(1:3)
        write(*,*) "local_permuted_cshape :", local_permuted_cshape(1:3)
    end if

    ! Generate local, distributed data
    allocate(u(local_rshape(1), local_rshape(2), local_rshape(3)))
    allocate(u_permuted(local_permuted_cshape(1), local_permuted_cshape(2), &
        local_permuted_cshape(3)))
    allocate(ref(local_rshape(1), local_rshape(2), local_rshape(3)))
    print*,'shape of u is ', shape(u)
    print*,'shape of u_permuted is ', shape(u_permuted)
    call generate_random(nz, local_rshape(1), local_rshape(2), &
        local_rshape(3), u)
    ref = u
    u_permuted = (0.0,0.0)

    call checkNorm(nz, local_rshape(1), local_rshape(2), local_rshape(3), &
        u, max_norm)
    print*, "initial data on ", rank, " max_norm is ", max_norm

    call checkCufft(cufftCreate(planr2c))
    call checkCufft(cufftCreate(planc2r))
    call checkCufft(cufftMpAttachComm(planr2c, CUFFT_COMM_MPI, &
        MPI_COMM_WORLD), 'cufftMpAttachComm error')
    call checkCufft(cufftMpAttachComm(planc2r, CUFFT_COMM_MPI, &
        MPI_COMM_WORLD), 'cufftMpAttachComm error')

    ! Note nx, ny, nz order
    call checkCufft(cufftMakePlan3d(planr2c, nx, ny, nz, CUFFT_R2C, &
        worksize), 'cufftMakePlan3d r2c error')
    call checkCufft(cufftMakePlan3d(planc2r, nx, ny, nz, CUFFT_C2R, &
        worksize), 'cufftMakePlan3d c2r error')

    call checkCufft(cufftXtMalloc(planr2c, u_desc, &
        CUFFT_XT_FORMAT_INPLACE), 'cufftXtMalloc error')

    ! These are equivalent, and work as well
    ! istat = cufftXtMemcpy(planr2c, u_desc, u, CUFFT_COPY_HOST_TO_DEVICE)
    ! call cufft_memcpyH2D(u_desc, u, CUFFT_XT_FORMAT_INPLACE, .false.)
    call cufft_memcpyH2D(u_desc, u, CUFFT_XT_FORMAT_INPLACE, .true.)

    ! now reset u to make sure the check later is valid
    u = 0.0

    !xxxxxxxxxxxxxxxxxxxxxxxxxx Forward
    call checkCufft(cufftXtExecDescriptor(planr2c, u_desc, u_desc, &
        CUFFT_FORWARD),'forward fft failed')

    ! in case we want to check the results after Forward
    !call checkCufft(cufftXtMemcpy(planr2c, u_permuted, u_desc, CUFFT_COPY_DEVICE_TO_HOST), 'permuted D2H error')
    !call checkNormComplex(local_permuted_cshape(1), local_permuted_cshape(2), local_permuted_cshape(3), u_permuted, max_norm)
    !write(*,'(A18, I1, A14, F25.8)') "after R2C ", rank, " max_norm is ", max_norm

    ! Data is now distributed as Y-Slab. We need to scale the output
    call c_f_pointer(u_desc%descriptor, u_descptr)

#ifdef _OPENACC
    tmpcptr = transfer(u_descptr%data(1), tmpcptr)
    call c_f_pointer(tmpcptr, u_dptr, local_permuted_cshape(1:3))

    call scalingData(local_permuted_cshape(1), local_permuted_cshape(2), &
        local_permuted_cshape(3), u_dptr, real(nx*ny*nz))
#else
    call c_f_pointer(u_descptr%data(1), u_dptr, local_permuted_cshape(1:3))
    !$cuf kernel do (3)
    do k =1, local_permuted_cshape(3)
        do j = 1, local_permuted_cshape(2)
            do i = 1, local_permuted_cshape(1)
                u_dptr(i,j,k) = u_dptr(i,j,k) / real(nx*ny*nz)
            end do
        end do
    end do
    call checkCuda(cudaDeviceSynchronize())
#endif

    ! in case we want to check again after scaling
    call checkCufft(cufftXtMemcpy(planr2c, u_permuted, u_desc, &
        CUFFT_COPY_DEVICE_TO_HOST), 'permuted D2H error')
    call checkNormComplex(local_permuted_cshape(1), &
        local_permuted_cshape(2), local_permuted_cshape(3), &
        u_permuted, max_norm)
    write(*,'(A18, I1, A14, F25.8)') "after scaling ", rank, &
        " max_norm is ", max_norm

    !xxxxxxxxxxxxxxxxxxxxxxxxxxxx inverse
    call checkCufft(cufftXtExecDescriptor(planc2r, u_desc, u_desc, &
        CUFFT_INVERSE), 'inverse fft failed')

    ! These are equivalent, and work as well
    ! istat = cufftXtMemcpy(planc2r, u, u_desc, CUFFT_COPY_DEVICE_TO_HOST)
    ! call cufft_memcpyD2H(u, u_desc, CUFFT_XT_FORMAT_INPLACE, .false.)
    call cufft_memcpyD2H(u, u_desc, CUFFT_XT_FORMAT_INPLACE, .true.)

    call checkCufft(cufftXtFree(u_desc))
    call checkCufft(cufftDestroy(planr2c))
    call checkCufft(cufftDestroy(planc2r))

    call checkNormDiff(nz, local_rshape(1), local_rshape(2), &
        local_rshape(3), u, ref, max_norm, max_diff)
    write(*,'(A18,I1,A14,F25.8,A14,F15.8)') "after C2R ", rank, &
        " max_norm is ", max_norm, " max_diff is ", max_diff
    write(*,'(A25,I1,A14,F25.8)') "Relative Linf on rank ", rank, &
        " is ", max_diff/max_norm
    deallocate(u)
    deallocate(ref)
    deallocate(u_permuted)

    call mpi_finalize(ierr)

    if(max_diff / max_norm > 1e-5) then
        print*, ">>>> FAILED on rank ", rank
        stop 1
    else
        print*, ">>>> PASSED on rank ", rank
    end if

contains
#ifdef _CUDA
    subroutine checkCuda(istat, message)
        implicit none
        integer, intent(in)                   :: istat
        character(len=*),intent(in), optional :: message
        if (istat /= cudaSuccess) then
            write(*,"('Error code: ',I0, ': ')") istat
            write(*,*) cudaGetErrorString(istat)
            if(present(message)) write(*,*) message
            call mpi_finalize(ierr)
        endif
    end subroutine checkCuda
#endif

    subroutine checkCufft(istat, message)
        implicit none
        integer, intent(in)                   :: istat
        character(len=*),intent(in), optional :: message
        if (istat /= CUFFT_SUCCESS) then
            write(*,"('Error code: ',I0, ': ')") istat
!@cuf       write(*,*) cudaGetErrorString(istat)
            if(present(message)) write(*,*) message
            call mpi_finalize(ierr)
        endif
    end subroutine checkCufft

    subroutine generate_random(nz1, nz, ny, nx, data)
        implicit none
        integer, intent(in) :: nx, ny, nz, nz1
        real, dimension(nz, ny, nx), intent(out) :: data
        real :: rand(1)
        integer :: i,j,k
        !call random_seed(put=(/seed, seed+1/))
        do k =1, nx
            do j = 1, ny
                do i = 1, nz1
                    call random_number(rand)
                    data(i,j,k) = rand(1)
                end do
            end do
        end do

    end subroutine generate_random

    subroutine checkNorm(nz1, nz, ny, nx, data, max_norm)
        implicit none
        integer, intent(in)  :: nx, ny, nz, nz1
        real, dimension(nz, ny, nx), intent(in) :: data
        real :: max_norm
        integer :: i, j, k
        max_norm = 0
        do k =1, nx
            do j = 1, ny
                do i = 1, nz1
                    max_norm = max(max_norm, abs(data(i,j,k)))
                end do
            end do
        end do
    end subroutine checkNorm

    subroutine checkNormComplex(nz, ny, nx, data, max_norm)
        implicit none
        integer, intent(in)  :: nx, ny, nz
        complex, dimension(nz, ny, nx), intent(in) :: data
        real :: max_norm, max_diff
        integer :: i,j,k
        max_norm = 0
        do k =1, nx
            do j = 1, ny
                do i = 1, nz
                    max_norm = max(max_norm, abs(data(i,j,k)%re))
                    max_norm = max(max_norm, abs(data(i,j,k)%im))
                end do
            end do
        end do
    end subroutine checkNormComplex

    subroutine checkNormDiff(nz1, nz, ny, nx, data, ref, max_norm, max_diff)
        implicit none
        integer, intent(in)  :: nx, ny, nz, nz1
        real, dimension(nz, ny, nx), intent(in) :: data, ref
        real :: max_norm, max_diff
        max_norm = 0
        max_diff = 0
        do k =1, nx
          do j = 1, ny
            do i = 1, nz1
              max_norm = max(max_norm, abs(data(i,j,k)))
              max_diff = max(max_diff, abs(ref(i,j,k)-data(i,j,k)))
              if (abs(ref(i,j,k)-data(i,j,k)) > 0.0001) then
                write(*,'(A9,I3,I3,I3,A2,F18.8,A7,F18.8,A9,I2)') "diff ref[", &
                  i,j,k,"]",ref(i,j,k),"data ",data(i,j,k)," at rank",rank
              end if
            end do
          end do
        end do
    end subroutine checkNormDiff

#ifdef _OPENACC
    subroutine scalingData(nz, ny, nx, data, factor)
        implicit none
        integer, intent(in)  :: nx, ny, nz
        complex, dimension(nz, ny, nz) :: data
        !$acc declare deviceptr(data)
        real, intent(in)     :: factor

    !$acc parallel loop collapse(3)
        do k =1, nx
            do j = 1, ny
                do i = 1, nz
                    data(i, j, k) = data(i, j, k) / factor
                end do
            end do
        end do

    end subroutine scalingData
#endif

subroutine cufft_memcpyH2D(ulibxt, u_h, data_format, ismemcpy)
    implicit none
    type(cudaLibXtDesc), pointer, intent(out) :: ulibxt
    real, dimension(*), intent(in)          :: u_h
    integer, intent(in)                         :: data_format
    logical, intent(in)                         :: ismemcpy
    type(cudaXtDesc), pointer  :: uxt
!@cuf real, dimension(:,:,:), device, pointer :: u_d

    if (ismemcpy == .false.) then
        call checkCufft(cufftXtMemcpy(planc2r, ulibxt, u_h, &
                 CUFFT_COPY_HOST_TO_DEVICE), "cufft_memcpyHToD Error")
    else
        call c_f_pointer(ulibxt%descriptor, uxt)
        if(data_format == CUFFT_XT_FORMAT_INPLACE_SHUFFLED) then
#ifdef _OPENACC
            call acc_memcpy_to_device(uxt%data(1), u_h, &
                     product(int(local_rshape_permuted,kind=8))*4_8) ! bytes
#else
            call c_f_pointer(uxt%data(1), u_d, local_rshape_permuted)
            call checkCuda(cudaMemcpy(u_d, u_h, &
                     product(int(local_rshape_permuted,kind=8))), &
                     "cudamemcpy H2D Error")
#endif
        else if (data_format == CUFFT_XT_FORMAT_INPLACE) then
#ifdef _OPENACC
            call acc_memcpy_to_device(uxt%data(1), u_h, &
                     product(int(local_rshape,kind=8))*4_8) ! bytes
#else
            call c_f_pointer(uxt%data(1), u_d, local_rshape)
            call checkCuda(cudaMemcpy(u_d, u_h, &
                     product(int(local_rshape,kind=8))), &
                     "cudamemcpy H2D Error")
#endif
        endif
    endif
end subroutine cufft_memcpyH2D


subroutine cufft_memcpyD2H(u_h, ulibxt, data_format,ismemcpy)
    implicit none
    type(cudaLibXtDesc), pointer, intent(in) :: ulibxt
    real, dimension(*), intent(out)      :: u_h
    integer, intent(in)                      :: data_format
    logical, intent(in)                      :: ismemcpy
    type(cudaXtDesc), pointer  :: uxt
!@cuf real, dimension(:,:,:), device, pointer :: u_d

    if (ismemcpy == .false.) then
        call checkCufft(cufftXtMemcpy(planr2c, u_h, ulibxt, &
                 CUFFT_COPY_DEVICE_TO_HOST), "cufft_memcpyDToH Error")
    else
        call c_f_pointer(ulibxt%descriptor, uxt)
        if(data_format == CUFFT_XT_FORMAT_INPLACE_SHUFFLED) then
#ifdef _OPENACC
            call acc_memcpy_from_device(u_h, uxt%data(1), &
                     product(int(local_rshape_permuted,kind=8))*4_8) ! bytes
#else
            call c_f_pointer(uxt%data(1), u_d, local_rshape_permuted)
            call checkCuda(cudaMemcpy(u_h, u_d, &
                     product(int(local_rshape_permuted,kind=8))), &
                     "cudamemcpy D2H Error")
#endif
        else if (data_format == CUFFT_XT_FORMAT_INPLACE) then
#ifdef _OPENACC
            call acc_memcpy_from_device(u_h, uxt%data(1), &
                     product(int(local_rshape,kind=8))*4_8) ! bytes
#else
            call c_f_pointer(uxt%data(1), u_d, local_rshape)
            call checkCufft(cudamemcpy(u_h, u_d, &
                     product(int(local_rshape,kind=8))), &
                     "cufft_memcpyD2H error")
#endif
        endif
    endif
end subroutine cufft_memcpyD2H


end program cufftmp_r2c

Using cuRAND from OpenACC Host Code

This example demonstrates the use of the curand module, the curandHandle type, and several forms of rand calls.

Simple cuRAND Tests

program testcurand
! compile with the flags -ta=tesla -cuda -cudalib=curand
call cur1(1000, .true.); call cur1(1000, .false.)
call cur2(1000, .true.); call cur2(1000, .false.)
call cur3(1000, .true.); call cur3(1000, .false.)
end
!
subroutine cur1(n, onhost)
use curand
integer :: a(n)
type(curandGenerator) :: g
integer(8) nbits
logical onhost, passing
a = 0
passing = .true.
if (onhost) then
  istat = curandCreateGeneratorHost(g,CURAND_RNG_PSEUDO_XORWOW)
  istat = curandGenerate(g, a, n)
  istat = curandDestroyGenerator(g)
else
  !$acc data copy(a)
  istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
  !$acc host_data use_device(a)
  istat = curandGenerate(g, a, n)
  !$acc end host_data
  istat = curandDestroyGenerator(g)
  !$acc end data
endif
nbits = 0
do i = 1, n
  if (i.lt.10) print *,i,a(i)
  nbits = nbits + popcnt(a(i))
end do
print *,"Should be roughly half the bits set"
nbits = nbits / n
if ((nbits .lt. 12) .or. (nbits .gt. 20)) then
  passing = .false.
else
  print *,"nbits is ",nbits," which passes"
endif
if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end
!
subroutine cur2(n, onhost)
use curand
real :: a(n)
type(curandGenerator) :: g
logical onhost, passing
a = 0.0
passing = .true.
if (onhost) then
  istat = curandCreateGeneratorHost(g,CURAND_RNG_PSEUDO_XORWOW)
  istat = curandGenerate(g, a, n)
  istat = curandDestroyGenerator(g)
else
  !$acc data copy(a)
  istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
  !$acc host_data use_device(a)
  istat = curandGenerate(g, a, n)
  !$acc end host_data
  istat = curandDestroyGenerator(g)
  !$acc end data
endif
print *,"Should be uniform around 0.5"
do i = 1, n
  if (i.lt.10) print *,i,a(i)
  if ((a(i).lt.0.0) .or. (a(i).gt.1.0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif
if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end
!
subroutine cur3(n, onhost)
use curand
real(8) :: a(n)
type(curandGenerator) :: g
logical onhost, passing
a = 0.0d0
passing = .true.
if (onhost) then
  istat = curandCreateGeneratorHost(g,CURAND_RNG_PSEUDO_XORWOW)
  istat = curandGenerate(g, a, n)
  istat = curandDestroyGenerator(g)
else
  !$acc data copy(a)
  istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
  !$acc host_data use_device(a)
  istat = curandGenerate(g, a, n)
  !$acc end host_data
  istat = curandDestroyGenerator(g)
  !$acc end data
endif
do i = 1, n
  if (i.lt.10) print *,i,a(i)
  if ((a(i).lt.0.0d0) .or. (a(i).gt.1.0d0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4d0) .or. (rmean .gt. 0.6d0)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif
if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end

Using cuRAND from OpenACC Device Code

This example demonstrates the use of the curand_device module from a CUDA Fortran global subroutines.

Simple cuRAND Test from OpenACC Device Code

module mtests
  integer, parameter :: n = 1000
  contains
    subroutine testrand( a, b )
    use openacc_curand
    real :: a(n), b(n)
    type(curandStateXORWOW) :: h
    integer(8) :: seed, seq, offset

    !$acc parallel num_gangs(1) vector_length(1) copy(a,b) private(h)
    seed = 12345
    seq = 0
    offset = 0
    call curand_init(seed, seq, offset, h)
    !$acc loop seq
    do i = 1, n
      a(i) = curand_uniform(h)
      b(i) = curand_normal(h)
    end do
    !$acc end parallel
    return
    end subroutine
end module mtests

program t
use mtests
real :: a(n), b(n), c(n)
logical passing
a = 1.0
b = 2.0
passing = .true.
call testrand(a,b)
c = a
print *,"Should be uniform around 0.5"
do i = 1, n
  if (i.lt.10) print *,i,c(i)
  if ((c(i).lt.0.0) .or. (c(i).gt.1.0)) passing = .false.
end do
rmean = sum(c)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif
c = b
print *,"Should be normal around 0.0"
nc1 = 0;
nc2 = 0;
do i = 1, n
  if (i.lt.10) print *,i,c(i)
  if ((c(i) .gt. -4.0) .and. (c(i) .lt. 0.0)) nc1 = nc1 + 1
  if ((c(i) .gt.  0.0) .and. (c(i) .lt. 4.0)) nc2 = nc2 + 1
end do
print *,"Found on each side of zero ",nc1,nc2
if (abs(nc1-nc2) .gt. (n/10)) npassing = .false.
rmean = sum(c,mask=abs(c).lt.4.0)/n
if ((rmean .lt. -0.1) .or. (rmean .gt. 0.1)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif

if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end program

Using cuRAND from CUDA Fortran Host Code

This example demonstrates the use of the curand module, the curandHandle type, and several forms of rand calls.

Simple cuRAND Test

program testcurand1
call testr1(1000)
call testr2(1000)
call testr3(1000)
end
!
subroutine testr1(n)
use cudafor
use curand
integer, managed :: a(n)
type(curandGenerator) :: g
integer(8) nbits
logical passing
a = 0
passing = .true.
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = cudaDeviceSynchronize()
istat = curandDestroyGenerator(g)
nbits = 0
do i = 1, n
  if (i.lt.10) print *,i,a(i)
  nbits = nbits + popcnt(a(i))
end do
print *,"Should be roughly half the bits set"
nbits = nbits / n
if ((nbits .lt. 12) .or. (nbits .gt. 20)) then
  passing = .false.
else
  print *,"nbits is ",nbits," which passes"
endif
if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end
!
subroutine testr2(n)
use cudafor
use curand
real, managed :: a(n)
type(curandGenerator) :: g
logical passing
a = 0.0
passing = .true.
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = cudaDeviceSynchronize()
istat = curandDestroyGenerator(g)
print *,"Should be uniform around 0.5"
do i = 1, n
  if (i.lt.10) print *,i,a(i)
  if ((a(i).lt.0.0) .or. (a(i).gt.1.0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif
if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end
!
subroutine testr3(n)
use cudafor
use curand
real(8), managed :: a(n)
type(curandGenerator) :: g
logical passing
a = 0.0d0
passing = .true.
istat = curandCreateGenerator(g,CURAND_RNG_PSEUDO_XORWOW)
istat = curandGenerate(g, a, n)
istat = cudaDeviceSynchronize()
istat = curandDestroyGenerator(g)
do i = 1, n
  if (i.lt.10) print *,i,a(i)
  if ((a(i).lt.0.0d0) .or. (a(i).gt.1.0d0)) passing = .false.
end do
rmean = sum(a)/n
if ((rmean .lt. 0.4d0) .or. (rmean .gt. 0.6d0)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif
if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end
end program

Using cuRAND from CUDA Fortran Device Code

This example demonstrates the use of the curand_device module from a CUDA Fortran global subroutines.

Simple cuRAND Test from Device Code

module mtests
  use curand_device
  integer, parameter :: n = 10000
  contains
    attributes(global) subroutine testr( a, b )
    real, device :: a(n), b(n)
    type(curandStateXORWOW) :: h
    integer(8) :: seed, seq, offset
    integer :: iam
    iam = threadIdx%x
    seed = iam*64 + 12345
    seq = 0
    offset = 0
    call curand_init(seed, seq, offset, h)
    do i = iam, n, blockdim%x
      a(i) = curand_uniform(h)
      b(i) = curand_normal(h)
    end do
    return
    end subroutine
end module mtests

program t
use mtests
real, allocatable, device :: a(:), b(:)
real c(n), rmean
logical passing
allocate(a(n))
allocate(b(n))
a = 0.0
b = 0.0
passing = .true.
call testr<<<1,32>>> (a,b)
c = a
print *,"Should be uniform around 0.5"
do i = 1, n
  if (i.lt.10) print *,i,c(i)
  if ((c(i).lt.0.0) .or. (c(i).gt.1.0)) passing = .false.
end do
rmean = sum(c)/n
if ((rmean .lt. 0.4) .or. (rmean .gt. 0.6)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif

c = b
print *,"Should be normal around 0.0"
nc1 = 0;
nc2 = 0;
do i = 1, n
  if (i.lt.10) print *,i,c(i)
  if ((c(i) .gt. -4.0) .and. (c(i) .lt. 0.0)) nc1 = nc1 + 1
  if ((c(i) .gt.  0.0) .and. (c(i) .lt. 4.0)) nc2 = nc2 + 1
end do
print *,"Found on each side of zero ",nc1,nc2
if (abs(nc1-nc2) .gt. (n/10)) npassing = .false.
rmean = sum(c,mask=abs(c).lt.4.0)/n
if ((rmean .lt. -0.1) .or. (rmean .gt. 0.1)) then
  passing = .false.
else
  print *,"Mean is ",rmean," which passes"
endif

if (passing) then
  print *,"Test PASSED"
else
  print *,"Test FAILED"
endif
end
end program

Using cuSPARSE from OpenACC Host Code

This example demonstrates the use of the cuSPARSE module, the cusparseHandle type, and several calls to the cuSPARSE library.

Simple BLAS Test

program sparseMatVec
    integer n
    n = 25 ! # rows/cols in dense matrix
    call sparseMatVecSub1(n)
    n = 45 ! # rows/cols in dense matrix
    call sparseMatVecSub1(n)
end program

subroutine sparseMatVecSub1(n)
  use openacc
  use cusparse

  implicit none

  integer n

  ! dense data
  real(4), allocatable :: Ade(:,:), x(:), y(:)

  ! sparse CSR arrays
  real(4), allocatable :: csrValA(:)
  integer, allocatable :: nnzPerRowA(:), csrRowPtrA(:), csrColIndA(:)

  allocate(Ade(n,n), x(n), y(n))
  allocate(csrValA(n))
  allocate(nnzPerRowA(n), csrRowPtrA(n+1), csrColIndA(n))

  call sparseMatVecSub2(Ade, x, y, csrValA, nnzPerRowA, csrRowPtrA, &
                                                         csrColIndA, n)
  deallocate(Ade)
  deallocate(x)
  deallocate(y)
  deallocate(csrValA)
  deallocate(nnzPerRowA)
  deallocate(csrRowPtrA)
  deallocate(csrColIndA)
end subroutine

subroutine sparseMatVecSub2(Ade, x, y, csrValA, nnzPerRowA, csrRowPtrA, &
                                                             csrColIndA, n)
  use openacc
  use cusparse

  implicit none

  ! dense data
  real(4) :: Ade(n,n), x(n), y(n)

  ! sparse CSR arrays
  real(4) :: csrValA(n)
  integer :: nnzPerRowA(n), csrRowPtrA(n+1), csrColIndA(n)

  integer :: n, nnz, status, i
  type(cusparseHandle) :: h
  type(cusparseMatDescr) :: descrA

  ! parameters
  real(4) :: alpha, beta

  ! result
  real(4) :: xerr

  ! initalize CUSPARSE and matrix descriptor
  status = cusparseCreate(h)
  if (status /= CUSPARSE_STATUS_SUCCESS) &
       write(*,*) 'cusparseCreate error: ', status
  status = cusparseCreateMatDescr(descrA)
  status = cusparseSetMatType(descrA, &
       CUSPARSE_MATRIX_TYPE_GENERAL)
  status = cusparseSetMatIndexBase(descrA, &
       CUSPARSE_INDEX_BASE_ONE)
  status = cusparseSetStream(h, acc_get_cuda_stream(acc_async_sync))

  !$acc data create(Ade, x, y, csrValA, nnzPerRowA, csrRowPtrA, csrColIndA)

  ! Initialize matrix (upper circular shift matrix)
  !$acc kernels
  Ade = 0.0
  do i = 1, n-1
     Ade(i,i+1) = 1.0
  end do
  Ade(n,1) = 1.0

  ! Initialize vectors and constants
  do i = 1, n
     x(i) = i
  enddo
  y = 0.0
  !$acc end kernels

  !$acc update host(x)
  write(*,*) 'Original vector:'
  write(*,'(5(1x,f7.2))') x

  ! convert matrix from dense to csr format
  !$acc host_data use_device(Ade, nnzPerRowA, csrValA, csrRowPtrA, csrColIndA)
  status = cusparseSnnz_v2(h, CUSPARSE_DIRECTION_ROW, &
       n, n, descrA, Ade, n, nnzPerRowA, nnz)
  status = cusparseSdense2csr(h, n, n, descrA, Ade, n, &
       nnzPerRowA, csrValA, csrRowPtrA, csrColIndA)
  !$acc end host_data

  ! A is upper circular shift matrix
  ! y = alpha*A*x + beta*y
  alpha = 1.0
  beta = 0.0
  !$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, x, y)
  status = cusparseScsrmv(h, CUSPARSE_OPERATION_NON_TRANSPOSE, &
       n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
       csrColIndA, x, beta, y)
  !$acc end host_data

  !$acc wait
  write(*,*) 'Shifted vector:'
  write(*,'(5(1x,f7.2))') y

  ! shift-down y and add original x
  ! A' is lower circular shift matrix
  ! x = alpha*A'*y + beta*x
  beta = -1.0
  !$acc host_data use_device(csrValA, csrRowPtrA, csrColIndA, x, y)
  status = cusparseScsrmv(h, CUSPARSE_OPERATION_TRANSPOSE, &
       n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
       csrColIndA, y, beta, x)
  !$acc end host_data

  !$acc kernels
  xerr = maxval(abs(x))
  !$acc end kernels
  !$acc end data

  write(*,*) 'Max error = ', xerr
  if (xerr.le.1.e-5) then
    write(*,*) 'Test PASSED'
  else
    write(*,*) 'Test FAILED'
  endif

end subroutine

Using cuSPARSE from CUDA Fortran Host Code

This example demonstrates the use of the cuSPARSE module, the cusparseHandle type, and several forms of cusparse calls.

Simple BLAS Test

program sparseMatVec
  use cudafor
  use cusparse

  implicit none

  integer, parameter :: n = 25 ! # rows/cols in dense matrix

  type(cusparseHandle) :: h
  type(cusparseMatDescr) :: descrA

  ! dense data
  real(4), managed :: Ade(n,n), x(n), y(n)

  ! sparse CSR arrays
  real(4), managed :: csrValA(n)
  integer, managed :: nnzPerRowA(n), &
       csrRowPtrA(n+1), csrColIndA(n)
  integer :: nnz, status, i

  ! parameters
  real(4) :: alpha, beta

  ! initalize CUSPARSE and matrix descriptor
  status = cusparseCreate(h)
  if (status /= CUSPARSE_STATUS_SUCCESS) &
       write(*,*) 'cusparseCreate error: ', status
  status = cusparseCreateMatDescr(descrA)
  status = cusparseSetMatType(descrA, &
       CUSPARSE_MATRIX_TYPE_GENERAL)
  status = cusparseSetMatIndexBase(descrA, &
       CUSPARSE_INDEX_BASE_ONE)

  ! Initialize matrix (upper circular shift matrix)
  Ade = 0.0
  do i = 1, n-1
     Ade(i,i+1) = 1.0
  end do
  Ade(n,1) = 1.0

  ! Initialize vectors and constants
  x = [(i,i=1,n)]
  y = 0.0

  write(*,*) 'Original vector:'
  write(*,'(5(1x,f7.2))') x

  ! convert matrix from dense to csr format
  status = cusparseSnnz_v2(h, CUSPARSE_DIRECTION_ROW, &
       n, n, descrA, Ade, n, nnzPerRowA, nnz)
  status = cusparseSdense2csr(h, n, n, descrA, Ade, n, &
       nnzPerRowA, csrValA, csrRowPtrA, csrColIndA)

  ! A is upper circular shift matrix
  ! y = alpha*A*x + beta*y
  alpha = 1.0
  beta = 0.0
  status = cusparseScsrmv(h, CUSPARSE_OPERATION_NON_TRANSPOSE, &
       n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
       csrColIndA, x, beta, y)

  ! shift-down y and add original x
  ! A' is lower circular shift matrix
  ! x = alpha*A'*y + beta*x
  beta = -1.0
  status = cusparseScsrmv(h, CUSPARSE_OPERATION_TRANSPOSE, &
       n, n, n, alpha, descrA, csrValA, csrRowPtrA, &
       csrColIndA, y, beta, x)

  status = cudaDeviceSynchronize()

  write(*,*) 'Shifted vector:'
  write(*,'(5(1x,f7.2))') y

  write(*,*) 'Max error = ', maxval(abs(x))

  if (maxval(abs(x)).le.1.e-5) then
    write(*,*) 'Test PASSED'
  else
    write(*,*) 'Test FAILED'
  endif

end program sparseMatVec

Using cuTENSOR from CUDA Fortran Host Code

This example demonstrates the use of the low-level cuTENSOR module, version 2, from CUDA Fortran to perform a reshape permutation and a sum reduction across one dimension.

This example can be compiled and linked as a normal CUDA Fortran subprogram by adding the “-cudalib=cutensor” option to the link line.

Simple cuTENSOR Test from CUDA Fortran

program testcutensorcuf1
use cudafor
use cutensor

integer, parameter :: ndim = 3
real, managed, allocatable :: dA(:,:,:), dC(:,:)
real, allocatable :: hA(:,:,:), hC(:,:)
real, device, allocatable :: workbuf(:)
real :: alpha, beta
integer(4) :: numModesA, numModesC
integer(8), dimension(ndim) :: extA, strA, extC, strC
integer(4), dimension(ndim) :: modeA, modeC
integer(8) :: workbufsize

type(cutensorStatus) :: cstat
type(cutensorHandle) :: handle
type(cutensorTensorDescriptor) :: descA, descC
type(cutensorOperationDescriptor) :: rdesc
type(cutensorComputeDescriptor) :: descCompute
type(cutensorPlan) :: plan
type(cutensorAlgo) :: algo
type(cutensorPlanPreference) :: pref

! Init
cstat = cutensorCreate(handle)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

descCompute = CUTENSOR_COMPUTE_DESC_32F

allocate(dA(100,60,80))
! This is the operation we are going to perform
! dC = sum(reshape(dA,shape=[80,60,100],order=[3,2,1]),dim=2)
!
call random_number(dA); dA = real(int(dA * 10.0))
extA = shape(dA)
strA(1) =  1; strA(2) = 100; strA(3) = 6000
modeA(1) = 3; modeA(2) = 2; modeA(3) = 1
numModesA = ndim
ialign = 256

print *,"Desc A"
cstat = cutensorCreateTensorDescriptor(handle, descA, numModesA, extA, strA, &
              CUTENSOR_R_32F, ialign)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

allocate(dC(80,100))
extC(1:ndim-1) = shape(dC)
strC(1) = 1; strC(2) = 80
numModesC = ndim-1
dC = 0.0
modeC(1) = 1; modeC(2) = 3  ! Skip mode 2 for reduction across that dim

print *,"Desc C"
cstat = cutensorCreateTensorDescriptor(handle, descC, numModesC, extC, strC, &
              CUTENSOR_R_32F, ialign)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

print *,"CreateRed "
cstat = cutensorCreateReduction(handle, rdesc, &
              descA, modeA, CUTENSOR_OP_IDENTITY, &
              descC, modeC, CUTENSOR_OP_IDENTITY, &
              descC, modeC, CUTENSOR_OP_ADD, descCompute)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

print *,"CreatePlanPref "
cstat = cutensorCreatePlanPreference(handle, pref, CUTENSOR_ALGO_DEFAULT, &
              CUTENSOR_JIT_MODE_NONE)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

print *,"EstimateWork "
cstat = cutensorEstimateWorkspaceSize(handle, rdesc, pref, &
              CUTENSOR_WORKSPACE_DEFAULT, workbufsize)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

print *,"Estimated workspace size: ",workbufsize

print *,"CreatePlan "
cstat = cutensorCreatePlan(handle, plan, rdesc, pref, workbufsize)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

allocate(workbuf((workbufsize+3)/4))

alpha = 1.0; beta = 0.0
print *,"Reduce "
cstat = cutensorReduce(handle, plan, alpha, dA, beta, dC, &
              dC, workbuf, workbufsize, 0)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

hA = dA
hC = sum(reshape(hA,[80,60,100],order=[3,2,1]), dim=2)
istat = cudaDeviceSynchronize()  ! Managed memory, to be sure

if (all(hC.eq.dC)) then
  print *,"test PASSED"
else
  print *,"test FAILED"
end if

cstat = cutensorDestroy(handle)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyPlan(plan)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyPlanPreference(pref)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyOperationDescriptor(rdesc)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyTensorDescriptor(descA)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)
cstat = cutensorDestroyTensorDescriptor(descC)
if (cstat.ne.CUTENSOR_STATUS_SUCCESS) print *,cutensorGetErrorString(cstat)

deallocate(workbuf)
deallocate(dA, dC)
deallocate(hA, hC)

end program

Using cuTENSOREX from CUDA Fortran Host Code

This example demonstrates the use of the higher-level cuTENSOREX module from CUDA Fortran to perform a large matrix multiplication using multiple OpenMP threads.

This example can be compiled and linked as a normal CUDA Fortran subprogram by adding the “-mp -cudalib=cutensor” options to the compile and link line.

Multi-threaded cuTENSOREX Example from CUDA Fortran

! Test cuTensor + cuda Fortran + OMP multi-stream matmul
program testCuCufMsMatmul
use cudafor
use cutensorex

integer, parameter :: m=8192, k=1280, n=1024
integer, parameter :: mblksize = 128
integer, parameter :: mslices = m / mblksize
integer, parameter :: nstreams = 4
integer, parameter :: numtimes = mslices / nstreams

real(8), allocatable, dimension(:,:), device :: a_d, d_d
real(8), allocatable, dimension(:,:,:), pinned :: ha
real(8), dimension(k,n), device :: b_d
real(8), allocatable, dimension(:,:), pinned :: d
real(8) :: alpha
integer(kind=cuda_stream_kind) :: mystream
!$OMP THREADPRIVATE(a_d, d_d, mystream)

allocate( ha(k,mblksize,nstreams))
allocate(  d(1:m,1:n))

b_d   = 1.0d0
alpha = 1.0d0

!$OMP PARALLEL NUM_THREADS(nstreams) PRIVATE(istat)
istat = cudaStreamCreate(mystream)
istat = cudaforSetDefaultStream(mystream)
istat = cutensorExSetStream(mystream)
! At this point, all new allocations will pick up default stream
allocate(a_d(k,mblksize))
allocate(d_d(mblksize,n))
!$OMP END PARALLEL

! Test matmul
!$OMP PARALLEL DO NUM_THREADS(nstreams) PRIVATE(jlcl,jgbl,jend)
do ns = 1, nstreams
    do nt = 1, numtimes
        jgbl = 1 + ((ns-1) + (nt-1)*nstreams)*mblksize
        jend = jgbl + mblksize - 1
        ! Do some host work
        do jlcl = 1, mblksize
            ha(:,jlcl,ns) = dble(jlcl+jgbl-1)
        end do
        ! Move data to the device on default stream
        a_d = ha(:,:,ns)
        ! Matrix multiply on my thread cutensorEx stream
        d_d = alpha * matmul(transpose(a_d),b_d)
        ! Move result back to host on default stream
        d(jgbl:jend,:) = d_d
    end do
end do
! Wait for all threads to finish GPU work
istat = cudaDeviceSynchronize()
nfailed = 0
do j = 1, n
    do i = 1, m
        if (d(i,j) .ne. i*k) then
            if (nfailed .lt. 100) print *,i,j,d(i,j)
            nfailed = nfailed + 1
        end if
    end do
end do
if (nfailed .eq. 0) then
    print *,"test PASSED"
else
    print *,"test FAILED"
endif
end program

Using cuTENSOR from OpenACC Host Code

This example demonstrates the use of the cuTENSOREX module, calling Matmul() using OpenACC device data, and setting the cuTENSOR library stream to be consistent with the OpenACC default stream.

This example can be compiled and run with or without OpenACC. To compile with OpenACC, the options are “-ta=tesla -cuda -cudalib=cutensor”. To run on the CPU, leave off those options.

Simple cuTENSOREX Test from OpenACC

program testcutensorOpenACC
!@acc use openacc
!@acc use cutensorex
integer, parameter :: ni=1280, nj=1024, nk=960, ntimes=1
real(8) :: a(ni,nk), b(nk,nj), c(ni,nj), d(ni,nj)

call random_number(a)
call random_number(b)
a = dble(int(4.0d0*a - 2.0d0))
b = dble(int(8.0d0*b - 4.0d0))
c = 2.0; d = 0.0

!$acc enter data copyin(a,b,c) create(d)
!@acc istat = cutensorExSetStream(acc_get_cuda_stream(acc_async_sync))
!$acc host_data use_device(a,b,c,d)
do nt = 1, ntimes
  d = c + matmul(a,b)
end do
!$acc end host_data

!$acc update host(d)
print *,sum(d)

do nt = 1, ntimes
!$acc kernels
  do j = 1, nj
    do i = 1, ni
      d(i,j) = c(i,j)
      do k = 1, nk
        d(i,j) = d(i,j) + a(i,k) * b(k,j)
      end do
    end do
  end do
!$acc end kernels
end do
!$acc exit data copyout(d)

print *,sum(d)
end program