Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
68f95a0
add select-rank-less versions of mpp_pack and mpp_global_field
May 14, 2026
e0be911
quick fix for class(*) in diag_data
May 14, 2026
f247654
remove whitespace
May 14, 2026
f41c8ac
fix indent in diag_data
May 18, 2026
e523d22
add gpu2gpu mpi transer with flag for do_group_update
edoyango Aug 28, 2025
c2a0391
add missing collapse(3) clauses
edoyango Sep 2, 2025
e0172fd
Use __NVCOMPILER macro for target regions
edoyango Oct 10, 2025
3947369
add back old omp directive wrapped in #ifndef __NVCOMPILER
edoyango Oct 10, 2025
d622346
port remaining un/pack loops
edoyango Oct 10, 2025
57b1989
add multi gpu support (#2)
JorgeG94 Oct 30, 2025
715763e
sub __NVCOMPILER with __NVCOMPILER_OPENMP_GPU
edoyango Oct 14, 2025
72fbc7b
allow choice of gpu or cpu parallel
edoyango Dec 16, 2025
8b84417
fix omp set device call
edoyango Jan 15, 2026
fc36b70
Revert "allow choice of gpu or cpu parallel"
edoyango Feb 16, 2026
487cbfc
OMP MPI: Minor cleanups
marshallward Mar 16, 2026
af47ce2
OMP target MPI: line length compliance
marshallward Mar 16, 2026
dab46c3
OMP MPI: Update nocomm interface
marshallward Mar 17, 2026
51754e0
use openmp cpu if ompoffload=.false.
edoyango Apr 17, 2026
9b45a30
remove trailing whitespace in comments
edoyango May 5, 2026
f883340
add changes for group update test
Apr 29, 2026
93e03ac
fix k outside path and normalise omp offload clauses
edoyango May 6, 2026
11f93e7
offload folded-north post-processing
edoyango May 6, 2026
7b28243
enable group update omp offload tests
edoyango May 6, 2026
d317c75
mpp: fix uninitialized error text on single-PE runs
edoyango Jul 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions diag_manager/diag_data.F90
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,9 @@ MODULE diag_data_mod
class(*), allocatable :: att_value(:) !< Value of the attribute
character(len=:), allocatable :: att_name !< Name of the attribute
contains
#ifndef __NVCOMPILER
procedure :: add => fms_add_attribute
#endif
procedure :: write_metadata
end type fmsDiagAttribute_type
! Include variable "version" to be written to log file.
Expand Down Expand Up @@ -558,6 +560,7 @@ function get_base_second() &
res = base_second
end function get_base_second

#ifndef __NVCOMPILER
!> @brief Adds an attribute to the attribute type
subroutine fms_add_attribute(this, att_name, att_value)
class(fmsDiagAttribute_type), intent(inout) :: this !< Diag attribute type
Expand Down Expand Up @@ -589,6 +592,7 @@ subroutine fms_add_attribute(this, att_name, att_value)
end select
end select
end subroutine fms_add_attribute
#endif

!> @brief gets the type of a variable
!> @return the type of the variable (r4,r8,i4,i8,string)
Expand Down
347 changes: 275 additions & 72 deletions mpp/include/group_update_pack.inc

Large diffs are not rendered by default.

94 changes: 74 additions & 20 deletions mpp/include/group_update_unpack.inc
Original file line number Diff line number Diff line change
Expand Up @@ -17,92 +17,146 @@
!***********************************************************************

if( group%k_loop_inside ) then
!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
! nvfortran + cray pointers imposes some restrictions on the loops below:
! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops
! must be ported rather than the whole outer loop.
! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop".
! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)".
#ifndef __NVCOMPILER_OPENMP_GPU
!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, &
!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k )
!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx)
#endif
do n = nunpack, 1, -1
buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
pos = buffer_pos
is = group%unpack_is(n); ie = group%unpack_ie(n)
js = group%unpack_js(n); je = group%unpack_je(n)
is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
if( group%unpack_type(n) == FIELD_S ) then
do l=1,nscalar ! loop over number of fields
ptr_field = group%addrs_s(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
!$omp default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: field(is:ie,js:je,1:ksize))
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
field(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How are these two implementations equivalent? Is new idx = old pos always?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, they're equivalent. For any iteration, idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 produces the same value that pos would have had at that point. The formula accounts for all the iterations that would have occurred in the nested loops up to that (i,j,k) position.

The reason for the change is that each nested iteration is now independent and can be performed in parallel.

field(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
else if( group%unpack_type(n) == FIELD_X ) then
do l=1,nvector ! loop over number of fields
ptr_fieldx = group%addrs_x(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
!$omp default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: fieldx(is:ie,js:je,1:ksize))
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
fieldx(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
fieldx(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
else if( group%unpack_type(n) == FIELD_Y ) then
do l=1,nvector ! loop over number of fields
ptr_fieldy = group%addrs_y(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) if(use_device_ptr) &
!$omp default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: fieldy(is:ie,js:je,1:ksize))
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
fieldy(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
fieldy(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
endif
enddo
else
!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
#ifndef __NVCOMPILER_OPENMP_GPU
!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, &
!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k)
!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx)
#endif
do nk = nunpack*ksize, 1, -1
n = (nk-1)/ksize + 1
k = mod((nk-1), ksize) + 1
buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
pos = buffer_pos + (k-1)*group%unpack_size(n)
is = group%unpack_is(n); ie = group%unpack_ie(n)
js = group%unpack_js(n); je = group%unpack_je(n)
is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
if( group%unpack_type(n) == FIELD_S ) then
do l=1,nscalar ! loop over number of fields
ptr_field = group%addrs_s(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
!$omp default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k))
#endif
do j = js, je
do i = is, ie
pos = pos + 1
field(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
field(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
else if( group%unpack_type(n) == FIELD_X ) then
do l=1,nvector ! loop over number of fields
ptr_fieldx = group%addrs_x(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
!$omp default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k))
#endif
do j = js, je
do i = is, ie
pos = pos + 1
fieldx(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
fieldx(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
else if( group%unpack_type(n) == FIELD_Y ) then
do l=1,nvector ! loop over number of fields
ptr_fieldy = group%addrs_y(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) if(use_device_ptr) &
!$omp default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k))
#endif
do j = js, je
do i = is, ie
pos = pos + 1
fieldy(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
fieldy(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
endif
enddo
Expand Down
9 changes: 9 additions & 0 deletions mpp/include/mpp_comm_mpi.inc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!> @brief Initialize the @ref mpp_mod module. Must be called before any usage.
subroutine mpp_init_f08( flags, localcomm, test_level, alt_input_nml_path )
!$ use omp_lib
integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG
type(mpi_comm), optional, intent(in) :: localcomm !< MPI communicator to use. Only relevant if MPI has already
!! been initialized by an external call to mpi_init.
Expand Down Expand Up @@ -60,6 +61,14 @@
call MPI_COMM_RANK( mpp_comm_private, pe, error )
call MPI_COMM_SIZE( mpp_comm_private, npes, error )

! set default device to enable multi GPU parallelism
! calls to both OpenACC and OpenMP runtimes are needed
! because we use both do-concurrent and openmp
! if you remove either, the code will run multiple
! ranks on a _single_ GPU. Be careful out there!
!$ call omp_set_default_device(pe)
!$acc set device_num(pe)

module_is_initialized = .TRUE.
if (present(test_level)) then
t_level = test_level
Expand Down
123 changes: 123 additions & 0 deletions mpp/include/mpp_domains_reduce.inc
Original file line number Diff line number Diff line change
Expand Up @@ -905,6 +905,8 @@
#undef MPP_TYPE_INIT_VALUE

!****************************************************
#ifndef __NVCOMPILER

#undef MPP_GLOBAL_FIELD_
#define MPP_GLOBAL_FIELD_ mpp_global_field_r8
#undef MPP_TYPE_
Expand Down Expand Up @@ -972,6 +974,127 @@
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ .false._l4_kind
#include <mpp_global_field.fh>


!! if __NVCOMPILER is defined, use compatibility version
#else

#undef MPP_TYPE_
#define MPP_TYPE_ real(r8_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ 0._r8_kind
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_r8_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_r8_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_r8_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_r8_5d
#include <mpp_global_field_compat.fh>

#undef MPP_TYPE_
#define MPP_TYPE_ integer(i8_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ 0_i8_kind
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_i8_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_i8_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_i8_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_i8_5d
#include <mpp_global_field_compat.fh>

#ifdef OVERLOAD_C8
#undef MPP_TYPE_
#define MPP_TYPE_ complex(c8_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ (0._r8_kind,0._r8_kind)
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_c8_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_c8_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_c8_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_c8_5d
#include <mpp_global_field_compat.fh>
#endif

#ifdef OVERLOAD_C4
#undef MPP_TYPE_
#define MPP_TYPE_ complex(c4_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ (0._r4_kind,0._r4_kind)
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_c4_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_c4_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_c4_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_c4_5d
#include <mpp_global_field_compat.fh>
#endif

#undef MPP_TYPE_
#define MPP_TYPE_ logical(l8_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ .false._l8_kind
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_l8_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_l8_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_l8_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_l8_5d
#include <mpp_global_field_compat.fh>

#undef MPP_TYPE_
#define MPP_TYPE_ real(r4_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ 0._r4_kind
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_r4_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_r4_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_r4_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_r4_5d
#include <mpp_global_field_compat.fh>

#undef MPP_TYPE_
#define MPP_TYPE_ integer(i4_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ 0_i4_kind
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_i4_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_i4_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_i4_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_i4_5d
#include <mpp_global_field_compat.fh>

#undef MPP_TYPE_
#define MPP_TYPE_ logical(l4_kind)
#undef DEFAULT_VALUE_
#define DEFAULT_VALUE_ .false._l4_kind
#undef MPP_GLOBAL_FIELD_2D_
#define MPP_GLOBAL_FIELD_2D_ mpp_global_field_l4_2d
#undef MPP_GLOBAL_FIELD_3D_
#define MPP_GLOBAL_FIELD_3D_ mpp_global_field_l4_3d
#undef MPP_GLOBAL_FIELD_4D_
#define MPP_GLOBAL_FIELD_4D_ mpp_global_field_l4_4d
#undef MPP_GLOBAL_FIELD_5D_
#define MPP_GLOBAL_FIELD_5D_ mpp_global_field_l4_5d
#include <mpp_global_field_compat.fh>
#endif
!****************************************************
#undef MPP_DO_GLOBAL_FIELD_3D_AD_
#define MPP_DO_GLOBAL_FIELD_3D_AD_ mpp_do_global_field2D_r8_3d_ad
Expand Down
Loading
Loading