From 59192133a885e88c83e7ea8da2eabe284dd31522 Mon Sep 17 00:00:00 2001 From: Jesse Lentz Date: Thu, 4 Jun 2026 19:56:01 -0400 Subject: [PATCH 1/5] Refactor group_update for improved performance Refactor the packing and unpacking routines of mpp_group_update for improved performance with generalized indices. The outer k-loop versions of the packing and unpacking routines have been removed. --- mpp/include/group_update_pack.inc | 574 ++++++++++------------------ mpp/include/group_update_unpack.inc | 210 ++++++---- mpp/include/mpp_domains_misc.inc | 28 +- mpp/include/mpp_domains_util.inc | 11 - mpp/include/mpp_group_update.fh | 531 +++++++++++-------------- mpp/mpp_domains.F90 | 11 +- test_fms/mpp/test_mpp_domains2.sh | 3 - 7 files changed, 593 insertions(+), 775 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 98b7645836..7134a72bc5 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -16,414 +16,264 @@ !* governing permissions and limitations under the License. !*********************************************************************** -if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(buffer,npack,group,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation,n,k) - do n = 1, npack +subroutine GROUP_UPDATE_PACK_ (group, buffer, buffer_start_pos) + type(mpp_group_update_type), intent(in) :: group + MPP_TYPE_, intent(out) :: buffer(:) + integer, intent(in) :: buffer_start_pos + integer, parameter :: SWAP_IJ=1, REVERSE_I=2, REVERSE_J=4, MINUS=8 !< Flags which affect packing order + integer :: n, buffer_pos, pos, ltop, ni, nj, nk, nijk, l + + do n=1,group%npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos - pos = buffer_pos - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) - rotation = group%pack_rotation(n) + ni = group%pack_ie(n) - group%pack_is(n) + 1 + nj = group%pack_je(n) - group%pack_js(n) + 1 if( group%pack_type(n) == FIELD_S ) then - select case( rotation ) - case(ZERO) - do l=1, group%nscalar ! loop over number of fields - do k = 1, ksize - do j = js, je - do i = is, ie - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do - enddo - enddo - case( MINUS_NINETY ) - do l=1,group%nscalar ! loop over number of fields - do k = 1, ksize - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do - end do - end do - case( NINETY ) - do l=1,group%nscalar ! loop over number of fields - do k = 1, ksize - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do - end do - end do - case( ONE_HUNDRED_EIGHTY ) - do l=1,group%nscalar ! loop over number of fields - do k = 1, ksize - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do - end do - end do - end select - else if( group%pack_type(n) == FIELD_X ) then - select case( rotation ) - case(ZERO) - do l=1, nvector ! loop over number of fields - do k = 1, ksize - do j = js, je - do i = is, ie - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do - end do - end do - case( MINUS_NINETY ) - if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do - end do - end do - else - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = -fieldy(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do - end do - end do - end if - case( NINETY ) - do l=1, nvector ! loop over number of fields - do k = 1, ksize - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do - end do - end do - case( ONE_HUNDRED_EIGHTY ) - if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do - end do - end do - else - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = -fieldx(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do - end do - end do - end if - end select ! select case( rotation(n) ) - else if( group%pack_type(n) == FIELD_Y ) then - select case( rotation ) - case(ZERO) - do l=1, nvector ! loop over number of fields - do k = 1, ksize - do j = js, je - do i = is, ie - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do - end do - end do - case( MINUS_NINETY ) - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do - end do - end do - case( NINETY ) - if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1, nvector ! loop over number of fields - do k = 1, ksize - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do - end do - end do - else - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = -fieldx(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do - end do - end do - end if - case( ONE_HUNDRED_EIGHTY ) - if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do - end do - end do - else - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = -fieldy(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do - end do - end do - end if - end select ! select case( rotation(n) ) + ltop = group%nscalar + nk = group%ksize_s + else + ltop = group%nvector + nk = group%ksize_v endif - enddo -else -!$OMP parallel do default(none) shared(buffer,npack,group,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation,n,k) - do nk = 1, npack*ksize - n = (nk-1)/ksize + 1 - k = mod((nk-1), ksize) + 1 - buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos - pos = buffer_pos + (k-1)*group%pack_size(n) - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) - rotation = group%pack_rotation(n) + nijk = ni*nj*nk if( group%pack_type(n) == FIELD_S ) then - select case( rotation ) + select case( group%pack_rotation(n) ) case(ZERO) - do l=1, group%nscalar ! loop over number of fields - do j = js, je - do i = is, ie - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, 0) enddo case( MINUS_NINETY ) - do l=1,group%nscalar ! loop over number of fields - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, SWAP_IJ + REVERSE_J) end do case( NINETY ) - do l=1,group%nscalar ! loop over number of fields - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, SWAP_IJ + REVERSE_I) end do case( ONE_HUNDRED_EIGHTY ) - do l=1,group%nscalar ! loop over number of fields - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = field(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, REVERSE_I + REVERSE_J) end do end select else if( group%pack_type(n) == FIELD_X ) then - select case( rotation ) + select case( group%pack_rotation(n) ) case(ZERO) - do l=1, nvector ! loop over number of fields - do j = js, je - do i = is, ie - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, 0) end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1,nvector ! loop over number of fields - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, SWAP_IJ + REVERSE_J) end do else - do l=1,nvector ! loop over number of fields - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = -fieldy(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, SWAP_IJ + REVERSE_J + MINUS) end do end if case( NINETY ) - do l=1, nvector ! loop over number of fields - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, SWAP_IJ + REVERSE_I) end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1,nvector ! loop over number of fields - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, REVERSE_I + REVERSE_J) end do else - do l=1,nvector ! loop over number of fields - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = -fieldx(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, REVERSE_I + REVERSE_J + MINUS) end do end if - end select ! select case( rotation(n) ) + end select else if( group%pack_type(n) == FIELD_Y ) then - select case( rotation ) + select case( group%pack_rotation(n) ) case(ZERO) - do l=1, nvector ! loop over number of fields - do j = js, je - do i = is, ie - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, 0) end do case( MINUS_NINETY ) - do l=1,nvector ! loop over number of fields - do i = is, ie - do j = je, js, -1 - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, SWAP_IJ + REVERSE_J) end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1, nvector ! loop over number of fields - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = fieldx(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, SWAP_IJ + REVERSE_I) end do else - do l=1,nvector ! loop over number of fields - do i = ie, is, -1 - do j = js, je - pos = pos + 1 - !buffer(pos) = -fieldx(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, SWAP_IJ + REVERSE_I + MINUS) end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then - do l=1,nvector ! loop over number of fields - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = fieldy(i,j,k) - buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, REVERSE_I + REVERSE_J) end do else - do l=1,nvector ! loop over number of fields - do j = je, js, -1 - do i = ie, is, -1 - pos = pos + 1 - !buffer(pos) = -fieldy(i,j,k) - buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k) - end do - end do + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, REVERSE_I + REVERSE_J + MINUS) end do end if - end select ! select case( rotation(n) ) + end select endif enddo -endif + +contains + + subroutine pack_inner_loop (buffer, group, field_type, ni, l, packing_flags) + MPP_TYPE_, intent(out) :: buffer(:) + type(mpp_group_update_type), intent(in), target :: group + integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y + integer, intent(in) :: ni + integer, intent(in) :: l !< Index of the field within group + integer, intent(in) :: packing_flags !< Any combination of: SWAP_IJ, REVERSE_I, REVERSE_J, MINUS + + integer, allocatable, dimension(:) :: lb, ub, n, stride + MPP_TYPE_, pointer :: field2d(:,:) + MPP_TYPE_, pointer :: field3d(:,:,:) + MPP_TYPE_, pointer :: field4d(:,:,:,:) + + integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions + integer :: nd !< Number of dimensions + integer :: idx !< Index into the 1D buffer + integer :: sgn !< Factor to multiply the filled values by + integer :: i1, i2, i3, i4, i(4) + integer :: ix2, iy2 + + select case (field_type) + case (FIELD_S) + ix = group%ix_s + iy = group%iy_s + + nd = size(group%shape_s) + allocate (lb(nd), ub(nd), n(nd), stride(nd)) + + lb = 1 + ub = group%shape_s + + select case(nd) + case (2) + call c_f_pointer(group%addrs_s(l), field2d, group%shape_s) + case (3) + call c_f_pointer(group%addrs_s(l), field3d, group%shape_s) + case (4) + call c_f_pointer(group%addrs_s(l), field4d, group%shape_s) + end select + case (FIELD_X) + ix = group%ix_v + iy = group%iy_v + + nd = size(group%shape_x) + allocate (lb(nd), ub(nd), n(nd), stride(nd)) + + lb = 1 + ub = group%shape_x + + select case(nd) + case (2) + call c_f_pointer(group%addrs_x(l), field2d, group%shape_x) + case (3) + call c_f_pointer(group%addrs_x(l), field3d, group%shape_x) + case (4) + call c_f_pointer(group%addrs_x(l), field4d, group%shape_x) + end select + case (FIELD_Y) + ix = group%ix_v + iy = group%iy_v + + nd = size(group%shape_y) + allocate (lb(nd), ub(nd), n(nd), stride(nd)) + + lb = 1 + ub = group%shape_y + + select case(nd) + case (2) + call c_f_pointer(group%addrs_y(l), field2d, group%shape_y) + case (3) + call c_f_pointer(group%addrs_y(l), field3d, group%shape_y) + case (4) + call c_f_pointer(group%addrs_y(l), field4d, group%shape_y) + end select + end select + + lb(ix) = group%pack_is(ni) + lb(iy) = group%pack_js(ni) + + ub(ix) = group%pack_ie(ni) + ub(iy) = group%pack_je(ni) + + n = ub - lb + 1 + + stride = 1 + + if (iand(packing_flags, REVERSE_I).ne.0) then + lb(ix) = group%pack_ie(ni) + ub(ix) = group%pack_is(ni) + stride(ix) = -1 + endif + + if (iand(packing_flags, REVERSE_J).ne.0) then + lb(iy) = group%pack_je(ni) + ub(iy) = group%pack_js(ni) + stride(iy) = -1 + endif + + if (iand(packing_flags, SWAP_IJ).eq.0) then + ix2 = ix + iy2 = iy + else + ix2 = iy + iy2 = ix + endif + + sgn = 1 + if (iand(packing_flags, MINUS).ne.0) sgn = -1 + + select case (nd) + case (2) + do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2)) local(i, idx) + i(1:2) = [i1, i2] + idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + & + stride(ix2)*(i(ix2)-lb(ix2)) + 1 + buffer(idx) = sgn * field2d(i1,i2) + end do + case (3) + iz1 = 1 + 2 + 3 - ix - iy + do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), & + i3=lb(3):ub(3):stride(3)) local(i, idx) + i(1:3) = [i1, i2, i3] + idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + & + stride(ix2)*(i(ix2)-lb(ix2)) + 1 + buffer(idx) = sgn * field3d(i1,i2,i3) + end do + case (4) + iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy) + iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1 + do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), & + i3=lb(3):ub(3):stride(3), i4=lb(4):ub(4):stride(4)) local(i, idx) + i(1:4) = [i1, i2, i3, i4] + idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + & + stride(ix2)*(i(ix2)-lb(ix2)) + 1 + buffer(idx) = sgn * field4d(i1,i2,i3,i4) + end do + end select + end subroutine pack_inner_loop +end subroutine GROUP_UPDATE_PACK_ diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 06c8a36bed..24d946273e 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -16,92 +16,138 @@ !* governing permissions and limitations under the License. !*********************************************************************** -if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(buffer,nunpack,group,nscalar,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation,n,k ) - do n = nunpack, 1, -1 +subroutine GROUP_UPDATE_UNPACK_ (group, buffer, buffer_start_pos) + type(mpp_group_update_type), intent(in) :: group + MPP_TYPE_, intent(in) :: buffer(:) + integer, intent(in) :: buffer_start_pos + integer :: n, buffer_pos, pos, ltop, ni, nj, nk, nijk, l + + do n=group%nunpack,1,-1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos - pos = buffer_pos - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + ni = group%unpack_ie(n) - group%unpack_is(n) + 1 + nj = group%unpack_je(n) - group%unpack_js(n) + 1 if( group%unpack_type(n) == FIELD_S ) then - do l=1,nscalar ! loop over number of fields - do k = 1, ksize - do j = js, je - do i = is, ie - pos = pos + 1 - !field(i,j,k) = buffer(pos) - call SET_VALUE_ (group, FIELD_S, l, i, j, k, buffer(pos)) - end do - end do - end do - end do - else if( group%unpack_type(n) == FIELD_X ) then - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do j = js, je - do i = is, ie - pos = pos + 1 - !fieldx(i,j,k) = buffer(pos) - call SET_VALUE_ (group, FIELD_X, l, i, j, k, buffer(pos)) - end do - end do - end do - end do - else if( group%unpack_type(n) == FIELD_Y ) then - do l=1,nvector ! loop over number of fields - do k = 1, ksize - do j = js, je - do i = is, ie - pos = pos + 1 - !fieldy(i,j,k) = buffer(pos) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, buffer(pos)) - end do - end do - end do - end do + ltop = group%nscalar + nk = group%ksize_s + else + ltop = group%nvector + nk = group%ksize_v endif + nijk = ni*nj*nk + + do l=1,ltop + pos = buffer_pos + (l-1)*nijk + 1 + call unpack_inner_loop (buffer(pos:), group, group%unpack_type(n), n, l) + end do enddo -else -!$OMP parallel do default(none) shared(buffer,nunpack,group,nscalar,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation,n,k) - do nk = nunpack*ksize, 1, -1 - n = (nk-1)/ksize + 1 - k = mod((nk-1), ksize) + 1 - buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos - pos = buffer_pos + (k-1)*group%unpack_size(n) - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) - if( group%unpack_type(n) == FIELD_S ) then - do l=1,nscalar ! loop over number of fields - do j = js, je - do i = is, ie - pos = pos + 1 - !field(i,j,k) = buffer(pos) - call SET_VALUE_ (group, FIELD_S, l, i, j, k, buffer(pos)) - end do - end do + +contains + + subroutine unpack_inner_loop (buf, group, field_type, ni, l) + MPP_TYPE_, intent(in) :: buf(:) + type(mpp_group_update_type), intent(in), target :: group + integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y + integer, intent(in) :: ni + integer, intent(in) :: l !< Index of the field within group + + integer, allocatable, dimension(:) :: lb, ub, n + MPP_TYPE_, pointer :: field2d(:,:) + MPP_TYPE_, pointer :: field3d(:,:,:) + MPP_TYPE_, pointer :: field4d(:,:,:,:) + + integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions + integer :: nd !< Number of dimensions + integer :: idx !< Index into the 1D buffer + integer :: i1, i2, i3, i4, i(4) + + select case (field_type) + case (FIELD_S) + ix = group%ix_s + iy = group%iy_s + + nd = size(group%shape_s) + allocate (lb(nd), ub(nd), n(nd)) + + lb = 1 + ub = group%shape_s + + select case(nd) + case (2) + call c_f_pointer(group%addrs_s(l), field2d, group%shape_s) + case (3) + call c_f_pointer(group%addrs_s(l), field3d, group%shape_s) + case (4) + call c_f_pointer(group%addrs_s(l), field4d, group%shape_s) + end select + case (FIELD_X) + ix = group%ix_v + iy = group%iy_v + + nd = size(group%shape_x) + allocate (lb(nd), ub(nd), n(nd)) + + lb = 1 + ub = group%shape_x + + select case(nd) + case (2) + call c_f_pointer(group%addrs_x(l), field2d, group%shape_x) + case (3) + call c_f_pointer(group%addrs_x(l), field3d, group%shape_x) + case (4) + call c_f_pointer(group%addrs_x(l), field4d, group%shape_x) + end select + case (FIELD_Y) + ix = group%ix_v + iy = group%iy_v + + nd = size(group%shape_y) + allocate (lb(nd), ub(nd), n(nd)) + + lb = 1 + ub = group%shape_y + + select case(nd) + case (2) + call c_f_pointer(group%addrs_y(l), field2d, group%shape_y) + case (3) + call c_f_pointer(group%addrs_y(l), field3d, group%shape_y) + case (4) + call c_f_pointer(group%addrs_y(l), field4d, group%shape_y) + end select + end select + + lb(ix) = group%unpack_is(ni) + lb(iy) = group%unpack_js(ni) + + ub(ix) = group%unpack_ie(ni) + ub(iy) = group%unpack_je(ni) + + n = ub - lb + 1 + + select case (nd) + case (2) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) local(i, idx) + i(1:2) = [i1, i2] + idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1 + field2d(i1,i2) = buffer(idx) end do - else if( group%unpack_type(n) == FIELD_X ) then - do l=1,nvector ! loop over number of fields - do j = js, je - do i = is, ie - pos = pos + 1 - !fieldx(i,j,k) = buffer(pos) - call SET_VALUE_ (group, FIELD_X, l, i, j, k, buffer(pos)) - end do - end do + case (3) + iz1 = 1 + 2 + 3 - ix - iy + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) local(i, idx) + i(1:3) = [i1, i2, i3] + idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 + field3d(i1,i2,i3) = buffer(idx) end do - else if( group%unpack_type(n) == FIELD_Y ) then - do l=1,nvector ! loop over number of fields - do j = js, je - do i = is, ie - pos = pos + 1 - !fieldy(i,j,k) = buffer(pos) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, buffer(pos)) - end do - end do + case (4) + iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy) + iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1 + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) local(i, idx) + i(1:4) = [i1, i2, i3, i4] + idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 + field4d(i1,i2,i3,i4) = buffer(idx) end do - endif - enddo -endif + end select + end subroutine unpack_inner_loop + +end subroutine GROUP_UPDATE_UNPACK_ diff --git a/mpp/include/mpp_domains_misc.inc b/mpp/include/mpp_domains_misc.inc index cf26a466b2..a9dcf70be3 100644 --- a/mpp/include/mpp_domains_misc.inc +++ b/mpp/include/mpp_domains_misc.inc @@ -1970,10 +1970,16 @@ end subroutine init_nonblock_type #define MPP_RESET_GROUP_UPDATE_FIELD_ mpp_reset_group_update_field_r8 #undef MPP_RESET_GROUP_UPDATE_FIELD_V_ #define MPP_RESET_GROUP_UPDATE_FIELD_V_ mpp_reset_group_update_field_r8_v -#undef GET_VALUE_ -#define GET_VALUE_ get_value_r8 -#undef SET_VALUE_ -#define SET_VALUE_ set_value_r8 +#undef GROUP_UPDATE_PACK_ +#define GROUP_UPDATE_PACK_ group_update_pack_r8 +#undef GROUP_UPDATE_UNPACK_ +#define GROUP_UPDATE_UNPACK_ group_update_unpack_r8 +#undef SET_ZERO_ +#define SET_ZERO_ set_zero_r8 +#undef NEGATE_HALO_ +#define NEGATE_HALO_ negate_halo_r8 +#undef FLIP_I_ +#define FLIP_I_ flip_i_r8 #include #undef MPP_TYPE_ @@ -1994,8 +2000,14 @@ end subroutine init_nonblock_type #define MPP_RESET_GROUP_UPDATE_FIELD_ mpp_reset_group_update_field_r4 #undef MPP_RESET_GROUP_UPDATE_FIELD_V_ #define MPP_RESET_GROUP_UPDATE_FIELD_V_ mpp_reset_group_update_field_r4_v -#undef GET_VALUE_ -#define GET_VALUE_ get_value_r4 -#undef SET_VALUE_ -#define SET_VALUE_ set_value_r4 +#undef GROUP_UPDATE_PACK_ +#define GROUP_UPDATE_PACK_ group_update_pack_r4 +#undef GROUP_UPDATE_UNPACK_ +#define GROUP_UPDATE_UNPACK_ group_update_unpack_r4 +#undef SET_ZERO_ +#define SET_ZERO_ set_zero_r4 +#undef NEGATE_HALO_ +#define NEGATE_HALO_ negate_halo_r4 +#undef FLIP_I_ +#define FLIP_I_ flip_i_r4 #include diff --git a/mpp/include/mpp_domains_util.inc b/mpp/include/mpp_domains_util.inc index 1e78a011dc..839a607fd5 100644 --- a/mpp/include/mpp_domains_util.inc +++ b/mpp/include/mpp_domains_util.inc @@ -1955,7 +1955,6 @@ end subroutine mpp_get_tile_compute_domains integer :: send_size(3*MAXOVERLAP) integer :: position_x, position_y, npack, nunpack, dir integer :: pack_buffer_pos, unpack_buffer_pos - integer :: omp_get_num_threads, nthreads character(len=8) :: text type(overlap_type), pointer :: overPtr => NULL() type(overlapSpec), pointer :: update_s => NULL() @@ -2030,16 +2029,6 @@ end subroutine mpp_get_tile_compute_domains call mpp_error(FATAL, "set_group_update: nscalar and nvector are all 0") endif - nthreads = 1 -!$OMP PARALLEL -!$ nthreads = omp_get_num_threads() -!$OMP END PARALLEL - if( nthreads > nthread_control_loop ) then - group%k_loop_inside = .FALSE. - else - group%k_loop_inside = .TRUE. - endif - if(nvector > 0) then !--- This check could not be done because of memory domain ! if( group%isize_x .NE. (group%ie_x-group%is_x+1) .OR. group%jsize_x .NE. (group%je_x-group%js_x+1)) & diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index c84325ac6c..abc4dca866 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -372,13 +372,12 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) integer :: msgsize integer :: from_pe, to_pe, buffer_pos, pos integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, nk integer :: shift, gridtype, midpoint - integer :: npack, nunpack, rotation, isd + integer :: rotation, isd type(c_ptr) :: stack_cptr !< Workaround for GFortran bug MPP_TYPE_, pointer :: buffer(:) - MPP_TYPE_, parameter :: zero_ = 0. nscalar = group%nscalar nvector = group%nvector @@ -427,12 +426,10 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_end(group_recv_clock) flags_v = group%flags_v - npack = group%npack call mpp_clock_begin(group_pack_clock) !pack the data - buffer_start_pos = 0 -#include + call GROUP_UPDATE_PACK_ (group, buffer, 0) call mpp_clock_end(group_pack_clock) call mpp_clock_begin(group_send_clock) @@ -453,9 +450,8 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) endif !---unpack the buffer - nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) -#include + call GROUP_UPDATE_UNPACK_ (group, buffer, 0) call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -472,12 +468,8 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( .NOT. domain%symmetry ) is = is - 1 do i = is ,ie, midpoint if( domain%x(1)%domain_data%begin.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then - do l=1,nvector - do k = 1,ksize - call SET_VALUE_ (group, FIELD_X, l, i, j, k, zero_) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, zero_) - end do - end do + call SET_ZERO_ (group, FIELD_X, i, j) + call SET_ZERO_ (group, FIELD_Y, i, j) end if end do endif @@ -497,16 +489,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( 2*is-domain%x(1)%domain_data%begin.GT.domain%x(1)%domain_data%end+shift ) & call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north BGRID_NE west edge ubound error.' ) - do l=1,nvector - do k = 1,ksize - do i = domain%x(1)%domain_data%begin,is-1 - !fieldx(i,j,k) = fieldx(2*is-i,j,k) - !fieldy(i,j,k) = fieldy(2*is-i,j,k) - call SET_VALUE_ (group, FIELD_X, l, i, j, k, GET_VALUE_ (group, FIELD_X, l, 2*is-i, j, k)) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i, j, k)) - end do - end do - end do + + call FLIP_I_ (group, FIELD_X, j, domain%x(1)%domain_data%begin, is-1, 2*is) + call FLIP_I_ (group, FIELD_Y, j, domain%x(1)%domain_data%begin, is-1, 2*is) end if case(CGRID_NE) is = domain%x(1)%global%begin @@ -514,14 +499,8 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( is.GT.isd )then if( 2*is-domain%x(1)%domain_data%begin-1.GT.domain%x(1)%domain_data%end ) & call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north CGRID_NE west edge ubound error.' ) - do l=1,nvector - do k = 1,ksize - do i = isd,is-1 - !fieldy(i,j,k) = fieldy(2*is-i-1,j,k) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i-1, j, k)) - end do - end do - end do + + call FLIP_I_ (group, FIELD_Y, j, isd, is-1, 2*is-1) end if end select end if @@ -534,25 +513,10 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) case(BGRID_NE) is = is + shift ie = ie + shift - do l=1,nvector - do k = 1,ksize - do i = is,ie - !fieldx(i,j,k) = -fieldx(i,j,k) - !fieldy(i,j,k) = -fieldy(i,j,k) - call SET_VALUE_ (group, FIELD_X, l, i, j, k, - GET_VALUE_ (group, FIELD_X, l, i, j, k)) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k)) - end do - end do - end do + call NEGATE_HALO_ (group, FIELD_X, j, is, ie) + call NEGATE_HALO_ (group, FIELD_Y, j, is, ie) case(CGRID_NE) - do l=1,nvector - do k = 1,ksize - do i = is, ie - !fieldy(i,j,k) = -fieldy(i,j,k) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k)) - end do - end do - end do + call NEGATE_HALO_ (group, FIELD_Y, j, is, ie) end select end if end if @@ -578,10 +542,10 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: nscalar, nvector integer :: nsend, nrecv, flags_v - integer :: msgsize, npack, rotation + integer :: msgsize, rotation integer :: from_pe, to_pe, buffer_pos, pos integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, nk logical :: reuse_buf_pos character(len=8) :: text type(c_ptr) :: stack_cptr !< Workaround for GFortran bug @@ -652,9 +616,7 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) !pack the data call mpp_clock_begin(nonblock_group_pack_clock) - npack = group%npack - buffer_start_pos = group%buffer_start_pos -#include + call GROUP_UPDATE_PACK_ (group, buffer, group%buffer_start_pos) call mpp_clock_end(nonblock_group_pack_clock) call mpp_clock_begin(nonblock_group_send_clock) @@ -677,15 +639,14 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) MPP_TYPE_, intent(in) :: d_type integer :: nsend, nrecv, nscalar, nvector - integer :: k, buffer_pos, pos, m, n, l + integer :: k, pos, m, n, l integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v - integer :: nunpack, rotation, buffer_start_pos, nk, isd + integer :: rotation, nk, isd logical :: recv_y(8) type(c_ptr) :: stack_cptr !< Workaround for GFortran bug MPP_TYPE_, pointer :: buffer(:) - MPP_TYPE_, parameter :: zero_ = 0. gridtype = group%gridtype flags_v = group%flags_v @@ -716,11 +677,9 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) endif !---unpack the buffer - nunpack = group%nunpack call mpp_clock_begin(nonblock_group_unpk_clock) - buffer_start_pos = group%buffer_start_pos -#include + call GROUP_UPDATE_UNPACK_ (group, buffer, group%buffer_start_pos) call mpp_clock_end(nonblock_group_unpk_clock) ! ---northern boundary fold @@ -737,12 +696,8 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) if( .NOT. domain%symmetry ) is = is - 1 do i = is ,ie, midpoint if( domain%x(1)%domain_data%begin.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then - do l=1,nvector - do k = 1,ksize - call SET_VALUE_ (group, FIELD_X, l, i, j, k, zero_) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, zero_) - end do - end do + call SET_ZERO_ (group, FIELD_X, i, j) + call SET_ZERO_ (group, FIELD_Y, i, j) end if end do endif @@ -762,16 +717,8 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) if( 2*is-domain%x(1)%domain_data%begin.GT.domain%x(1)%domain_data%end+shift ) & call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north BGRID_NE west edge ubound error.' ) - do l=1,nvector - do k = 1,ksize - do i = domain%x(1)%domain_data%begin,is-1 - !fieldx(i,j,k) = fieldx(2*is-i,j,k) - !fieldy(i,j,k) = fieldy(2*is-i,j,k) - call SET_VALUE_ (group, FIELD_X, l, i, j, k, GET_VALUE_ (group, FIELD_X, l, 2*is-i, j, k)) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i, j, k)) - end do - end do - end do + call FLIP_I_ (group, FIELD_X, j, domain%x(1)%domain_data%begin, is-1, 2*is) + call FLIP_I_ (group, FIELD_Y, j, domain%x(1)%domain_data%begin, is-1, 2*is) end if case(CGRID_NE) is = domain%x(1)%global%begin @@ -779,14 +726,7 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) if( is.GT.isd)then if( 2*is-domain%x(1)%domain_data%begin-1.GT.domain%x(1)%domain_data%end ) & call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north CGRID_NE west edge ubound error.' ) - do l=1,nvector - do k = 1,ksize - do i = isd,is-1 - !fieldy(i,j,k) = fieldy(2*is-i-1,j,k) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i-1, j, k)) - end do - end do - end do + call FLIP_I_ (group, FIELD_Y, j, isd, is-1, 2*is-1) end if end select end if @@ -799,25 +739,10 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) case(BGRID_NE) is = is + shift ie = ie + shift - do l=1,nvector - do k = 1,ksize - do i = is,ie - !fieldx(i,j,k) = -fieldx(i,j,k) - !fieldy(i,j,k) = -fieldy(i,j,k) - call SET_VALUE_ (group, FIELD_X, l, i, j, k, - GET_VALUE_ (group, FIELD_X, l, i, j, k)) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k)) - end do - end do - end do + call NEGATE_HALO_ (group, FIELD_X, j, is, ie) + call NEGATE_HALO_ (group, FIELD_Y, j, is, ie) case(CGRID_NE) - do l=1,nvector - do k = 1,ksize - do i = is, ie - !fieldy(i,j,k) = -fieldy(i,j,k) - call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k)) - end do - end do - end do + call NEGATE_HALO_ (group, FIELD_Y, j, is, ie) end select end if end if @@ -890,224 +815,230 @@ subroutine MPP_RESET_GROUP_UPDATE_FIELD_V_(group, fieldx, fieldy) group%addrs_y(group%reset_index_v) = c_loc(fieldy) end subroutine MPP_RESET_GROUP_UPDATE_FIELD_V_ -function GET_VALUE_ (group, field_type, l, i, j, k) result(res) +subroutine SET_ZERO_ (group, field_type, i0, j0) type(mpp_group_update_type), intent(in), target :: group integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y - integer, intent(in) :: l !< Index of the field within group - integer, intent(in) :: i, j, k !< i,j,k indices of the value to retrieve - MPP_TYPE_ :: res + integer, intent(in) :: i0, j0 !< i,j point to initialize to zero - integer, allocatable, dimension(:) :: lb, n, indx + integer, allocatable, dimension(:) :: lb, ub MPP_TYPE_, pointer :: field2d(:,:) MPP_TYPE_, pointer :: field3d(:,:,:) MPP_TYPE_, pointer :: field4d(:,:,:,:) + MPP_TYPE_, parameter :: zero_ = 0. + integer :: nd !< Number of dimensions integer :: ix, iy !< Indices of the domain-decomposed dimensions - integer :: is, ie, js, je !< Starting and ending indices of the x and y dimensions - integer :: nd, m, kp, d - - select case (field_type) - case (FIELD_S) - ix = group%ix_s - iy = group%iy_s - is = group%is_s - ie = group%ie_s - js = group%js_s - je = group%je_s - - nd = size(group%shape_s) - allocate (lb(nd), n(nd), indx(nd)) - - lb = 1 - n = group%shape_s - - select case(nd) - case (2) - call c_f_pointer(group%addrs_s(l), field2d, group%shape_s) - case (3) - call c_f_pointer(group%addrs_s(l), field3d, group%shape_s) - case (4) - call c_f_pointer(group%addrs_s(l), field4d, group%shape_s) - end select - case (FIELD_X) - ix = group%ix_v - iy = group%iy_v - is = group%is_x - ie = group%ie_x - js = group%js_x - je = group%je_x - - nd = size(group%shape_x) - allocate (lb(nd), n(nd), indx(nd)) - - lb = 1 - n = group%shape_x - - select case(nd) - case (2) - call c_f_pointer(group%addrs_x(l), field2d, group%shape_x) - case (3) - call c_f_pointer(group%addrs_x(l), field3d, group%shape_x) - case (4) - call c_f_pointer(group%addrs_x(l), field4d, group%shape_x) - end select - case (FIELD_Y) - ix = group%ix_v - iy = group%iy_v - is = group%is_y - ie = group%ie_y - js = group%js_y - je = group%je_y - - nd = size(group%shape_y) - allocate (lb(nd), n(nd), indx(nd)) - - lb = 1 - n = group%shape_y - - select case(nd) - case (2) - call c_f_pointer(group%addrs_y(l), field2d, group%shape_y) - case (3) - call c_f_pointer(group%addrs_y(l), field3d, group%shape_y) - case (4) - call c_f_pointer(group%addrs_y(l), field4d, group%shape_y) - end select - end select - - lb(ix) = i - is + 1 - lb(iy) = j - js + 1 - - n(ix) = 1 - n(iy) = 1 - - kp = k - 1 - do d=nd,1,-1 - m = product(n(1:d-1)) ! Product of all lower dimension sizes - indx(d) = kp / m - kp = kp - indx(d)*m + integer :: l, i1, i2, i3, i4 + + ix = group%ix_v + iy = group%iy_v + + nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree + allocate (lb(nd), ub(nd)) + + do l=1,group%nvector + select case (field_type) + case (FIELD_X) + lb = 1 + ub = group%shape_x + + select case(nd) + case (2) + call c_f_pointer(group%addrs_x(l), field2d, group%shape_x) + case (3) + call c_f_pointer(group%addrs_x(l), field3d, group%shape_x) + case (4) + call c_f_pointer(group%addrs_x(l), field4d, group%shape_x) + end select + case (FIELD_Y) + lb = 1 + ub = group%shape_y + + select case(nd) + case (2) + call c_f_pointer(group%addrs_y(l), field2d, group%shape_y) + case (3) + call c_f_pointer(group%addrs_y(l), field3d, group%shape_y) + case (4) + call c_f_pointer(group%addrs_y(l), field4d, group%shape_y) + end select + end select + + lb(ix) = i0 + ub(ix) = i0 + + lb(iy) = j0 + ub(iy) = j0 + + select case(nd) + case (2) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) + field2d(i1, i2) = zero_ + end do + case (3) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) + field3d(i1, i2, i3) = zero_ + end do + case (4) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) + field4d(i1, i2, i3, i4) = zero_ + end do + end select enddo +end subroutine SET_ZERO_ - indx = lb + indx +subroutine NEGATE_HALO_ (group, field_type, j0, is, ie) + type(mpp_group_update_type), intent(in), target :: group + integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y + integer, intent(in) :: j0 !< j point to negate + integer, intent(in) :: is, ie !< range of i indices to negate - select case (nd) - case (2) - res = field2d(indx(1), indx(2)) - case (3) - res = field3d(indx(1), indx(2), indx(3)) - case (4) - res = field4d(indx(1), indx(2), indx(3), indx(4)) - end select -end function GET_VALUE_ + integer, allocatable, dimension(:) :: lb, ub + MPP_TYPE_, pointer :: field2d(:,:) + MPP_TYPE_, pointer :: field3d(:,:,:) + MPP_TYPE_, pointer :: field4d(:,:,:,:) + + integer :: nd !< Number of dimensions + integer :: ix, iy !< Indices of the domain-decomposed dimensions + integer :: l, i1, i2, i3, i4 + + ix = group%ix_v + iy = group%iy_v + + nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree + allocate (lb(nd), ub(nd)) + + do l=1,group%nvector + select case (field_type) + case (FIELD_X) + lb = 1 + ub = group%shape_x + + select case(nd) + case (2) + call c_f_pointer(group%addrs_x(l), field2d, group%shape_x) + case (3) + call c_f_pointer(group%addrs_x(l), field3d, group%shape_x) + case (4) + call c_f_pointer(group%addrs_x(l), field4d, group%shape_x) + end select + case (FIELD_Y) + lb = 1 + ub = group%shape_y + + select case(nd) + case (2) + call c_f_pointer(group%addrs_y(l), field2d, group%shape_y) + case (3) + call c_f_pointer(group%addrs_y(l), field3d, group%shape_y) + case (4) + call c_f_pointer(group%addrs_y(l), field4d, group%shape_y) + end select + end select + + lb(ix) = is + ub(ix) = ie + + lb(iy) = j0 + ub(iy) = j0 + + select case(nd) + case (2) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) + field2d(i1,i2) = -field2d(i1,i2) + end do + case (3) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) + field3d(i1,i2,i3) = -field3d(i1,i2,i3) + end do + case (4) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) + field4d(i1,i2,i3,i4) = -field4d(i1,i2,i3,i4) + end do + end select + enddo +end subroutine NEGATE_HALO_ -subroutine SET_VALUE_ (group, field_type, l, i, j, k, x) +subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) type(mpp_group_update_type), intent(in), target :: group integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y - integer, intent(in) :: l !< Index of the field within group - integer, intent(in) :: i, j, k !< i,j,k indices of the value to retrieve - MPP_TYPE_, intent(in) :: x + integer, intent(in) :: j0 !< j point to negate + integer, intent(in) :: is, ie !< range of i indices to negate + integer, intent(in) :: offset !< Offset when flipping the i axis - integer, allocatable, dimension(:) :: lb, n, indx + integer, allocatable, dimension(:) :: lb, ub, src MPP_TYPE_, pointer :: field2d(:,:) MPP_TYPE_, pointer :: field3d(:,:,:) MPP_TYPE_, pointer :: field4d(:,:,:,:) + integer :: nd !< Number of dimensions integer :: ix, iy !< Indices of the domain-decomposed dimensions - integer :: is, ie, js, je !< Starting and ending indices of the x and y dimensions - integer :: nd, m, kp, d - - select case (field_type) - case (FIELD_S) - ix = group%ix_s - iy = group%iy_s - is = group%is_s - ie = group%ie_s - js = group%js_s - je = group%je_s - - nd = size(group%shape_s) - allocate (lb(nd), n(nd), indx(nd)) - - lb = 1 - n = group%shape_s - - select case(nd) - case (2) - call c_f_pointer(group%addrs_s(l), field2d, group%shape_s) - case (3) - call c_f_pointer(group%addrs_s(l), field3d, group%shape_s) - case (4) - call c_f_pointer(group%addrs_s(l), field4d, group%shape_s) - end select - case (FIELD_X) - ix = group%ix_v - iy = group%iy_v - is = group%is_x - ie = group%ie_x - js = group%js_x - je = group%je_x - - nd = size(group%shape_x) - allocate (lb(nd), n(nd), indx(nd)) - - lb = 1 - n = group%shape_x - - select case(nd) - case (2) - call c_f_pointer(group%addrs_x(l), field2d, group%shape_x) - case (3) - call c_f_pointer(group%addrs_x(l), field3d, group%shape_x) - case (4) - call c_f_pointer(group%addrs_x(l), field4d, group%shape_x) - end select - case (FIELD_Y) - ix = group%ix_v - iy = group%iy_v - is = group%is_y - ie = group%ie_y - js = group%js_y - je = group%je_y - - nd = size(group%shape_y) - allocate (lb(nd), n(nd), indx(nd)) - - lb = 1 - n = group%shape_y - - select case(nd) - case (2) - call c_f_pointer(group%addrs_y(l), field2d, group%shape_y) - case (3) - call c_f_pointer(group%addrs_y(l), field3d, group%shape_y) - case (4) - call c_f_pointer(group%addrs_y(l), field4d, group%shape_y) - end select - end select - - lb(ix) = i - is + 1 - lb(iy) = j - js + 1 - - n(ix) = 1 - n(iy) = 1 - - kp = k - 1 - do d=nd,1,-1 - m = product(n(1:d-1)) ! Product of all lower dimension sizes - indx(d) = kp / m - kp = kp - indx(d)*m + integer :: l, i1, i2, i3, i4 + + ix = group%ix_v + iy = group%iy_v + + nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree + allocate (lb(nd), ub(nd), src(nd)) + + do l=1,group%nvector + select case (field_type) + case (FIELD_X) + lb = 1 + ub = group%shape_x + + select case(nd) + case (2) + call c_f_pointer(group%addrs_x(l), field2d, group%shape_x) + case (3) + call c_f_pointer(group%addrs_x(l), field3d, group%shape_x) + case (4) + call c_f_pointer(group%addrs_x(l), field4d, group%shape_x) + end select + case (FIELD_Y) + lb = 1 + ub = group%shape_y + + select case(nd) + case (2) + call c_f_pointer(group%addrs_y(l), field2d, group%shape_y) + case (3) + call c_f_pointer(group%addrs_y(l), field3d, group%shape_y) + case (4) + call c_f_pointer(group%addrs_y(l), field4d, group%shape_y) + end select + end select + + lb(ix) = is + ub(ix) = ie + + lb(iy) = j0 + ub(iy) = j0 + + select case(nd) + case (2) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) + src = [i1, i2] + src(ix) = offset - src(ix) + field2d(i1,i2) = field2d(src(1),src(2)) + end do + case (3) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) + src = [i1, i2, i3] + src(ix) = offset - src(ix) + field3d(i1,i2,i3) = field3d(src(1),src(2),src(3)) + end do + case (4) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) + src = [i1, i2, i3, i4] + src(ix) = offset - src(ix) + field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4)) + end do + end select enddo +end subroutine FLIP_I_ - indx = lb + indx - - select case (nd) - case (2) - field2d(indx(1), indx(2)) = x - case (3) - field3d(indx(1), indx(2), indx(3)) = x - case (4) - field4d(indx(1), indx(2), indx(3), indx(4)) = x - end select -end subroutine SET_VALUE_ +#include "group_update_pack.inc" +#include "group_update_unpack.inc" !> @} diff --git a/mpp/mpp_domains.F90 b/mpp/mpp_domains.F90 index d09e5f262b..bd2f10f129 100644 --- a/mpp/mpp_domains.F90 +++ b/mpp/mpp_domains.F90 @@ -578,7 +578,6 @@ module mpp_domains_mod type :: mpp_group_update_type private logical :: initialized = .FALSE. - logical :: k_loop_inside = .TRUE. logical :: nonsym_edge = .FALSE. integer :: nscalar = 0 integer :: nvector = 0 @@ -745,17 +744,11 @@ module mpp_domains_mod logical :: debug_message_passing = .false. !< Will check the consistency on the boundary between !! processor/tile when updating domain for symmetric domain and !! check the consistency on the north folded edge. - integer :: nthread_control_loop = 8 !< Determine the loop order for packing and unpacking. - !! When number of threads is greater than nthread_control_loop, - !! the k-loop will be moved outside and combined with number - !! of pack and unpack. When the number of threads is - !! less than or equal to nthread_control_loop, the k-loop - !! is moved inside, but still outside, of j,i loop. logical :: efp_sum_overflow_check = .false. !< If .true., always do overflow_check !! when doing EFP bitwise mpp_global_sum. logical :: use_alltoallw = .false. - namelist /mpp_domains_nml/ debug_update_domain, domain_clocks_on, debug_message_passing, nthread_control_loop, & - efp_sum_overflow_check, use_alltoallw + namelist /mpp_domains_nml/ debug_update_domain, domain_clocks_on, debug_message_passing, efp_sum_overflow_check, & + use_alltoallw !*********************************************************************** diff --git a/test_fms/mpp/test_mpp_domains2.sh b/test_fms/mpp/test_mpp_domains2.sh index a9b26f2cd3..d4ed7aaeec 100755 --- a/test_fms/mpp/test_mpp_domains2.sh +++ b/test_fms/mpp/test_mpp_domains2.sh @@ -27,9 +27,6 @@ # Set common test settings. . ../test-lib.sh -# TODO: Enable this test once generalized indices work is complete -SKIP_TESTS="test_mpp_domains2.12" - # TODO edge update, fails on non-blocking with gnu #SKIP_TESTS="$SKIP_TESTS $(basename $0 .sh).6" From 40c55c396f07a9f3f383f87fac29fc81027832c4 Mon Sep 17 00:00:00 2001 From: Jesse Lentz Date: Fri, 12 Jun 2026 14:01:29 -0400 Subject: [PATCH 2/5] Use locally scoped variables in do concurrent loops Use locally scoped variables within do concurrent loops rather than subroutine-scope variables. --- mpp/include/group_update_pack.inc | 46 ++++++++++++++++++----------- mpp/include/group_update_unpack.inc | 39 +++++++++++++++--------- mpp/include/mpp_group_update.fh | 34 ++++++++++++++------- 3 files changed, 76 insertions(+), 43 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 7134a72bc5..35201a96c1 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -152,9 +152,8 @@ contains integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions integer :: nd !< Number of dimensions - integer :: idx !< Index into the 1D buffer integer :: sgn !< Factor to multiply the filled values by - integer :: i1, i2, i3, i4, i(4) + integer :: i1, i2, i3, i4 integer :: ix2, iy2 select case (field_type) @@ -249,30 +248,41 @@ contains select case (nd) case (2) - do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2)) local(i, idx) - i(1:2) = [i1, i2] - idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + & - stride(ix2)*(i(ix2)-lb(ix2)) + 1 - buffer(idx) = sgn * field2d(i1,i2) + do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2)) + block + integer :: i(2), idx + + i = [i1, i2] + idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + & + stride(ix2)*(i(ix2)-lb(ix2)) + 1 + buffer(idx) = sgn * field2d(i1,i2) + end block end do case (3) iz1 = 1 + 2 + 3 - ix - iy - do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), & - i3=lb(3):ub(3):stride(3)) local(i, idx) - i(1:3) = [i1, i2, i3] - idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + & - stride(ix2)*(i(ix2)-lb(ix2)) + 1 - buffer(idx) = sgn * field3d(i1,i2,i3) + do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), i3=lb(3):ub(3):stride(3)) + block + integer :: i(3), idx + + i = [i1, i2, i3] + idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + & + stride(ix2)*(i(ix2)-lb(ix2)) + 1 + buffer(idx) = sgn * field3d(i1,i2,i3) + end block end do case (4) iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy) iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1 do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), & - i3=lb(3):ub(3):stride(3), i4=lb(4):ub(4):stride(4)) local(i, idx) - i(1:4) = [i1, i2, i3, i4] - idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + & - stride(ix2)*(i(ix2)-lb(ix2)) + 1 - buffer(idx) = sgn * field4d(i1,i2,i3,i4) + i3=lb(3):ub(3):stride(3), i4=lb(4):ub(4):stride(4)) + block + integer :: i(4), idx + + i = [i1, i2, i3, i4] + idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + & + stride(ix2)*(i(ix2)-lb(ix2)) + 1 + buffer(idx) = sgn * field4d(i1,i2,i3,i4) + end block end do end select end subroutine pack_inner_loop diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 24d946273e..48b24c708a 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -57,8 +57,7 @@ contains integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions integer :: nd !< Number of dimensions - integer :: idx !< Index into the 1D buffer - integer :: i1, i2, i3, i4, i(4) + integer :: i1, i2, i3, i4 select case (field_type) case (FIELD_S) @@ -127,25 +126,37 @@ contains select case (nd) case (2) - do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) local(i, idx) - i(1:2) = [i1, i2] - idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1 - field2d(i1,i2) = buffer(idx) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) + block + integer :: i(2), idx + + i = [i1, i2] + idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1 + field2d(i1,i2) = buffer(idx) + end block end do case (3) iz1 = 1 + 2 + 3 - ix - iy - do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) local(i, idx) - i(1:3) = [i1, i2, i3] - idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 - field3d(i1,i2,i3) = buffer(idx) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) + block + integer :: i(3), idx + + i = [i1, i2, i3] + idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 + field3d(i1,i2,i3) = buffer(idx) + end block end do case (4) iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy) iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1 - do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) local(i, idx) - i(1:4) = [i1, i2, i3, i4] - idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 - field4d(i1,i2,i3,i4) = buffer(idx) + do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) + block + integer :: i(4), idx + + i = [i1, i2, i3, i4] + idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 + field4d(i1,i2,i3,i4) = buffer(idx) + end block end do end select end subroutine unpack_inner_loop diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index abc4dca866..48657c951a 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -966,7 +966,7 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) integer, intent(in) :: is, ie !< range of i indices to negate integer, intent(in) :: offset !< Offset when flipping the i axis - integer, allocatable, dimension(:) :: lb, ub, src + integer, allocatable, dimension(:) :: lb, ub MPP_TYPE_, pointer :: field2d(:,:) MPP_TYPE_, pointer :: field3d(:,:,:) MPP_TYPE_, pointer :: field4d(:,:,:,:) @@ -979,7 +979,7 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) iy = group%iy_v nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree - allocate (lb(nd), ub(nd), src(nd)) + allocate (lb(nd), ub(nd)) do l=1,group%nvector select case (field_type) @@ -1018,21 +1018,33 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) select case(nd) case (2) do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) - src = [i1, i2] - src(ix) = offset - src(ix) - field2d(i1,i2) = field2d(src(1),src(2)) + block + integer :: src(2) + + src = [i1, i2] + src(ix) = offset - src(ix) + field2d(i1,i2) = field2d(src(1),src(2)) + end block end do case (3) do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) - src = [i1, i2, i3] - src(ix) = offset - src(ix) - field3d(i1,i2,i3) = field3d(src(1),src(2),src(3)) + block + integer :: src(3) + + src = [i1, i2, i3] + src(ix) = offset - src(ix) + field3d(i1,i2,i3) = field3d(src(1),src(2),src(3)) + end block end do case (4) do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) - src = [i1, i2, i3, i4] - src(ix) = offset - src(ix) - field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4)) + block + integer :: src(4) + + src = [i1, i2, i3, i4] + src(ix) = offset - src(ix) + field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4)) + end block end do end select enddo From 93f0dfd5aae8ce8bdb8d95d6019485e0600daa06 Mon Sep 17 00:00:00 2001 From: Jesse Lentz Date: Wed, 17 Jun 2026 11:02:35 -0400 Subject: [PATCH 3/5] Various corrections --- mpp/include/group_update_pack.inc | 45 +++++++++++++++------------ mpp/include/group_update_unpack.inc | 15 ++++++--- mpp/include/mpp_group_update.fh | 47 ++++++++++++++++++++--------- 3 files changed, 69 insertions(+), 38 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 35201a96c1..700d2c7c45 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -154,12 +154,15 @@ contains integer :: nd !< Number of dimensions integer :: sgn !< Factor to multiply the filled values by integer :: i1, i2, i3, i4 - integer :: ix2, iy2 + integer :: ij1, ij2 + integer :: is0, js0 select case (field_type) case (FIELD_S) ix = group%ix_s iy = group%iy_s + is0 = group%is_s + js0 = group%js_s nd = size(group%shape_s) allocate (lb(nd), ub(nd), n(nd), stride(nd)) @@ -178,6 +181,8 @@ contains case (FIELD_X) ix = group%ix_v iy = group%iy_v + is0 = group%is_x + js0 = group%js_x nd = size(group%shape_x) allocate (lb(nd), ub(nd), n(nd), stride(nd)) @@ -196,6 +201,8 @@ contains case (FIELD_Y) ix = group%ix_v iy = group%iy_v + is0 = group%is_y + js0 = group%js_y nd = size(group%shape_y) allocate (lb(nd), ub(nd), n(nd), stride(nd)) @@ -213,34 +220,34 @@ contains end select end select - lb(ix) = group%pack_is(ni) - lb(iy) = group%pack_js(ni) + lb(ix) = group%pack_is(ni) - is0 + 1 + lb(iy) = group%pack_js(ni) - js0 + 1 - ub(ix) = group%pack_ie(ni) - ub(iy) = group%pack_je(ni) + ub(ix) = group%pack_ie(ni) - is0 + 1 + ub(iy) = group%pack_je(ni) - js0 + 1 n = ub - lb + 1 stride = 1 if (iand(packing_flags, REVERSE_I).ne.0) then - lb(ix) = group%pack_ie(ni) - ub(ix) = group%pack_is(ni) + lb(ix) = group%pack_ie(ni) - is0 + 1 + ub(ix) = group%pack_is(ni) - is0 + 1 stride(ix) = -1 endif if (iand(packing_flags, REVERSE_J).ne.0) then - lb(iy) = group%pack_je(ni) - ub(iy) = group%pack_js(ni) + lb(iy) = group%pack_je(ni) - js0 + 1 + ub(iy) = group%pack_js(ni) - js0 + 1 stride(iy) = -1 endif if (iand(packing_flags, SWAP_IJ).eq.0) then - ix2 = ix - iy2 = iy + ij1 = ix + ij2 = iy else - ix2 = iy - iy2 = ix + ij1 = iy + ij2 = ix endif sgn = 1 @@ -253,8 +260,8 @@ contains integer :: i(2), idx i = [i1, i2] - idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + & - stride(ix2)*(i(ix2)-lb(ix2)) + 1 + idx = n(ij1)*stride(ij2)*(i(ij2)-lb(ij2)) + & + stride(ij1)*(i(ij1)-lb(ij1)) + 1 buffer(idx) = sgn * field2d(i1,i2) end block end do @@ -265,8 +272,8 @@ contains integer :: i(3), idx i = [i1, i2, i3] - idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + & - stride(ix2)*(i(ix2)-lb(ix2)) + 1 + idx = n(ij1)*( n(ij2)*(i(iz1)-1) + stride(ij2)*(i(ij2)-lb(ij2))) + & + stride(ij1)*(i(ij1)-lb(ij1)) + 1 buffer(idx) = sgn * field3d(i1,i2,i3) end block end do @@ -279,8 +286,8 @@ contains integer :: i(4), idx i = [i1, i2, i3, i4] - idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + & - stride(ix2)*(i(ix2)-lb(ix2)) + 1 + idx = n(ij1)*( n(ij2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(ij2)*(i(ij2)-lb(ij2))) + & + stride(ij1)*(i(ij1)-lb(ij1)) + 1 buffer(idx) = sgn * field4d(i1,i2,i3,i4) end block end do diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 48b24c708a..a49ceb684c 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -58,11 +58,14 @@ contains integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions integer :: nd !< Number of dimensions integer :: i1, i2, i3, i4 + integer :: is0, js0 select case (field_type) case (FIELD_S) ix = group%ix_s iy = group%iy_s + is0 = group%is_s + js0 = group%js_s nd = size(group%shape_s) allocate (lb(nd), ub(nd), n(nd)) @@ -81,6 +84,8 @@ contains case (FIELD_X) ix = group%ix_v iy = group%iy_v + is0 = group%is_x + js0 = group%js_x nd = size(group%shape_x) allocate (lb(nd), ub(nd), n(nd)) @@ -99,6 +104,8 @@ contains case (FIELD_Y) ix = group%ix_v iy = group%iy_v + is0 = group%is_y + js0 = group%js_y nd = size(group%shape_y) allocate (lb(nd), ub(nd), n(nd)) @@ -116,11 +123,11 @@ contains end select end select - lb(ix) = group%unpack_is(ni) - lb(iy) = group%unpack_js(ni) + lb(ix) = group%unpack_is(ni) - is0 + 1 + lb(iy) = group%unpack_js(ni) - js0 + 1 - ub(ix) = group%unpack_ie(ni) - ub(iy) = group%unpack_je(ni) + ub(ix) = group%unpack_ie(ni) - is0 + 1 + ub(iy) = group%unpack_je(ni) - js0 + 1 n = ub - lb + 1 diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 48657c951a..4b68382094 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -829,6 +829,7 @@ subroutine SET_ZERO_ (group, field_type, i0, j0) integer :: nd !< Number of dimensions integer :: ix, iy !< Indices of the domain-decomposed dimensions integer :: l, i1, i2, i3, i4 + integer :: is0, js0 ix = group%ix_v iy = group%iy_v @@ -841,6 +842,8 @@ subroutine SET_ZERO_ (group, field_type, i0, j0) case (FIELD_X) lb = 1 ub = group%shape_x + is0 = group%is_x + js0 = group%js_x select case(nd) case (2) @@ -853,6 +856,8 @@ subroutine SET_ZERO_ (group, field_type, i0, j0) case (FIELD_Y) lb = 1 ub = group%shape_y + is0 = group%is_y + js0 = group%js_y select case(nd) case (2) @@ -864,11 +869,11 @@ subroutine SET_ZERO_ (group, field_type, i0, j0) end select end select - lb(ix) = i0 - ub(ix) = i0 + lb(ix) = i0 - is0 + 1 + ub(ix) = i0 - is0 + 1 - lb(iy) = j0 - ub(iy) = j0 + lb(iy) = j0 - js0 + 1 + ub(iy) = j0 - js0 + 1 select case(nd) case (2) @@ -901,6 +906,7 @@ subroutine NEGATE_HALO_ (group, field_type, j0, is, ie) integer :: nd !< Number of dimensions integer :: ix, iy !< Indices of the domain-decomposed dimensions integer :: l, i1, i2, i3, i4 + integer :: is0, js0 ix = group%ix_v iy = group%iy_v @@ -913,6 +919,8 @@ subroutine NEGATE_HALO_ (group, field_type, j0, is, ie) case (FIELD_X) lb = 1 ub = group%shape_x + is0 = group%is_x + js0 = group%js_x select case(nd) case (2) @@ -925,6 +933,8 @@ subroutine NEGATE_HALO_ (group, field_type, j0, is, ie) case (FIELD_Y) lb = 1 ub = group%shape_y + is0 = group%is_y + js0 = group%js_y select case(nd) case (2) @@ -936,11 +946,11 @@ subroutine NEGATE_HALO_ (group, field_type, j0, is, ie) end select end select - lb(ix) = is - ub(ix) = ie + lb(ix) = is - is0 + 1 + ub(ix) = ie - is0 + 1 - lb(iy) = j0 - ub(iy) = j0 + lb(iy) = j0 - js0 + 1 + ub(iy) = j0 - js0 + 1 select case(nd) case (2) @@ -974,6 +984,7 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) integer :: nd !< Number of dimensions integer :: ix, iy !< Indices of the domain-decomposed dimensions integer :: l, i1, i2, i3, i4 + integer :: is0, js0, offset1 ix = group%ix_v iy = group%iy_v @@ -986,6 +997,8 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) case (FIELD_X) lb = 1 ub = group%shape_x + is0 = group%is_x + js0 = group%js_x select case(nd) case (2) @@ -998,6 +1011,8 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) case (FIELD_Y) lb = 1 ub = group%shape_y + is0 = group%is_y + js0 = group%js_y select case(nd) case (2) @@ -1009,11 +1024,13 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) end select end select - lb(ix) = is - ub(ix) = ie + lb(ix) = is - is0 + 1 + ub(ix) = ie - is0 + 1 - lb(iy) = j0 - ub(iy) = j0 + lb(iy) = j0 - js0 + 1 + ub(iy) = j0 - js0 + 1 + + offset1 = offset - 2*is0 + 2 select case(nd) case (2) @@ -1022,7 +1039,7 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) integer :: src(2) src = [i1, i2] - src(ix) = offset - src(ix) + src(ix) = offset1 - src(ix) field2d(i1,i2) = field2d(src(1),src(2)) end block end do @@ -1032,7 +1049,7 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) integer :: src(3) src = [i1, i2, i3] - src(ix) = offset - src(ix) + src(ix) = offset1 - src(ix) field3d(i1,i2,i3) = field3d(src(1),src(2),src(3)) end block end do @@ -1042,7 +1059,7 @@ subroutine FLIP_I_ (group, field_type, j0, is, ie, offset) integer :: src(4) src = [i1, i2, i3, i4] - src(ix) = offset - src(ix) + src(ix) = offset1 - src(ix) field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4)) end block end do From b9298822486a39af04a5433365135e71ec7676b9 Mon Sep 17 00:00:00 2001 From: Jesse Lentz Date: Wed, 17 Jun 2026 12:09:59 -0400 Subject: [PATCH 4/5] Rename `buffer` to `buf` within inner loops --- mpp/include/group_update_pack.inc | 10 +++++----- mpp/include/group_update_unpack.inc | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 700d2c7c45..29b03f5bc5 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -137,8 +137,8 @@ subroutine GROUP_UPDATE_PACK_ (group, buffer, buffer_start_pos) contains - subroutine pack_inner_loop (buffer, group, field_type, ni, l, packing_flags) - MPP_TYPE_, intent(out) :: buffer(:) + subroutine pack_inner_loop (buf, group, field_type, ni, l, packing_flags) + MPP_TYPE_, intent(out) :: buf(:) type(mpp_group_update_type), intent(in), target :: group integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y integer, intent(in) :: ni @@ -262,7 +262,7 @@ contains i = [i1, i2] idx = n(ij1)*stride(ij2)*(i(ij2)-lb(ij2)) + & stride(ij1)*(i(ij1)-lb(ij1)) + 1 - buffer(idx) = sgn * field2d(i1,i2) + buf(idx) = sgn * field2d(i1,i2) end block end do case (3) @@ -274,7 +274,7 @@ contains i = [i1, i2, i3] idx = n(ij1)*( n(ij2)*(i(iz1)-1) + stride(ij2)*(i(ij2)-lb(ij2))) + & stride(ij1)*(i(ij1)-lb(ij1)) + 1 - buffer(idx) = sgn * field3d(i1,i2,i3) + buf(idx) = sgn * field3d(i1,i2,i3) end block end do case (4) @@ -288,7 +288,7 @@ contains i = [i1, i2, i3, i4] idx = n(ij1)*( n(ij2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(ij2)*(i(ij2)-lb(ij2))) + & stride(ij1)*(i(ij1)-lb(ij1)) + 1 - buffer(idx) = sgn * field4d(i1,i2,i3,i4) + buf(idx) = sgn * field4d(i1,i2,i3,i4) end block end do end select diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index a49ceb684c..bac3d37a5d 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -139,7 +139,7 @@ contains i = [i1, i2] idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1 - field2d(i1,i2) = buffer(idx) + field2d(i1,i2) = buf(idx) end block end do case (3) @@ -150,7 +150,7 @@ contains i = [i1, i2, i3] idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 - field3d(i1,i2,i3) = buffer(idx) + field3d(i1,i2,i3) = buf(idx) end block end do case (4) @@ -162,7 +162,7 @@ contains i = [i1, i2, i3, i4] idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1 - field4d(i1,i2,i3,i4) = buffer(idx) + field4d(i1,i2,i3,i4) = buf(idx) end block end do end select From f895ec5732063a18f99d056457458c77517e06d3 Mon Sep 17 00:00:00 2001 From: Jesse Lentz Date: Thu, 18 Jun 2026 14:50:32 -0400 Subject: [PATCH 5/5] Deallocate shape arrays --- mpp/include/mpp_domains_util.inc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mpp/include/mpp_domains_util.inc b/mpp/include/mpp_domains_util.inc index 839a607fd5..e2d292b4b6 100644 --- a/mpp/include/mpp_domains_util.inc +++ b/mpp/include/mpp_domains_util.inc @@ -2390,6 +2390,10 @@ end subroutine set_group_update group%nunpack = 0 group%initialized = .false. + if (allocated(group%shape_s)) deallocate(group%shape_s) + if (allocated(group%shape_x)) deallocate(group%shape_x) + if (allocated(group%shape_y)) deallocate(group%shape_y) + end subroutine mpp_clear_group_update !#####################################################################