From 59192133a885e88c83e7ea8da2eabe284dd31522 Mon Sep 17 00:00:00 2001
From: Jesse Lentz <jesse.lentz@noaa.gov>
Date: Thu, 4 Jun 2026 19:56:01 -0400
Subject: [PATCH 1/5] Refactor group_update for improved performance

Refactor the packing and unpacking routines of mpp_group_update for
improved performance with generalized indices. The outer k-loop versions
of the packing and unpacking routines have been removed.
---
 mpp/include/group_update_pack.inc   | 574 ++++++++++------------------
 mpp/include/group_update_unpack.inc | 210 ++++++----
 mpp/include/mpp_domains_misc.inc    |  28 +-
 mpp/include/mpp_domains_util.inc    |  11 -
 mpp/include/mpp_group_update.fh     | 531 +++++++++++--------------
 mpp/mpp_domains.F90                 |  11 +-
 test_fms/mpp/test_mpp_domains2.sh   |   3 -
 7 files changed, 593 insertions(+), 775 deletions(-)

diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc
index 98b7645836..7134a72bc5 100644
--- a/mpp/include/group_update_pack.inc
+++ b/mpp/include/group_update_pack.inc
@@ -16,414 +16,264 @@
 !* governing permissions and limitations under the License.
 !***********************************************************************
 
-if( group%k_loop_inside ) then
-!$OMP parallel do default(none) shared(buffer,npack,group,nvector,ksize,buffer_start_pos) &
-!$OMP                          private(buffer_pos,pos,m,is,ie,js,je,rotation,n,k)
-  do n = 1, npack
+subroutine GROUP_UPDATE_PACK_ (group, buffer, buffer_start_pos)
+  type(mpp_group_update_type), intent(in) :: group
+  MPP_TYPE_, intent(out) :: buffer(:)
+  integer, intent(in) :: buffer_start_pos
+  integer, parameter :: SWAP_IJ=1, REVERSE_I=2, REVERSE_J=4, MINUS=8 !< Flags which affect packing order
+  integer :: n, buffer_pos, pos, ltop, ni, nj, nk, nijk, l
+
+  do n=1,group%npack
      buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos
-     pos  = buffer_pos
-     is = group%pack_is(n); ie = group%pack_ie(n)
-     js = group%pack_js(n); je = group%pack_je(n)
-     rotation = group%pack_rotation(n)
+     ni = group%pack_ie(n) - group%pack_is(n) + 1
+     nj = group%pack_je(n) - group%pack_js(n) + 1
      if( group%pack_type(n) == FIELD_S ) then
-        select case( rotation )
-        case(ZERO)
-           do l=1, group%nscalar  ! loop over number of fields
-              do k = 1, ksize
-                 do j = js, je
-                    do i = is, ie
-                       pos = pos + 1
-                       !buffer(pos) = field(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                    end do
-                 end do
-              enddo
-           enddo
-        case( MINUS_NINETY )
-           do l=1,group%nscalar  ! loop over number of fields
-              do k = 1, ksize
-                 do i = is, ie
-                    do j = je, js, -1
-                       pos = pos + 1
-                       !buffer(pos) = field(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                    end do
-                 end do
-              end do
-           end do
-        case( NINETY )
-           do l=1,group%nscalar  ! loop over number of fields
-              do k = 1, ksize
-                 do i = ie, is, -1
-                    do j = js, je
-                       pos = pos + 1
-                       !buffer(pos) = field(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                    end do
-                 end do
-              end do
-           end do
-        case( ONE_HUNDRED_EIGHTY )
-           do l=1,group%nscalar  ! loop over number of fields
-              do k = 1, ksize
-                 do j = je, js, -1
-                    do i = ie, is, -1
-                       pos = pos + 1
-                       !buffer(pos) = field(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                    end do
-                 end do
-              end do
-           end do
-        end select
-     else if( group%pack_type(n) == FIELD_X  ) then
-        select case( rotation )
-        case(ZERO)
-           do l=1, nvector  ! loop over number of fields
-              do k = 1, ksize
-                 do j = js, je
-                    do i = is, ie
-                       pos = pos + 1
-                       !buffer(pos) = fieldx(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                    end do
-                 end do
-              end do
-           end do
-        case( MINUS_NINETY )
-           if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1,nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do i = is, ie
-                       do j = je, js, -1
-                          pos = pos + 1
-                          !buffer(pos) = fieldy(i,j,k)
-                          buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           else
-              do l=1,nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do i = is, ie
-                       do j = je, js, -1
-                          pos = pos + 1
-                          !buffer(pos) = -fieldy(i,j,k)
-                          buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           end if
-        case( NINETY )
-           do l=1, nvector  ! loop over number of fields
-              do k = 1, ksize
-                 do i = ie, is, -1
-                    do j = js, je
-                       pos = pos + 1
-                       !buffer(pos) = fieldy(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                    end do
-                 end do
-              end do
-           end do
-        case( ONE_HUNDRED_EIGHTY )
-           if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1,nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do j = je, js, -1
-                       do i = ie, is, -1
-                          pos = pos + 1
-                          !buffer(pos) =  fieldx(i,j,k)
-                          buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           else
-              do l=1,nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do j = je, js, -1
-                       do i = ie, is, -1
-                          pos = pos + 1
-                          !buffer(pos) =  -fieldx(i,j,k)
-                          buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           end if
-        end select ! select case( rotation(n) )
-     else if( group%pack_type(n) == FIELD_Y ) then
-        select case( rotation )
-        case(ZERO)
-           do l=1, nvector  ! loop over number of fields
-              do k = 1, ksize
-                 do j = js, je
-                    do i = is, ie
-                       pos = pos + 1
-                       !buffer(pos) = fieldy(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                    end do
-                 end do
-              end do
-           end do
-        case( MINUS_NINETY )
-           do l=1,nvector  ! loop over number of fields
-              do k = 1, ksize
-                 do i = is, ie
-                    do j = je, js, -1
-                       pos = pos + 1
-                       !buffer(pos) = fieldx(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                    end do
-                 end do
-              end do
-           end do
-        case( NINETY )
-           if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1, nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do i = ie, is, -1
-                       do j = js, je
-                          pos = pos + 1
-                          !buffer(pos) = fieldx(i,j,k)
-                          buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           else
-              do l=1,nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do i = ie, is, -1
-                       do j = js, je
-                          pos = pos + 1
-                          !buffer(pos) = -fieldx(i,j,k)
-                          buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           end if
-        case( ONE_HUNDRED_EIGHTY )
-           if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1,nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do j = je, js, -1
-                       do i = ie, is, -1
-                          pos = pos + 1
-                          !buffer(pos) =  fieldy(i,j,k)
-                          buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           else
-              do l=1,nvector  ! loop over number of fields
-                 do k = 1, ksize
-                    do j = je, js, -1
-                       do i = ie, is, -1
-                          pos = pos + 1
-                          !buffer(pos) =  -fieldy(i,j,k)
-                          buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                       end do
-                    end do
-                 end do
-              end do
-           end if
-        end select ! select case( rotation(n) )
+       ltop = group%nscalar
+       nk = group%ksize_s
+     else
+       ltop = group%nvector
+       nk = group%ksize_v
      endif
-  enddo
-else
-!$OMP parallel do default(none) shared(buffer,npack,group,nvector,ksize,buffer_start_pos) &
-!$OMP                          private(buffer_pos,pos,m,is,ie,js,je,rotation,n,k)
-  do nk = 1, npack*ksize
-     n = (nk-1)/ksize + 1
-     k = mod((nk-1), ksize) + 1
-     buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos
-     pos  = buffer_pos + (k-1)*group%pack_size(n)
-     is = group%pack_is(n); ie = group%pack_ie(n)
-     js = group%pack_js(n); je = group%pack_je(n)
-     rotation = group%pack_rotation(n)
+     nijk = ni*nj*nk
      if( group%pack_type(n) == FIELD_S ) then
-        select case( rotation )
+        select case( group%pack_rotation(n) )
         case(ZERO)
-           do l=1, group%nscalar  ! loop over number of fields
-              do j = js, je
-                 do i = is, ie
-                    pos = pos + 1
-                    !buffer(pos) = field(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, 0)
            enddo
         case( MINUS_NINETY )
-           do l=1,group%nscalar  ! loop over number of fields
-              do i = is, ie
-                 do j = je, js, -1
-                    pos = pos + 1
-                    !buffer(pos) = field(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, SWAP_IJ + REVERSE_J)
            end do
         case( NINETY )
-           do l=1,group%nscalar  ! loop over number of fields
-              do i = ie, is, -1
-                 do j = js, je
-                    pos = pos + 1
-                    !buffer(pos) = field(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, SWAP_IJ + REVERSE_I)
            end do
         case( ONE_HUNDRED_EIGHTY )
-           do l=1,group%nscalar  ! loop over number of fields
-              do j = je, js, -1
-                 do i = ie, is, -1
-                    pos = pos + 1
-                    !buffer(pos) = field(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_S, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_S, n, l, REVERSE_I + REVERSE_J)
            end do
         end select
      else if( group%pack_type(n) == FIELD_X  ) then
-        select case( rotation )
+        select case( group%pack_rotation(n) )
         case(ZERO)
-           do l=1, nvector  ! loop over number of fields
-              do j = js, je
-                 do i = is, ie
-                    pos = pos + 1
-                    !buffer(pos) = fieldx(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, 0)
            end do
         case( MINUS_NINETY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1,nvector  ! loop over number of fields
-                 do i = is, ie
-                    do j = je, js, -1
-                       pos = pos + 1
-                       !buffer(pos) = fieldy(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, SWAP_IJ + REVERSE_J)
               end do
            else
-              do l=1,nvector  ! loop over number of fields
-                 do i = is, ie
-                    do j = je, js, -1
-                       pos = pos + 1
-                       !buffer(pos) = -fieldy(i,j,k)
-                       buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, SWAP_IJ + REVERSE_J + MINUS)
               end do
            end if
         case( NINETY )
-           do l=1, nvector  ! loop over number of fields
-              do i = ie, is, -1
-                 do j = js, je
-                    pos = pos + 1
-                    !buffer(pos) = fieldy(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, SWAP_IJ + REVERSE_I)
            end do
         case( ONE_HUNDRED_EIGHTY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1,nvector  ! loop over number of fields
-                 do j = je, js, -1
-                    do i = ie, is, -1
-                       pos = pos + 1
-                       !buffer(pos) =  fieldx(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, REVERSE_I + REVERSE_J)
               end do
            else
-              do l=1,nvector  ! loop over number of fields
-                 do j = je, js, -1
-                    do i = ie, is, -1
-                       pos = pos + 1
-                       !buffer(pos) =  -fieldx(i,j,k)
-                       buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, REVERSE_I + REVERSE_J + MINUS)
               end do
            end if
-        end select ! select case( rotation(n) )
+        end select
      else if( group%pack_type(n) == FIELD_Y ) then
-        select case( rotation )
+        select case( group%pack_rotation(n) )
         case(ZERO)
-           do l=1, nvector  ! loop over number of fields
-              do j = js, je
-                 do i = is, ie
-                    pos = pos + 1
-                    !buffer(pos) = fieldy(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, 0)
            end do
         case( MINUS_NINETY )
-           do l=1,nvector  ! loop over number of fields
-              do i = is, ie
-                 do j = je, js, -1
-                    pos = pos + 1
-                    !buffer(pos) = fieldx(i,j,k)
-                    buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                 end do
-              end do
+           do l=1,ltop
+              pos = buffer_pos + (l-1)*nijk + 1
+              call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, SWAP_IJ + REVERSE_J)
            end do
         case( NINETY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1, nvector  ! loop over number of fields
-                 do i = ie, is, -1
-                    do j = js, je
-                       pos = pos + 1
-                       !buffer(pos) = fieldx(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, SWAP_IJ + REVERSE_I)
               end do
            else
-              do l=1,nvector  ! loop over number of fields
-                 do i = ie, is, -1
-                    do j = js, je
-                       pos = pos + 1
-                       !buffer(pos) = -fieldx(i,j,k)
-                       buffer(pos) = - GET_VALUE_ (group, FIELD_X, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_X, n, l, SWAP_IJ + REVERSE_I + MINUS)
               end do
            end if
         case( ONE_HUNDRED_EIGHTY )
            if( BTEST(group%flags_v,SCALAR_BIT) ) then
-              do l=1,nvector  ! loop over number of fields
-                 do j = je, js, -1
-                    do i = ie, is, -1
-                       pos = pos + 1
-                       !buffer(pos) =  fieldy(i,j,k)
-                       buffer(pos) = GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, REVERSE_I + REVERSE_J)
               end do
            else
-              do l=1,nvector  ! loop over number of fields
-                 do j = je, js, -1
-                    do i = ie, is, -1
-                       pos = pos + 1
-                       !buffer(pos) =  -fieldy(i,j,k)
-                       buffer(pos) = - GET_VALUE_ (group, FIELD_Y, l, i, j, k)
-                    end do
-                 end do
+              do l=1,ltop
+                 pos = buffer_pos + (l-1)*nijk + 1
+                 call pack_inner_loop (buffer(pos:), group, FIELD_Y, n, l, REVERSE_I + REVERSE_J + MINUS)
               end do
            end if
-        end select ! select case( rotation(n) )
+        end select
      endif
   enddo
-endif
+
+contains
+
+  subroutine pack_inner_loop (buffer, group, field_type, ni, l, packing_flags)
+    MPP_TYPE_, intent(out) :: buffer(:)
+    type(mpp_group_update_type), intent(in), target :: group
+    integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y
+    integer, intent(in) :: ni
+    integer, intent(in) :: l !< Index of the field within group
+    integer, intent(in) :: packing_flags !< Any combination of: SWAP_IJ, REVERSE_I, REVERSE_J, MINUS
+
+    integer, allocatable, dimension(:) :: lb, ub, n, stride
+    MPP_TYPE_, pointer :: field2d(:,:)
+    MPP_TYPE_, pointer :: field3d(:,:,:)
+    MPP_TYPE_, pointer :: field4d(:,:,:,:)
+
+    integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions
+    integer :: nd !< Number of dimensions
+    integer :: idx !< Index into the 1D buffer
+    integer :: sgn !< Factor to multiply the filled values by
+    integer :: i1, i2, i3, i4, i(4)
+    integer :: ix2, iy2
+
+    select case (field_type)
+      case (FIELD_S)
+        ix = group%ix_s
+        iy = group%iy_s
+
+        nd = size(group%shape_s)
+        allocate (lb(nd), ub(nd), n(nd), stride(nd))
+
+        lb = 1
+        ub = group%shape_s
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_s(l), field2d, group%shape_s)
+          case (3)
+            call c_f_pointer(group%addrs_s(l), field3d, group%shape_s)
+          case (4)
+            call c_f_pointer(group%addrs_s(l), field4d, group%shape_s)
+        end select
+      case (FIELD_X)
+        ix = group%ix_v
+        iy = group%iy_v
+
+        nd = size(group%shape_x)
+        allocate (lb(nd), ub(nd), n(nd), stride(nd))
+
+        lb = 1
+        ub = group%shape_x
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_x(l), field2d, group%shape_x)
+          case (3)
+            call c_f_pointer(group%addrs_x(l), field3d, group%shape_x)
+          case (4)
+            call c_f_pointer(group%addrs_x(l), field4d, group%shape_x)
+        end select
+      case (FIELD_Y)
+        ix = group%ix_v
+        iy = group%iy_v
+
+        nd = size(group%shape_y)
+        allocate (lb(nd), ub(nd), n(nd), stride(nd))
+
+        lb = 1
+        ub = group%shape_y
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_y(l), field2d, group%shape_y)
+          case (3)
+            call c_f_pointer(group%addrs_y(l), field3d, group%shape_y)
+          case (4)
+            call c_f_pointer(group%addrs_y(l), field4d, group%shape_y)
+        end select
+    end select
+
+    lb(ix) = group%pack_is(ni)
+    lb(iy) = group%pack_js(ni)
+
+    ub(ix) = group%pack_ie(ni)
+    ub(iy) = group%pack_je(ni)
+
+    n = ub - lb + 1
+
+    stride = 1
+
+    if (iand(packing_flags, REVERSE_I).ne.0) then
+      lb(ix) = group%pack_ie(ni)
+      ub(ix) = group%pack_is(ni)
+      stride(ix) = -1
+    endif
+
+    if (iand(packing_flags, REVERSE_J).ne.0) then
+      lb(iy) = group%pack_je(ni)
+      ub(iy) = group%pack_js(ni)
+      stride(iy) = -1
+    endif
+
+    if (iand(packing_flags, SWAP_IJ).eq.0) then
+      ix2 = ix
+      iy2 = iy
+    else
+      ix2 = iy
+      iy2 = ix
+    endif
+
+    sgn = 1
+    if (iand(packing_flags, MINUS).ne.0) sgn = -1
+
+    select case (nd)
+      case (2)
+        do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2)) local(i, idx)
+          i(1:2) = [i1, i2]
+          idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + &
+                stride(ix2)*(i(ix2)-lb(ix2)) + 1
+          buffer(idx) = sgn * field2d(i1,i2)
+        end do
+      case (3)
+        iz1 = 1 + 2 + 3 - ix - iy
+        do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), &
+                       i3=lb(3):ub(3):stride(3)) local(i, idx)
+          i(1:3) = [i1, i2, i3]
+          idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + &
+                stride(ix2)*(i(ix2)-lb(ix2)) + 1
+          buffer(idx) = sgn * field3d(i1,i2,i3)
+        end do
+      case (4)
+        iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy)
+        iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1
+        do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), &
+                       i3=lb(3):ub(3):stride(3), i4=lb(4):ub(4):stride(4)) local(i, idx)
+          i(1:4) = [i1, i2, i3, i4]
+          idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + &
+                stride(ix2)*(i(ix2)-lb(ix2)) + 1
+          buffer(idx) = sgn * field4d(i1,i2,i3,i4)
+        end do
+    end select
+  end subroutine pack_inner_loop
+end subroutine GROUP_UPDATE_PACK_
diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc
index 06c8a36bed..24d946273e 100644
--- a/mpp/include/group_update_unpack.inc
+++ b/mpp/include/group_update_unpack.inc
@@ -16,92 +16,138 @@
 !* governing permissions and limitations under the License.
 !***********************************************************************
 
-if( group%k_loop_inside ) then
-!$OMP parallel do default(none) shared(buffer,nunpack,group,nscalar,nvector,ksize,buffer_start_pos) &
-!$OMP                          private(buffer_pos,pos,m,is, ie, js, je,rotation,n,k )
-  do n = nunpack, 1, -1
+subroutine GROUP_UPDATE_UNPACK_ (group, buffer, buffer_start_pos)
+  type(mpp_group_update_type), intent(in) :: group
+  MPP_TYPE_, intent(in) :: buffer(:)
+  integer, intent(in) :: buffer_start_pos
+  integer :: n, buffer_pos, pos, ltop, ni, nj, nk, nijk, l
+
+  do n=group%nunpack,1,-1
      buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
-     pos = buffer_pos
-     is = group%unpack_is(n); ie = group%unpack_ie(n)
-     js = group%unpack_js(n); je = group%unpack_je(n)
+     ni = group%unpack_ie(n) - group%unpack_is(n) + 1
+     nj = group%unpack_je(n) - group%unpack_js(n) + 1
      if( group%unpack_type(n) == FIELD_S ) then
-        do l=1,nscalar  ! loop over number of fields
-           do k = 1, ksize
-              do j = js, je
-                 do i = is, ie
-                    pos = pos + 1
-                    !field(i,j,k) = buffer(pos)
-                    call SET_VALUE_ (group, FIELD_S, l, i, j, k, buffer(pos))
-                 end do
-              end do
-           end do
-        end do
-     else if( group%unpack_type(n) == FIELD_X ) then
-        do l=1,nvector  ! loop over number of fields
-           do k = 1, ksize
-              do j = js, je
-                 do i = is, ie
-                    pos = pos + 1
-                    !fieldx(i,j,k) = buffer(pos)
-                    call SET_VALUE_ (group, FIELD_X, l, i, j, k, buffer(pos))
-                 end do
-              end do
-           end do
-        end do
-     else if( group%unpack_type(n) == FIELD_Y ) then
-        do l=1,nvector  ! loop over number of fields
-           do k = 1, ksize
-              do j = js, je
-                 do i = is, ie
-                    pos = pos + 1
-                    !fieldy(i,j,k) = buffer(pos)
-                    call SET_VALUE_ (group, FIELD_Y, l, i, j, k, buffer(pos))
-                 end do
-              end do
-           end do
-        end do
+       ltop = group%nscalar
+       nk = group%ksize_s
+     else
+       ltop = group%nvector
+       nk = group%ksize_v
      endif
+     nijk = ni*nj*nk
+
+     do l=1,ltop
+        pos = buffer_pos + (l-1)*nijk + 1
+        call unpack_inner_loop (buffer(pos:), group, group%unpack_type(n), n, l)
+     end do
   enddo
-else
-!$OMP parallel do default(none) shared(buffer,nunpack,group,nscalar,nvector,ksize,buffer_start_pos) &
-!$OMP                          private(buffer_pos,pos,m,is, ie, js, je,rotation,n,k)
-  do nk = nunpack*ksize, 1, -1
-     n = (nk-1)/ksize + 1
-     k = mod((nk-1), ksize) + 1
-     buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
-     pos = buffer_pos + (k-1)*group%unpack_size(n)
-     is = group%unpack_is(n); ie = group%unpack_ie(n)
-     js = group%unpack_js(n); je = group%unpack_je(n)
-     if( group%unpack_type(n) == FIELD_S ) then
-        do l=1,nscalar  ! loop over number of fields
-           do j = js, je
-              do i = is, ie
-                 pos = pos + 1
-                 !field(i,j,k) = buffer(pos)
-                 call SET_VALUE_ (group, FIELD_S, l, i, j, k, buffer(pos))
-              end do
-           end do
+
+contains
+
+  subroutine unpack_inner_loop (buf, group, field_type, ni, l)
+    MPP_TYPE_, intent(in) :: buf(:)
+    type(mpp_group_update_type), intent(in), target :: group
+    integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y
+    integer, intent(in) :: ni
+    integer, intent(in) :: l !< Index of the field within group
+
+    integer, allocatable, dimension(:) :: lb, ub, n
+    MPP_TYPE_, pointer :: field2d(:,:)
+    MPP_TYPE_, pointer :: field3d(:,:,:)
+    MPP_TYPE_, pointer :: field4d(:,:,:,:)
+
+    integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions
+    integer :: nd !< Number of dimensions
+    integer :: idx !< Index into the 1D buffer
+    integer :: i1, i2, i3, i4, i(4)
+
+    select case (field_type)
+      case (FIELD_S)
+        ix = group%ix_s
+        iy = group%iy_s
+
+        nd = size(group%shape_s)
+        allocate (lb(nd), ub(nd), n(nd))
+
+        lb = 1
+        ub = group%shape_s
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_s(l), field2d, group%shape_s)
+          case (3)
+            call c_f_pointer(group%addrs_s(l), field3d, group%shape_s)
+          case (4)
+            call c_f_pointer(group%addrs_s(l), field4d, group%shape_s)
+        end select
+      case (FIELD_X)
+        ix = group%ix_v
+        iy = group%iy_v
+
+        nd = size(group%shape_x)
+        allocate (lb(nd), ub(nd), n(nd))
+
+        lb = 1
+        ub = group%shape_x
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_x(l), field2d, group%shape_x)
+          case (3)
+            call c_f_pointer(group%addrs_x(l), field3d, group%shape_x)
+          case (4)
+            call c_f_pointer(group%addrs_x(l), field4d, group%shape_x)
+        end select
+      case (FIELD_Y)
+        ix = group%ix_v
+        iy = group%iy_v
+
+        nd = size(group%shape_y)
+        allocate (lb(nd), ub(nd), n(nd))
+
+        lb = 1
+        ub = group%shape_y
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_y(l), field2d, group%shape_y)
+          case (3)
+            call c_f_pointer(group%addrs_y(l), field3d, group%shape_y)
+          case (4)
+            call c_f_pointer(group%addrs_y(l), field4d, group%shape_y)
+        end select
+    end select
+
+    lb(ix) = group%unpack_is(ni)
+    lb(iy) = group%unpack_js(ni)
+
+    ub(ix) = group%unpack_ie(ni)
+    ub(iy) = group%unpack_je(ni)
+
+    n = ub - lb + 1
+
+    select case (nd)
+      case (2)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) local(i, idx)
+            i(1:2) = [i1, i2]
+            idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1
+            field2d(i1,i2) = buffer(idx)
         end do
-     else if( group%unpack_type(n) == FIELD_X ) then
-        do l=1,nvector  ! loop over number of fields
-           do j = js, je
-              do i = is, ie
-                 pos = pos + 1
-                 !fieldx(i,j,k) = buffer(pos)
-                 call SET_VALUE_ (group, FIELD_X, l, i, j, k, buffer(pos))
-              end do
-           end do
+      case (3)
+        iz1 = 1 + 2 + 3 - ix - iy
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) local(i, idx)
+            i(1:3) = [i1, i2, i3]
+            idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
+            field3d(i1,i2,i3) = buffer(idx)
         end do
-     else if( group%unpack_type(n) == FIELD_Y ) then
-        do l=1,nvector  ! loop over number of fields
-           do j = js, je
-              do i = is, ie
-                 pos = pos + 1
-                 !fieldy(i,j,k) = buffer(pos)
-                 call SET_VALUE_ (group, FIELD_Y, l, i, j, k, buffer(pos))
-              end do
-           end do
+      case (4)
+        iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy)
+        iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) local(i, idx)
+            i(1:4) = [i1, i2, i3, i4]
+            idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
+            field4d(i1,i2,i3,i4) = buffer(idx)
         end do
-     endif
-  enddo
-endif
+    end select
+  end subroutine unpack_inner_loop
+
+end subroutine GROUP_UPDATE_UNPACK_
diff --git a/mpp/include/mpp_domains_misc.inc b/mpp/include/mpp_domains_misc.inc
index cf26a466b2..a9dcf70be3 100644
--- a/mpp/include/mpp_domains_misc.inc
+++ b/mpp/include/mpp_domains_misc.inc
@@ -1970,10 +1970,16 @@ end subroutine init_nonblock_type
 #define MPP_RESET_GROUP_UPDATE_FIELD_ mpp_reset_group_update_field_r8
 #undef MPP_RESET_GROUP_UPDATE_FIELD_V_
 #define MPP_RESET_GROUP_UPDATE_FIELD_V_ mpp_reset_group_update_field_r8_v
-#undef GET_VALUE_
-#define GET_VALUE_ get_value_r8
-#undef SET_VALUE_
-#define SET_VALUE_ set_value_r8
+#undef GROUP_UPDATE_PACK_
+#define GROUP_UPDATE_PACK_ group_update_pack_r8
+#undef GROUP_UPDATE_UNPACK_
+#define GROUP_UPDATE_UNPACK_ group_update_unpack_r8
+#undef SET_ZERO_
+#define SET_ZERO_ set_zero_r8
+#undef NEGATE_HALO_
+#define NEGATE_HALO_ negate_halo_r8
+#undef FLIP_I_
+#define FLIP_I_ flip_i_r8
 #include <mpp_group_update.fh>
 
 #undef MPP_TYPE_
@@ -1994,8 +2000,14 @@ end subroutine init_nonblock_type
 #define MPP_RESET_GROUP_UPDATE_FIELD_ mpp_reset_group_update_field_r4
 #undef MPP_RESET_GROUP_UPDATE_FIELD_V_
 #define MPP_RESET_GROUP_UPDATE_FIELD_V_ mpp_reset_group_update_field_r4_v
-#undef GET_VALUE_
-#define GET_VALUE_ get_value_r4
-#undef SET_VALUE_
-#define SET_VALUE_ set_value_r4
+#undef GROUP_UPDATE_PACK_
+#define GROUP_UPDATE_PACK_ group_update_pack_r4
+#undef GROUP_UPDATE_UNPACK_
+#define GROUP_UPDATE_UNPACK_ group_update_unpack_r4
+#undef SET_ZERO_
+#define SET_ZERO_ set_zero_r4
+#undef NEGATE_HALO_
+#define NEGATE_HALO_ negate_halo_r4
+#undef FLIP_I_
+#define FLIP_I_ flip_i_r4
 #include <mpp_group_update.fh>
diff --git a/mpp/include/mpp_domains_util.inc b/mpp/include/mpp_domains_util.inc
index 1e78a011dc..839a607fd5 100644
--- a/mpp/include/mpp_domains_util.inc
+++ b/mpp/include/mpp_domains_util.inc
@@ -1955,7 +1955,6 @@ end subroutine mpp_get_tile_compute_domains
      integer   :: send_size(3*MAXOVERLAP)
      integer   :: position_x, position_y, npack, nunpack, dir
      integer   :: pack_buffer_pos, unpack_buffer_pos
-     integer   :: omp_get_num_threads, nthreads
      character(len=8)            :: text
      type(overlap_type), pointer :: overPtr => NULL()
      type(overlapSpec), pointer  :: update_s => NULL()
@@ -2030,16 +2029,6 @@ end subroutine mpp_get_tile_compute_domains
         call mpp_error(FATAL, "set_group_update: nscalar and nvector are all 0")
      endif
 
-    nthreads = 1
-!$OMP PARALLEL
-!$   nthreads = omp_get_num_threads()
-!$OMP END PARALLEL
-     if( nthreads > nthread_control_loop ) then
-        group%k_loop_inside = .FALSE.
-     else
-        group%k_loop_inside = .TRUE.
-     endif
-
      if(nvector > 0) then
         !--- This check could not be done because of memory domain
 !        if( group%isize_x .NE. (group%ie_x-group%is_x+1) .OR. group%jsize_x .NE. (group%je_x-group%js_x+1))  &
diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh
index c84325ac6c..abc4dca866 100644
--- a/mpp/include/mpp_group_update.fh
+++ b/mpp/include/mpp_group_update.fh
@@ -372,13 +372,12 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
   integer   :: msgsize
   integer   :: from_pe, to_pe, buffer_pos, pos
   integer   :: ksize, is, ie, js, je
-  integer   :: n, l, m, i, j, k, buffer_start_pos, nk
+  integer   :: n, l, m, i, j, k, nk
   integer   :: shift, gridtype, midpoint
-  integer   :: npack, nunpack, rotation, isd
+  integer   :: rotation, isd
   type(c_ptr) :: stack_cptr !< Workaround for GFortran bug
 
   MPP_TYPE_, pointer :: buffer(:)
-  MPP_TYPE_, parameter :: zero_ = 0.
 
   nscalar = group%nscalar
   nvector = group%nvector
@@ -427,12 +426,10 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
   call mpp_clock_end(group_recv_clock)
 
   flags_v = group%flags_v
-  npack = group%npack
 
   call mpp_clock_begin(group_pack_clock)
   !pack the data
-  buffer_start_pos = 0
-#include <group_update_pack.inc>
+  call GROUP_UPDATE_PACK_ (group, buffer, 0)
   call mpp_clock_end(group_pack_clock)
 
   call mpp_clock_begin(group_send_clock)
@@ -453,9 +450,8 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
   endif
 
   !---unpack the buffer
-  nunpack = group%nunpack
   call mpp_clock_begin(group_unpk_clock)
-#include <group_update_unpack.inc>
+  call GROUP_UPDATE_UNPACK_ (group, buffer, 0)
   call mpp_clock_end(group_unpk_clock)
 
   ! ---northern boundary fold
@@ -472,12 +468,8 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
            if( .NOT. domain%symmetry ) is = is - 1
            do i = is ,ie, midpoint
               if( domain%x(1)%domain_data%begin.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then
-                 do l=1,nvector
-                    do k = 1,ksize
-                       call SET_VALUE_ (group, FIELD_X, l, i, j, k, zero_)
-                       call SET_VALUE_ (group, FIELD_Y, l, i, j, k, zero_)
-                    end do
-                 end do
+                 call SET_ZERO_ (group, FIELD_X, i, j)
+                 call SET_ZERO_ (group, FIELD_Y, i, j)
               end if
            end do
         endif
@@ -497,16 +489,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
 
                  if( 2*is-domain%x(1)%domain_data%begin.GT.domain%x(1)%domain_data%end+shift ) &
                       call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north BGRID_NE west edge ubound error.' )
-                 do l=1,nvector
-                    do k = 1,ksize
-                       do i = domain%x(1)%domain_data%begin,is-1
-                          !fieldx(i,j,k) = fieldx(2*is-i,j,k)
-                          !fieldy(i,j,k) = fieldy(2*is-i,j,k)
-                          call SET_VALUE_ (group, FIELD_X, l, i, j, k, GET_VALUE_ (group, FIELD_X, l, 2*is-i, j, k))
-                          call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i, j, k))
-                       end do
-                    end do
-                 end do
+
+                 call FLIP_I_ (group, FIELD_X, j, domain%x(1)%domain_data%begin, is-1, 2*is)
+                 call FLIP_I_ (group, FIELD_Y, j, domain%x(1)%domain_data%begin, is-1, 2*is)
               end if
            case(CGRID_NE)
               is = domain%x(1)%global%begin
@@ -514,14 +499,8 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
               if( is.GT.isd )then
                  if( 2*is-domain%x(1)%domain_data%begin-1.GT.domain%x(1)%domain_data%end ) &
                       call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north CGRID_NE west edge ubound error.' )
-                 do l=1,nvector
-                    do k = 1,ksize
-                       do i = isd,is-1
-                          !fieldy(i,j,k) = fieldy(2*is-i-1,j,k)
-                          call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i-1, j, k))
-                       end do
-                    end do
-                 end do
+
+                 call FLIP_I_ (group, FIELD_Y, j, isd, is-1, 2*is-1)
               end if
            end select
         end if
@@ -534,25 +513,10 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
            case(BGRID_NE)
               is = is + shift
               ie = ie + shift
-              do l=1,nvector
-                 do k = 1,ksize
-                    do i = is,ie
-                       !fieldx(i,j,k) = -fieldx(i,j,k)
-                       !fieldy(i,j,k) = -fieldy(i,j,k)
-                       call SET_VALUE_ (group, FIELD_X, l, i, j, k, - GET_VALUE_ (group, FIELD_X, l, i, j, k))
-                       call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k))
-                    end do
-                 end do
-              end do
+              call NEGATE_HALO_ (group, FIELD_X, j, is, ie)
+              call NEGATE_HALO_ (group, FIELD_Y, j, is, ie)
            case(CGRID_NE)
-              do l=1,nvector
-                 do k = 1,ksize
-                    do i = is, ie
-                       !fieldy(i,j,k) = -fieldy(i,j,k)
-                       call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k))
-                    end do
-                 end do
-              end do
+              call NEGATE_HALO_ (group, FIELD_Y, j, is, ie)
            end select
         end if
      end if
@@ -578,10 +542,10 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
 
   integer   :: nscalar, nvector
   integer   :: nsend, nrecv, flags_v
-  integer   :: msgsize, npack, rotation
+  integer   :: msgsize, rotation
   integer   :: from_pe, to_pe, buffer_pos, pos
   integer   :: ksize, is, ie, js, je
-  integer   :: n, l, m, i, j, k, buffer_start_pos, nk
+  integer   :: n, l, m, i, j, k, nk
   logical   :: reuse_buf_pos
   character(len=8)            :: text
   type(c_ptr) :: stack_cptr !< Workaround for GFortran bug
@@ -652,9 +616,7 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
 
   !pack the data
   call mpp_clock_begin(nonblock_group_pack_clock)
-  npack = group%npack
-  buffer_start_pos = group%buffer_start_pos
-#include <group_update_pack.inc>
+  call GROUP_UPDATE_PACK_ (group, buffer, group%buffer_start_pos)
   call mpp_clock_end(nonblock_group_pack_clock)
 
   call mpp_clock_begin(nonblock_group_send_clock)
@@ -677,15 +639,14 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
   MPP_TYPE_,                   intent(in)    :: d_type
 
   integer   :: nsend, nrecv, nscalar, nvector
-  integer   :: k, buffer_pos, pos, m, n, l
+  integer   :: k, pos, m, n, l
   integer   :: is, ie, js, je, ksize, i, j
   integer   :: shift, gridtype, midpoint, flags_v
-  integer   :: nunpack, rotation, buffer_start_pos, nk, isd
+  integer   :: rotation, nk, isd
   logical   :: recv_y(8)
   type(c_ptr) :: stack_cptr !< Workaround for GFortran bug
 
   MPP_TYPE_, pointer :: buffer(:)
-  MPP_TYPE_, parameter :: zero_ = 0.
 
   gridtype = group%gridtype
   flags_v = group%flags_v
@@ -716,11 +677,9 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
   endif
 
   !---unpack the buffer
-  nunpack = group%nunpack
 
   call mpp_clock_begin(nonblock_group_unpk_clock)
-  buffer_start_pos = group%buffer_start_pos
-#include <group_update_unpack.inc>
+  call GROUP_UPDATE_UNPACK_ (group, buffer, group%buffer_start_pos)
   call mpp_clock_end(nonblock_group_unpk_clock)
 
   ! ---northern boundary fold
@@ -737,12 +696,8 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
            if( .NOT. domain%symmetry ) is = is - 1
            do i = is ,ie, midpoint
               if( domain%x(1)%domain_data%begin.LE.i .AND. i.LE. domain%x(1)%domain_data%end+shift )then
-                 do l=1,nvector
-                    do k = 1,ksize
-                       call SET_VALUE_ (group, FIELD_X, l, i, j, k, zero_)
-                       call SET_VALUE_ (group, FIELD_Y, l, i, j, k, zero_)
-                    end do
-                 end do
+                 call SET_ZERO_ (group, FIELD_X, i, j)
+                 call SET_ZERO_ (group, FIELD_Y, i, j)
               end if
            end do
         endif
@@ -762,16 +717,8 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
 
                  if( 2*is-domain%x(1)%domain_data%begin.GT.domain%x(1)%domain_data%end+shift ) &
                       call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north BGRID_NE west edge ubound error.' )
-                 do l=1,nvector
-                    do k = 1,ksize
-                       do i = domain%x(1)%domain_data%begin,is-1
-                          !fieldx(i,j,k) = fieldx(2*is-i,j,k)
-                          !fieldy(i,j,k) = fieldy(2*is-i,j,k)
-                          call SET_VALUE_ (group, FIELD_X, l, i, j, k, GET_VALUE_ (group, FIELD_X, l, 2*is-i, j, k))
-                          call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i, j, k))
-                       end do
-                    end do
-                 end do
+                 call FLIP_I_ (group, FIELD_X, j, domain%x(1)%domain_data%begin, is-1, 2*is)
+                 call FLIP_I_ (group, FIELD_Y, j, domain%x(1)%domain_data%begin, is-1, 2*is)
               end if
            case(CGRID_NE)
               is = domain%x(1)%global%begin
@@ -779,14 +726,7 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
               if( is.GT.isd)then
                  if( 2*is-domain%x(1)%domain_data%begin-1.GT.domain%x(1)%domain_data%end ) &
                       call mpp_error( FATAL, 'MPP_DO_UPDATE_V: folded-north CGRID_NE west edge ubound error.' )
-                 do l=1,nvector
-                    do k = 1,ksize
-                       do i = isd,is-1
-                          !fieldy(i,j,k) = fieldy(2*is-i-1,j,k)
-                          call SET_VALUE_ (group, FIELD_Y, l, i, j, k, GET_VALUE_ (group, FIELD_Y, l, 2*is-i-1, j, k))
-                       end do
-                    end do
-                 end do
+                 call FLIP_I_ (group, FIELD_Y, j, isd, is-1, 2*is-1)
               end if
            end select
         end if
@@ -799,25 +739,10 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
            case(BGRID_NE)
               is = is + shift
               ie = ie + shift
-              do l=1,nvector
-                 do k = 1,ksize
-                    do i = is,ie
-                       !fieldx(i,j,k) = -fieldx(i,j,k)
-                       !fieldy(i,j,k) = -fieldy(i,j,k)
-                       call SET_VALUE_ (group, FIELD_X, l, i, j, k, - GET_VALUE_ (group, FIELD_X, l, i, j, k))
-                       call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k))
-                    end do
-                 end do
-              end do
+              call NEGATE_HALO_ (group, FIELD_X, j, is, ie)
+              call NEGATE_HALO_ (group, FIELD_Y, j, is, ie)
            case(CGRID_NE)
-              do l=1,nvector
-                 do k = 1,ksize
-                    do i = is, ie
-                       !fieldy(i,j,k) = -fieldy(i,j,k)
-                       call SET_VALUE_ (group, FIELD_Y, l, i, j, k, - GET_VALUE_ (group, FIELD_Y, l, i, j, k))
-                    end do
-                 end do
-              end do
+              call NEGATE_HALO_ (group, FIELD_Y, j, is, ie)
            end select
         end if
      end if
@@ -890,224 +815,230 @@ subroutine MPP_RESET_GROUP_UPDATE_FIELD_V_(group, fieldx, fieldy)
   group%addrs_y(group%reset_index_v) = c_loc(fieldy)
 end subroutine MPP_RESET_GROUP_UPDATE_FIELD_V_
 
-function GET_VALUE_ (group, field_type, l, i, j, k) result(res)
+subroutine SET_ZERO_  (group, field_type, i0, j0)
   type(mpp_group_update_type), intent(in), target :: group
   integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y
-  integer, intent(in) :: l !< Index of the field within group
-  integer, intent(in) :: i, j, k !< i,j,k indices of the value to retrieve
-  MPP_TYPE_ :: res
+  integer, intent(in) :: i0, j0 !< i,j point to initialize to zero
 
-  integer, allocatable, dimension(:) :: lb, n, indx
+  integer, allocatable, dimension(:) :: lb, ub
   MPP_TYPE_, pointer :: field2d(:,:)
   MPP_TYPE_, pointer :: field3d(:,:,:)
   MPP_TYPE_, pointer :: field4d(:,:,:,:)
+  MPP_TYPE_, parameter :: zero_ = 0.
 
+  integer :: nd !< Number of dimensions
   integer :: ix, iy !< Indices of the domain-decomposed dimensions
-  integer :: is, ie, js, je !< Starting and ending indices of the x and y dimensions
-  integer :: nd, m, kp, d
-
-  select case (field_type)
-    case (FIELD_S)
-      ix = group%ix_s
-      iy = group%iy_s
-      is = group%is_s
-      ie = group%ie_s
-      js = group%js_s
-      je = group%je_s
-
-      nd = size(group%shape_s)
-      allocate (lb(nd), n(nd), indx(nd))
-
-      lb = 1
-      n = group%shape_s
-
-      select case(nd)
-        case (2)
-          call c_f_pointer(group%addrs_s(l), field2d, group%shape_s)
-        case (3)
-          call c_f_pointer(group%addrs_s(l), field3d, group%shape_s)
-        case (4)
-          call c_f_pointer(group%addrs_s(l), field4d, group%shape_s)
-      end select
-    case (FIELD_X)
-      ix = group%ix_v
-      iy = group%iy_v
-      is = group%is_x
-      ie = group%ie_x
-      js = group%js_x
-      je = group%je_x
-
-      nd = size(group%shape_x)
-      allocate (lb(nd), n(nd), indx(nd))
-
-      lb = 1
-      n = group%shape_x
-
-      select case(nd)
-        case (2)
-          call c_f_pointer(group%addrs_x(l), field2d, group%shape_x)
-        case (3)
-          call c_f_pointer(group%addrs_x(l), field3d, group%shape_x)
-        case (4)
-          call c_f_pointer(group%addrs_x(l), field4d, group%shape_x)
-      end select
-    case (FIELD_Y)
-      ix = group%ix_v
-      iy = group%iy_v
-      is = group%is_y
-      ie = group%ie_y
-      js = group%js_y
-      je = group%je_y
-
-      nd = size(group%shape_y)
-      allocate (lb(nd), n(nd), indx(nd))
-
-      lb = 1
-      n = group%shape_y
-
-      select case(nd)
-        case (2)
-          call c_f_pointer(group%addrs_y(l), field2d, group%shape_y)
-        case (3)
-          call c_f_pointer(group%addrs_y(l), field3d, group%shape_y)
-        case (4)
-          call c_f_pointer(group%addrs_y(l), field4d, group%shape_y)
-      end select
-  end select
-
-  lb(ix) = i - is + 1
-  lb(iy) = j - js + 1
-
-  n(ix) = 1
-  n(iy) = 1
-
-  kp = k - 1
-  do d=nd,1,-1
-    m = product(n(1:d-1)) ! Product of all lower dimension sizes
-    indx(d) = kp / m
-    kp = kp - indx(d)*m
+  integer :: l, i1, i2, i3, i4
+
+  ix = group%ix_v
+  iy = group%iy_v
+
+  nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree
+  allocate (lb(nd), ub(nd))
+
+  do l=1,group%nvector
+    select case (field_type)
+      case (FIELD_X)
+        lb = 1
+        ub = group%shape_x
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_x(l), field2d, group%shape_x)
+          case (3)
+            call c_f_pointer(group%addrs_x(l), field3d, group%shape_x)
+          case (4)
+            call c_f_pointer(group%addrs_x(l), field4d, group%shape_x)
+        end select
+      case (FIELD_Y)
+        lb = 1
+        ub = group%shape_y
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_y(l), field2d, group%shape_y)
+          case (3)
+            call c_f_pointer(group%addrs_y(l), field3d, group%shape_y)
+          case (4)
+            call c_f_pointer(group%addrs_y(l), field4d, group%shape_y)
+        end select
+    end select
+
+    lb(ix) = i0
+    ub(ix) = i0
+
+    lb(iy) = j0
+    ub(iy) = j0
+
+    select case(nd)
+      case (2)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2))
+          field2d(i1, i2) = zero_
+        end do
+      case (3)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3))
+          field3d(i1, i2, i3) = zero_
+        end do
+      case (4)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4))
+          field4d(i1, i2, i3, i4) = zero_
+        end do
+    end select
   enddo
+end subroutine SET_ZERO_
 
-  indx = lb + indx
+subroutine NEGATE_HALO_  (group, field_type, j0, is, ie)
+  type(mpp_group_update_type), intent(in), target :: group
+  integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y
+  integer, intent(in) :: j0 !< j point to negate
+  integer, intent(in) :: is, ie !< range of i indices to negate
 
-  select case (nd)
-    case (2)
-      res = field2d(indx(1), indx(2))
-    case (3)
-      res = field3d(indx(1), indx(2), indx(3))
-    case (4)
-      res = field4d(indx(1), indx(2), indx(3), indx(4))
-  end select
-end function GET_VALUE_
+  integer, allocatable, dimension(:) :: lb, ub
+  MPP_TYPE_, pointer :: field2d(:,:)
+  MPP_TYPE_, pointer :: field3d(:,:,:)
+  MPP_TYPE_, pointer :: field4d(:,:,:,:)
+
+  integer :: nd !< Number of dimensions
+  integer :: ix, iy !< Indices of the domain-decomposed dimensions
+  integer :: l, i1, i2, i3, i4
+
+  ix = group%ix_v
+  iy = group%iy_v
+
+  nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree
+  allocate (lb(nd), ub(nd))
+
+  do l=1,group%nvector
+    select case (field_type)
+      case (FIELD_X)
+        lb = 1
+        ub = group%shape_x
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_x(l), field2d, group%shape_x)
+          case (3)
+            call c_f_pointer(group%addrs_x(l), field3d, group%shape_x)
+          case (4)
+            call c_f_pointer(group%addrs_x(l), field4d, group%shape_x)
+        end select
+      case (FIELD_Y)
+        lb = 1
+        ub = group%shape_y
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_y(l), field2d, group%shape_y)
+          case (3)
+            call c_f_pointer(group%addrs_y(l), field3d, group%shape_y)
+          case (4)
+            call c_f_pointer(group%addrs_y(l), field4d, group%shape_y)
+        end select
+    end select
+
+    lb(ix) = is
+    ub(ix) = ie
+
+    lb(iy) = j0
+    ub(iy) = j0
+
+    select case(nd)
+      case (2)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2))
+          field2d(i1,i2) = -field2d(i1,i2)
+        end do
+      case (3)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3))
+          field3d(i1,i2,i3) = -field3d(i1,i2,i3)
+        end do
+      case (4)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4))
+          field4d(i1,i2,i3,i4) = -field4d(i1,i2,i3,i4)
+        end do
+    end select
+  enddo
+end subroutine NEGATE_HALO_
 
-subroutine SET_VALUE_ (group, field_type, l, i, j, k, x)
+subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
   type(mpp_group_update_type), intent(in), target :: group
   integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y
-  integer, intent(in) :: l !< Index of the field within group
-  integer, intent(in) :: i, j, k !< i,j,k indices of the value to retrieve
-  MPP_TYPE_, intent(in) :: x
+  integer, intent(in) :: j0 !< j point to negate
+  integer, intent(in) :: is, ie !< range of i indices to negate
+  integer, intent(in) :: offset !< Offset when flipping the i axis
 
-  integer, allocatable, dimension(:) :: lb, n, indx
+  integer, allocatable, dimension(:) :: lb, ub, src
   MPP_TYPE_, pointer :: field2d(:,:)
   MPP_TYPE_, pointer :: field3d(:,:,:)
   MPP_TYPE_, pointer :: field4d(:,:,:,:)
 
+  integer :: nd !< Number of dimensions
   integer :: ix, iy !< Indices of the domain-decomposed dimensions
-  integer :: is, ie, js, je !< Starting and ending indices of the x and y dimensions
-  integer :: nd, m, kp, d
-
-  select case (field_type)
-    case (FIELD_S)
-      ix = group%ix_s
-      iy = group%iy_s
-      is = group%is_s
-      ie = group%ie_s
-      js = group%js_s
-      je = group%je_s
-
-      nd = size(group%shape_s)
-      allocate (lb(nd), n(nd), indx(nd))
-
-      lb = 1
-      n = group%shape_s
-
-      select case(nd)
-        case (2)
-          call c_f_pointer(group%addrs_s(l), field2d, group%shape_s)
-        case (3)
-          call c_f_pointer(group%addrs_s(l), field3d, group%shape_s)
-        case (4)
-          call c_f_pointer(group%addrs_s(l), field4d, group%shape_s)
-      end select
-    case (FIELD_X)
-      ix = group%ix_v
-      iy = group%iy_v
-      is = group%is_x
-      ie = group%ie_x
-      js = group%js_x
-      je = group%je_x
-
-      nd = size(group%shape_x)
-      allocate (lb(nd), n(nd), indx(nd))
-
-      lb = 1
-      n = group%shape_x
-
-      select case(nd)
-        case (2)
-          call c_f_pointer(group%addrs_x(l), field2d, group%shape_x)
-        case (3)
-          call c_f_pointer(group%addrs_x(l), field3d, group%shape_x)
-        case (4)
-          call c_f_pointer(group%addrs_x(l), field4d, group%shape_x)
-      end select
-    case (FIELD_Y)
-      ix = group%ix_v
-      iy = group%iy_v
-      is = group%is_y
-      ie = group%ie_y
-      js = group%js_y
-      je = group%je_y
-
-      nd = size(group%shape_y)
-      allocate (lb(nd), n(nd), indx(nd))
-
-      lb = 1
-      n = group%shape_y
-
-      select case(nd)
-        case (2)
-          call c_f_pointer(group%addrs_y(l), field2d, group%shape_y)
-        case (3)
-          call c_f_pointer(group%addrs_y(l), field3d, group%shape_y)
-        case (4)
-          call c_f_pointer(group%addrs_y(l), field4d, group%shape_y)
-      end select
-  end select
-
-  lb(ix) = i - is + 1
-  lb(iy) = j - js + 1
-
-  n(ix) = 1
-  n(iy) = 1
-
-  kp = k - 1
-  do d=nd,1,-1
-    m = product(n(1:d-1)) ! Product of all lower dimension sizes
-    indx(d) = kp / m
-    kp = kp - indx(d)*m
+  integer :: l, i1, i2, i3, i4
+
+  ix = group%ix_v
+  iy = group%iy_v
+
+  nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree
+  allocate (lb(nd), ub(nd), src(nd))
+
+  do l=1,group%nvector
+    select case (field_type)
+      case (FIELD_X)
+        lb = 1
+        ub = group%shape_x
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_x(l), field2d, group%shape_x)
+          case (3)
+            call c_f_pointer(group%addrs_x(l), field3d, group%shape_x)
+          case (4)
+            call c_f_pointer(group%addrs_x(l), field4d, group%shape_x)
+        end select
+      case (FIELD_Y)
+        lb = 1
+        ub = group%shape_y
+
+        select case(nd)
+          case (2)
+            call c_f_pointer(group%addrs_y(l), field2d, group%shape_y)
+          case (3)
+            call c_f_pointer(group%addrs_y(l), field3d, group%shape_y)
+          case (4)
+            call c_f_pointer(group%addrs_y(l), field4d, group%shape_y)
+        end select
+    end select
+
+    lb(ix) = is
+    ub(ix) = ie
+
+    lb(iy) = j0
+    ub(iy) = j0
+
+    select case(nd)
+      case (2)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2))
+          src = [i1, i2]
+          src(ix) = offset - src(ix)
+          field2d(i1,i2) = field2d(src(1),src(2))
+        end do
+      case (3)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3))
+          src = [i1, i2, i3]
+          src(ix) = offset - src(ix)
+          field3d(i1,i2,i3) = field3d(src(1),src(2),src(3))
+        end do
+      case (4)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4))
+          src = [i1, i2, i3, i4]
+          src(ix) = offset - src(ix)
+          field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4))
+        end do
+    end select
   enddo
+end subroutine FLIP_I_
 
-  indx = lb + indx
-
-  select case (nd)
-    case (2)
-      field2d(indx(1), indx(2)) = x
-    case (3)
-      field3d(indx(1), indx(2), indx(3)) = x
-    case (4)
-      field4d(indx(1), indx(2), indx(3), indx(4)) = x
-  end select
-end subroutine SET_VALUE_
+#include "group_update_pack.inc"
+#include "group_update_unpack.inc"
 
 !> @}
diff --git a/mpp/mpp_domains.F90 b/mpp/mpp_domains.F90
index d09e5f262b..bd2f10f129 100644
--- a/mpp/mpp_domains.F90
+++ b/mpp/mpp_domains.F90
@@ -578,7 +578,6 @@ module mpp_domains_mod
   type :: mpp_group_update_type
      private
      logical            :: initialized = .FALSE.
-     logical            :: k_loop_inside = .TRUE.
      logical            :: nonsym_edge = .FALSE.
      integer            :: nscalar = 0
      integer            :: nvector = 0
@@ -745,17 +744,11 @@ module mpp_domains_mod
   logical           :: debug_message_passing = .false. !<  Will check the consistency on the boundary between
                                                        !! processor/tile when updating domain for symmetric domain and
                                                        !! check the consistency on the north folded edge.
-  integer           :: nthread_control_loop = 8 !< Determine the loop order for packing and unpacking.
-                                                !! When number of threads is greater than nthread_control_loop,
-                                                !! the k-loop will be moved outside and combined with number
-                                                !! of pack and unpack. When the number of threads is
-                                                !! less than or equal to nthread_control_loop, the k-loop
-                                                !! is moved inside, but still outside, of j,i loop.
   logical           :: efp_sum_overflow_check = .false. !< If .true., always do overflow_check
                                                         !! when doing EFP bitwise mpp_global_sum.
   logical           :: use_alltoallw = .false.
-  namelist /mpp_domains_nml/ debug_update_domain, domain_clocks_on, debug_message_passing, nthread_control_loop, &
-                             efp_sum_overflow_check, use_alltoallw
+  namelist /mpp_domains_nml/ debug_update_domain, domain_clocks_on, debug_message_passing, efp_sum_overflow_check, &
+                             use_alltoallw
 
   !***********************************************************************
 
diff --git a/test_fms/mpp/test_mpp_domains2.sh b/test_fms/mpp/test_mpp_domains2.sh
index a9b26f2cd3..d4ed7aaeec 100755
--- a/test_fms/mpp/test_mpp_domains2.sh
+++ b/test_fms/mpp/test_mpp_domains2.sh
@@ -27,9 +27,6 @@
 # Set common test settings.
 . ../test-lib.sh
 
-# TODO: Enable this test once generalized indices work is complete
-SKIP_TESTS="test_mpp_domains2.12"
-
 # TODO edge update, fails on non-blocking with gnu
 #SKIP_TESTS="$SKIP_TESTS $(basename $0 .sh).6"
 

From 40c55c396f07a9f3f383f87fac29fc81027832c4 Mon Sep 17 00:00:00 2001
From: Jesse Lentz <jesse.lentz@noaa.gov>
Date: Fri, 12 Jun 2026 14:01:29 -0400
Subject: [PATCH 2/5] Use locally scoped variables in do concurrent loops

Use locally scoped variables within do concurrent loops rather than
subroutine-scope variables.
---
 mpp/include/group_update_pack.inc   | 46 ++++++++++++++++++-----------
 mpp/include/group_update_unpack.inc | 39 +++++++++++++++---------
 mpp/include/mpp_group_update.fh     | 34 ++++++++++++++-------
 3 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc
index 7134a72bc5..35201a96c1 100644
--- a/mpp/include/group_update_pack.inc
+++ b/mpp/include/group_update_pack.inc
@@ -152,9 +152,8 @@ contains
 
     integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions
     integer :: nd !< Number of dimensions
-    integer :: idx !< Index into the 1D buffer
     integer :: sgn !< Factor to multiply the filled values by
-    integer :: i1, i2, i3, i4, i(4)
+    integer :: i1, i2, i3, i4
     integer :: ix2, iy2
 
     select case (field_type)
@@ -249,30 +248,41 @@ contains
 
     select case (nd)
       case (2)
-        do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2)) local(i, idx)
-          i(1:2) = [i1, i2]
-          idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + &
-                stride(ix2)*(i(ix2)-lb(ix2)) + 1
-          buffer(idx) = sgn * field2d(i1,i2)
+        do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2))
+          block
+            integer :: i(2), idx
+
+            i = [i1, i2]
+            idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + &
+                  stride(ix2)*(i(ix2)-lb(ix2)) + 1
+            buffer(idx) = sgn * field2d(i1,i2)
+          end block
         end do
       case (3)
         iz1 = 1 + 2 + 3 - ix - iy
-        do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), &
-                       i3=lb(3):ub(3):stride(3)) local(i, idx)
-          i(1:3) = [i1, i2, i3]
-          idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + &
-                stride(ix2)*(i(ix2)-lb(ix2)) + 1
-          buffer(idx) = sgn * field3d(i1,i2,i3)
+        do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), i3=lb(3):ub(3):stride(3))
+          block
+            integer :: i(3), idx
+
+            i = [i1, i2, i3]
+            idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + &
+                  stride(ix2)*(i(ix2)-lb(ix2)) + 1
+            buffer(idx) = sgn * field3d(i1,i2,i3)
+          end block
         end do
       case (4)
         iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy)
         iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1
         do concurrent (i1=lb(1):ub(1):stride(1), i2=lb(2):ub(2):stride(2), &
-                       i3=lb(3):ub(3):stride(3), i4=lb(4):ub(4):stride(4)) local(i, idx)
-          i(1:4) = [i1, i2, i3, i4]
-          idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + &
-                stride(ix2)*(i(ix2)-lb(ix2)) + 1
-          buffer(idx) = sgn * field4d(i1,i2,i3,i4)
+                       i3=lb(3):ub(3):stride(3), i4=lb(4):ub(4):stride(4))
+          block
+            integer :: i(4), idx
+
+            i = [i1, i2, i3, i4]
+            idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + &
+                  stride(ix2)*(i(ix2)-lb(ix2)) + 1
+            buffer(idx) = sgn * field4d(i1,i2,i3,i4)
+          end block
         end do
     end select
   end subroutine pack_inner_loop
diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc
index 24d946273e..48b24c708a 100644
--- a/mpp/include/group_update_unpack.inc
+++ b/mpp/include/group_update_unpack.inc
@@ -57,8 +57,7 @@ contains
 
     integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions
     integer :: nd !< Number of dimensions
-    integer :: idx !< Index into the 1D buffer
-    integer :: i1, i2, i3, i4, i(4)
+    integer :: i1, i2, i3, i4
 
     select case (field_type)
       case (FIELD_S)
@@ -127,25 +126,37 @@ contains
 
     select case (nd)
       case (2)
-        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2)) local(i, idx)
-            i(1:2) = [i1, i2]
-            idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1
-            field2d(i1,i2) = buffer(idx)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2))
+            block
+              integer :: i(2), idx
+
+              i = [i1, i2]
+              idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1
+              field2d(i1,i2) = buffer(idx)
+            end block
         end do
       case (3)
         iz1 = 1 + 2 + 3 - ix - iy
-        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3)) local(i, idx)
-            i(1:3) = [i1, i2, i3]
-            idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
-            field3d(i1,i2,i3) = buffer(idx)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3))
+            block
+              integer :: i(3), idx
+
+              i = [i1, i2, i3]
+              idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
+              field3d(i1,i2,i3) = buffer(idx)
+            end block
         end do
       case (4)
         iz1 = trailz(sum(2**[1,2,3,4]) - 2**ix - 2**iy)
         iz2 = 1 + 2 + 3 + 4 - ix - iy - iz1
-        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4)) local(i, idx)
-            i(1:4) = [i1, i2, i3, i4]
-            idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
-            field4d(i1,i2,i3,i4) = buffer(idx)
+        do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4))
+            block
+              integer :: i(4), idx
+
+              i = [i1, i2, i3, i4]
+              idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
+              field4d(i1,i2,i3,i4) = buffer(idx)
+            end block
         end do
     end select
   end subroutine unpack_inner_loop
diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh
index abc4dca866..48657c951a 100644
--- a/mpp/include/mpp_group_update.fh
+++ b/mpp/include/mpp_group_update.fh
@@ -966,7 +966,7 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
   integer, intent(in) :: is, ie !< range of i indices to negate
   integer, intent(in) :: offset !< Offset when flipping the i axis
 
-  integer, allocatable, dimension(:) :: lb, ub, src
+  integer, allocatable, dimension(:) :: lb, ub
   MPP_TYPE_, pointer :: field2d(:,:)
   MPP_TYPE_, pointer :: field3d(:,:,:)
   MPP_TYPE_, pointer :: field4d(:,:,:,:)
@@ -979,7 +979,7 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
   iy = group%iy_v
 
   nd = size(group%shape_x) ! Assume that dimensions of x and y fields must agree
-  allocate (lb(nd), ub(nd), src(nd))
+  allocate (lb(nd), ub(nd))
 
   do l=1,group%nvector
     select case (field_type)
@@ -1018,21 +1018,33 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
     select case(nd)
       case (2)
         do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2))
-          src = [i1, i2]
-          src(ix) = offset - src(ix)
-          field2d(i1,i2) = field2d(src(1),src(2))
+          block
+            integer :: src(2)
+
+            src = [i1, i2]
+            src(ix) = offset - src(ix)
+            field2d(i1,i2) = field2d(src(1),src(2))
+          end block
         end do
       case (3)
         do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3))
-          src = [i1, i2, i3]
-          src(ix) = offset - src(ix)
-          field3d(i1,i2,i3) = field3d(src(1),src(2),src(3))
+          block
+            integer :: src(3)
+
+            src = [i1, i2, i3]
+            src(ix) = offset - src(ix)
+            field3d(i1,i2,i3) = field3d(src(1),src(2),src(3))
+          end block
         end do
       case (4)
         do concurrent (i1=lb(1):ub(1), i2=lb(2):ub(2), i3=lb(3):ub(3), i4=lb(4):ub(4))
-          src = [i1, i2, i3, i4]
-          src(ix) = offset - src(ix)
-          field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4))
+          block
+            integer :: src(4)
+
+            src = [i1, i2, i3, i4]
+            src(ix) = offset - src(ix)
+            field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4))
+          end block
         end do
     end select
   enddo

From 93f0dfd5aae8ce8bdb8d95d6019485e0600daa06 Mon Sep 17 00:00:00 2001
From: Jesse Lentz <jesse.lentz@noaa.gov>
Date: Wed, 17 Jun 2026 11:02:35 -0400
Subject: [PATCH 3/5] Various corrections

---
 mpp/include/group_update_pack.inc   | 45 +++++++++++++++------------
 mpp/include/group_update_unpack.inc | 15 ++++++---
 mpp/include/mpp_group_update.fh     | 47 ++++++++++++++++++++---------
 3 files changed, 69 insertions(+), 38 deletions(-)

diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc
index 35201a96c1..700d2c7c45 100644
--- a/mpp/include/group_update_pack.inc
+++ b/mpp/include/group_update_pack.inc
@@ -154,12 +154,15 @@ contains
     integer :: nd !< Number of dimensions
     integer :: sgn !< Factor to multiply the filled values by
     integer :: i1, i2, i3, i4
-    integer :: ix2, iy2
+    integer :: ij1, ij2
+    integer :: is0, js0
 
     select case (field_type)
       case (FIELD_S)
         ix = group%ix_s
         iy = group%iy_s
+        is0 = group%is_s
+        js0 = group%js_s
 
         nd = size(group%shape_s)
         allocate (lb(nd), ub(nd), n(nd), stride(nd))
@@ -178,6 +181,8 @@ contains
       case (FIELD_X)
         ix = group%ix_v
         iy = group%iy_v
+        is0 = group%is_x
+        js0 = group%js_x
 
         nd = size(group%shape_x)
         allocate (lb(nd), ub(nd), n(nd), stride(nd))
@@ -196,6 +201,8 @@ contains
       case (FIELD_Y)
         ix = group%ix_v
         iy = group%iy_v
+        is0 = group%is_y
+        js0 = group%js_y
 
         nd = size(group%shape_y)
         allocate (lb(nd), ub(nd), n(nd), stride(nd))
@@ -213,34 +220,34 @@ contains
         end select
     end select
 
-    lb(ix) = group%pack_is(ni)
-    lb(iy) = group%pack_js(ni)
+    lb(ix) = group%pack_is(ni) - is0 + 1
+    lb(iy) = group%pack_js(ni) - js0 + 1
 
-    ub(ix) = group%pack_ie(ni)
-    ub(iy) = group%pack_je(ni)
+    ub(ix) = group%pack_ie(ni) - is0 + 1
+    ub(iy) = group%pack_je(ni) - js0 + 1
 
     n = ub - lb + 1
 
     stride = 1
 
     if (iand(packing_flags, REVERSE_I).ne.0) then
-      lb(ix) = group%pack_ie(ni)
-      ub(ix) = group%pack_is(ni)
+      lb(ix) = group%pack_ie(ni) - is0 + 1
+      ub(ix) = group%pack_is(ni) - is0 + 1
       stride(ix) = -1
     endif
 
     if (iand(packing_flags, REVERSE_J).ne.0) then
-      lb(iy) = group%pack_je(ni)
-      ub(iy) = group%pack_js(ni)
+      lb(iy) = group%pack_je(ni) - js0 + 1
+      ub(iy) = group%pack_js(ni) - js0 + 1
       stride(iy) = -1
     endif
 
     if (iand(packing_flags, SWAP_IJ).eq.0) then
-      ix2 = ix
-      iy2 = iy
+      ij1 = ix
+      ij2 = iy
     else
-      ix2 = iy
-      iy2 = ix
+      ij1 = iy
+      ij2 = ix
     endif
 
     sgn = 1
@@ -253,8 +260,8 @@ contains
             integer :: i(2), idx
 
             i = [i1, i2]
-            idx = n(ix2)*stride(iy2)*(i(iy2)-lb(iy2)) + &
-                  stride(ix2)*(i(ix2)-lb(ix2)) + 1
+            idx = n(ij1)*stride(ij2)*(i(ij2)-lb(ij2)) + &
+                  stride(ij1)*(i(ij1)-lb(ij1)) + 1
             buffer(idx) = sgn * field2d(i1,i2)
           end block
         end do
@@ -265,8 +272,8 @@ contains
             integer :: i(3), idx
 
             i = [i1, i2, i3]
-            idx = n(ix2)*( n(iy2)*(i(iz1)-1) + stride(iy2)*(i(iy2)-lb(iy2))) + &
-                  stride(ix2)*(i(ix2)-lb(ix2)) + 1
+            idx = n(ij1)*( n(ij2)*(i(iz1)-1) + stride(ij2)*(i(ij2)-lb(ij2))) + &
+                  stride(ij1)*(i(ij1)-lb(ij1)) + 1
             buffer(idx) = sgn * field3d(i1,i2,i3)
           end block
         end do
@@ -279,8 +286,8 @@ contains
             integer :: i(4), idx
 
             i = [i1, i2, i3, i4]
-            idx = n(ix2)*( n(iy2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(iy2)*(i(iy2)-lb(iy2))) + &
-                  stride(ix2)*(i(ix2)-lb(ix2)) + 1
+            idx = n(ij1)*( n(ij2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(ij2)*(i(ij2)-lb(ij2))) + &
+                  stride(ij1)*(i(ij1)-lb(ij1)) + 1
             buffer(idx) = sgn * field4d(i1,i2,i3,i4)
           end block
         end do
diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc
index 48b24c708a..a49ceb684c 100644
--- a/mpp/include/group_update_unpack.inc
+++ b/mpp/include/group_update_unpack.inc
@@ -58,11 +58,14 @@ contains
     integer :: ix, iy, iz1, iz2 !< Indices of the domain-decomposed dimensions
     integer :: nd !< Number of dimensions
     integer :: i1, i2, i3, i4
+    integer :: is0, js0
 
     select case (field_type)
       case (FIELD_S)
         ix = group%ix_s
         iy = group%iy_s
+        is0 = group%is_s
+        js0 = group%js_s
 
         nd = size(group%shape_s)
         allocate (lb(nd), ub(nd), n(nd))
@@ -81,6 +84,8 @@ contains
       case (FIELD_X)
         ix = group%ix_v
         iy = group%iy_v
+        is0 = group%is_x
+        js0 = group%js_x
 
         nd = size(group%shape_x)
         allocate (lb(nd), ub(nd), n(nd))
@@ -99,6 +104,8 @@ contains
       case (FIELD_Y)
         ix = group%ix_v
         iy = group%iy_v
+        is0 = group%is_y
+        js0 = group%js_y
 
         nd = size(group%shape_y)
         allocate (lb(nd), ub(nd), n(nd))
@@ -116,11 +123,11 @@ contains
         end select
     end select
 
-    lb(ix) = group%unpack_is(ni)
-    lb(iy) = group%unpack_js(ni)
+    lb(ix) = group%unpack_is(ni) - is0 + 1
+    lb(iy) = group%unpack_js(ni) - js0 + 1
 
-    ub(ix) = group%unpack_ie(ni)
-    ub(iy) = group%unpack_je(ni)
+    ub(ix) = group%unpack_ie(ni) - is0 + 1
+    ub(iy) = group%unpack_je(ni) - js0 + 1
 
     n = ub - lb + 1
 
diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh
index 48657c951a..4b68382094 100644
--- a/mpp/include/mpp_group_update.fh
+++ b/mpp/include/mpp_group_update.fh
@@ -829,6 +829,7 @@ subroutine SET_ZERO_  (group, field_type, i0, j0)
   integer :: nd !< Number of dimensions
   integer :: ix, iy !< Indices of the domain-decomposed dimensions
   integer :: l, i1, i2, i3, i4
+  integer :: is0, js0
 
   ix = group%ix_v
   iy = group%iy_v
@@ -841,6 +842,8 @@ subroutine SET_ZERO_  (group, field_type, i0, j0)
       case (FIELD_X)
         lb = 1
         ub = group%shape_x
+        is0 = group%is_x
+        js0 = group%js_x
 
         select case(nd)
           case (2)
@@ -853,6 +856,8 @@ subroutine SET_ZERO_  (group, field_type, i0, j0)
       case (FIELD_Y)
         lb = 1
         ub = group%shape_y
+        is0 = group%is_y
+        js0 = group%js_y
 
         select case(nd)
           case (2)
@@ -864,11 +869,11 @@ subroutine SET_ZERO_  (group, field_type, i0, j0)
         end select
     end select
 
-    lb(ix) = i0
-    ub(ix) = i0
+    lb(ix) = i0 - is0 + 1
+    ub(ix) = i0 - is0 + 1
 
-    lb(iy) = j0
-    ub(iy) = j0
+    lb(iy) = j0 - js0 + 1
+    ub(iy) = j0 - js0 + 1
 
     select case(nd)
       case (2)
@@ -901,6 +906,7 @@ subroutine NEGATE_HALO_  (group, field_type, j0, is, ie)
   integer :: nd !< Number of dimensions
   integer :: ix, iy !< Indices of the domain-decomposed dimensions
   integer :: l, i1, i2, i3, i4
+  integer :: is0, js0
 
   ix = group%ix_v
   iy = group%iy_v
@@ -913,6 +919,8 @@ subroutine NEGATE_HALO_  (group, field_type, j0, is, ie)
       case (FIELD_X)
         lb = 1
         ub = group%shape_x
+        is0 = group%is_x
+        js0 = group%js_x
 
         select case(nd)
           case (2)
@@ -925,6 +933,8 @@ subroutine NEGATE_HALO_  (group, field_type, j0, is, ie)
       case (FIELD_Y)
         lb = 1
         ub = group%shape_y
+        is0 = group%is_y
+        js0 = group%js_y
 
         select case(nd)
           case (2)
@@ -936,11 +946,11 @@ subroutine NEGATE_HALO_  (group, field_type, j0, is, ie)
         end select
     end select
 
-    lb(ix) = is
-    ub(ix) = ie
+    lb(ix) = is - is0 + 1
+    ub(ix) = ie - is0 + 1
 
-    lb(iy) = j0
-    ub(iy) = j0
+    lb(iy) = j0 - js0 + 1
+    ub(iy) = j0 - js0 + 1
 
     select case(nd)
       case (2)
@@ -974,6 +984,7 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
   integer :: nd !< Number of dimensions
   integer :: ix, iy !< Indices of the domain-decomposed dimensions
   integer :: l, i1, i2, i3, i4
+  integer :: is0, js0, offset1
 
   ix = group%ix_v
   iy = group%iy_v
@@ -986,6 +997,8 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
       case (FIELD_X)
         lb = 1
         ub = group%shape_x
+        is0 = group%is_x
+        js0 = group%js_x
 
         select case(nd)
           case (2)
@@ -998,6 +1011,8 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
       case (FIELD_Y)
         lb = 1
         ub = group%shape_y
+        is0 = group%is_y
+        js0 = group%js_y
 
         select case(nd)
           case (2)
@@ -1009,11 +1024,13 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
         end select
     end select
 
-    lb(ix) = is
-    ub(ix) = ie
+    lb(ix) = is - is0 + 1
+    ub(ix) = ie - is0 + 1
 
-    lb(iy) = j0
-    ub(iy) = j0
+    lb(iy) = j0 - js0 + 1
+    ub(iy) = j0 - js0 + 1
+
+    offset1 = offset - 2*is0 + 2
 
     select case(nd)
       case (2)
@@ -1022,7 +1039,7 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
             integer :: src(2)
 
             src = [i1, i2]
-            src(ix) = offset - src(ix)
+            src(ix) = offset1 - src(ix)
             field2d(i1,i2) = field2d(src(1),src(2))
           end block
         end do
@@ -1032,7 +1049,7 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
             integer :: src(3)
 
             src = [i1, i2, i3]
-            src(ix) = offset - src(ix)
+            src(ix) = offset1 - src(ix)
             field3d(i1,i2,i3) = field3d(src(1),src(2),src(3))
           end block
         end do
@@ -1042,7 +1059,7 @@ subroutine FLIP_I_  (group, field_type, j0, is, ie, offset)
             integer :: src(4)
 
             src = [i1, i2, i3, i4]
-            src(ix) = offset - src(ix)
+            src(ix) = offset1 - src(ix)
             field4d(i1,i2,i3,i4) = field4d(src(1),src(2),src(3),src(4))
           end block
         end do

From b9298822486a39af04a5433365135e71ec7676b9 Mon Sep 17 00:00:00 2001
From: Jesse Lentz <jesse.lentz@noaa.gov>
Date: Wed, 17 Jun 2026 12:09:59 -0400
Subject: [PATCH 4/5] Rename `buffer` to `buf` within inner loops

---
 mpp/include/group_update_pack.inc   | 10 +++++-----
 mpp/include/group_update_unpack.inc |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc
index 700d2c7c45..29b03f5bc5 100644
--- a/mpp/include/group_update_pack.inc
+++ b/mpp/include/group_update_pack.inc
@@ -137,8 +137,8 @@ subroutine GROUP_UPDATE_PACK_ (group, buffer, buffer_start_pos)
 
 contains
 
-  subroutine pack_inner_loop (buffer, group, field_type, ni, l, packing_flags)
-    MPP_TYPE_, intent(out) :: buffer(:)
+  subroutine pack_inner_loop (buf, group, field_type, ni, l, packing_flags)
+    MPP_TYPE_, intent(out) :: buf(:)
     type(mpp_group_update_type), intent(in), target :: group
     integer, intent(in) :: field_type !< FIELD_S, FIELD_X, or FIELD_Y
     integer, intent(in) :: ni
@@ -262,7 +262,7 @@ contains
             i = [i1, i2]
             idx = n(ij1)*stride(ij2)*(i(ij2)-lb(ij2)) + &
                   stride(ij1)*(i(ij1)-lb(ij1)) + 1
-            buffer(idx) = sgn * field2d(i1,i2)
+            buf(idx) = sgn * field2d(i1,i2)
           end block
         end do
       case (3)
@@ -274,7 +274,7 @@ contains
             i = [i1, i2, i3]
             idx = n(ij1)*( n(ij2)*(i(iz1)-1) + stride(ij2)*(i(ij2)-lb(ij2))) + &
                   stride(ij1)*(i(ij1)-lb(ij1)) + 1
-            buffer(idx) = sgn * field3d(i1,i2,i3)
+            buf(idx) = sgn * field3d(i1,i2,i3)
           end block
         end do
       case (4)
@@ -288,7 +288,7 @@ contains
             i = [i1, i2, i3, i4]
             idx = n(ij1)*( n(ij2)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + stride(ij2)*(i(ij2)-lb(ij2))) + &
                   stride(ij1)*(i(ij1)-lb(ij1)) + 1
-            buffer(idx) = sgn * field4d(i1,i2,i3,i4)
+            buf(idx) = sgn * field4d(i1,i2,i3,i4)
           end block
         end do
     end select
diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc
index a49ceb684c..bac3d37a5d 100644
--- a/mpp/include/group_update_unpack.inc
+++ b/mpp/include/group_update_unpack.inc
@@ -139,7 +139,7 @@ contains
 
               i = [i1, i2]
               idx = n(ix)*(i(iy)-lb(iy)) + (i(ix)-lb(ix)) + 1
-              field2d(i1,i2) = buffer(idx)
+              field2d(i1,i2) = buf(idx)
             end block
         end do
       case (3)
@@ -150,7 +150,7 @@ contains
 
               i = [i1, i2, i3]
               idx = n(ix)*( n(iy)*(i(iz1)-1) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
-              field3d(i1,i2,i3) = buffer(idx)
+              field3d(i1,i2,i3) = buf(idx)
             end block
         end do
       case (4)
@@ -162,7 +162,7 @@ contains
 
               i = [i1, i2, i3, i4]
               idx = n(ix)*( n(iy)*( n(iz1)*(i(iz2)-1) + (i(iz1)-1)) + (i(iy)-lb(iy))) + (i(ix)-lb(ix)) + 1
-              field4d(i1,i2,i3,i4) = buffer(idx)
+              field4d(i1,i2,i3,i4) = buf(idx)
             end block
         end do
     end select

From f895ec5732063a18f99d056457458c77517e06d3 Mon Sep 17 00:00:00 2001
From: Jesse Lentz <jesse.lentz@noaa.gov>
Date: Thu, 18 Jun 2026 14:50:32 -0400
Subject: [PATCH 5/5] Deallocate shape arrays

---
 mpp/include/mpp_domains_util.inc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mpp/include/mpp_domains_util.inc b/mpp/include/mpp_domains_util.inc
index 839a607fd5..e2d292b4b6 100644
--- a/mpp/include/mpp_domains_util.inc
+++ b/mpp/include/mpp_domains_util.inc
@@ -2390,6 +2390,10 @@ end subroutine set_group_update
      group%nunpack = 0
      group%initialized = .false.
 
+     if (allocated(group%shape_s)) deallocate(group%shape_s)
+     if (allocated(group%shape_x)) deallocate(group%shape_x)
+     if (allocated(group%shape_y)) deallocate(group%shape_y)
+
   end subroutine mpp_clear_group_update
 
 !#####################################################################